Custom data loaded BERT

is this custom data loader for BERT correct, I am getting an error for the datatype with this code (bold line)

def init(self, path, use_tokenizer, max_sequence_len=None):
df = pd.read_csv(path)

texts = []
labels = []
for index, row in df.iterrows():
    source = row['source']
    pubDate = row['pubDate']
       author = row['author']
    title = row['title']
   content = row['content']
     **text.append((source, pubDate,author,title,content))**
    label_id=row['label']
    # Save encode labels.
    labels.append(label_id)

the error is

We don’t have enough code in your snippet to understand what is happening. Can you share more?

1 Like

Hi @thomwolf , thanks for replying to this thread, grateful.
I think I am able to resolve, I guess it is more related to pytorch dataloader, basically the BERT takes labelIDs and text as fields, I want to give more features to the text field, like the features of my news dataset including source, title, date, news story etc.
I learnt that we need to join them via the pytorch dataloader, my first question is 'is it possible to have the side information’ through other means in BERT rather than combining through the data loader, if there is not in BERT or variants, then I am left with pytorch datalaoder option only.

I think I am able to resolve it after removing the date field from the data loader. I am not sure if I can have the timestamp through the pytorch dataloader, though I ordered data temporally, should it retain the order of time during training?
here is the source code,

class newsDataset(Dataset):
def __init__(self,  path, use_tokenizer, max_sequence_len=None):
  df = pd.read_csv(path,sep=',')
  max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len
texts = []
labels = []   
for index, row in df.iterrows():
    source = row['source']
    #pubDate = row['pubDate']
    author = row['author']
    title = row['title']
    content = row['content']
   
    # Save content.
    texts.append((source, author,title,content))
    label_id=row['label']
    # Save encode labels.
    labels.append(label_id)


# Number of exmaples.
self.n_examples = len(labels)

# Use tokenizer on texts. This can take a while.
print('Using tokenizer on all texts. This can take a while...')
self.inputs = use_tokenizer(texts, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt',  max_length=max_sequence_len)
# Get maximum sequence length.
self.sequence_len = self.inputs['input_ids'].shape[-1]
print('Texts padded or truncated to %d length!' % self.sequence_len)
# Add labels.
self.inputs.update({'labels':torch.tensor(labels)})
print('Finished!\n')

return

def len(self):
return self.n_examples

def getitem(self, item):
return {key: self.inputs[key][item] for key in self.inputs.keys()}

I am able to fix the error it with the following code:
issue was the wrong way to concatenate the array list:

def __init__(self,  path, use_tokenizer, max_sequence_len=None):
    # Check max sequence length.
    df = pd.read_csv(path,sep=',')
    max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len
    texts = []
    labels = []
    # Since the labels are defined by folders with data we loop 
    # through each label.
    #Index(['source', 'label', 'pubDate', 'title', 'author', 'content', 'pub'], dtype='object')
    
    for index, row in df.iterrows():
        source = row['source']
        pubDate = row['pubDate']
        author = row['author']
        title = row['title']
        content = row['content']

        content = fix_text(content)
        texts.append(source)
        texts.append(author)
        texts.append(title)
        texts.append(content)

        label_id=row['label']
        # Save encode labels.
        labels.append(label_id)


    # Number of exmaples.
    self.n_examples = len(labels)
  
    # Use tokenizer on texts. This can take a while.
    print('Using tokenizer on all texts. This can take a while...')
    self.inputs = use_tokenizer(texts, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt',  max_length=max_sequence_len)
    # Get maximum sequence length.
    self.sequence_len = self.inputs['input_ids'].shape[-1]
    print('Texts padded or truncated to %d length!' % self.sequence_len)
    # Add labels.
    self.inputs.update({'labels':torch.tensor(labels)})
    print('Finished!\n')

    return
#