Transformers: 4.4.2
Pytorch: 1.8.0
I am trying to fine-tune BERT for sequence classification on my dataset. The number of classes I have is 6. Here is some example code:
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
import pandas as pd
class MyDataset(Dataset):
def __init__(self, csv_file: str):
self.df = pd.read_csv(csv_file, encoding='ISO-8859-1')
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", padding_side='right', local_files_only=True)
self.label_list = self.df['label'].value_counts().keys().to_list()
def __len__(self) -> int:
return len(self.df)
def __getitem__(self, idx: int) -> str:
if torch.is_tensor(idx):
idx = idx.tolist()
text = self.df.iloc[idx, 1]
label = self.label_list.index(self.df.iloc[idx, 3])
return (text, label)
def data_collator(dataset_samples_list):
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", padding_side='right', local_files_only=True)
examples = [example[0] for example in dataset_samples_list]
encoded_results = tokenizer(examples, padding=True, truncation=True, return_tensors='pt',
return_attention_mask=True)
batch = {}
batch['input_ids'] = torch.stack([result for result in encoded_results['input_ids']])
batch['attention_mask'] = torch.stack([result for result in encoded_results['attention_mask']])
batch['labels'] = torch.stack([torch.tensor(example[1]) for example in dataset_samples_list])
return batch
train_data_obj = MyDataset('/path/to/train/data.csv')
eval_data_obj = MyDataset('/path/to/eval/data.csv')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model.config.num_labels = 6
training_args = TrainingArguments(
output_dir='/path/to/output/dir',
do_train=True,
do_eval=True,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
evaluation_strategy='epoch',
num_train_epochs=2,
save_steps=10,
gradient_accumulation_steps=4,
dataloader_drop_last=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data_obj,
eval_dataset=eval_data_obj,
data_collator=data_collator
)
trainer.train()
trainer.save_model("/path/to/model/save/dir")
When I run this, I get the following error:
Traceback (most recent call last):
File "/path/to/my/project/scratch.py", line 9, in <module>
bert_processor.train()
File "/path/to/my/project/BertClassifierProcessor.py", line 156, in train
self.trainer.train()
File "/path/to/python/lib/python3.7/site-packages/transformers/trainer.py", line 1053, in train
tr_loss += self.training_step(model, inputs)
File "/path/to/python/lib/python3.7/site-packages/transformers/trainer.py", line 1443, in training_step
loss = self.compute_loss(model, inputs)
File "/path/to/python/lib/python3.7/site-packages/transformers/trainer.py", line 1475, in compute_loss
outputs = model(**inputs)
File "/path/to/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/path/to/python/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 1526, in forward
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
File "/path/to/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/path/to/python/lib/python3.7/site-packages/torch/nn/modules/loss.py", line 1048, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File "/path/to/python/lib/python3.7/site-packages/torch/nn/functional.py", line 2690, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "/path/to/python/lib/python3.7/site-packages/torch/nn/functional.py", line 2385, in nll_loss
ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
IndexError: Target 3 is out of bounds.
Any thoughts about how to correct this or what I may be doing wrong? Thanks in advance!