I am trying to follow the instructions from here but using my own fine-tuned BERT model. I had to make a few adaptations but I think I got the data in the same format as the example. However, when I try to run the Trainer, I get the following error:
RuntimeError: The size of tensor a (8160) must match the size of tensor b (16) at non-singleton dimension 0
I had defined the max_length=510
on the call to tokenizer function, to match what I used to train the language model. I also manually pad the labels to match this length.
The code I’m using is the following:
def main():
train_texts, train_tags = read_data('./data/ner_train.pkl')
val_texts, val_tags = read_data('./data/ner_test.pkl')
unique_tags = set(tag for doc in train_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}
tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\Rogerio\\Documents\\bert-beaver-language")
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True,
truncation=True, max_length=510)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True,
truncation=True, max_length=510)
labels = [[tag2id[tag] for tag in doc] for doc in train_tags]
train_labels = []
for doc_labels, doc_offset in zip(labels, train_encodings.offset_mapping):
# pad labels if necessary
if len(doc_labels) < len(doc_offset):
doc_labels += [-100] * (len(doc_offset) - len(doc_labels))
for label, offset in zip(doc_labels, doc_offset):
if offset[0] != 0 or offset == (0, 0):
label = -100
train_labels.append(label)
labels = [[tag2id[tag] for tag in doc] for doc in val_tags]
val_labels = []
for doc_labels, doc_offset in zip(labels, val_encodings.offset_mapping):
# pad labels if necessary
if len(doc_labels) < len(doc_offset):
doc_labels += [-100] * (len(doc_offset) - len(doc_labels))
for label, offset in zip(doc_labels, doc_offset):
if offset[0] != 0 or offset == (0, 0):
label = -100
val_labels.append(label)
train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = NERDataset(train_encodings, train_labels)
val_dataset = NERDataset(val_encodings, val_labels)
model = AutoModelForTokenClassification.from_pretrained("C:\\Users\\Rogerio\\Documents\\bert-beaver-language",
num_labels=len(unique_tags))
training_args = TrainingArguments(
output_dir="C:\\Users\\Rogerio\\Documents\\bert-beaver-ner\\ner_output",
logging_dir="C:\\Users\\Rogerio\\Documents\\bert-beaver-ner\\ner_logs",
num_train_epochs=8, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_steps=10,
save_total_limit=5,
overwrite_output_dir=True,
save_steps=750,
do_eval=True,
do_train=True,
do_predict=True
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
trainer.train()
The full stack trace is below:
0%| | 0/1797240 [00:00<?, ?it/s]Traceback (most recent call last):
File "C:/Users/Rogerio/Documents/Python_Projects/beaver-model-training/ner_bert.py", line 558, in <module>
main()
File "C:/Users/Rogerio/Documents/Python_Projects/beaver-model-training/ner_bert.py", line 496, in main
trainer.train()
File "C:\Users\Rogerio\python-virtual-envs\beaver-model-training\lib\site-packages\transformers\trainer.py", line 747, in train
tr_loss += self.training_step(model, inputs)
File "C:\Users\Rogerio\python-virtual-envs\beaver-model-training\lib\site-packages\transformers\trainer.py", line 1075, in training_step
loss = self.compute_loss(model, inputs)
File "C:\Users\Rogerio\python-virtual-envs\beaver-model-training\lib\site-packages\transformers\trainer.py", line 1099, in compute_loss
outputs = model(**inputs)
File "C:\Users\Rogerio\python-virtual-envs\beaver-model-training\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "C:\Users\Rogerio\python-virtual-envs\beaver-model-training\lib\site-packages\transformers\models\bert\modeling_bert.py", line 1541, in forward
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
RuntimeError: The size of tensor a (8160) must match the size of tensor b (16) at non-singleton dimension 0
Any ideas?