Hello,
I am trying to tweak/train a pretrained model (cardiffnlp/twitter-roberta-base-sentiment-latest), and am getting the following error when I try to run the Trainer.train command:
raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
ValueError: Target size (torch.Size([7])) must be the same as input size (torch.Size([7, 11]))
Note: the test dataset I am using for the tuning is 7 lines, just for testing. It’s format is
,text,labels
0, "some text here", 2
1, "some text here", 0
2, "some text here", 9
...
I have successfully managed to run the same code on both the twitter-roberta-base-sentiment-latest and the roberta-base-emotion models.
Here is a minimum example that produces the error with the mentioned model, and works for the others:
from numpy import argmax
from evaluate import load as evload
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
def compute_metrics(eval_pred) -> str:
load_accuracy = evload("accuracy") # load_metric("accuracy")
load_f1 = evload("f1") #load_metric("f1")
logits, labels = eval_pred
predictions = argmax(logits)
accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
return {"accuracy": accuracy, "f1": f1}
def tokenization(examples):
return tokenizer(examples["text"])#, truncation=True)
MODEL = 'cardiffnlp/roberta-base-emotion'
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataset = load_dataset("csv", data_files={"train": "emotion_train_data.csv"})
tokenized_train = dataset["train"].map(tokenization, batched=True)
training_args = TrainingArguments(
output_dir="trained_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
save_strategy="epoch",
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
Any advice on this would be great!