Hi folks,
I’m newbie and got stuck by this error msg “ValueError: Expected input batch_size (8) to match target batch_size (280)”.
Appreciate any advice, please… Thank you!
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
model_inputs = tokenizer(
text = examples["text"],
max_length=max_text_length,
truncation=True,
padding = True,
)
label = tokenizer(examples["label"], max_length=max_label_length, truncation=True, padding = True)
model_inputs["label"] = label["input_ids"]
return model_inputs
tokenized_aicgp = aicgp.map(preprocess_function, batched=True)
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=no_of_labels, id2label=id2label, label2id=label2id)
training_args = TrainingArguments(
output_dir="my_model",
learning_rate=2e-5,
per_device_train_batch_size=8, #16
per_device_eval_batch_size=8, #16
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_aicgp["train"],
eval_dataset=tokenized_aicgp["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
Note:
The program aborted at trainer.train() with the below error message:
ValueError: Expected input batch_size (8) to match target batch_size (280).
tokenized_aicgp
DatasetDict({
train: Dataset({
features: ['label', 'text', 'lblId', 'input_ids', 'attention_mask'],
num_rows: 56
})
test: Dataset({
features: ['label', 'text', 'lblId', 'input_ids', 'attention_mask'],
num_rows: 14
})
})
length label 56
length text 56
length lblId 56
length input_ids 56
length attention_mask 56