I’m new to huggingface and transformers and I’m trying to finetune mt5-small on a custom dataset.
The problem is that trainer.train()
seems to finish really quickly instead of actually training. I used similar code to finetune on a model from the huggingface hub, and it went through all the steps as expected. So I suspect it has something to do with how I’m building the custom dataset, but there isn’t great documentation on this.
So what am I doing wrong?
Here’s the code:
train_inputs = train_df['linearized_input'].tolist()
train_targets = train_df['table_text'].tolist()
validation_inputs = dev_df['linearized_input'].tolist()
validation_targets = dev_df['target'].tolist()
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
# Define custom dataset
class ForT5Dataset(torch.utils.data.Dataset):
def __init__(self, inputs, targets):
self.inputs = inputs
self.targets = targets
def __len__(self):
return len(self.targets)
def __getitem__(self, index):
input_ids = self.inputs["input_ids"][index]
attention_mask = self.inputs["attention_mask"][index]
target_ids = self.targets["input_ids"][index]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": target_ids,
}
# Initialize the tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)
# Tokenize the dataset
train_input_encodings = tokenizer(train_inputs, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
train_target_encodings = tokenizer(train_targets, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
val_input_encodings = tokenizer(validation_inputs, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
val_target_encodings = tokenizer(validation_targets, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
train_dataset = ForT5Dataset(train_input_encodings, train_target_encodings)
eval_dataset = ForT5Dataset(val_input_encodings, val_target_encodings)
# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# Prepare the Trainer arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./mt5-small-finetuned",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
evaluation_strategy="steps",
eval_steps=100,
save_steps=1000,
num_train_epochs=3,
save_total_limit=3,
predict_with_generate=True
)
# Initialize Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Fine-tune the model
trainer.train()
# Save the trained model
trainer.save_model()
# Evaluate the model
results = trainer.evaluate()
print(results)
And for the output I just get [3/3 00:34, Epoch 3/3]
- why is it only 3/3? I have thousands of samples.
I’ve tried creating the Dataset in different ways but some error, some just get the same result as this.