Trainer.train() seems to finish almost instantly

I’m new to huggingface and transformers and I’m trying to finetune mt5-small on a custom dataset.

The problem is that trainer.train() seems to finish really quickly instead of actually training. I used similar code to finetune on a model from the huggingface hub, and it went through all the steps as expected. So I suspect it has something to do with how I’m building the custom dataset, but there isn’t great documentation on this.

So what am I doing wrong?

Here’s the code:

train_inputs = train_df['linearized_input'].tolist()
train_targets = train_df['table_text'].tolist()

validation_inputs = dev_df['linearized_input'].tolist()
validation_targets = dev_df['target'].tolist()

import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define custom dataset
class ForT5Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, index):
        input_ids = self.inputs["input_ids"][index]
        attention_mask = self.inputs["attention_mask"][index]
        target_ids = self.targets["input_ids"][index]
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
        }

# Initialize the tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the dataset
train_input_encodings = tokenizer(train_inputs, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
train_target_encodings = tokenizer(train_targets, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

val_input_encodings = tokenizer(validation_inputs, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
val_target_encodings = tokenizer(validation_targets, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

train_dataset = ForT5Dataset(train_input_encodings, train_target_encodings)
eval_dataset = ForT5Dataset(val_input_encodings, val_target_encodings)

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Prepare the Trainer arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-small-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=1000,
    num_train_epochs=3,
    save_total_limit=3,
    predict_with_generate=True
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

# Save the trained model
trainer.save_model()

# Evaluate the model
results = trainer.evaluate()
print(results)

And for the output I just get [3/3 00:34, Epoch 3/3] - why is it only 3/3? I have thousands of samples.

I’ve tried creating the Dataset in different ways but some error, some just get the same result as this.