Finetuning T5 on Squad

I think I have a misunderstanding in how I am approaching this problem but I think I have the right set up. Any guidance y’all can offer would be great.

Question: If my tokenized dataset contains input_ids, attention_mask, targets_ids, and target_attention_mask, why is the trainer complaining about needing decoder_input_ids?

Goal: Fine Tune T5 on SQUAD dataset for extractive question and answer

Error: ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

Helper Functions

def subset_dataset(dataset, train_size=32, validation_size=16):
    # Extract the specified number of examples for training and validation
    train_data = dataset["train"].shuffle(seed=42).select([i for i in range(train_size)])
    validation_data = dataset["validation"].shuffle(seed=42).select([i for i in range(validation_size)])

    # Create a new DatasetDict with the subsetted data
    subsetted_dataset = DatasetDict({"train": train_data, "validation": validation_data})

    return subsetted_dataset

def t5_preprocess(entry):
    """
    Process input for T5 question and answer format
    """
    context = entry['context']
    question = entry['question']
    answer = entry['answers']['text'][0]
    answer_start =  entry['answers']['answer_start'][0]
    answer_end = answer_start + len(answer)
    
    input_text = f"question: {question} context: {context}"
    target_text = answer
    
    entry = {'input_text':input_text,
             'target_text': target_text, 
            #  'question': question, 
            #  'context': context, 
            #  'answers': answer, 
            #  'answer_start':answer_start, 
            #  'answer_end':answer_end
    }
    return entry

def convert_to_features(entry, tokenizer):
    """""
    Convert entry to features
    """

    input_encodings = tokenizer.encode_plus(entry['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.encode_plus(entry['target_text'], pad_to_max_length=True, max_length=512)

    entry = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return entry

Full Code

#define model checkpoint
model_checkpoint = 't5-small'

# instantiate tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

# instantiate model
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

# load data
datasets = load_dataset('squad') #87,599 samples
datasets = subset_dataset(datasets) # 48 examples

# tokenize input
tokenized_datasets = datasets.map(lambda examples: t5_preprocess(examples), remove_columns=datasets["train"].column_names) # prep data 
tokenized_datasets = tokenized_datasets.map(lambda examples: convert_to_features(examples, tokenizer), remove_columns=tokenized_datasets["train"].column_names)

# define save directory
model_checkpoint_save_directory = './model_checkpoints'

# define model name
model_name = model_checkpoint.split("/")[-1] # model name for pushing to hugging face

# define checkpoint save location
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
model_checkpoint_file_path = model_checkpoint_save_directory + "/" + model_name + "/"+ timestamp + "/" + "epochs" + "/"+  '{epoch:02d}-{val_loss:.2f}.hdf5' # callback knows to autopopulate these parameters

# define Training Arguments
batch_size = 16
learning_rate = 2e-5 # learning rate; The learning rate is a hyperparameter that determines the step size at each iteration while moving toward a minimum of the loss function
weight_decay = 0.01
num_epochs = 4 # number of times the entire training dataset will be passed through the model. Training for more epochs allows the model to see the entire dataset multiple times, potentially improving performance. However, training for too many epochs can lead to overfitting, where the model performs well on the training data but poorly on new, unseen data
total_train_steps = (len(tokenized_datasets["train"]) // batch_size) * num_epochs

# define the data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=512,
)

# compile model for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# move to gpu
model.to(device)

# define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# define training args
training_args = TrainingArguments(
    output_dir=model_checkpoint_file_path,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay
)

# define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Tokenized Dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'],
        num_rows: 16
    })
})

@alifunseen I’m in the same boat, I can’t find anything that explains how to use T5ForConditionalGeneration for Q&A. That being said, I am using T5ForConditionalGeneration because I want to also train other tasks (Summarizer) in the same model. If you do not have the same requirement, I may suggest that you look at using AutoModelForQuestionAnswering, rather than using the other one. You’ll find more examples to follow if you use that model rather than T5ForConditionalGeneration.

BTW If you DO happen to figure out how to get it working using T5ForConditionalGeneration please let me know :slight_smile: