Passing the tokenizer to Trainer for bucketing does not work for evaluation set

Hi all,

The documentation says: The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model.

If I understand it correctly, the library itself applies bucketing to the training data, right ? With bucketing I mean that each batch will be extended to the maximum length in that particular batch. I want to use this feature rather than padding everything to the global maximum length(of all samples). That’s why when I create the input encodings for train_dataset and eval_dataset, I set the truncation and padding to False.

In this case the training works but the validation throws and error. Saying that: RuntimeError: Sizes of tensors must match except in dimension 0. Got 56 and 28 in dimension 1 (The offending index is 1)

I actually do not exactly understand the error. Because my evaluation batch size is 1. So regardless of the size, it should have worked I think.

Does anyone have an idea what it is going on here ?

Could you post the code you’re using? Training and evaluation dataloaders are built the same way so there is no reason it should work for one and not the other.

Hi,

Thanks for the reply. Here is my code:

import torch
import argparse
import os
import sys
import numpy as np
import torch.nn.functional as F
sys.path.append('..')
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
from data_reader import GetDataAsPython
from sklearn.model_selection import train_test_split
from prepare_data import create_data, create_dataset
from transformers import T5Tokenizer

parser = argparse.ArgumentParser()
parser.add_argument('-e', '--epochs', type=int, default=100)
parser.add_argument('-bs', '--batch-size', type=int, default=1)
parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4)
parser.add_argument('-gcv', '--gradient-clip-val', type=float, default=0.0)
parser.add_argument('-wd', '--weight-decay', type=float, default=0.01)
args = parser.parse_args()

# delete the logs directory
model_name = "t5"
os.system("rm -rf ./logs" + model_name)
os.system("rm -rf ./results_" + model_name)

data = GetDataAsPython('../data_large.json')

train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = create_data(data, ['no-array-constructor'])

# from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
print('len of tokenizer before adding: ', len(tokenizer))
tokenizer.add_tokens(['{', '}', '<', '>'])
train_dataset = create_dataset(train_inputs, train_labels, tokenizer, True)
val_dataset = create_dataset(val_inputs, val_labels, tokenizer, False)
test_dataset = create_dataset(test_inputs, test_labels, tokenizer, False)

training_args = TrainingArguments(
    output_dir='./results_' + model_name,          
    num_train_epochs=args.epochs,              
    per_device_train_batch_size=args.batch_size,  
    per_device_eval_batch_size=4,   
    warmup_steps=500,                
    weight_decay=args.weight_decay,               
    logging_dir='./logs_' + model_name,
    logging_steps=10,
    do_eval=True,
    evaluation_strategy='epoch',
    learning_rate=args.learning_rate,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    # prediction_loss_only=True
)

model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
model.resize_token_embeddings(len(tokenizer))
# model.resize maybe depending on tokens

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=[torch.optim.Adam(params=model.parameters(), lr=args.learning_rate), None],       
    tokenizer=tokenizer,
    compute_metrics=compute_val_metrics
)

trainer.train()

and I have a second file where I create the dataset:

from sklearn.model_selection import train_test_split
import torch

def filter_rule(data, rule_type):
    filtered_data = []
    for point in data:
        if point.linter_report.rule_id == rule_type:
            filtered_data.append(point)

    return filtered_data

def split_filtered(filtered_data, seed=11):
    filtered_data_temp = filtered_data
    inputs = [data_point.GetT5Representation()[0] for data_point in filtered_data]
    outputs = [data_point.GetT5Representation()[1] for data_point in filtered_data_temp]
    
    test_size = 0.1 if len(inputs) >= 10 else 1 / len(inputs)
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, outputs, shuffle=True, random_state=seed, test_size=test_size)
    val_size = 0.1 if len(train_inputs) >= 10 else 1 / len(train_inputs)
    train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, shuffle=True, random_state=seed, test_size=val_size)
    
    return train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels


def create_data(data, linter_warnings: list):
    train, train_labels = [], []
    val, val_labels = [], []
    test, test_labels = [], []

    for warning in linter_warnings:
        filtered_data = filter_rule(data, warning)
        
        train_w, train_w_labels, val_w, val_w_labels, test_w, test_w_labels = split_filtered(filtered_data)

        train += train_w
        train_labels += train_w_labels

        val += val_w
        val_labels += val_w_labels

        test += test_w
        test_labels += test_w_labels

    print('train size: ', len(train))
    print('val size: ', len(val))
    print('test size: ', len(test))
    return train, train_labels, val, val_labels, test, test_labels



class BugFixDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, targets):
        self.encodings = encodings
        self.target_encodings = targets
    # you have to correct the get item function
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.target_encodings['input_ids'][index], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

def create_dataset(inputs, labels, tokenizer, training):
    if training:
        pad = False
    else:
        pad = True
    input_encodings = tokenizer(inputs, truncation=pad, padding=pad)
    label_encodings = tokenizer(labels, truncation=pad, padding=pad)

    dataset = BugFixDataset(input_encodings, label_encodings)
    return dataset

As a work around, I introduced the training boolean parameter in the create_dataset function. For training I do not pad and let the trainer handle it. For evaluation I pass false and I pad them by the tokenizer. If you set the argument to False for eval data set as well, it crashes

Ah yes, this can’t work because your model returns logits that have seq_length as a dimension. Trainer tries to concatenate all predictions and can’t do that since they have different shapes. So for evaluation, you need to pad to the max length. (This wouldn’t be the case if you used a model that doesn’t return logits having the sequence length in their dimensions, like a classification model).

2 Likes

okay I understand the issue now. Thank you very much for your help!