T5 model fine-tuning in the stsb dataset generates wrong outputs

yuwu · July 22, 2022, 12:22am

Hello, everyone.

I was trying using the T5 model to fine-tune the stsb dataset without prefixes. However, , to decode some prediction , the output was a sentence instead of a number. I don’t understand why this happened. Any help is appreciated, thanks a lot!

My code is:


from transformers import T5Config, T5ForConditionalGeneration, get_scheduler,T5Tokenizer
from transformers.optimization import Adafactor, AdafactorSchedule
from datasets import load_dataset, load_dataset_builder, load_metric
from tqdm import tqdm
import torch 
from torch.utils.data import DataLoader
import time
from dataclasses import dataclass
import numpy as np 

def same_seed(seed): 
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False 
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def evaluate_model(model, dataloader, metric, device, tokenizer):
    model.eval().to(device)
    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device)  for k, v in batch.items()}
        with torch.no_grad():
            outputs = model.generate(input_ids=batch['input_ids'], 
                          attention_mask=batch['attention_mask'],
                          max_length=16,
                          early_stopping=True)

            predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
            batch['labels'][batch['labels'][:,:] == -100] = 0  
            real_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['labels']]

            predictions = [float(i) for i in predictions]
            real_labels = [float(i) for i in real_labels]

        metric.add_batch(predictions=predictions, references=real_labels)
    loss = metric.compute()
    return loss

def add_eos_to_examples(example):
    example['input_text'] = '%s</s>%s' % (example['sentence1'], example['sentence1'])
    processed_label = round(example['label'], 1)  
    if int(processed_label * 10) % 2 != 0: 
        min_num = processed_label - 0.1 
        max_num = processed_label + 0.1 

        if (example['label'] - min_num) <= (max_num - example['label']):
            processed_label = min_num
        else:
             processed_label = max_num

    example['target_text'] = '%s' % round(processed_label, 1) #avoid some case of decimal
    return example

@dataclass
class T2TDataCollator:
    def __call__(self, batch):
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch])
        lm_labels = torch.stack([example['target_ids'] for example in batch])
        lm_labels[lm_labels[:, :] == 0] = -100    
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        
        return {
            'input_ids': input_ids, 
            'attention_mask': attention_mask,
            'labels': lm_labels
        }

def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    config = {
        'seed': 5201314, 
        'model_name' : 't5-base',
        'dataset': 'stsb',
        'num_epochs': 20,
        'batch_size': 2,  
        'learning_rate': 3e-4,
        'optimizer': 'Adamw',
        'datetime': time.strftime('%Y%m%d-%H%M%S'),
        'save_path': 'results'
    }

    same_seed(config['seed'])
    tokenizer = T5Tokenizer.from_pretrained(config['model_name'], model_max_length=512)

    def convert_to_features(example_batch):
        max_input_len = 512
        max_target_len = 16
        input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], padding='max_length', max_length=max_input_len, truncation=True)
        target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], padding='max_length', max_length=max_target_len, truncation=True)
        encodings = {
            'input_ids': input_encodings['input_ids'], 
            'attention_mask': input_encodings['attention_mask'],
            'target_ids': target_encodings['input_ids']
        }

        return encodings

    raw_datasets = load_dataset('glue', config['dataset'])
    tokenized_datasets = raw_datasets.map(add_eos_to_examples)
    tokenized_datasets = tokenized_datasets.map(convert_to_features, batched=True) 
    columns = ['input_ids', 'target_ids', 'attention_mask']
    tokenized_datasets.set_format(type='torch', columns=columns)
    data_collator = T2TDataCollator()
    train_dataloader = DataLoader(
        tokenized_datasets['train'], shuffle=True, batch_size=config['batch_size'], collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        tokenized_datasets['validation'], batch_size=config['batch_size'], collate_fn=data_collator
    )


    model = T5ForConditionalGeneration.from_pretrained(config['model_name'], return_dict=True)
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=config['learning_rate'])
    num_epochs = config['num_epochs']
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        'linear',
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    metric = load_metric('glue', config['dataset'])
    for epoch in range(num_epochs):
        model.train().to(device)
        for i, batch in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
           
        train_loss = evaluate_model(model, train_dataloader, metric, device, tokenizer) 
        val_loss = evaluate_model(model, eval_dataloader, metric, device, tokenizer)
        print(f'Epoch [{epoch+1}/{epoch+1}]: Train set: {train_loss}, Validation set: {val_loss}')

if __name__ == "__main__":
    main()

aquia · August 12, 2022, 8:32am

The generate space for the T5forconditionalgeneration is the whole vocabloary. That is, you may want a number, but the number is dealed as the string, i.e. ordinary text. You may get what you want after fine-tuned it well.Otherwise, you may use the T5 model and add a custom layer to convert the lasthiddenstate to the number.

AmitNikhade · September 21, 2022, 6:50am

If I train T5 model with a text data on 100 epochs, it delivers good results. But after finetuning the same trained model on new data it gives incorrect results.

Topic		Replies	Views
Finetuning T5 on custom data Models	0	1064	November 13, 2020
T5 fine-tuning for custom output Beginners	0	1096	January 22, 2022
mT5/T5v1.1 Fine-Tuning Results Models	16	7527	March 8, 2022
How to train TFT5ForConditionalGeneration model? 🤗Transformers	5	3340	November 21, 2020
Issue with finetuning a seq-to-seq model 🤗Transformers	30	3974	August 11, 2022

T5 model fine-tuning in the stsb dataset generates wrong outputs

Related topics