T5 model fine-tuning in the stsb dataset generates wrong outputs

Hello, everyone.

I was trying using the T5 model to fine-tune the stsb dataset without prefixes. However, , to decode some prediction , the output was a sentence instead of a number. I don’t understand why this happened. Any help is appreciated, thanks a lot!

My code is:


from transformers import T5Config, T5ForConditionalGeneration, get_scheduler,T5Tokenizer
from transformers.optimization import Adafactor, AdafactorSchedule
from datasets import load_dataset, load_dataset_builder, load_metric
from tqdm import tqdm
import torch 
from torch.utils.data import DataLoader
import time
from dataclasses import dataclass
import numpy as np 

def same_seed(seed): 
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False 
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def evaluate_model(model, dataloader, metric, device, tokenizer):
    model.eval().to(device)
    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device)  for k, v in batch.items()}
        with torch.no_grad():
            outputs = model.generate(input_ids=batch['input_ids'], 
                          attention_mask=batch['attention_mask'],
                          max_length=16,
                          early_stopping=True)

            predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
            batch['labels'][batch['labels'][:,:] == -100] = 0  
            real_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['labels']]

            predictions = [float(i) for i in predictions]
            real_labels = [float(i) for i in real_labels]

        metric.add_batch(predictions=predictions, references=real_labels)
    loss = metric.compute()
    return loss

def add_eos_to_examples(example):
    example['input_text'] = '%s</s>%s' % (example['sentence1'], example['sentence1'])
    processed_label = round(example['label'], 1)  
    if int(processed_label * 10) % 2 != 0: 
        min_num = processed_label - 0.1 
        max_num = processed_label + 0.1 

        if (example['label'] - min_num) <= (max_num - example['label']):
            processed_label = min_num
        else:
             processed_label = max_num

    example['target_text'] = '%s' % round(processed_label, 1) #avoid some case of decimal
    return example

@dataclass
class T2TDataCollator:
    def __call__(self, batch):
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch])
        lm_labels = torch.stack([example['target_ids'] for example in batch])
        lm_labels[lm_labels[:, :] == 0] = -100    
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        
        return {
            'input_ids': input_ids, 
            'attention_mask': attention_mask,
            'labels': lm_labels
        }

def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    config = {
        'seed': 5201314, 
        'model_name' : 't5-base',
        'dataset': 'stsb',
        'num_epochs': 20,
        'batch_size': 2,  
        'learning_rate': 3e-4,
        'optimizer': 'Adamw',
        'datetime': time.strftime('%Y%m%d-%H%M%S'),
        'save_path': 'results'
    }

    same_seed(config['seed'])
    tokenizer = T5Tokenizer.from_pretrained(config['model_name'], model_max_length=512)

    def convert_to_features(example_batch):
        max_input_len = 512
        max_target_len = 16
        input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], padding='max_length', max_length=max_input_len, truncation=True)
        target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], padding='max_length', max_length=max_target_len, truncation=True)
        encodings = {
            'input_ids': input_encodings['input_ids'], 
            'attention_mask': input_encodings['attention_mask'],
            'target_ids': target_encodings['input_ids']
        }

        return encodings

    raw_datasets = load_dataset('glue', config['dataset'])
    tokenized_datasets = raw_datasets.map(add_eos_to_examples)
    tokenized_datasets = tokenized_datasets.map(convert_to_features, batched=True) 
    columns = ['input_ids', 'target_ids', 'attention_mask']
    tokenized_datasets.set_format(type='torch', columns=columns)
    data_collator = T2TDataCollator()
    train_dataloader = DataLoader(
        tokenized_datasets['train'], shuffle=True, batch_size=config['batch_size'], collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        tokenized_datasets['validation'], batch_size=config['batch_size'], collate_fn=data_collator
    )


    model = T5ForConditionalGeneration.from_pretrained(config['model_name'], return_dict=True)
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=config['learning_rate'])
    num_epochs = config['num_epochs']
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        'linear',
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    metric = load_metric('glue', config['dataset'])
    for epoch in range(num_epochs):
        model.train().to(device)
        for i, batch in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
           
        train_loss = evaluate_model(model, train_dataloader, metric, device, tokenizer) 
        val_loss = evaluate_model(model, eval_dataloader, metric, device, tokenizer)
        print(f'Epoch [{epoch+1}/{epoch+1}]: Train set: {train_loss}, Validation set: {val_loss}')

if __name__ == "__main__":
    main()

The generate space for the T5forconditionalgeneration is the whole vocabloary. That is, you may want a number, but the number is dealed as the string, i.e. ordinary text. You may get what you want after fine-tuned it well.Otherwise, you may use the T5 model and add a custom layer to convert the lasthiddenstate to the number.

If I train T5 model with a text data on 100 epochs, it delivers good results. But after finetuning the same trained model on new data it gives incorrect results.