No Improvement in Results after Implementing Unsupervised Denoising Training Technique for T5 Model using Hugging Face

Hello,

I am currently working on implementing an unsupervised denoising training technique using the Hugging Face library for the T5 model. I have written several versions of the code, but only one seems to run without errors. However, after training the model with this code, I am not seeing any improvement in the results.

Here is the code that I have been using:



import torch
import random
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments

def generate_masked_sequence(paragraph, mask_prob):
    words = paragraph.split()
    num_words = len(words)
    num_extra_ids = max(1, round(mask_prob * num_words))
    extra_id_positions = random.sample(range(num_words), num_extra_ids)

    extra_id_positions.sort()
    input_ids = []
    labels = []
    for i, word in enumerate(words):
        if extra_id_positions and i == extra_id_positions[0]:
            input_ids.append(f"<extra_id_{len(input_ids)}>")
            labels.append(word)
            extra_id_positions.pop(0)  # Remove the used position
        else:
            input_ids.append(word)
            labels.append(f"<extra_id_{len(labels)}>")
    
    return " ".join(input_ids), " ".join(labels)

paragraphs = load_dataset('nima-nLc/dsm', split='train')

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Hyperparameters
batch_size = 32
learning_rate = 5e-5
epochs = 3
mask_prob = 0.15

def preprocess_data(examples):
    input_seqs = []
    label_seqs = []
    for paragraph in examples['text']:
        input_seq, label_seq = generate_masked_sequence(paragraph, mask_prob)
        input_seqs.append(input_seq)
        label_seqs.append(label_seq)
    return {"input_ids": tokenizer(input_seqs, return_tensors="pt", padding=True, truncation=True).input_ids,
            "labels": tokenizer(label_seqs, return_tensors="pt", padding=True, truncation=True).input_ids}
train_dataset = paragraphs.map(preprocess_data, batched=True)

training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    report_to="none",  
    logging_steps=500,  
    save_steps=1000,  
    output_dir="./checkpoints" , 
    resume_from_checkpoint = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=reduced_dataset
)

print('Training Started')
trainer.train()
print('Training Finished')


Despite the code running successfully, the results from the model post-training do not show any noticeable improvements. I am unsure if the issue lies in the code itself or the approach I am using to implement the unsupervised denoising training technique.

I would greatly appreciate any insights or suggestions on what might be going wrong, or how I could modify my approach or code to improve the results.

Thank you in advance for your help!