Fine-tune T5-small but lower performance

Hi guys,

I am trying to use the T5-small model and fine-tune it on several customize datasets for translation tasks.
I have 10 customized datasets and each training set has 200-500 sentences (14000 tokens).
Each testing set has 400-1000 sentences (28000 tokens).

I use the pre-trained T5-small model to evaluate the bleu score, let’s say I got 10 score-0.
Then I fine-tune the training set and evaluate the bleu score, let’s say I got 10 score-1.

The weird thing is that score-1 < score-0 in most of the 10 cases.
And I have tried different learning rates, but it doesn’t help.

I have read the T5 Fine-tuning Tips (T5 Finetuning Tips) and tried their suggestions but also doesn’t help.

Here is my code:

def get_data(split, batchsz, domain=None):
    if split == 'train_teacher':
        path = './data/aggs/all.json'
        temp_data = load_dataset('json', data_files=path)['train']
        temp_loader = DataLoader(temp_data, shuffle=True, batch_size=batchsz)
        return temp_loader

    if split == 'FT_target':
        path = './data/test_support/{}.json'.format(domain)
        temp_data = load_dataset('json', data_files=path)['train']
        temp_loader = DataLoader(temp_data, shuffle=True, batch_size=batchsz)
        return temp_loader

    if split == 'evaluation':
        path = './data/test_query/{}.json'.format(domain)
        temp_data = load_dataset('json', data_files=path)['train']
        temp_loader = DataLoader(temp_data, shuffle=True, batch_size=batchsz)
        return temp_loader


def compute_bleu(metric_name, y_pred, y_true):
    metric = load_metric(metric_name)
    metric.add_batch(predictions=y_pred, references=y_true)
    report = metric.compute()
    if metric_name == 'bleu':
        return report['bleu'] * 100
    if metric_name == 'sacrebleu':
        return report['score']


def encode_sentences(is_src, batch):

    # Prepare and tokenize source and target sentences
    prefix = "Translate English to German: "

    encoded_sentences = tokenizer(
        [prefix + line for line in batch[args().src_language]] if is_src
        else [line for line in batch[args().tgt_language]],
        max_length=200,
        padding=True,
        truncation=True,
        return_tensors='pt',
        add_special_tokens=True).input_ids.to(device)
    return encoded_sentences


def evaluation(epoch):

    model.eval()
    y_true = []
    y_pred = []
    eva_loader = get_data('evaluation', args().batchsz, domain)

    for i, batch in enumerate(eva_loader):
        # Prepare and tokenize the source sentences
        encoded_src = encode_sentences(is_src=True, batch=batch)

        # Translate and decode the inputs
        outputs = model.generate(encoded_src, max_length=200)
        batch_pred = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Concatenate the translated and reference sentences
        for sentence in batch[args().tgt_language]:
            if args().metric == 'bleu':
                sentence = tokenizer.tokenize(sentence)
            y_true.append([sentence])
        for sentence in batch_pred:
            if args().metric == 'bleu':
                sentence = tokenizer.tokenize(sentence)
            y_pred.append(sentence)

    bleu = compute_bleu(args().metric, y_pred, y_true)
    now = datetime.now()
    print('Time: {}:{}, Domain: {}, {} score: {:.2f}'.format(now.hour, now.minute, domain, args().metric, bleu))
   

def fine_tune_target():

    model.train()
    ft_loader = get_data(split='FT_target', batchsz=args().batchsz, domain=domain)
    optimizer = get_optimizer(model, args().optimizer_name, args().tgt_ft_lr)

    for epoch in range(args().tgt_ft_epochs):

        loss = 0

        for i, batch in enumerate(ft_loader):
            # Prepare the source and target data corpus
            encoded_src = encode_sentences(is_src=True, batch=batch)
            encoded_tgt = encode_sentences(is_src=False, batch=batch)
            prediction = model(input_ids=encoded_src, labels=encoded_tgt)

            # Compute the loss
            train_loss = prediction.loss
            loss += train_loss.item()

            # Update model parameters
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

        loss = loss / len(ft_loader)
        now = datetime.now()
        print('Time: {}:{}, Epoch: {}, Fine Tune Loss: {:.4f}'.format(now.hour, now.minute, epoch + 1, loss))
        evaluation(epoch + 1)
    now = datetime.now()
    print('Time: {}:{}, {} Done'.format(now.hour, now.minute, domain))


model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-small")
domain = 'covid'
fine_tune_target()

Hyperparameters:
optimizer: adafactor
learning rate: 5e-4
fine-tune epochs: 20
batch size: 16