Issues in finetuning t5-large model

Hi,

Has anyone encountered problems in updating weights in t5-large? I am using the transformers 4.26.1
The code snippet below should work standalone.

I artificially jacked up the learning_rate=10000 because i want to see a change in the weights in the decoder.final_layer_norm.weight. I expect the weight to be different, but no weights have changed.

When I used t5-base, the weights are indeed different with much smaller learning_rate. I might be missing some intricacies in using the t5-large model. Any helps or tips will be appreciated.

import os
os.environ["WANDB_DISABLED"] = "true"

import torch
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("xsum")

from transformers import AutoTokenizer, AutoConfig, logging, set_seed

logging.set_verbosity_warning()
logger = logging.get_logger("transformers")

set_seed(42)

model_checkpoint = "t5-large"
config = AutoConfig.from_pretrained(
    model_checkpoint,
)
config.use_cache = False  # set this to false becaue we are using gradient_checkpointing.

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""


max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_checkpoint,
    config=config,
)
w_before = model.decoder.final_layer_norm.weight.detach().clone()
logger.warn(w_before)

batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    per_device_train_batch_size=batch_size,
    weight_decay=0.0,
    save_total_limit=0,
    optim="adafactor",
    learning_rate=10000.,  # make this really big, so we can see changes in weight.
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=8,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

train_dataset = tokenized_datasets["train"]
max_train_samples = 200
if max_train_samples is not None:
   max_train_samples = min(len(train_dataset), max_train_samples)
   train_dataset = train_dataset.select(range(max_train_samples))

logger.info(len(train_dataset))
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
w_after = trainer.model.decoder.final_layer_norm.weight

logger.warning("Before:")
logger.warning(w_before)
logger.warning("after:")
logger.warning(w_after)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
w_before = w_before.to(device)
weight_after = w_after.to(device)

logger.warning((w_before != weight_after).any())

A bit late, but why max_input_length = 1024?
From the paper: “We use a maximum sequence length of 512”.