Hi,
Has anyone encountered problems in updating weights in t5-large? I am using the transformers 4.26.1
The code snippet below should work standalone.
I artificially jacked up the learning_rate=10000
because i want to see a change in the weights in the decoder.final_layer_norm.weight
. I expect the weight to be different, but no weights have changed.
When I used t5-base
, the weights are indeed different with much smaller learning_rate
. I might be missing some intricacies in using the t5-large model. Any helps or tips will be appreciated.
import os
os.environ["WANDB_DISABLED"] = "true"
import torch
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("xsum")
from transformers import AutoTokenizer, AutoConfig, logging, set_seed
logging.set_verbosity_warning()
logger = logging.get_logger("transformers")
set_seed(42)
model_checkpoint = "t5-large"
config = AutoConfig.from_pretrained(
model_checkpoint,
)
config.use_cache = False # set this to false becaue we are using gradient_checkpointing.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
prefix = "summarize: "
else:
prefix = ""
max_input_length = 1024
max_target_length = 128
def preprocess_function(examples):
inputs = [prefix + doc for doc in examples["document"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(
model_checkpoint,
config=config,
)
w_before = model.decoder.final_layer_norm.weight.detach().clone()
logger.warn(w_before)
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
f"{model_name}-finetuned-xsum",
per_device_train_batch_size=batch_size,
weight_decay=0.0,
save_total_limit=0,
optim="adafactor",
learning_rate=10000., # make this really big, so we can see changes in weight.
fp16=True,
gradient_checkpointing=True,
gradient_accumulation_steps=8,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
train_dataset = tokenized_datasets["train"]
max_train_samples = 200
if max_train_samples is not None:
max_train_samples = min(len(train_dataset), max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
logger.info(len(train_dataset))
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=train_dataset,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
w_after = trainer.model.decoder.final_layer_norm.weight
logger.warning("Before:")
logger.warning(w_before)
logger.warning("after:")
logger.warning(w_after)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
w_before = w_before.to(device)
weight_after = w_after.to(device)
logger.warning((w_before != weight_after).any())