I am logging gradients using a TrainerCallback function for a few Bert (‘bert-base-multilingual-cased’) layers. I am using Adapters to learn a small dataset (~2164 samples).
The problem is that logging the gradients reveals that all Adapter layers have zero gradients while other Bert layer gradients are none.
Am I using the callback correctly? Why are my gradients zero despite the loss being non-zero (the model is able to converge)?
Here is the Callback:
class GradientNormCallback(TrainerCallback):
def __init__(self, model, layer_names, lang, trainingstage):
self.model = model
self.layer_names = layer_names
self.gradient_norms = {}
def on_step_end(self, args, state, control, logs=None, **kwargs):
# Compute the L2 norm of gradients for each layer
for layer_name, layer_params in self.model.named_parameters():
if any(layer_name.startswith(prefix) for prefix in self.layer_names):
if layer_params.grad is not None:
grad_norm = torch.norm(layer_params.grad.data, p=2).item()
if layer_name not in self.gradient_norms:
self.gradient_norms[layer_name] = []
self.gradient_norms[layer_name].append(grad_norm)
else:
if layer_name not in self.gradient_norms:
self.gradient_norms[layer_name] = []
def on_epoch_end(self, args, state, control, **kwargs):
epoch = state.epoch
# Save the gradient norms to separate CSV files for each layer with the epoch number
for layer_name, norms in self.gradient_norms.items():
df = pd.DataFrame(norms, columns=['Gradient Norm'])
filename = f'{layer_name}_e{epoch}_norm.csv'
filename = os.path.join(save_best_model_path_pretrain, filename)
df.to_csv(filename, index=False)
Here are training arguments:
# train args
training_args = TrainingArguments(
seed=SEED,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_strategy="epoch",
learning_rate=lr,
num_train_epochs=epoch1,
per_device_train_batch_size=batch_size1,
per_device_eval_batch_size=batch_size1,
output_dir=MODEL_DIR+'_'+lang,
overwrite_output_dir=True,
# The next line is important to ensure the dataset labels are properly passed to the model
remove_unused_columns=False,
save_total_limit=1,
load_best_model_at_end=True,
)
trainer = AdapterTrainer(
model=model,
args=training_args,
train_dataset=train_dataset1,
eval_dataset=valid_dataset1,
compute_metrics=compute_metrics
)
layer_names = ['bert.encoder.layer.1', 'bert.encoder.layer.6', 'bert.encoder.layer.11', 'bert.pooler', 'heads.disrpt-deu-rst-pcc.1']
grad_norm_callback_obj = GradientNormCallback(model, layer_names, lang, 'pretrain')
trainer.add_callback(CustomCallback(trainer))
trainer.add_callback(grad_norm_callback_obj)
trainer.add_callback(early_stop)