If reducing ‘max_len’ doesn’t resolve the issue, then try enabling gradient checkpointing (in the correct sequence)
# Enable gradient checkpointing
model.gradient_checkpointing_enable()
# Freeze all parameters first
for param in model.parameters():
param.requires_grad = False
# Unfreeze specific layers
layers_to_unfreeze = list(range(33, 39))
for i in layers_to_unfreeze:
for param in model.model.layers[i].parameters():
param.requires_grad = True