Hi there!
I try to freeze the embedding layer and the first transformer’s layer of Phi-3-mini-128k-instruct, and get the following error:
File "/home/user/anaconda3/envs/JupyterSystemEnv/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Code snippet:
################
# Model & Tokenizer
################
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(script_args.model_id, token=script_args.token, trust_remote_code=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
# Model
torch_dtype = torch.bfloat16
quant_storage_dtype = torch.bfloat16
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_quant_storage=quant_storage_dtype,
)
model = AutoModelForCausalLM.from_pretrained(
script_args.model_id,
token=script_args.token,
trust_remote_code=True,
quantization_config=quantization_config,
#attn_implementation="sdpa", # use sdpa, alternatively use "flash_attention_2"
torch_dtype=quant_storage_dtype,
use_cache=False if training_args.gradient_checkpointing else True, # this is needed for gradient checkpointing
)
if training_args.gradient_checkpointing:
model.gradient_checkpointing_enable()
################
# PEFT
################
# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
lora_alpha=8,
lora_dropout=0.05,
r=16,
bias="none",
target_modules='all-linear',
task_type="CAUSAL_LM",
# modules_to_save = ["lm_head", "embed_tokens"] # add if you want to use the Llama 3 instruct template
)
peft_model = PeftModel(model, peft_config)
print(peft_model)
for param in peft_model.base_model.model.model.layers[0].parameters():
param.requires_grad = False
for param in peft_model.base_model.model.model.embed_tokens.parameters():
param.requires_grad = False
peft_model.print_trainable_parameters()
################
# Training
################
trainer = SFTTrainer(
model=peft_model,
args=training_args,
train_dataset=train_dataset,
dataset_text_field="content",
# peft_config=peft_config,
max_seq_length=script_args.max_seq_length,
tokenizer=tokenizer,
packing=True,
dataset_kwargs={
"add_special_tokens": False, # We template with special tokens
"append_concat_token": False, # No need to add additional separator token
},
)
if trainer.accelerator.is_main_process:
trainer.model.print_trainable_parameters()
##########################
# Train model
##########################
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
trainer.train(resume_from_checkpoint=checkpoint)
Any help will be appreciated