When I load gpt2 model without using load_in_8bit = True, I get below error while training using SFTTrainer.
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Arguments I am using -
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(â-fâ) #added to work on notebook
parser.add_argument(ââmodel_pathâ, type=str, default=âgpt2â) # model-id from HFparser.add_argument("--seq_length", type=int, default=1024) #specific for gpt2 parser.add_argument("--max_steps", type=int, default=500) #no. of batches to be fine tuned parser.add_argument("--batch_size", type=int, default=1) #single batch at a time due to memory issue parser.add_argument("--gradient_accumulation_steps", type=int, default=16) #accumulating gradient as batch_size 1 is not stable # parser.add_argument("--eos_token_id", type=int, default=49152) #eos already specified in gpt2 parser.add_argument("--learning_rate", type=float, default=1e-4) # parser.add_argument("--lr_scheduler_type", type=str, default="cosine") parser.add_argument("--num_warmup_steps", type=int, default=100) #fixed learning rate on first warmup steps # parser.add_argument("--weight_decay", type=float, default=0.05) parser.add_argument("--no_gradient_checkpointing", action="store_false", default=False) #store gradients to be retrived during backpropagation, used to reduce memory. not req parser.add_argument("--local_rank", type=int, default=0) #process id, not useful in single process training parser.add_argument("--no_fp16", action="store_false") #fp16 not supported parser.add_argument("--bf16", action="store_true", default=False) #using bf16 parser.add_argument("--seed", type=int, default=0) #seed parser.add_argument("--num_workers", type=int, default=None) #using single worker parser.add_argument("--output_dir", type=str, default="./gpt2-lora/checkpoints2") #save model to this folder parser.add_argument("--log_freq", default=100, type=int) #display output after log steps parser.add_argument("--eval_freq", default=100, type=int) #evaluate after eval steps parser.add_argument("--save_freq", default=100, type=int) #save to folder after save steps return parser.parse_args()
lora_config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
bias=ânoneâ,
task_type=âCAUSAL_LMâ,
fan_in_fan_out = True # added from docs; https://github.com/huggingface/peft/blob/v0.6.2/src/peft/tuners/lora/config.py#L24
)
training_args = TrainingArguments(
output_dir=args.output_dir,
dataloader_drop_last=True, #drops last batch if dataset is not divisble by batch_size, not useful for batch_size = 1
evaluation_strategy=âstepsâ,
max_steps=args.max_steps,
eval_steps=args.eval_freq,
save_steps=args.save_freq,
logging_steps=args.log_freq,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
learning_rate=args.learning_rate,
# lr_scheduler_type=args.lr_scheduler_type,
warmup_steps=args.num_warmup_steps,
gradient_accumulation_steps=args.gradient_accumulation_steps,
gradient_checkpointing=not args.no_gradient_checkpointing,
fp16=not args.no_fp16,
bf16=args.bf16,
# weight_decay=args.weight_decay,
run_name=âgpt2-finetuned2â,
# report_to=âwandbâ,
ddp_find_unused_parameters=False,
)
training code -
model = AutoModelForCausalLM.from_pretrained(
args.model_path, device_map={ââ: Accelerator().process_index}
)
trainer = SFTTrainer(
model=model, #checked
args=training_args,
train_dataset=train_data, #checked
eval_dataset=val_data, #checked
peft_config=lora_config, #checked
packing=True, #checked
)
Is quantization mandatory or can i make this work somehow?