LoRA finetuning without quantization (8bit)

When I load gpt2 model without using load_in_8bit = True, I get below error while training using SFTTrainer.

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Arguments I am using -

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(‘-f’) #added to work on notebook
parser.add_argument(“–model_path”, type=str, default=“gpt2”) # model-id from HF

parser.add_argument("--seq_length", type=int, default=1024) #specific for gpt2
parser.add_argument("--max_steps", type=int, default=500) #no. of batches to be fine tuned
parser.add_argument("--batch_size", type=int, default=1) #single batch at a time  due to memory issue
parser.add_argument("--gradient_accumulation_steps", type=int, default=16) #accumulating gradient as batch_size 1 is not stable
# parser.add_argument("--eos_token_id", type=int, default=49152) #eos already specified in gpt2

parser.add_argument("--learning_rate", type=float, default=1e-4)
# parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
parser.add_argument("--num_warmup_steps", type=int, default=100) #fixed learning rate on first warmup steps
# parser.add_argument("--weight_decay", type=float, default=0.05)

parser.add_argument("--no_gradient_checkpointing", action="store_false", default=False) #store gradients to be retrived during backpropagation, used to reduce memory. not req
parser.add_argument("--local_rank", type=int, default=0) #process id, not useful in single process training
parser.add_argument("--no_fp16", action="store_false") #fp16 not supported
parser.add_argument("--bf16", action="store_true", default=False) #using bf16

parser.add_argument("--seed", type=int, default=0) #seed
parser.add_argument("--num_workers", type=int, default=None) #using single worker

parser.add_argument("--output_dir", type=str, default="./gpt2-lora/checkpoints2") #save model to this folder
parser.add_argument("--log_freq", default=100, type=int) #display output after log steps
parser.add_argument("--eval_freq", default=100, type=int) #evaluate after eval steps
parser.add_argument("--save_freq", default=100, type=int) #save to folder after save steps

return parser.parse_args()

lora_config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
bias=“none”,
task_type=“CAUSAL_LM”,
fan_in_fan_out = True # added from docs; https://github.com/huggingface/peft/blob/v0.6.2/src/peft/tuners/lora/config.py#L24
)

training_args = TrainingArguments(
output_dir=args.output_dir,
dataloader_drop_last=True, #drops last batch if dataset is not divisble by batch_size, not useful for batch_size = 1
evaluation_strategy=“steps”,
max_steps=args.max_steps,
eval_steps=args.eval_freq,
save_steps=args.save_freq,
logging_steps=args.log_freq,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
learning_rate=args.learning_rate,
# lr_scheduler_type=args.lr_scheduler_type,
warmup_steps=args.num_warmup_steps,
gradient_accumulation_steps=args.gradient_accumulation_steps,
gradient_checkpointing=not args.no_gradient_checkpointing,
fp16=not args.no_fp16,
bf16=args.bf16,
# weight_decay=args.weight_decay,
run_name=“gpt2-finetuned2”,
# report_to=“wandb”,
ddp_find_unused_parameters=False,
)

training code -

model = AutoModelForCausalLM.from_pretrained(
args.model_path, device_map={“”: Accelerator().process_index}
)

trainer = SFTTrainer(
model=model, #checked
args=training_args,
train_dataset=train_data, #checked
eval_dataset=val_data, #checked
peft_config=lora_config, #checked
packing=True, #checked
)

Is quantization mandatory or can i make this work somehow?

1 Like

I’m having the exact same issue, have you found a solution?