Finetuning existing Lora Adapters gives "Attempting to unscale FP16 gradients" - Error

Hello!

I have the following 2 notebooks on which I am trying to finetune the Llama 3 8b instruct model on a large custom dataset using Lora. Its very big so I finetune in multiple sessions. In notebook 1 I create the lora adapters and finetune those and then push them to huggingface. Notebook 1 ran perfectly fine but in notebook 2 I wanna further finetune those lora adapters that I previously finetuned, and here is where I get the error: ValueError: Attempting to unscale FP16 gradients.

I checked a lot online about this error and consulted chatGPT4 but sadly I was not able to find a fix that worked for my code . I tried setting fp16=False but then i get a memory error even though I use Nvidia A6000 48gb GPU. I would really be thankful if someone could have a quick look over my notebooks and see what causes the error in notebook 2 and how I can fix it so I can successfully continue finetuning using the existing lora adapters from my huggingface.

Notebook1:

!pip install huggingface_hub transformers datasets peft

from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
login(token=“**********************”)

dataset = load_dataset(“maxdemian1006/dataset1”, split=“train”)

def formatting_prompts_func(examples):
formatted_texts =
for conversation in examples[“conversations”]:
formatted_convo = " “.join([f"Human: {msg[‘value’]}” if msg[‘from’] == ‘human’ else f"Assistant: {msg[‘value’]}" for msg in conversation])
formatted_texts.append(formatted_convo)
return {“text”: formatted_texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

model_name = “meta-llama/Meta-Llama-3-8B-Instruct”
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
tokenized = tokenizer(examples[“text”], padding=“max_length”, truncation=True, max_length=512)
tokenized[“labels”] = tokenized[“input_ids”].copy()
return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True)

import torch
torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.half()

lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.1
)

model = get_peft_model(model, lora_config)

total_samples = 80000
batch_size = 2
gradient_accumulation_steps = 8
max_steps = (total_samples // batch_size) // gradient_accumulation_steps

training_args = TrainingArguments(
output_dir=“./results”,
evaluation_strategy=“no”,
learning_rate=3e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
max_steps=max_steps,
weight_decay=0.01,
save_strategy=“epoch”,
logging_dir=“./logs”,
logging_steps=10,
fp16=True,
report_to=,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=“adamw_torch”,
warmup_steps=int(0.1 * max_steps)
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)

trainer.train()

model.push_to_hub(“maxdemian1006/mymodel”, use_temp_dir=True)




Notebook 2:

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import PeftModel, LoraConfig, get_peft_model, TaskType
from huggingface_hub import login

login(token=“**********************”)

base_model_name = “meta-llama/Meta-Llama-3-8B-Instruct”
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16)

adapter_name = “maxdemian1006/mymodel”
model = PeftModel.from_pretrained(base_model, adapter_name)

new_dataset = load_dataset(“maxdemian1006/dataset2”, split=“train”)

def formatting_prompts_func(examples):
formatted_texts =
for conversation in examples[“conversations”]:
formatted_text = “”
for turn in conversation:
if turn[“from”] == “human”:
formatted_text += f"Human: {turn[‘value’]}\n"
elif turn[“from”] == “gpt”:
formatted_text += f"Assistant: {turn[‘value’]}\n"
formatted_texts.append(formatted_text.strip())
return {“text”: formatted_texts}

formatted_dataset = new_dataset.map(formatting_prompts_func, batched=True)

def tokenize_function(examples):
tokenized = tokenizer(examples[“text”], padding=“max_length”, truncation=True, max_length=512)
tokenized[“labels”] = tokenized[“input_ids”].copy()
return tokenized

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)

lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.1
)

model = get_peft_model(model, lora_config)

total_samples = 80000
batch_size = 2
gradient_accumulation_steps = 8
max_steps = (total_samples // batch_size) // gradient_accumulation_steps

training_args = TrainingArguments(
output_dir=“./further_finetuned_results”,
evaluation_strategy=“no”,
learning_rate=3e-5,
per_device_train_batch_size=batch_size,
max_steps=max_steps,
weight_decay=0.01,
save_strategy=“epoch”,
logging_dir=“./further_finetuned_logs”,
logging_steps=10,
fp16=True,
report_to=,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=“adamw_torch”,
warmup_steps=int(0.1 * max_steps)
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)

trainer.train()

model.push_to_hub(“maxdemian1006/mymodel”, use_temp_dir=True)

Okay I seem to finally have found an answer online, putting it here so it might be helpful to anyone who stumbles upon the same problem:

"Getting: ValueError: Attempting to unscale FP16 gradients
This error probably occurred because the model was loaded with torch_dtype=torch.float16 and then used in an automatic mixed precision (AMP) context, e.g. by setting fp16=True in the Trainer class from :hugs: Transformers. The reason is that when using AMP, trainable weights should never use fp16. To make this work without having to load the whole model in FP32, add the following snippet to your code:

peft_model = get_peft_model(…)

add this:

for param in model.parameters():
if param.requires_grad:
param.data = param.data.float()

proceed as usual

trainer = Trainer(model=peft_model, fp16=True, …)
trainer.train() "

4 Likes

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.