Hello!
I have the following 2 notebooks on which I am trying to finetune the Llama 3 8b instruct model on a large custom dataset using Lora. Its very big so I finetune in multiple sessions. In notebook 1 I create the lora adapters and finetune those and then push them to huggingface. Notebook 1 ran perfectly fine but in notebook 2 I wanna further finetune those lora adapters that I previously finetuned, and here is where I get the error: ValueError: Attempting to unscale FP16 gradients.
I checked a lot online about this error and consulted chatGPT4 but sadly I was not able to find a fix that worked for my code . I tried setting fp16=False but then i get a memory error even though I use Nvidia A6000 48gb GPU. I would really be thankful if someone could have a quick look over my notebooks and see what causes the error in notebook 2 and how I can fix it so I can successfully continue finetuning using the existing lora adapters from my huggingface.
Notebook1:
!pip install huggingface_hub transformers datasets peft
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
login(token=â**********************â)dataset = load_dataset(âmaxdemian1006/dataset1â, split=âtrainâ)
def formatting_prompts_func(examples):
formatted_texts =
for conversation in examples[âconversationsâ]:
formatted_convo = " â.join([f"Human: {msg[âvalueâ]}â if msg[âfromâ] == âhumanâ else f"Assistant: {msg[âvalueâ]}" for msg in conversation])
formatted_texts.append(formatted_convo)
return {âtextâ: formatted_texts}dataset = dataset.map(formatting_prompts_func, batched=True)
model_name = âmeta-llama/Meta-Llama-3-8B-Instructâ
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_tokendef tokenize_function(examples):
tokenized = tokenizer(examples[âtextâ], padding=âmax_lengthâ, truncation=True, max_length=512)
tokenized[âlabelsâ] = tokenized[âinput_idsâ].copy()
return tokenizedtokenized_dataset = dataset.map(tokenize_function, batched=True)
import torch
torch.cuda.empty_cache()model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.half()lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.1
)model = get_peft_model(model, lora_config)
total_samples = 80000
batch_size = 2
gradient_accumulation_steps = 8
max_steps = (total_samples // batch_size) // gradient_accumulation_stepstraining_args = TrainingArguments(
output_dir=â./resultsâ,
evaluation_strategy=ânoâ,
learning_rate=3e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
max_steps=max_steps,
weight_decay=0.01,
save_strategy=âepochâ,
logging_dir=â./logsâ,
logging_steps=10,
fp16=True,
report_to=,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=âadamw_torchâ,
warmup_steps=int(0.1 * max_steps)
)trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)trainer.train()
model.push_to_hub(âmaxdemian1006/mymodelâ, use_temp_dir=True)
Notebook 2:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import PeftModel, LoraConfig, get_peft_model, TaskType
from huggingface_hub import loginlogin(token=â**********************â)
base_model_name = âmeta-llama/Meta-Llama-3-8B-Instructâ
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_tokenbase_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16)
adapter_name = âmaxdemian1006/mymodelâ
model = PeftModel.from_pretrained(base_model, adapter_name)new_dataset = load_dataset(âmaxdemian1006/dataset2â, split=âtrainâ)
def formatting_prompts_func(examples):
formatted_texts =
for conversation in examples[âconversationsâ]:
formatted_text = ââ
for turn in conversation:
if turn[âfromâ] == âhumanâ:
formatted_text += f"Human: {turn[âvalueâ]}\n"
elif turn[âfromâ] == âgptâ:
formatted_text += f"Assistant: {turn[âvalueâ]}\n"
formatted_texts.append(formatted_text.strip())
return {âtextâ: formatted_texts}formatted_dataset = new_dataset.map(formatting_prompts_func, batched=True)
def tokenize_function(examples):
tokenized = tokenizer(examples[âtextâ], padding=âmax_lengthâ, truncation=True, max_length=512)
tokenized[âlabelsâ] = tokenized[âinput_idsâ].copy()
return tokenizedtokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.1
)model = get_peft_model(model, lora_config)
total_samples = 80000
batch_size = 2
gradient_accumulation_steps = 8
max_steps = (total_samples // batch_size) // gradient_accumulation_stepstraining_args = TrainingArguments(
output_dir=â./further_finetuned_resultsâ,
evaluation_strategy=ânoâ,
learning_rate=3e-5,
per_device_train_batch_size=batch_size,
max_steps=max_steps,
weight_decay=0.01,
save_strategy=âepochâ,
logging_dir=â./further_finetuned_logsâ,
logging_steps=10,
fp16=True,
report_to=,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=âadamw_torchâ,
warmup_steps=int(0.1 * max_steps)
)trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)trainer.train()
model.push_to_hub(âmaxdemian1006/mymodelâ, use_temp_dir=True)