I am currently fine-tuning Llama 3.1 with 1B parameters and I am trying to make It work, but I’ll always end up with the following error:
The thing is that I think that my harware is decent enough, but I cannot allocate the memory to my code. Any tips on how to fix this? I’m using Anaconda Prompt 1.1.0 to run my python script over my training dataset.
Here is my code:
import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import time
from huggingface_hub import login
import torch
import gc
import os
# Step 1: Load and preprocess your data
def load_and_prepare_data(csv_file):
data = pd.read_csv(csv_file)
data = data.dropna(subset=["text"]).reset_index(drop=True) # Drop nulls
return Dataset.from_pandas(data)
# Step 2: Tokenize the data
def tokenize_data(dataset, tokenizer, max_length=2048):
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_length,
)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})
return tokenized_dataset
# Step 3: Load the model and tokenizer with LoRA configuration
def load_model_and_tokenizer_with_lora(model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # Enable 4-bit precision
bnb_4bit_compute_dtype="float16", # Use float16 for computation
)
# Load the model with 4-bit precision
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto", # Automatically map layers to GPU/CPU
quantization_config=quantization_config,
)
# Handle missing padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
r=24, # Higher rank for domain-specific adaptation
lora_alpha=48, # Increased scaling to emphasize adaptation
target_modules=["q_proj", "v_proj", "o_proj"], # Add output projection for detailed control
lora_dropout=0.15, # Slightly higher dropout for better generalization
bias="none", # Keep biases unchanged
task_type="CAUSAL_LM", # Generative task
)
model = get_peft_model(model, lora_config)
return model, tokenizer
# Step 4: Fine-tune the model
def fine_tune_model(
tokenized_dataset,
model,
tokenizer,
output_dir="./results_llama_1b",
epochs=3,
batch_size=1,
gradient_accumulation_steps=4,
fp16=True,
):
training_args = TrainingArguments(
#auto_find_batch_size=True,
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
warmup_steps=2,
gradient_accumulation_steps=gradient_accumulation_steps,
fp16=fp16,
save_steps=500,
eval_strategy="steps",
eval_accumulation_steps = 1,
eval_steps=500,
save_total_limit=2,
logging_dir="./logs_llama_1b",
logging_steps=100,
optim="adafactor",
gradient_checkpointing=True,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
tokenizer=tokenizer,
)
# Train the model
trainer.train()
# Save the fine-tuned model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
# Push model and tokenizer to the hub
trainer.push_to_hub("fine-tuned-llama3-1b-newsGenerator-PTBR")
tokenizer.push_to_hub("fine-tuned-llama3-1b-newsGenerator-PTBR")
# Main script
if __name__ == "__main__":
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()
gc.collect()
login(token="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")
start = time.time()
csv_file = "dataset_noticias_final.csv" # Replace with your CSV file
model_name = "meta-llama/Llama-3.2-1B" # Replace with the desired LLaMA model
# Load data
raw_dataset = load_and_prepare_data(csv_file)
# Split dataset into train and test sets
dataset = raw_dataset.train_test_split(test_size=0.2)
# Load model and tokenizer with LoRA
model, tokenizer = load_model_and_tokenizer_with_lora(model_name)
# Tokenize data
tokenized_dataset = tokenize_data(dataset, tokenizer)
# Fine-tune the model
fine_tune_model(
tokenized_dataset=tokenized_dataset,
model=model,
tokenizer=tokenizer,
)
end = time.time()
execution_time = end - start
minutes = int(execution_time // 60)
seconds = execution_time % 60
print(f"Fine-tuning complete in {minutes} minutes and {seconds:.2f} seconds!")
print("Model saved to ./fine-tuned-llama3-1b-newsGenerator-PTBR")