CUDA out of Memory even on a RTX 4070 Super

I am currently fine-tuning Llama 3.1 with 1B parameters and I am trying to make It work, but I’ll always end up with the following error:

The thing is that I think that my harware is decent enough, but I cannot allocate the memory to my code. Any tips on how to fix this? I’m using Anaconda Prompt 1.1.0 to run my python script over my training dataset.

Here is my code:


import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import time
from huggingface_hub import login
import torch
import gc
import os

# Step 1: Load and preprocess your data
def load_and_prepare_data(csv_file):
    data = pd.read_csv(csv_file)
    data = data.dropna(subset=["text"]).reset_index(drop=True)  # Drop nulls
    
    return Dataset.from_pandas(data)

# Step 2: Tokenize the data
def tokenize_data(dataset, tokenizer, max_length=2048):
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})
    return tokenized_dataset

# Step 3: Load the model and tokenizer with LoRA configuration
def load_model_and_tokenizer_with_lora(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit precision
        bnb_4bit_compute_dtype="float16",  # Use float16 for computation
    )

    # Load the model with 4-bit precision
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",  # Automatically map layers to GPU/CPU
        quantization_config=quantization_config,
    )

    # Handle missing padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.resize_token_embeddings(len(tokenizer))

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=24,  # Higher rank for domain-specific adaptation
        lora_alpha=48,  # Increased scaling to emphasize adaptation
        target_modules=["q_proj", "v_proj", "o_proj"],  # Add output projection for detailed control
        lora_dropout=0.15,  # Slightly higher dropout for better generalization
        bias="none",  # Keep biases unchanged
        task_type="CAUSAL_LM",  # Generative task
    )

    model = get_peft_model(model, lora_config)

    return model, tokenizer

# Step 4: Fine-tune the model
def fine_tune_model(
    tokenized_dataset,
    model,
    tokenizer,
    output_dir="./results_llama_1b",
    epochs=3,
    batch_size=1,
    gradient_accumulation_steps=4,
    fp16=True,
):
    training_args = TrainingArguments(
        #auto_find_batch_size=True,
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        warmup_steps=2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        fp16=fp16,
        save_steps=500,
        eval_strategy="steps",
        eval_accumulation_steps = 1,
        eval_steps=500,
        save_total_limit=2,
        logging_dir="./logs_llama_1b",
        logging_steps=100,
        optim="adafactor",
        gradient_checkpointing=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Push model and tokenizer to the hub
    trainer.push_to_hub("fine-tuned-llama3-1b-newsGenerator-PTBR")
    tokenizer.push_to_hub("fine-tuned-llama3-1b-newsGenerator-PTBR")

# Main script
if __name__ == "__main__":

    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    torch.cuda.empty_cache()

    gc.collect()    
    
    login(token="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")

    start = time.time()

    csv_file = "dataset_noticias_final.csv"  # Replace with your CSV file
    model_name = "meta-llama/Llama-3.2-1B"  # Replace with the desired LLaMA model

    # Load data
    raw_dataset = load_and_prepare_data(csv_file)

    # Split dataset into train and test sets
    dataset = raw_dataset.train_test_split(test_size=0.2)

    # Load model and tokenizer with LoRA
    model, tokenizer = load_model_and_tokenizer_with_lora(model_name)

    # Tokenize data
    tokenized_dataset = tokenize_data(dataset, tokenizer)

    # Fine-tune the model
    fine_tune_model(
        tokenized_dataset=tokenized_dataset,
        model=model,
        tokenizer=tokenizer,
    )

    end = time.time()
    execution_time = end - start
    minutes = int(execution_time // 60)
    seconds = execution_time % 60

    print(f"Fine-tuning complete in {minutes} minutes and {seconds:.2f} seconds!")
    print("Model saved to ./fine-tuned-llama3-1b-newsGenerator-PTBR")

1 Like

Have you tried to set gradient to 2?

1 Like

I’ll try right now to see how It goes, thanks :slight_smile:

1 Like

Any other suggestions? My code is still running but It’s taking a long time, maybe more than what I expected It to be given my last attempt.

1 Like

you can increase you batch_size from 1 to 16

1 Like