CUDA out of Memory even on a RTX 4070 Super

gui8600k · December 30, 2024, 8:00pm

I am currently fine-tuning Llama 3.1 with 1B parameters and I am trying to make It work, but I’ll always end up with the following error:

The thing is that I think that my harware is decent enough, but I cannot allocate the memory to my code. Any tips on how to fix this? I’m using Anaconda Prompt 1.1.0 to run my python script over my training dataset.

Here is my code:


import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import time
from huggingface_hub import login
import torch
import gc
import os

# Step 1: Load and preprocess your data
def load_and_prepare_data(csv_file):
    data = pd.read_csv(csv_file)
    data = data.dropna(subset=["text"]).reset_index(drop=True)  # Drop nulls
    
    return Dataset.from_pandas(data)

# Step 2: Tokenize the data
def tokenize_data(dataset, tokenizer, max_length=2048):
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})
    return tokenized_dataset

# Step 3: Load the model and tokenizer with LoRA configuration
def load_model_and_tokenizer_with_lora(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit precision
        bnb_4bit_compute_dtype="float16",  # Use float16 for computation
    )

    # Load the model with 4-bit precision
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",  # Automatically map layers to GPU/CPU
        quantization_config=quantization_config,
    )

    # Handle missing padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.resize_token_embeddings(len(tokenizer))

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=24,  # Higher rank for domain-specific adaptation
        lora_alpha=48,  # Increased scaling to emphasize adaptation
        target_modules=["q_proj", "v_proj", "o_proj"],  # Add output projection for detailed control
        lora_dropout=0.15,  # Slightly higher dropout for better generalization
        bias="none",  # Keep biases unchanged
        task_type="CAUSAL_LM",  # Generative task
    )

    model = get_peft_model(model, lora_config)

    return model, tokenizer

# Step 4: Fine-tune the model
def fine_tune_model(
    tokenized_dataset,
    model,
    tokenizer,
    output_dir="./results_llama_1b",
    epochs=3,
    batch_size=1,
    gradient_accumulation_steps=4,
    fp16=True,
):
    training_args = TrainingArguments(
        #auto_find_batch_size=True,
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        warmup_steps=2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        fp16=fp16,
        save_steps=500,
        eval_strategy="steps",
        eval_accumulation_steps = 1,
        eval_steps=500,
        save_total_limit=2,
        logging_dir="./logs_llama_1b",
        logging_steps=100,
        optim="adafactor",
        gradient_checkpointing=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Push model and tokenizer to the hub
    trainer.push_to_hub("fine-tuned-llama3-1b-newsGenerator-PTBR")
    tokenizer.push_to_hub("fine-tuned-llama3-1b-newsGenerator-PTBR")

# Main script
if __name__ == "__main__":

    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    torch.cuda.empty_cache()

    gc.collect()    
    
    login(token="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")

    start = time.time()

    csv_file = "dataset_noticias_final.csv"  # Replace with your CSV file
    model_name = "meta-llama/Llama-3.2-1B"  # Replace with the desired LLaMA model

    # Load data
    raw_dataset = load_and_prepare_data(csv_file)

    # Split dataset into train and test sets
    dataset = raw_dataset.train_test_split(test_size=0.2)

    # Load model and tokenizer with LoRA
    model, tokenizer = load_model_and_tokenizer_with_lora(model_name)

    # Tokenize data
    tokenized_dataset = tokenize_data(dataset, tokenizer)

    # Fine-tune the model
    fine_tune_model(
        tokenized_dataset=tokenized_dataset,
        model=model,
        tokenizer=tokenizer,
    )

    end = time.time()
    execution_time = end - start
    minutes = int(execution_time // 60)
    seconds = execution_time % 60

    print(f"Fine-tuning complete in {minutes} minutes and {seconds:.2f} seconds!")
    print("Model saved to ./fine-tuned-llama3-1b-newsGenerator-PTBR")

Cicciokr · December 30, 2024, 9:08pm

Have you tried to set gradient to 2?

gui8600k · December 30, 2024, 9:33pm

I’ll try right now to see how It goes, thanks

gui8600k · December 30, 2024, 10:57pm

Any other suggestions? My code is still running but It’s taking a long time, maybe more than what I expected It to be given my last attempt.

Cicciokr · December 31, 2024, 7:15am

you can increase you batch_size from 1 to 16

Topic		Replies	Views
CUDA Out of Memory Error SFTTrainer 🤗Transformers	1	126	February 16, 2025
Fine Tuning LLama 3.2 1B Quantized Memory Requirements Models	6	1425	June 16, 2025
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 39.56 GiB total capacity; 37.84 GiB already allocated; 242.56 MiB free; 37.96 GiB reserved in total by PyTorch) 🤗Transformers	2	5350	June 7, 2023
CUDA out of memory on multi-GPU 🤗Transformers	1	2649	March 6, 2024
Running into cuda out of memory when running llama2-13b-chat model on multi-gpu machine Intermediate	5	11051	December 21, 2023

CUDA out of Memory even on a RTX 4070 Super

Related topics