CUDA Out of Memory Error SFTTrainer

from datasets import load_dataset
dataset = load_dataset("Alok2304/Indian_Law_Final_Dataset",split="train[:30%]")
dataset

from transformers import BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    quantization_config = bnb_config,
    trust_remote_code = True).to(device)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules= ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
)


model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 6,078,464 || all params: 3,218,828,288 || trainable%: 0.1888

import re
def preprocess_dataset(examples):

  messages = []

  for sentence in examples['text']:
    sentence = re.sub(r"<s>\[INST\]","",sentence)
    sentence = sentence.split("[/INST]")
    question = sentence[0]
    answer = re.sub(r"</s>","",sentence[1])


    messages.append([
        {"role":"user" ,"content":question},
        {"role":"assistant","content":answer}
        ])

  examples['messages'] = messages
  return examples

dataset = dataset.map(preprocess_dataset,batched=True,remove_columns=['text'])

from trl import setup_chat_format
model,tokenizer = setup_chat_format(model,tokenizer)

from trl import SFTConfig, SFTTrainer

args = SFTConfig(
    output_dir = "lora_model/",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    learning_rate = 2e-05,
    max_steps = 300,
    logging_strategy = "steps",
    logging_steps = 25,
    save_strategy = "steps",
    save_steps = 25,
    eval_strategy = "steps",
    eval_steps = 25,
    data_seed=42,
    max_seq_length = 2048,
    gradient_checkpointing=False,
    report_to = "none",
)

trainer = SFTTrainer(
    model = model,
    args = args,
    processing_class = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset = dataset['test'],)

trainer.train()
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)
 [ 26/300 03:33 < 40:41, 0.11 it/s, Epoch 0.01/1]
Step	Training Loss	Validation Loss

 [ 43/283 02:11 < 12:32, 0.32 it/s]
---------------------------------------------------------------------------
OutOfMemoryError                          Traceback (most recent call last)
<ipython-input-24-3435b262f1ae> in <cell line: 0>()
----> 1 trainer.train()

13 frames
/usr/local/lib/python3.11/dist-packages/transformers/loss/loss_utils.py in ForCausalLMLoss(logits, labels, vocab_size, num_items_in_batch, ignore_index, **kwargs)
     37     labels = labels.to(logits.device)
     38     # Shift so that tokens < n predict n
---> 39     shift_logits = logits[..., :-1, :].contiguous()
     40     shift_labels = labels[..., 1:].contiguous()
     41 

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.69 GiB. GPU 0 has a total capacity of 14.74 GiB of which 4.29 GiB is free. Process 2635 has 10.45 GiB memory in use. Of the allocated memory 9.77 GiB is allocated by PyTorch, and 559.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

why am i getting out of memory error even thoiugh i quantized the model and also used lora. i think i should be able to train since the trainable params are only 6 million. Help me figure out the error please

2 Likes

Wouldn’t it be better to use gradient_checkpointing?
https://mmengine.readthedocs.io/en/latest/common_usage/save_gpu_memory.html