torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 39.56 GiB total capacity; 37.84 GiB already allocated; 242.56 MiB free; 37.96 GiB reserved in total by PyTorch)

Hello HuggingFace Team,

I’m encountering a CUDA memory error while trying to fine-tune a custom GPT-J-6B model on a dataset consisting of around 50,000 samples. Although I am able to load the model and tokenize the entire dataset successfully, the error occurs during training.

Could you please review my code and provide any suggestions or solutions?

Here is my entire codebase.

import os
import torch
import numpy as np
import pandas as pd
from functools import partial
from src.data_prepare import final_data
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedTokenizer, TrainingArguments, set_seed, Trainer, GPT2TokenizerFast

RESPONSE_KEY = " ### Response:"
DEFAULT_INPUT_MODEL = "EleutherAI/gpt-j-6b"
seed = 42
MAX_LENGTH = 128
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE

data = final_data('data/med_alpaca.json')
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "512"

class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        batch = super().torch_call(examples)

        response_token_ids = self.tokenizer.encode(RESPONSE_KEY)
        # print("RTI:",response_token_ids)

        labels = batch["labels"].clone()

        for i in range(len(examples)):
            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                if np.array_equal(response_token_ids, batch["labels"][i, idx : idx + len(response_token_ids)]):
                    response_token_ids_start_idx = idx
                    break

            if response_token_ids_start_idx is None:
                raise RuntimeError("Could not find response key token IDs")

            response_token_ids_end_idx = response_token_ids_start_idx + len(response_token_ids)

            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

def preprocess_batch(batch, tokenizer: AutoTokenizer, max_length: int = MAX_LENGTH):
    return tokenizer(batch["text"], max_length=max_length, truncation=True)

def load_training_dataset(training_data_id = data):
    # dataset: Dataset = load_dataset(training_data_id)
    
    dataset = training_data_id
    # Remove the response key from the text
    dataset = dataset.filter(lambda rec: not rec["text"].strip().startswith(" ### Response:"))
    
    def _func(rec):
        rec["text"] += "\n\n### End"
        return rec
    
    dataset = dataset.map(_func)
    return dataset

def load_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast = True)
    # print(tokenizer)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def load_model(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = False):
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path, trust_remote_code=True, device_map = "auto", use_cache=False if gradient_checkpointing else True
    )
    return model


def get_model_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = True):
    tokenizer = load_tokenizer(pretrained_model_name_or_path)
    model = load_model(pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing)
    return model, tokenizer

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int = MAX_LENGTH, seed=seed):
    
    dataset = load_training_dataset()

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "input", "output", "text"],
    )

    dataset = dataset.shuffle(seed=seed)
    return dataset

def train(
    local_output_dir,
    epochs,
    per_device_train_batch_size,
    per_device_eval_batch_size,
    gradient_accumulation_steps,
    lr,
    seed,
    test_size=500,
):
    set_seed(seed)

    model, tokenizer = get_model_tokenizer()
    processed_dataset = preprocess_dataset(tokenizer=tokenizer, seed=seed)
    split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed)

    data_collator = DataCollatorForCompletionOnlyLM(
        tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
    )
    

    training_args = TrainingArguments(
        output_dir=local_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps=100,
        learning_rate=lr,
        num_train_epochs=epochs,
        evaluation_strategy="steps",
        eval_steps=10,
        fp16=True,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to="tensorboard",
        disable_tqdm=True,
        remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=split_dataset["train"],
        eval_dataset=split_dataset["test"],
        data_collator=data_collator,
    )
    # breakpoint()
    
    model.config.use_cache = False
    trainer.train()
    trainer.save_model(output_dir=local_output_dir)
    torch.cuda.empty_cache()
    
def main(**kwargs):
    train(**kwargs)



if __name__ == "__main__":
    try:
        ia_dolly = {
            'local_output_dir':"output/",
            'epochs':1,
            'per_device_train_batch_size':2,
            'per_device_eval_batch_size':2,
            'gradient_accumulation_steps': GRADIENT_ACCUMULATION_STEPS,
            'lr':0.001,
            'seed':seed,
            'test_size':500,
        }
        main(**ia_dolly)
    except Exception:
        raise

Here is my attached image:

#beginners #transformers #models

It is some sort of cache that it never clears, specially if you run over and over again the same processes (perhaps while testing) it fills it up and suddenly that’s it and you need to restart your GPU

Although on a dataset of 51k samples you might just want to start with a more capable GPU (vastAI has some resources at that level).

Note, this same error can happen when you run barely 10 samples through the code, you just need to do it enough times to fill up said “cache” (somehow PyTorch reserves the space even for processes killed prior)

1 Like

Thanks @anon60462482

1 Like