Hello HuggingFace Team,
I’m encountering a CUDA memory error while trying to fine-tune a custom GPT-J-6B model on a dataset consisting of around 50,000 samples. Although I am able to load the model and tokenize the entire dataset successfully, the error occurs during training.
Could you please review my code and provide any suggestions or solutions?
Here is my entire codebase.
import os
import torch
import numpy as np
import pandas as pd
from functools import partial
from src.data_prepare import final_data
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedTokenizer, TrainingArguments, set_seed, Trainer, GPT2TokenizerFast
RESPONSE_KEY = " ### Response:"
DEFAULT_INPUT_MODEL = "EleutherAI/gpt-j-6b"
seed = 42
MAX_LENGTH = 128
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
data = final_data('data/med_alpaca.json')
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "512"
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
def torch_call(self, examples):
batch = super().torch_call(examples)
response_token_ids = self.tokenizer.encode(RESPONSE_KEY)
# print("RTI:",response_token_ids)
labels = batch["labels"].clone()
for i in range(len(examples)):
response_token_ids_start_idx = None
for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
if np.array_equal(response_token_ids, batch["labels"][i, idx : idx + len(response_token_ids)]):
response_token_ids_start_idx = idx
break
if response_token_ids_start_idx is None:
raise RuntimeError("Could not find response key token IDs")
response_token_ids_end_idx = response_token_ids_start_idx + len(response_token_ids)
labels[i, :response_token_ids_end_idx] = -100
batch["labels"] = labels
return batch
def preprocess_batch(batch, tokenizer: AutoTokenizer, max_length: int = MAX_LENGTH):
return tokenizer(batch["text"], max_length=max_length, truncation=True)
def load_training_dataset(training_data_id = data):
# dataset: Dataset = load_dataset(training_data_id)
dataset = training_data_id
# Remove the response key from the text
dataset = dataset.filter(lambda rec: not rec["text"].strip().startswith(" ### Response:"))
def _func(rec):
rec["text"] += "\n\n### End"
return rec
dataset = dataset.map(_func)
return dataset
def load_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL):
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast = True)
# print(tokenizer)
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
def load_model(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = False):
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=True, device_map = "auto", use_cache=False if gradient_checkpointing else True
)
return model
def get_model_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = True):
tokenizer = load_tokenizer(pretrained_model_name_or_path)
model = load_model(pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing)
return model, tokenizer
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int = MAX_LENGTH, seed=seed):
dataset = load_training_dataset()
_preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
dataset = dataset.map(
_preprocessing_function,
batched=True,
remove_columns=["instruction", "input", "output", "text"],
)
dataset = dataset.shuffle(seed=seed)
return dataset
def train(
local_output_dir,
epochs,
per_device_train_batch_size,
per_device_eval_batch_size,
gradient_accumulation_steps,
lr,
seed,
test_size=500,
):
set_seed(seed)
model, tokenizer = get_model_tokenizer()
processed_dataset = preprocess_dataset(tokenizer=tokenizer, seed=seed)
split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed)
data_collator = DataCollatorForCompletionOnlyLM(
tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
)
training_args = TrainingArguments(
output_dir=local_output_dir,
per_device_train_batch_size=per_device_train_batch_size,
per_device_eval_batch_size=per_device_eval_batch_size,
gradient_accumulation_steps = gradient_accumulation_steps,
warmup_steps=100,
learning_rate=lr,
num_train_epochs=epochs,
evaluation_strategy="steps",
eval_steps=10,
fp16=True,
save_strategy="steps",
save_steps=200,
save_total_limit=1,
load_best_model_at_end=True,
report_to="tensorboard",
disable_tqdm=True,
remove_unused_columns=False,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=split_dataset["train"],
eval_dataset=split_dataset["test"],
data_collator=data_collator,
)
# breakpoint()
model.config.use_cache = False
trainer.train()
trainer.save_model(output_dir=local_output_dir)
torch.cuda.empty_cache()
def main(**kwargs):
train(**kwargs)
if __name__ == "__main__":
try:
ia_dolly = {
'local_output_dir':"output/",
'epochs':1,
'per_device_train_batch_size':2,
'per_device_eval_batch_size':2,
'gradient_accumulation_steps': GRADIENT_ACCUMULATION_STEPS,
'lr':0.001,
'seed':seed,
'test_size':500,
}
main(**ia_dolly)
except Exception:
raise
Here is my attached image: