Memory issues with 3090 and 7b model

gacekk · July 31, 2024, 11:58am

Hi

I am trying to fine tune 7b model on 24 GB RTX 3090, please see below my data set for training.

Any idea on how to optimize this to stop getting CUDA Out of Memory errors:

Blockquote
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from tqdm import tqdm

Set environment variables

os.environ[‘TF_ENABLE_ONEDNN_OPTS’] = ‘0’
os.environ[‘PYTORCH_CUDA_ALLOC_CONF’] = ‘expandable_segments:True’

def main():
# Clear any leftover memory to start fresh
torch.cuda.empty_cache()
os.environ[“HF_TOKEN”] = “your_huggingface_token_here”

model_name = "szymonrucinski/Krakowiak-7B-v3"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True).to('cuda')

# Use BF16 if supported, otherwise FP16
if torch.cuda.is_bf16_supported():
    model = model.to(dtype=torch.bfloat16)
else:
    print("BF16 not supported on this device, using FP16 instead.")
    model = model.to(dtype=torch.float16)

model.train()  # Ensure the model is in training mode

# Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load datasets
train_dataset = load_dataset('json', data_files='data/home_assistant_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='data/home_assistant_test.jsonl', split='train')

# Define function to extract user text
def extract_user_text(examples):
    user_texts = [conv["value"] for conversation in examples["conversations"] for conv in conversation if conv["from"] == "user"]
    tokenized_inputs = tokenizer(user_texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt') if user_texts else {"input_ids": [], "attention_mask": []}
    torch.cuda.empty_cache()  # Clear cache after tokenization
    return tokenized_inputs

# Map the extract_user_text function over the datasets
train_dataset = train_dataset.map(extract_user_text, batched=True, remove_columns=['conversations'])
eval_dataset = eval_dataset.map(extract_user_text, batched=True, remove_columns=['conversations'])

train_dataset.set_format(type='torch', columns=['input_ids'])
eval_dataset.set_format(type='torch', columns=['input_ids'])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_dir='./logs',
    logging_steps=200,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=3,
    report_to="tensorboard",
    bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported
    dataloader_num_workers=1,
    gradient_accumulation_steps=8,
)

# Custom trainer class
class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        torch.cuda.empty_cache()  # Clear cache before each training step
        model.train()
        inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Ensure inputs are on the right device

        outputs = model(**inputs)
        loss = outputs.loss if 'loss' in outputs else None
        if not loss:
            print("No loss computed from the model outputs.")
            raise ValueError("No loss computed from the model outputs.")

        if not loss.requires_grad:
            print("Debug Info: Loss does not require gradients")
            for name, param in model.named_parameters():
                if param.requires_grad:
                    print(f"{name}: {param.requires_grad}, {param.device}")
            raise RuntimeError("Loss tensor is not connected to the computation graph.")

        return loss

    def train(self, *args, **kwargs):
        torch.cuda.empty_cache()  # Clear cache before starting the train loop
        if not self.optimizer:
            num_training_steps = int(len(self.train_dataset) / self.args.per_device_train_batch_size * self.args.num_train_epochs)
            self.create_optimizer_and_scheduler(num_training_steps=num_training_steps)
        total_steps = len(self.get_train_dataloader()) * self.args.num_train_epochs
        progress_bar = tqdm(total=total_steps, desc="Training Progress", unit="step")
        for epoch in range(self.args.num_train_epochs):
            for step, batch in enumerate(self.get_train_dataloader()):
                batch = {k: v.to(self.model.device) for k, v in batch.items()}
                loss = self.training_step(self.model, batch)
                loss.backward()
                torch.cuda.empty_cache()  # Clear cache after backward pass

                if self.optimizer:
                    self.optimizer.step()
                    self.lr_scheduler.step()
                    self.model.zero_grad()
                    torch.cuda.empty_cache()  # Clear cache after each grad accumulation
                else:
                    raise RuntimeError("Optimizer not initialized")  # Clear cache after each grad accumulation

                progress_bar.update(1)
                if (step + 1) % self.args.logging_steps == 0:
                    self.log({"loss": loss.item()})
                    torch.cuda.empty_cache()  # Clear cache after logging
                if (step + 1) % self.args.eval_steps == 0:
                    eval_output = self.evaluate()
                    progress_bar.write(f"Evaluation metrics: {eval_output}")
                    torch.cuda.empty_cache()  # Clear cache after evaluation
                if (step + 1) % self.args.save_steps == 0:
                    self.save_model()
                    torch.cuda.empty_cache()  # Clear cache after saving model
        progress_bar.close()
        torch.cuda.empty_cache()  # Clear cache after training loop
        return TrainOutput(global_step=step, training_loss=loss.item(), metrics=None)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

trainer.train()
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')

if name == ‘main’:
main()

Topic		Replies	Views
OPT Memory problem Beginners	2	816	June 2, 2022
CUDA memory suddenly run out of space when only used a quarter of memory Models	0	1143	January 7, 2023
Llama2 70b - Cuda out of memory exceptions 🤗Transformers	0	167	February 28, 2024
RuntimeError: CUDA out of memory. Tried to allocate 1.91 GiB (GPU 0; 15.78 GiB total capacity; 12.36 GiB already allocated; 302.75 MiB free; 14.16 GiB reserved in total by PyTorch) Beginners	2	1381	September 11, 2021
Cuda out of memory while using Trainer API Beginners	1	1772	October 20, 2021

Memory issues with 3090 and 7b model

Set environment variables

Related topics