Memory issues with 3090 and 7b model

Hi

I am trying to fine tune 7b model on 24 GB RTX 3090, please see below my data set for training.

Training dataset

Any idea on how to optimize this to stop getting CUDA Out of Memory errors:

Blockquote
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from tqdm import tqdm

Set environment variables

os.environ[‘TF_ENABLE_ONEDNN_OPTS’] = ‘0’
os.environ[‘PYTORCH_CUDA_ALLOC_CONF’] = ‘expandable_segments:True’

def main():
# Clear any leftover memory to start fresh
torch.cuda.empty_cache()
os.environ[“HF_TOKEN”] = “your_huggingface_token_here”

model_name = "szymonrucinski/Krakowiak-7B-v3"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True).to('cuda')

# Use BF16 if supported, otherwise FP16
if torch.cuda.is_bf16_supported():
    model = model.to(dtype=torch.bfloat16)
else:
    print("BF16 not supported on this device, using FP16 instead.")
    model = model.to(dtype=torch.float16)

model.train()  # Ensure the model is in training mode

# Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load datasets
train_dataset = load_dataset('json', data_files='data/home_assistant_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='data/home_assistant_test.jsonl', split='train')

# Define function to extract user text
def extract_user_text(examples):
    user_texts = [conv["value"] for conversation in examples["conversations"] for conv in conversation if conv["from"] == "user"]
    tokenized_inputs = tokenizer(user_texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt') if user_texts else {"input_ids": [], "attention_mask": []}
    torch.cuda.empty_cache()  # Clear cache after tokenization
    return tokenized_inputs

# Map the extract_user_text function over the datasets
train_dataset = train_dataset.map(extract_user_text, batched=True, remove_columns=['conversations'])
eval_dataset = eval_dataset.map(extract_user_text, batched=True, remove_columns=['conversations'])

train_dataset.set_format(type='torch', columns=['input_ids'])
eval_dataset.set_format(type='torch', columns=['input_ids'])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_dir='./logs',
    logging_steps=200,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=3,
    report_to="tensorboard",
    bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported
    dataloader_num_workers=1,
    gradient_accumulation_steps=8,
)

# Custom trainer class
class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        torch.cuda.empty_cache()  # Clear cache before each training step
        model.train()
        inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Ensure inputs are on the right device

        outputs = model(**inputs)
        loss = outputs.loss if 'loss' in outputs else None
        if not loss:
            print("No loss computed from the model outputs.")
            raise ValueError("No loss computed from the model outputs.")

        if not loss.requires_grad:
            print("Debug Info: Loss does not require gradients")
            for name, param in model.named_parameters():
                if param.requires_grad:
                    print(f"{name}: {param.requires_grad}, {param.device}")
            raise RuntimeError("Loss tensor is not connected to the computation graph.")

        return loss

    def train(self, *args, **kwargs):
        torch.cuda.empty_cache()  # Clear cache before starting the train loop
        if not self.optimizer:
            num_training_steps = int(len(self.train_dataset) / self.args.per_device_train_batch_size * self.args.num_train_epochs)
            self.create_optimizer_and_scheduler(num_training_steps=num_training_steps)
        total_steps = len(self.get_train_dataloader()) * self.args.num_train_epochs
        progress_bar = tqdm(total=total_steps, desc="Training Progress", unit="step")
        for epoch in range(self.args.num_train_epochs):
            for step, batch in enumerate(self.get_train_dataloader()):
                batch = {k: v.to(self.model.device) for k, v in batch.items()}
                loss = self.training_step(self.model, batch)
                loss.backward()
                torch.cuda.empty_cache()  # Clear cache after backward pass

                if self.optimizer:
                    self.optimizer.step()
                    self.lr_scheduler.step()
                    self.model.zero_grad()
                    torch.cuda.empty_cache()  # Clear cache after each grad accumulation
                else:
                    raise RuntimeError("Optimizer not initialized")  # Clear cache after each grad accumulation

                progress_bar.update(1)
                if (step + 1) % self.args.logging_steps == 0:
                    self.log({"loss": loss.item()})
                    torch.cuda.empty_cache()  # Clear cache after logging
                if (step + 1) % self.args.eval_steps == 0:
                    eval_output = self.evaluate()
                    progress_bar.write(f"Evaluation metrics: {eval_output}")
                    torch.cuda.empty_cache()  # Clear cache after evaluation
                if (step + 1) % self.args.save_steps == 0:
                    self.save_model()
                    torch.cuda.empty_cache()  # Clear cache after saving model
        progress_bar.close()
        torch.cuda.empty_cache()  # Clear cache after training loop
        return TrainOutput(global_step=step, training_loss=loss.item(), metrics=None)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

trainer.train()
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')

if name == ‘main’:
main()