Hi
I am trying to fine tune 7b model on 24 GB RTX 3090, please see below my data set for training.
Any idea on how to optimize this to stop getting CUDA Out of Memory errors:
Blockquote
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from tqdm import tqdm
Set environment variables
os.environ[‘TF_ENABLE_ONEDNN_OPTS’] = ‘0’
os.environ[‘PYTORCH_CUDA_ALLOC_CONF’] = ‘expandable_segments:True’
def main():
# Clear any leftover memory to start fresh
torch.cuda.empty_cache()
os.environ[“HF_TOKEN”] = “your_huggingface_token_here”
model_name = "szymonrucinski/Krakowiak-7B-v3"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True).to('cuda')
# Use BF16 if supported, otherwise FP16
if torch.cuda.is_bf16_supported():
model = model.to(dtype=torch.bfloat16)
else:
print("BF16 not supported on this device, using FP16 instead.")
model = model.to(dtype=torch.float16)
model.train() # Ensure the model is in training mode
# Ensure padding token is set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load datasets
train_dataset = load_dataset('json', data_files='data/home_assistant_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='data/home_assistant_test.jsonl', split='train')
# Define function to extract user text
def extract_user_text(examples):
user_texts = [conv["value"] for conversation in examples["conversations"] for conv in conversation if conv["from"] == "user"]
tokenized_inputs = tokenizer(user_texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt') if user_texts else {"input_ids": [], "attention_mask": []}
torch.cuda.empty_cache() # Clear cache after tokenization
return tokenized_inputs
# Map the extract_user_text function over the datasets
train_dataset = train_dataset.map(extract_user_text, batched=True, remove_columns=['conversations'])
eval_dataset = eval_dataset.map(extract_user_text, batched=True, remove_columns=['conversations'])
train_dataset.set_format(type='torch', columns=['input_ids'])
eval_dataset.set_format(type='torch', columns=['input_ids'])
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy="steps",
eval_steps=1000,
logging_dir='./logs',
logging_steps=200,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
save_steps=1000,
save_total_limit=2,
learning_rate=2e-5,
num_train_epochs=3,
report_to="tensorboard",
bf16=torch.cuda.is_bf16_supported(), # Use BF16 if supported
dataloader_num_workers=1,
gradient_accumulation_steps=8,
)
# Custom trainer class
class CustomTrainer(Trainer):
def training_step(self, model, inputs):
torch.cuda.empty_cache() # Clear cache before each training step
model.train()
inputs = {k: v.to(model.device) for k, v in inputs.items()} # Ensure inputs are on the right device
outputs = model(**inputs)
loss = outputs.loss if 'loss' in outputs else None
if not loss:
print("No loss computed from the model outputs.")
raise ValueError("No loss computed from the model outputs.")
if not loss.requires_grad:
print("Debug Info: Loss does not require gradients")
for name, param in model.named_parameters():
if param.requires_grad:
print(f"{name}: {param.requires_grad}, {param.device}")
raise RuntimeError("Loss tensor is not connected to the computation graph.")
return loss
def train(self, *args, **kwargs):
torch.cuda.empty_cache() # Clear cache before starting the train loop
if not self.optimizer:
num_training_steps = int(len(self.train_dataset) / self.args.per_device_train_batch_size * self.args.num_train_epochs)
self.create_optimizer_and_scheduler(num_training_steps=num_training_steps)
total_steps = len(self.get_train_dataloader()) * self.args.num_train_epochs
progress_bar = tqdm(total=total_steps, desc="Training Progress", unit="step")
for epoch in range(self.args.num_train_epochs):
for step, batch in enumerate(self.get_train_dataloader()):
batch = {k: v.to(self.model.device) for k, v in batch.items()}
loss = self.training_step(self.model, batch)
loss.backward()
torch.cuda.empty_cache() # Clear cache after backward pass
if self.optimizer:
self.optimizer.step()
self.lr_scheduler.step()
self.model.zero_grad()
torch.cuda.empty_cache() # Clear cache after each grad accumulation
else:
raise RuntimeError("Optimizer not initialized") # Clear cache after each grad accumulation
progress_bar.update(1)
if (step + 1) % self.args.logging_steps == 0:
self.log({"loss": loss.item()})
torch.cuda.empty_cache() # Clear cache after logging
if (step + 1) % self.args.eval_steps == 0:
eval_output = self.evaluate()
progress_bar.write(f"Evaluation metrics: {eval_output}")
torch.cuda.empty_cache() # Clear cache after evaluation
if (step + 1) % self.args.save_steps == 0:
self.save_model()
torch.cuda.empty_cache() # Clear cache after saving model
progress_bar.close()
torch.cuda.empty_cache() # Clear cache after training loop
return TrainOutput(global_step=step, training_loss=loss.item(), metrics=None)
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator
)
trainer.train()
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')
if name == ‘main’:
main()