Hello everyone!
I am trying to train the mistralai/Mistral-Nemo-Instruct-2407
model using the Hugging Face Trainer. My goal is to train layers 33 to 39 (7 layers) of the model. The model, when fully loaded, takes around 45 GiB of vRAM. I am using an Nvidia H100 GPU with 80 GiB of vRAM, so it should have enough vRAM as I am only training the last 7 layers.
However, I am encountering a CUDA Out of Memory Error after a couple of steps. Is there anything wrong with my setup?
Here is my complete script:
import torch
from train_data_v2 import get_data
from datasets import Dataset
from transformers import MistralForCausalLM, AutoTokenizer, Trainer, TrainingArguments
def tokenize_function(example, tokenizer):
max_len = 2400
# Tokenize input
model_inputs = tokenizer(
example["input"],
padding="max_length",
truncation=True,
max_length=max_len
)
# Tokenize output/labels
with tokenizer.as_target_tokenizer():
labels = tokenizer(
example["output"],
padding="max_length",
truncation=True,
max_length=max_len
)
# Replace padding token id with -100 for labels
labels_tensor = torch.tensor(
[label if label != tokenizer.pad_token_id else -100 for label in labels["input_ids"]],
dtype=torch.long
)
return {
"input_ids": torch.tensor(model_inputs["input_ids"], dtype=torch.long),
"attention_mask": torch.tensor(model_inputs["attention_mask"], dtype=torch.long),
"labels": labels_tensor
}
def main():
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
"mistralai/Mistral-Nemo-Instruct-2407",
token="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",
padding_side="right"
)
tokenizer.pad_token = tokenizer.eos_token
# Load model
model = MistralForCausalLM.from_pretrained(
"mistralai/Mistral-Nemo-Instruct-2407",
token="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",
torch_dtype=torch.float16
).to("cuda")
# Freeze all parameters first
for param in model.parameters():
param.requires_grad = False
# Unfreeze specific layers
layers_to_unfreeze = list(range(33, 39))
for i in layers_to_unfreeze:
for param in model.model.layers[i].parameters():
param.requires_grad = True
# Load and process dataset
dataset = Dataset.from_list(get_data(tokenizer)) # Here I use a custom function to load my data, the tokenizer is used for adding special tokens at the right place.
tokenized_dataset = dataset.map(
lambda x: tokenize_function(x, tokenizer),
remove_columns=dataset.column_names,
batched=False,
num_proc=1
)
print("Data Loaded and tokenized!")
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="no",
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
num_train_epochs=3,
learning_rate=1e-6,
weight_decay=0.01,
warmup_steps=40,
logging_dir='./logs',
logging_steps=50,
fp16=True
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
# Start training
print("Starting Training")
trainer.train()
print("Training Completed!")
# Save the model
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_tokenizer")
print("Model and Tokenizer saved!")
if __name__ == "__main__":
main()
I also tried to set model.gradient_checkpointing_enable()
to enable checkpointing (which should save memory usage) using this code:
for param in model.parameters():
param.requires_grad = True # Temporarily enable gradients
model.gradient_checkpointing_enable() # Enable checkpointing
# Freeze all parameters first
for param in model.parameters():
param.requires_grad = False
# Unfreeze specific layers
layers_to_unfreeze = list(range(33, 39))
for i in layers_to_unfreeze:
for param in model.model.layers[i].parameters():
param.requires_grad = True
And also by adding gradient_checkpointing=True
to the training_args.
But I am getting this error:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn.
Thank you!