Hey all,
Trying to learn a bit about training AI models, so I figured I would try to kind of recreate the “Nothing, Forever” AI but with a whatsapp friend group of mine as training set. However, the text set that i’m using is large (100k lines) so training is taking a while and I would like to interrupt it at times.
It all seems to work fine, except when i resume a checkpoint I always see this message:
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
Should I be worried about this? I’ll copy my current training script below
import os
from transformers import AutoTokenizer, GPTNeoForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch
# Clear GPU memory
torch.cuda.empty_cache()
# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use only GPU 0
os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Helps with debugging CUDA out-of-memory errors
# Define the output directory
output_dir = "./model_output"
os.makedirs(output_dir, exist_ok=True) # Ensure the directory exists
# Check for existing checkpoints
checkpoint_dir = None
if os.path.isdir(output_dir) and len(os.listdir(output_dir)) > 0:
# Look for the last checkpoint
checkpoint_dirs = sorted([os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint-")],
key=lambda x: int(x.split('-')[-1]))
if checkpoint_dirs:
checkpoint_dir = checkpoint_dirs[-1] # Use the last checkpoint
# Load the tokenizer from the main output directory, not the checkpoint directory
tokenizer = AutoTokenizer.from_pretrained(output_dir)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load the model (from the checkpoint if it exists, otherwise from the base model)
model = GPTNeoForCausalLM.from_pretrained(checkpoint_dir if checkpoint_dir else 'EleutherAI/gpt-neo-125m')
# Load training arguments
training_args = TrainingArguments(
output_dir=output_dir, # Directory to save model and checkpoints
overwrite_output_dir=True,
do_train=True,
do_eval=True,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=1e-5, # Adjusted learning rate
weight_decay=0.0,
logging_dir="./logs",
logging_steps=10,
save_strategy="steps",
save_steps=1000,
eval_strategy="steps",
eval_steps=6000,
save_total_limit=3, # Keep only the last 3 checkpoints to manage disk space
fp16=True,
resume_from_checkpoint=checkpoint_dir, # Automatically resume from checkpoint if found
gradient_checkpointing=True, # Enable gradient checkpointing here
)
# Load and tokenize your dataset
def tokenize_function(examples):
tokens = tokenizer(examples['text'], padding="max_length", truncation=True)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
data_files = {"train": "chat_logs.txt", "validation": "validation_logs.txt"}
dataset = load_dataset("text", data_files=data_files)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
)
# Start training
try:
trainer.train(resume_from_checkpoint=checkpoint_dir)
# Save the final model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Save evaluation results
eval_output = trainer.evaluate()
with open(os.path.join(output_dir, "evaluation_results.txt"), "w") as f:
f.write(str(eval_output))
except KeyboardInterrupt:
print("Training or evaluation interrupted. Saving model to output directory.")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)