Resuming training: There were missing keys in the checkpoint model loaded: ['lm_head.weight']

idkthrowaway123123 · August 24, 2024, 6:17pm

Hey all,

Trying to learn a bit about training AI models, so I figured I would try to kind of recreate the “Nothing, Forever” AI but with a whatsapp friend group of mine as training set. However, the text set that i’m using is large (100k lines) so training is taking a while and I would like to interrupt it at times.

It all seems to work fine, except when i resume a checkpoint I always see this message:

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].

Should I be worried about this? I’ll copy my current training script below

import os
from transformers import AutoTokenizer, GPTNeoForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Clear GPU memory
torch.cuda.empty_cache()

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only GPU 0
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Helps with debugging CUDA out-of-memory errors

# Define the output directory
output_dir = "./model_output"
os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

# Check for existing checkpoints
checkpoint_dir = None
if os.path.isdir(output_dir) and len(os.listdir(output_dir)) > 0:
    # Look for the last checkpoint
    checkpoint_dirs = sorted([os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint-")], 
                             key=lambda x: int(x.split('-')[-1]))
    if checkpoint_dirs:
        checkpoint_dir = checkpoint_dirs[-1]  # Use the last checkpoint

# Load the tokenizer from the main output directory, not the checkpoint directory
tokenizer = AutoTokenizer.from_pretrained(output_dir)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the model (from the checkpoint if it exists, otherwise from the base model)
model = GPTNeoForCausalLM.from_pretrained(checkpoint_dir if checkpoint_dir else 'EleutherAI/gpt-neo-125m')

# Load training arguments
training_args = TrainingArguments(
    output_dir=output_dir,  # Directory to save model and checkpoints
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,  # Adjusted learning rate
    weight_decay=0.0,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="steps",
    save_steps=1000,
    eval_strategy="steps",
    eval_steps=6000,
    save_total_limit=3,  # Keep only the last 3 checkpoints to manage disk space
    fp16=True,
    resume_from_checkpoint=checkpoint_dir,  # Automatically resume from checkpoint if found
    gradient_checkpointing=True,  # Enable gradient checkpointing here
)

# Load and tokenize your dataset
def tokenize_function(examples):
    tokens = tokenizer(examples['text'], padding="max_length", truncation=True)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

data_files = {"train": "chat_logs.txt", "validation": "validation_logs.txt"}
dataset = load_dataset("text", data_files=data_files)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Start training
try:
    trainer.train(resume_from_checkpoint=checkpoint_dir)
    
    # Save the final model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Save evaluation results
    eval_output = trainer.evaluate()
    with open(os.path.join(output_dir, "evaluation_results.txt"), "w") as f:
        f.write(str(eval_output))

except KeyboardInterrupt:
    print("Training or evaluation interrupted. Saving model to output directory.")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

tanaha2002 · September 19, 2024, 8:07pm

From what I know, you can fix it by adding save_safetensors=False in your TrainingArguments. In my case the problem is fixed, but your model now will save as a bin not a safetensors.