Huge discrepancy in perplexity for Trainer v/s scratch implementation?

I tried to determine the perplexity of gpt-2 model on the wikitext2 dataset using 2 methods:

  1. The huggingface Trainer (ppl: 262915.39172431716)
  2. following Perplexity of fixed-length models (ppl: 16.45)

As I understand, method 2 might be more accurate as explained in the blog, but when I used the following script to get perplexity, I get a very high value as mentioned above.

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import math

# Load the pre-trained language model (e.g., GPT-2)
model_name = "gpt2"  # You can replace this with another model
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load the Wikitext-2 dataset from Hugging Face
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
test_dataset = dataset["test"]

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], return_attention_mask=False, truncation=True, padding="max_length", max_length=512)

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Define the data collator
def data_collator(features):
    batch = {
        "input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in features]),
        "labels": torch.stack([torch.tensor(f["input_ids"]) for f in features]),
    }
    return batch

# Setup training arguments for evaluation
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=4,  # Adjust based on your GPU memory
    evaluation_strategy="no",
    logging_dir='./logs',
    do_train=False,
    do_eval=True,
    report_to="none",
)

# Define a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
)

# Evaluate the model and calculate perplexity
eval_results = trainer.evaluate()

# Perplexity calculation
log_loss = eval_results["eval_loss"]
perplexity = math.exp(log_loss)

print(f"Perplexity on the Wikitext-2 dataset: {perplexity}")

Am I doing something wrong here ?

1 Like

Update:

using the following preprocessing with Trainer gives 79.41 ppl value:

def load_wikitext_dataset(model_name, batch_size=2):
    """
    Load and tokenize the WikiText dataset and return train, validation, and test loaders.
    :param model_name: The Hugging Face model name to get the appropriate tokenizer.
    :param batch_size: The batch size to use for DataLoader.
    :return: train, validation, and test DataLoaders.
    """
    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token 

    # We use DataCollatorForLanguageModeling for causal LM, it handles padding and attention masking.
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    def tokenize_function(examples):
        # Tokenize the text and prepare input_ids and labels for causal language modeling
        return tokenizer(
            examples['text'],
            padding="max_length",  # Ensure padding to a fixed length
            truncation=True,       # Truncate texts that are too long
            max_length=512,        # Set maximum sequence length
            return_tensors="pt"    # Return PyTorch tensors
        )

    # Tokenize the dataset and split into train, validation, and test sets
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    # Remove the 'text' column from the tokenized dataset
    tokenized_dataset = tokenized_dataset.remove_columns(['text'])

    test_dataset = tokenized_dataset['test']

    # Create DataLoaders for each set
    # test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=data_collator)

    return test_dataset, data_collator

While this one:

import evaluate
import datasets

perplexity = evaluate.load("perplexity", module_type="metric")
input_texts = datasets.load_dataset("wikitext",
                                    "wikitext-2-raw-v1",
                                    split="test")["text"]
input_texts = [s for s in input_texts if s!='']
results = perplexity.compute(
    model_id="gpt2",
    batch_size=4,
    predictions=input_texts
)
print(results['mean_perplexity'])

Gives 546.62.

Update 2: This one based on run_clm_no_trainer.py gives 29.35:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, default_data_collator
from typing import Dict, List, Union, Tuple, Optional, Any, Mapping
from datasets import load_dataset
from torch.utils.data import DataLoader
from itertools import chain
from tqdm import tqdm
import math


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
    # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


model_id = "gpt2"
batch_size = 4

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
block_size = tokenizer.model_max_length

if block_size > model.config.max_position_embeddings:
    print(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). ", 
                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
            )
    block_size = min(1024, config.max_position_embeddings)


# Load the Wikitext-2 and preprocess dataset from Hugging Face

raw_datasets = load_dataset(
            "wikitext", "wikitext-2-raw-v1", trust_remote_code=True
        )

column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=2,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=2,
    desc=f"Grouping texts in chunks of {block_size}",
)

eval_dataset = lm_datasets["test"]

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=default_data_collator, batch_size=batch_size
)

model.eval()
losses = []

# Wrap the eval_dataloader in tqdm and provide a description for clarity
for step, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader), desc="Evaluating"):
    with torch.no_grad():
        outputs = model(**batch)
    
    loss = outputs.loss
    losses.append(loss.repeat(batch_size))

# Combine the list of losses into a single tensor
losses = torch.cat(losses)

eval_loss = torch.mean(losses)
perplexity = math.exp(eval_loss)

print(f"Perplexity on the Wikitext-2 dataset: {perplexity}")
1 Like

:point_down::point_down::point_down::point_down::white_check_mark::white_check_mark:

Download Link >>>> https://pcsoftsfull.org/after-verification-click-go-to-download/