Update:
using the following preprocessing with Trainer
gives 79.41
ppl value:
def load_wikitext_dataset(model_name, batch_size=2):
"""
Load and tokenize the WikiText dataset and return train, validation, and test loaders.
:param model_name: The Hugging Face model name to get the appropriate tokenizer.
:param batch_size: The batch size to use for DataLoader.
:return: train, validation, and test DataLoaders.
"""
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# We use DataCollatorForLanguageModeling for causal LM, it handles padding and attention masking.
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
def tokenize_function(examples):
# Tokenize the text and prepare input_ids and labels for causal language modeling
return tokenizer(
examples['text'],
padding="max_length", # Ensure padding to a fixed length
truncation=True, # Truncate texts that are too long
max_length=512, # Set maximum sequence length
return_tensors="pt" # Return PyTorch tensors
)
# Tokenize the dataset and split into train, validation, and test sets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Remove the 'text' column from the tokenized dataset
tokenized_dataset = tokenized_dataset.remove_columns(['text'])
test_dataset = tokenized_dataset['test']
# Create DataLoaders for each set
# test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=data_collator)
return test_dataset, data_collator
While this one:
import evaluate
import datasets
perplexity = evaluate.load("perplexity", module_type="metric")
input_texts = datasets.load_dataset("wikitext",
"wikitext-2-raw-v1",
split="test")["text"]
input_texts = [s for s in input_texts if s!='']
results = perplexity.compute(
model_id="gpt2",
batch_size=4,
predictions=input_texts
)
print(results['mean_perplexity'])
Gives 546.62
.
Update 2: This one based on run_clm_no_trainer.py gives 29.35
:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, default_data_collator
from typing import Dict, List, Union, Tuple, Optional, Any, Mapping
from datasets import load_dataset
from torch.utils.data import DataLoader
from itertools import chain
from tqdm import tqdm
import math
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
model_id = "gpt2"
batch_size = 4
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
block_size = tokenizer.model_max_length
if block_size > model.config.max_position_embeddings:
print(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). ",
f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
)
block_size = min(1024, config.max_position_embeddings)
# Load the Wikitext-2 and preprocess dataset from Hugging Face
raw_datasets = load_dataset(
"wikitext", "wikitext-2-raw-v1", trust_remote_code=True
)
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
def tokenize_function(examples):
return tokenizer(examples[text_column_name])
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=2,
remove_columns=column_names,
desc="Running tokenizer on dataset",
)
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=2,
desc=f"Grouping texts in chunks of {block_size}",
)
eval_dataset = lm_datasets["test"]
eval_dataloader = DataLoader(
eval_dataset, collate_fn=default_data_collator, batch_size=batch_size
)
model.eval()
losses = []
# Wrap the eval_dataloader in tqdm and provide a description for clarity
for step, batch in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader), desc="Evaluating"):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(loss.repeat(batch_size))
# Combine the list of losses into a single tensor
losses = torch.cat(losses)
eval_loss = torch.mean(losses)
perplexity = math.exp(eval_loss)
print(f"Perplexity on the Wikitext-2 dataset: {perplexity}")