Code is working fine for Bert and Roberta However Fails During GPTNeo

Context:
Im trying to finetune and compare the performance of different Models. My code works for models such BERT, ROBERTA and Minilm. However for GPTNEO, I encounter the error message : "ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token (tokenizer.pad_token = tokenizer.eos_token e.g.) or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'}).

Code:
from transformers import AutoTokenizer
model_ckpt = “EleutherAI/gpt-neo-125m”
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize_text(examples):
return tokenizer(examples[“sentence”], truncation=True, max_length=512)

ds = ds.map(tokenize_text, batched=True)
ds

DatasetDict({
train: Dataset({
features: [‘sentence’, ‘tec_name’, ‘label’, ‘input_ids’, ‘token_type_ids’, ‘attention_mask’],
num_rows: 19083
})
validation: Dataset({
features: [‘sentence’, ‘tec_name’, ‘label’, ‘input_ids’, ‘token_type_ids’, ‘attention_mask’],
num_rows: 6360
})
test: Dataset({
features: [‘sentence’, ‘tec_name’, ‘label’, ‘input_ids’, ‘token_type_ids’, ‘attention_mask’],
num_rows: 2358
})
})

class_weights = (1 - (labels_df[“label”].value_counts().sort_index() / len(labels_df))).values
class_weights

import torch
class_weights = torch.from_numpy(class_weights).float().to(“cuda”)
class_weights

from torch import nn
import torch
from transformers import Trainer

class WeightedLossTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
logits = outputs.get(“logits”)
labels = inputs.get(“labels”)
loss_func = nn.CrossEntropyLoss(weight=class_weights)
loss = loss_func(logits, labels)
return (loss, outputs) if return_outputs else loss

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=164, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average=“weighted”)
acc = accuracy_score(labels, preds)
return {“accuracy”: acc, “f1”: f1}

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
batch_size = 64
logging_steps = len(ds[“train”])
#output_dir1=“xxx”

training_args = TrainingArguments(output_dir=output_dir1,
num_train_epochs=30,
eval_steps=250,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=16,
weight_decay=0.01,
evaluation_strategy=‘steps’,
metric_for_best_model = ‘f1’,
load_best_model_at_end=True,
push_to_hub=True)

trainer = WeightedLossTrainer(model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=ds[“train”],
eval_dataset=ds[“validation”],
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
tokenizer=tokenizer)

trainer.train()

Error Message:ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token (tokenizer.pad_token = tokenizer.eos_token e.g.) or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'}).

Any Help is welcome. Please Help :frowning:

If I am not wrong you have to explicitly mention the padding token for this type of models to eos_token
tokenizer.pad_token = tokenizer.eos_token.

1 Like

Is this true? Is it neccasary to mention the pad token? Why is it not defined by default?