Code is working fine for Bert and Roberta However Fails During GPTNeo

Context:
Im trying to finetune and compare the performance of different Models. My code works for models such BERT, ROBERTA and Minilm. However for GPTNEO, I encounter the error message : "ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token (tokenizer.pad_token = tokenizer.eos_token e.g.) or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'}).

Code:
from transformers import AutoTokenizer
model_ckpt = ā€œEleutherAI/gpt-neo-125mā€
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize_text(examples):
return tokenizer(examples[ā€œsentenceā€], truncation=True, max_length=512)

ds = ds.map(tokenize_text, batched=True)
ds

DatasetDict({
train: Dataset({
features: [ā€˜sentenceā€™, ā€˜tec_nameā€™, ā€˜labelā€™, ā€˜input_idsā€™, ā€˜token_type_idsā€™, ā€˜attention_maskā€™],
num_rows: 19083
})
validation: Dataset({
features: [ā€˜sentenceā€™, ā€˜tec_nameā€™, ā€˜labelā€™, ā€˜input_idsā€™, ā€˜token_type_idsā€™, ā€˜attention_maskā€™],
num_rows: 6360
})
test: Dataset({
features: [ā€˜sentenceā€™, ā€˜tec_nameā€™, ā€˜labelā€™, ā€˜input_idsā€™, ā€˜token_type_idsā€™, ā€˜attention_maskā€™],
num_rows: 2358
})
})

class_weights = (1 - (labels_df[ā€œlabelā€].value_counts().sort_index() / len(labels_df))).values
class_weights

import torch
class_weights = torch.from_numpy(class_weights).float().to(ā€œcudaā€)
class_weights

from torch import nn
import torch
from transformers import Trainer

class WeightedLossTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
logits = outputs.get(ā€œlogitsā€)
labels = inputs.get(ā€œlabelsā€)
loss_func = nn.CrossEntropyLoss(weight=class_weights)
loss = loss_func(logits, labels)
return (loss, outputs) if return_outputs else loss

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=164, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average=ā€œweightedā€)
acc = accuracy_score(labels, preds)
return {ā€œaccuracyā€: acc, ā€œf1ā€: f1}

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
batch_size = 64
logging_steps = len(ds[ā€œtrainā€])
#output_dir1=ā€œxxxā€

training_args = TrainingArguments(output_dir=output_dir1,
num_train_epochs=30,
eval_steps=250,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=16,
weight_decay=0.01,
evaluation_strategy=ā€˜stepsā€™,
metric_for_best_model = ā€˜f1ā€™,
load_best_model_at_end=True,
push_to_hub=True)

trainer = WeightedLossTrainer(model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=ds[ā€œtrainā€],
eval_dataset=ds[ā€œvalidationā€],
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
tokenizer=tokenizer)

trainer.train()

Error Message:ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token (tokenizer.pad_token = tokenizer.eos_token e.g.) or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'}).

Any Help is welcome. Please Help :frowning:

If I am not wrong you have to explicitly mention the padding token for this type of models to eos_token
tokenizer.pad_token = tokenizer.eos_token.

1 Like

Is this true? Is it neccasary to mention the pad token? Why is it not defined by default?