Context:
Im trying to finetune and compare the performance of different Models. My code works for models such BERT, ROBERTA and Minilm. However for GPTNEO, I encounter the error message : "ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token
(tokenizer.pad_token = tokenizer.eos_token e.g.)
or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
.
Code:
from transformers import AutoTokenizer
model_ckpt = “EleutherAI/gpt-neo-125m”
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
def tokenize_text(examples):
return tokenizer(examples[“sentence”], truncation=True, max_length=512)
ds = ds.map(tokenize_text, batched=True)
ds
DatasetDict({
train: Dataset({
features: [‘sentence’, ‘tec_name’, ‘label’, ‘input_ids’, ‘token_type_ids’, ‘attention_mask’],
num_rows: 19083
})
validation: Dataset({
features: [‘sentence’, ‘tec_name’, ‘label’, ‘input_ids’, ‘token_type_ids’, ‘attention_mask’],
num_rows: 6360
})
test: Dataset({
features: [‘sentence’, ‘tec_name’, ‘label’, ‘input_ids’, ‘token_type_ids’, ‘attention_mask’],
num_rows: 2358
})
})
class_weights = (1 - (labels_df[“label”].value_counts().sort_index() / len(labels_df))).values
class_weights
import torch
class_weights = torch.from_numpy(class_weights).float().to(“cuda”)
class_weights
from torch import nn
import torch
from transformers import Trainer
class WeightedLossTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
logits = outputs.get(“logits”)
labels = inputs.get(“labels”)
loss_func = nn.CrossEntropyLoss(weight=class_weights)
loss = loss_func(logits, labels)
return (loss, outputs) if return_outputs else loss
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=164, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average=“weighted”)
acc = accuracy_score(labels, preds)
return {“accuracy”: acc, “f1”: f1}
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
batch_size = 64
logging_steps = len(ds[“train”])
#output_dir1=“xxx”
training_args = TrainingArguments(output_dir=output_dir1,
num_train_epochs=30,
eval_steps=250,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=16,
weight_decay=0.01,
evaluation_strategy=‘steps’,
metric_for_best_model = ‘f1’,
load_best_model_at_end=True,
push_to_hub=True)
trainer = WeightedLossTrainer(model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=ds[“train”],
eval_dataset=ds[“validation”],
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
tokenizer=tokenizer)
trainer.train()
Error Message:ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token
(tokenizer.pad_token = tokenizer.eos_token e.g.)
or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
.
Any Help is welcome. Please Help