Context:
Im trying to finetune and compare the performance of different Models. My code works for models such BERT, ROBERTA and Minilm. However for GPTNEO, I encounter the error message : "ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token
(tokenizer.pad_token = tokenizer.eos_token e.g.)
or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
.
Code:
from transformers import AutoTokenizer
model_ckpt = āEleutherAI/gpt-neo-125mā
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
def tokenize_text(examples):
return tokenizer(examples[āsentenceā], truncation=True, max_length=512)
ds = ds.map(tokenize_text, batched=True)
ds
DatasetDict({
train: Dataset({
features: [āsentenceā, ātec_nameā, ālabelā, āinput_idsā, ātoken_type_idsā, āattention_maskā],
num_rows: 19083
})
validation: Dataset({
features: [āsentenceā, ātec_nameā, ālabelā, āinput_idsā, ātoken_type_idsā, āattention_maskā],
num_rows: 6360
})
test: Dataset({
features: [āsentenceā, ātec_nameā, ālabelā, āinput_idsā, ātoken_type_idsā, āattention_maskā],
num_rows: 2358
})
})
class_weights = (1 - (labels_df[ālabelā].value_counts().sort_index() / len(labels_df))).values
class_weights
import torch
class_weights = torch.from_numpy(class_weights).float().to(ācudaā)
class_weights
from torch import nn
import torch
from transformers import Trainer
class WeightedLossTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
logits = outputs.get(ālogitsā)
labels = inputs.get(ālabelsā)
loss_func = nn.CrossEntropyLoss(weight=class_weights)
loss = loss_func(logits, labels)
return (loss, outputs) if return_outputs else loss
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=164, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average=āweightedā)
acc = accuracy_score(labels, preds)
return {āaccuracyā: acc, āf1ā: f1}
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
batch_size = 64
logging_steps = len(ds[ātrainā])
#output_dir1=āxxxā
training_args = TrainingArguments(output_dir=output_dir1,
num_train_epochs=30,
eval_steps=250,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=16,
weight_decay=0.01,
evaluation_strategy=āstepsā,
metric_for_best_model = āf1ā,
load_best_model_at_end=True,
push_to_hub=True)
trainer = WeightedLossTrainer(model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=ds[ātrainā],
eval_dataset=ds[āvalidationā],
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
tokenizer=tokenizer)
trainer.train()
Error Message:ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as
pad_token
(tokenizer.pad_token = tokenizer.eos_token e.g.)
or add a new pad token via
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
.
Any Help is welcome. Please Help