I have been trying to train Deberta v3 on multiple nli datasets at the same time. And faced the error mentioned below after the training on the line result = trainer.evaluate(dataset_test_final)
My guess is the data format is incorrect but I don’t know how to fix it?
The code I ran -
import torch
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import os
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, DataCollatorWithPadding, DataCollator
device = "cuda"
print(f"Device: {device}")
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, model_max_length=512) # model_max_length=512
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, label2id=label2id, id2label=id2label).to(device) # num_labels=3
print(model.config)
from datasets import concatenate_datasets
#MNLI MNLI MNLI
dataset_train_mnli = load_dataset("glue","mnli", split="train") # split='train'
dataset_train_mnli = dataset_train_mnli.remove_columns(['idx'])
dataset_val_mnli_m = load_dataset("glue","mnli", split="validation_matched") # split='train'
dataset_val_mnli_m = dataset_val_mnli_m.remove_columns(['idx'])
dataset_val_mnli_mm = load_dataset("glue","mnli", split="validation_mismatched") # split='train'
dataset_val_mnli_mm = dataset_val_mnli_mm.remove_columns(['idx'])
dataset_test_mnli_m = load_dataset("glue","mnli", split="test_matched") # split='train'
dataset_test_mnli_m = dataset_test_mnli_m.remove_columns(['idx'])
dataset_test_mnli_mm = load_dataset("glue","mnli", split="test_mismatched") # split='train'
dataset_test_mnli_mm = dataset_test_mnli_mm.remove_columns(['idx'])
#ANLI ANLI ANLI
dataset_train_anli = load_dataset('anli', split=["train_r1", "train_r2", "train_r3"])
dataset_train_anli = concatenate_datasets([dataset_train_anli[0], dataset_train_anli[1], dataset_train_anli[2]])
dataset_train_anli = dataset_train_anli.remove_columns(["uid", "reason"])
dataset_test_anli = load_dataset('anli', split=["test_r1", "test_r2", "test_r3"])
dataset_test_anli = concatenate_datasets([dataset_test_anli[0], dataset_test_anli[1], dataset_test_anli[2]])
dataset_test_anli = dataset_test_anli.remove_columns(["uid", "reason"])
#SNLI SNLI SNLI
dataset_train_snli = load_dataset("snli", split="train")
dataset_test_snli = load_dataset("snli", split="test")
dataset_val_snli = load_dataset("snli", split="validation")
from datasets import concatenate_datasets
dataset_train_final = concatenate_datasets([dataset_train_mnli, dataset_train_snli, dataset_train_anli]) # dataset_train_effec
dataset_train_final= dataset_train_final.shuffle(seed=42)
dataset_test_final= concatenate_datasets([dataset_val_mnli_mm,dataset_val_mnli_m,dataset_test_mnli_mm,dataset_test_mnli_m,dataset_val_snli,dataset_test_snli,dataset_test_anli,dataset_train_anli])
dataset_test_final= dataset_test_final.shuffle(seed=42)
dynamic_padding = True
def tokenize_func(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) # max_length=512, padding=True
encoded_dataset_train = dataset_train_final.map(tokenize_func, batched=True)
encoded_dataset_test = dataset_test_final.map(tokenize_func, batched=True)
data_collator = DataCollator(tokenizer)
if dynamic_padding == True:
data_collator = DataCollatorWithPadding(tokenizer)
n_tokens = [len(encoding) for encoding in encoded_dataset_train["input_ids"]]
from datasets import list_metrics
metric = load_metric('accuracy')
train_args = TrainingArguments(
output_dir=f'./results/output',
logging_dir=f'./logs/output',
learning_rate=6e-6,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
warmup_ratio=0.06,
weight_decay=0.1,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
fp16=True,
fp16_full_eval=True,
evaluation_strategy="epoch",
seed=42,
save_strategy = "epoch",
save_total_limit=5,
logging_strategy="epoch",
report_to="all")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=train_args,
train_dataset=encoded_dataset_train,
eval_dataset=encoded_dataset_test,
compute_metrics=compute_metrics
)
trainer.train()
result = trainer.evaluate(dataset_test_final)
print(result)
The error-
Traceback (most recent call last):
File "trainersv3.py", line 115, in <module>
result = trainer.evaluate(dataset_test_final)
File "/opt/conda/lib/python3.7/site-packages/transformers/trainer.py", line 2128, in evaluate
metric_key_prefix=metric_key_prefix,
File "/opt/conda/lib/python3.7/site-packages/transformers/trainer.py", line 2283, in evaluation_loop
for step, inputs in enumerate(dataloader):
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/opt/conda/lib/python3.7/site-packages/transformers/data/data_collator.py", line 231, in __call__
return_tensors=self.return_tensors,
File "/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 2718, in pad
"You should supply an encoding or a list of encodings to this method "
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']
Apologies for the long post. Thank you for reading and trying to help.