I’m attempting to do a multiclass finetuning of distilbert using the following code:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
from tqdm.notebook import tqdm
from datasets import load_dataset
import numpy as np
import os
dataset = load_dataset(“csv”, data_files=“datafile.csv”,keep_default_na=False)
dataset[“train”] = dataset[“train”].rename_column(“label”, “labels”)
dataset = dataset[“train”].train_test_split(test_size=0.3)
dataset[“train”] = dataset[“train”].shuffle()
dataset[“test”] = dataset[“test”].shuffle()
def preprocess_function(examples):
return tokenizer(examples[“text”], truncation=True)
model_id = “distilbert-base-uncased”
tokenizer = AutoTokenizer.from_pretrained(model_id, max_length = 512, model_max_length = 512)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load(“accuracy”)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
id2label = {0: “casual”, 1: “possibly_needs_caution”, 2: “probably_needs_caution”, 3: “needs_caution”, 4: “needs_intervention”}
label2id = {“casual”: 0, “possibly_needs_caution”: 1, “probably_needs_caution”: 2, “needs_caution”: 3, “needs_intervention”: 4}
model = AutoModelForSequenceClassification.from_pretrained(
model_id, num_labels=5, id2label=id2label, label2id=label2id,problem_type="multi_class_classification"
)
tokenized_dataset[“train”] = tokenized_dataset[“train”].remove_columns([‘text’,‘Unnamed: 0’])
tokenized_dataset[“test”] = tokenized_dataset[“test”].remove_columns([‘text’,‘Unnamed: 0’])
training_args = TrainingArguments(
output_dir="safety_instruct_detect",
learning_rate=4.5e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
#fp16 = True,
warmup_steps = 1000,
num_train_epochs=2,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
I get the following error:
ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are labels,input_ids,attention_mask.
Here are a few things i’ve tried:
-
Using BertForSequenceClassification rather than AutoModelForSequenceClassification
-
Making the labels in the dataset a float rather than int
-
Using both “label” and “labels” as column names a few different times