i have 24 labels and my data are look like this
text | labels
from datasets import load_dataset
data = load_dataset('csv',data_files='/content/drive/MyDrive/new_stories.csv')
# Create a dictionary to map IDs to text
id2text = {i: word for i, word in enumerate(data['labels'].unique())}
# Create a dictionary to map text to IDs
text2id = {word: i for i, word in enumerate(data['labels'].unique())}
# Print the mappings
print("ID to Text Mapping: ", id2text)
print("Text to ID Mapping: ", text2id)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
return tokenizer(examples["story"], truncation=True)
tokenized_data = data.map(preprocess_function, batched=True,num_proc=10)
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased", num_labels=24, id2label=id2text, label2id=text2id
)
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=-1)
return accuracy.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir="text_classification",
evaluation_strategy="steps",
eval_steps=500,
learning_rate=2e-5,
per_device_train_batch_size=2,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
fp16=True,
gradient_accumulation_steps=16,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
show me this error
ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.