I want to train my first Hugginface model.
The code below generates the following error. What am I doing wrong?
## imports
from datasets import Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
)
## data
# Load 20-newsgroup dataset and arrange it into a list of tuples
# data = [("description1", "category1"), ("description2", "category2"), ...]
newsgroups_train = fetch_20newsgroups(subset="train")
data = [
(
newsgroups_train.data[i],
newsgroups_train.target_names[newsgroups_train.target[i]],
)
for i in range(len(newsgroups_train.data))
]
## Prepare the dataset
descriptions = [item[0] for item in data]
categories = [item[1] for item in data]
# Tokenizer and Model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=len(set(categories))
)
## Encoding data
def encode(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length")
## Train/Test split
(
train_descriptions,
test_descriptions,
train_categories,
test_categories,
) = train_test_split(descriptions, categories, test_size=0.2)
training_args = TrainingArguments("test_trainer")
def hugginface_dataset(text, labels):
return Dataset.from_dict(
{
"text": text,
"labels": labels,
}
)
train_dataset = hugginface_dataset(train_descriptions, train_categories)
test_dataset = hugginface_dataset(test_descriptions, test_categories)
## Define trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encode(train_dataset),
eval_dataset=encode(test_dataset),
# compute_metrics=compute_metrics,
)
## Train the model
trainer.train()