Hello everyone,
I am trying to build a Multiclass Classifier with a pretrained BERT model. I am completely new to the topic. I have 8 classes and use Huggingface’s Dataset infrastructure to finetune a pretrained model for the german language:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
num_labels_cla = 8
model_name_cla = "bert-base-german-dbmdz-uncased"
batch_size_cla = 8
model = AutoModelForSequenceClassification.from_pretrained(model_name_cla, num_labels=num_labels_cla)
def tokenize(batch):
return tokenizer(batch['text'], padding=True, truncation=True,max_length=260)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels,preds)
return {"accuracy":acc, "f1":f1}
My model shouldn’t be a sentiment classifier but a multilabel classifier which classifies customer reviews based on different label (e.g customer support etc.).
When I train/finetune my model with the Huggingface Trainer() instance:
#Encoding the data
data_encoded = data_dict.map(tokenize, batched=True, batch_size=None)
data_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
#Specify training arguments
logging_steps=len(data_encoded["train"])
training_args = TrainingArguments(output_dir='./results',
num_train_epochs=3,
learning_rate=2e-5,
per_device_train_batch_size=batch_size_cla,
per_device_eval_batch_size=batch_size_cla,
load_best_model_at_end=True,
metric_for_best_model="f1",
weight_decay=0.01,
evaluation_strategy="steps",
eval_steps = 2,
disable_tqdm=False,
logging_steps=logging_steps)
#Specify trainer
trainer = Trainer(
model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=data_encoded['train'], eval_dataset=data_encoded['test']
)
#Train
trainer.train()
After 6 steps I get the following error:
~/miniconda3/envs/textmallet/lib/python3.9/site-packages/torch/nn/modules/sparse.py in forward(self, input)
156
157 def forward(self, input: Tensor) -> Tensor:
--> 158 return F.embedding(
159 input, self.weight, self.padding_idx, self.max_norm,
160 self.norm_type, self.scale_grad_by_freq, self.sparse)
~/miniconda3/envs/textmallet/lib/python3.9/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2041 # remove once script supports set_grad_enabled
2042 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2043 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2044
2045
IndexError: index out of range in self
Does anyone have any idea what I could change in my code?
Cheers