Hi all,
while using Trainer to train a BERT model, I receive the following error/warning:
“Not all data has been set. Are you sure you passed all values?”
I’m not able to fix it and it seems to calculate the wrong metrics. I assume because of the missing data.
About my setup: I want to train a BERT model with a custom head for multilabel classification.
This is my code:
import pandas as pd
import numpy as np
import datasets
import json
import torch
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from torch import cuda
device = ‘cuda’ if cuda.is_available() else ‘cpu’MODEL_NAME = ‘dbmdz/bert-base-german-uncased’
SEED = 321def compute_metrics_multilables_b(eval_pred):
predictions, labels = eval_pred
predictions = torch.tensor(predictions)
preds_full = torch.sigmoid(predictions).cpu().detach().numpy().tolist()preds_full = np.array(preds_full) >= 0.5 labels = np.array(labels) >= 0.5 accuracy = metrics.accuracy_score(labels, preds_full) f1_score_micro = metrics.f1_score(labels, preds_full, average='micro') f1_score_macro = metrics.f1_score(labels, preds_full, average='macro') metrics_result = { 'accuracy': accuracy, 'f1_micro': f1_score_micro, 'f1_macro': f1_score_macro, } return metrics_result
class EmotionDataset(torch.utils.data.Dataset):
def init(self, encodings, labels):
self.encodings = encodings
self.labels = labelsdef __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels)
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs):
labels = inputs.pop(“labels”)
outputs = model(inputs[‘input_ids’], inputs[‘attention_mask’], inputs[‘token_type_ids’])
labels = labels.type_as(outputs)
logits = outputs
return torch.nn.BCEWithLogitsLoss()(logits, labels)class MultiLabelClassifier(torch.nn.Module):
def init(self):
super(MultiLabelClassifier, self).init()
self.l1 = BertModel.from_pretrained(MODEL_NAME)
self.l2 = torch.nn.Dropout(0.3)
# output is a 8-dim vector
self.l3 = torch.nn.Linear(768, 8)def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None): output_1 = self.l1(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids).pooler_output output_2 = self.l2(output_1) output = self.l3(output_2) return output
dataset_train = Dataset.from_pandas(df_train)
dataset_validation = Dataset.from_pandas(df_validation)
dataset_test = Dataset.from_pandas(df_test)load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)preprocess data
field_text = “Text”
field_label = “list”tokenize data
train_encodings = tokenizer(dataset_train[field_text], truncation=True, padding=True)
val_encodings = tokenizer(dataset_validation[field_text], truncation=True, padding=True)
test_encodings = tokenizer(dataset_test[field_text], truncation=True, padding=True)train_dataset = EmotionDataset(train_encodings, dataset_train[field_label])
val_dataset = EmotionDataset(val_encodings, dataset_validation[field_label])
test_dataset = EmotionDataset(test_encodings, dataset_test[field_label])model = MultiLabelClassifier()
_ = model.to(device)training_args = TrainingArguments(
output_dir=‘./results’, # output directory
num_train_epochs=1, # total # of training epochs
per_device_train_batch_size=8, # batch size per device during training
per_device_eval_batch_size=20, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir=‘./logs’, # directory for storing logs
)trainer = CustomTrainer(
model=model, # the instantiatedTransformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=test_dataset, # evaluation dataset
compute_metrics=compute_metrics_multilables_b
)_ = trainer.train()
trainer.evaluate()
The target/predicition is a binary 8-dim vector for each data record. The error/warning is thrown by trainer.evaluate().
Any idea what I did wrong?
Since the code is hard to read here, here the link to the pastebin snippet: https://pastebin.com/MNf68rfn
Thanks,
Max