I’ve been breaking my head about this bug in my code for two days now. I have a set of german texts that I want to classify into one of 10 classes. The training runs smoothly, I have problems with the evaluation. Obviously I don’t share the whole texts, let me know if that is required, but they are confidential, so I’d have to make a mock example.
Here is the code I use to get my data:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, random_state=111, test_size=0.1)
print("TRAIN TEXTS LENGTH", len(train_texts))
print("VAL TEXTS LENGTH", len(val_texts))
print("TRAIN LABELS LENGTH", len(train_labels))
print("VAL LABELS LENGTH", len(val_labels))
TRAIN TEXTS LENGTH 36
VAL TEXTS LENGTH 4
TRAIN LABELS LENGTH 36
VAL LABELS LENGTH 4
Here is the code I have. First I prepare the model:
###########################
# Prepare model
###########################
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("benjamin/gerpt2", model_max_len = 300)
tokenizer.padding_side = "left" # GPT-2 must be padded to the left
tokenizer.pad_token = tokenizer.eos_token
# Model
config = GPT2Config.from_pretrained(pretrained_model_name_or_path="benjamin/gerpt2",
id2label = id2label, #dictionary of {'id': 'label'}
label2id = label2id) #dictionary of {'label': 'id'}
model = AutoModelForSequenceClassification.from_pretrained("benjamin/gerpt2", num_labels = 10)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
Then I create dataset and tokenize text (see custom defined classes and function):
def tokenize_text(text, tokenizer):
'''
Tokenizes text using a loaded tokenizer
'''
return tokenizer(text, max_length=300, truncation=True, padding=True)
class CustomDataset(torch.utils.data.Dataset):
'''
Defines a Dataset class to feed the model.
'''
def __init__(self, encodings, labels=None):
'''
Initializes the class with the preprocessed text (encodings), labels and number of examples.
'''
self.encodings = encodings
self.labels = labels
self.n_examples = len(self.labels)
def __getitem__(self, idx):
'''
Defines a method that pulls a single item with its idx from the dataset.
'''
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} # get from dictionary
if self.labels:
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
'''
Defines a method that returns the length of the dataset.
'''
return len(self.encodings["input_ids"])
###########################
# Encode text
###########################
train_encodings = tokenize_text(train_texts, tokenizer)
val_encodings = tokenize_text(val_texts, tokenizer)
###########################
# Create dataset objects
###########################
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
Now I created my own Trainer class and compute metrics because I want to use the weight argument in my loss function that I defined:
###########################
# Training arguments
###########################
def compute_metrics(pred):
'''
Calculates metrics to evaluate model.
'''
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
print('\npred.predictions:\n', pred.predictions)
print('\npred:\n', pred)
print()
print('y_true', labels)
print('y_hat', preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
# Model Artifacts should fall into this folder (or sub folders)
artifacts_out_dir = './outputs'
training_args = TrainingArguments(
output_dir=artifacts_out_dir,
# checkpoint saving strategy
overwrite_output_dir=True,
evaluation_strategy = 'epoch',
# model hyperparameters
num_train_epochs=1,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=10,
weight_decay=0.01,
# evaluation strategy and logging
logging_dir='./logs/tensorboard',
logging_steps=2
)
class TrainerCustom(transformers.Trainer):
def __init__(self, weights, *args, **kwargs):
super().__init__(*args, **kwargs)
# initialize weights from argument
self.weights = weights
def compute_loss(self, model, inputs, return_outputs=False):
"""
How the loss is computed by Trainer. By default, all models return the loss in the first element.
Subclass and override for custom behavior.
"""
labels = inputs.pop("labels")
outputs, _ = model(**inputs, return_dict=False) # returns tuple, that is why '_'
# set to same device as labels
self.weights = self.weights.to(labels.device)
print("LABELS:", labels)
print('LABELS device:', labels.device)
print("WEIGHTS:", self.weights)
print('WEIGHTS device:', self.weights.device)
# Save past state if it exists
if self.args.past_index >= 0:
self._past = outputs[self.args.past_index]
cross_entropy_loss_func = torch.nn.CrossEntropyLoss(weight = self.weights)
print('OUTPUTS:', outputs)
loss = cross_entropy_loss_func(outputs, labels.long()) # set labels to TensorLong, got error before
print('LOSS', loss)
return (loss, outputs) if return_outputs else loss
Now I obviously run just debug mode (1 epoch, 36 train examples, 4 val examples).
###########################
# Trainer class
###########################
trainer = TrainerCustom(
model=model,
args=training_args,
compute_metrics=compute_metrics, #own defined function see above
train_dataset=train_dataset,
eval_dataset=val_dataset,
weights = torch.tensor([1.2000, 0.9000, 1.2000, 0.9000, 0.9000, 0.9000, 0.9000, 1.8000, 0.9000, 0.9000]) # 10 weights for 10 classes
)
Now I run training (evaluation happens as part of training due to arguments I set):
print('\nRunning training...\n')
trainer.train()
However, the issue is that the .evaluate() function for some reason returns correct number of labels. but one less than batch size predictions - hence my lengths don’t match and I get an error. See below. I print it all so I found where the bug is, I looked into the source code, but just can’t find where I’m making a mistake.
# TRAINING STEP PRINTS - EVERYTHING IS OK
LABELS: tensor([3, 9, 5, 9, 0, 6, 4, 0], dtype=torch.int32)
LABELS device: cpu
WEIGHTS: tensor([1.2000, 0.9000, 1.2000, 0.9000, 0.9000, 0.9000, 0.9000, 1.8000, 0.9000,
0.9000])
WEIGHTS device: cpu
OUTPUTS: tensor([[-0.0889, 0.2450, 0.3983, 0.1111, -0.1511, -0.0520, -0.3428, 0.2376,
-0.1851, -0.5946],
[ 0.3004, 0.1739, 0.4019, 0.1611, -0.2102, -0.1775, -0.0751, 0.4822,
-0.3875, -0.5656],
[ 0.2611, 0.1720, 0.0378, 0.0174, -0.1998, -0.1694, 0.0667, 0.7277,
-0.0311, -0.4646],
[ 0.3728, 0.6940, 0.0792, 0.1359, -0.0296, 0.2614, -0.1489, 0.5426,
-0.0150, -0.7283],
[ 0.3806, 0.3427, 0.2283, -0.0392, -0.0176, -0.2239, -0.1351, 0.8266,
-0.4894, -0.5863],
[ 0.0585, 0.3695, 0.5742, -0.7659, -0.1160, -0.2615, 0.1515, 1.7408,
-0.7622, -1.0512],
[-0.1374, 0.0696, 0.1904, 0.2616, 0.1822, -0.3327, -0.4270, 0.6404,
-0.2022, -0.5745],
[ 0.4530, 0.3680, 0.4304, -0.4875, -0.4661, -0.2198, 0.0557, 0.4714,
-0.3884, -0.2292]], grad_fn=<IndexBackward>)
LOSS tensor(2.4015, grad_fn=<NllLossBackward>)
# EVAL STEP
LABELS: tensor([0, 2, 7, 7], dtype=torch.int32)
LABELS device: cpu
WEIGHTS: tensor([1.2000, 0.9000, 1.2000, 0.9000, 0.9000, 0.9000, 0.9000, 1.8000, 0.9000,
0.9000])
WEIGHTS device: cpu
OUTPUTS: tensor([[ 0.1938, -0.2064, 0.3387, 0.0504, 0.0684, -0.2160, -0.2775, 0.4145,
-0.2933, -0.1107],
[ 0.1445, 0.0269, 0.1467, 0.1527, -0.2904, 0.0661, -0.2611, 0.5330,
-0.0186, -0.4184],
[-0.0918, -0.0234, 0.2311, 0.1614, -0.1304, -0.1700, -0.1917, 0.2001,
-0.3553, -0.2138],
[-0.0918, -0.0234, 0.2311, 0.1614, -0.1304, -0.1700, -0.1917, 0.2001,
-0.3553, -0.2138]])
LOSS tensor(2.1039)
pred.predictions:
[[ 0.14445858 0.02692143 0.14672504 0.1527456 -0.29039353 0.06611381
-0.26105392 0.5329592 -0.01855119 -0.41837007]
[-0.09184867 -0.02340093 0.23106857 0.16139469 -0.13035089 -0.17000316
-0.19174051 0.20007178 -0.3553058 -0.2137518 ]
[-0.09184867 -0.02340093 0.23106857 0.16139469 -0.13035089 -0.17000316
-0.19174051 0.20007178 -0.3553058 -0.2137518 ]]
pred:
EvalPrediction(predictions=array([[ 0.14445858, 0.02692143, 0.14672504, 0.1527456 , -0.29039353,
0.06611381, -0.26105392, 0.5329592 , -0.01855119, -0.41837007],
[-0.09184867, -0.02340093, 0.23106857, 0.16139469, -0.13035089,
-0.17000316, -0.19174051, 0.20007178, -0.3553058 , -0.2137518 ],
[-0.09184867, -0.02340093, 0.23106857, 0.16139469, -0.13035089,
-0.17000316, -0.19174051, 0.20007178, -0.3553058 , -0.2137518 ]],
dtype=float32), label_ids=array([0, 2, 7, 7]))
y_true [0 2 7 7]
y_hat [7 2 2]
As you can see, I get y_hat one less predicted class. Not sure why - the bug must be in the step above, as I only get three tensors instead of 4 of class probabilities (in the EvalPrediction obj).
Here is the error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8820/2277851298.py in <module>
4
5 print('\nRunning training...\n')
----> 6 trainer.train()
~\Anaconda3\envs\mailbot\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1405
1406 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-> 1407 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
1408
1409 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
~\Anaconda3\envs\mailbot\lib\site-packages\transformers\trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
1512 metrics = None
1513 if self.control.should_evaluate:
-> 1514 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
1515 self._report_to_hp_search(trial, epoch, metrics)
1516
~\Anaconda3\envs\mailbot\lib\site-packages\transformers\trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
2156 prediction_loss_only=True if self.compute_metrics is None else None,
2157 ignore_keys=ignore_keys,
-> 2158 metric_key_prefix=metric_key_prefix,
2159 )
2160
~\Anaconda3\envs\mailbot\lib\site-packages\transformers\trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
2390 # Metrics!
2391 if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-> 2392 metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
2393 else:
2394 metrics = {}
~\AppData\Local\Temp/ipykernel_8820/1156443582.py in compute_metrics(pred)
120 print('y_hat', preds)
121
--> 122 precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
123 acc = accuracy_score(labels, preds)
124 return {
~\Anaconda3\envs\mailbot\lib\site-packages\sklearn\metrics\_classification.py in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight, zero_division)
1532 if beta < 0:
1533 raise ValueError("beta should be >=0 in the F-beta score")
-> 1534 labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
1535
1536 # Calculate tp_sum, pred_sum, true_sum ###
~\Anaconda3\envs\mailbot\lib\site-packages\sklearn\metrics\_classification.py in _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
1336 raise ValueError("average has to be one of " + str(average_options))
1337
-> 1338 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
1339 # Convert to Python primitive type to avoid NumPy type / Python str
1340 # comparison. See https://github.com/numpy/numpy/issues/6784
~\Anaconda3\envs\mailbot\lib\site-packages\sklearn\metrics\_classification.py in _check_targets(y_true, y_pred)
82 y_pred : array or indicator matrix
83 """
---> 84 check_consistent_length(y_true, y_pred)
85 type_true = type_of_target(y_true)
86 type_pred = type_of_target(y_pred)
~\Anaconda3\envs\mailbot\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
331 raise ValueError(
332 "Found input variables with inconsistent numbers of samples: %r"
--> 333 % [int(l) for l in lengths]
334 )
335
ValueError: Found input variables with inconsistent numbers of samples: [4, 3]
PYTORCH VERSION: 1.7.1+cpu
TRANSFORMERS VERSION: 4.12.3