So I changed the toy dataset (=train=eval dataset) to 16 examples and set the train and eval batch sizes to their default of 8.
Code
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
class CustomDataset(Dataset):
def __init__(self):
self.input = [[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.],
[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.],
[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.],
[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.]]
self.labels = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
self.n_tokens = 4
self.n_labels = 2
def __len__(self):
return(len(self.input))
def __getitem__(self, idx):
input_dict = {"inputs": torch.tensor(self.input[idx]),
"label_ids": torch.tensor(self.labels[idx])}
return input_dict
class MyTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs = False):
labels = inputs[“labels”].long()
logits = model(inputs[“inputs”])
loss_function = nn.CrossEntropyLoss()
loss = loss_function(logits,labels)
return (loss, logits) if return_outputs else loss
def compute_metrics(eval_pred):
preds, labels = eval_pred
print(preds)
preds = np.argmax(preds, axis=1)
accuracy = accuracy_score(labels, preds)
micro_f1 = f1_score(labels, preds, average = "micro")
macro_f1 = f1_score(labels, preds, average = "macro")
return {"Accuracy": accuracy, "Micro F1": micro_f1, "Macro F1": macro_f1}
dataset = CustomDataset()
n_tokens = dataset.n_tokens
n_hidden = 2
n_labels = dataset.n_labels
model = nn.Sequential(
nn.Linear(n_tokens,n_hidden),
nn.ReLU(),
nn.BatchNorm1d(n_hidden),
nn.Linear(n_hidden,n_hidden),
nn.ReLU(),
nn.BatchNorm1d(n_hidden),
nn.Linear(n_hidden, n_labels))
train_dataset = eval_dataset = dataset
args = TrainingArguments(output_dir = “example”,
report_to=[],
num_train_epochs = 3,
per_device_train_batch_size =8,
per_device_eval_batch_size = 8,
evaluation_strategy = “steps”,
logging_steps =4)
trainer = MyTrainer(
model = model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics
)
trainer.train()
Here is what happens, when comparing the output of trainer.predict and trainer.model:
preds1 = trainer.model(dataset[0:16][“inputs”]).detach().numpy()
preds2 = trainer.predict(dataset)[0]
preds2 == np.concatenate((preds1[1:8], preds1[9:16], [[-100.,-100],[-100.,-100]]))
Output:
array([[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True],
[ True, True]])
So what got lost here is the first instance of both the two batches.
I changed it to a 3-class data set and the same two rows were missing, so in this small example there is no indication of the relationship between number of missing values and number of classes here that I saw in the full setup.
Edit:
I had to run the line preds1 = trainer.model(dataset[0:16][“inputs”]).detach().numpy() twice to get these results. The first call of trainer.model gives different results than subsequent ones.