So I changed the toy dataset (=train=eval dataset) to 16 examples and set the train and eval batch sizes to their default of 8.

##
Code

import torch

import torch.nn as nn

from torch.utils.data import Dataset

from transformers import TrainingArguments, Trainer

import numpy as np

from sklearn.metrics import accuracy_score, f1_score

class CustomDataset(Dataset):

```
def __init__(self):
self.input = [[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.],
[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.],
[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.],
[1.,0.,0.,0.],[0.,1.,0.,0.],[0.,0.,1.,0.],[0.,0.,0.,1.]]
self.labels = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
self.n_tokens = 4
self.n_labels = 2
def __len__(self):
return(len(self.input))
def __getitem__(self, idx):
input_dict = {"inputs": torch.tensor(self.input[idx]),
"label_ids": torch.tensor(self.labels[idx])}
return input_dict
```

class MyTrainer(Trainer):

def compute_loss(self, model, inputs, return_outputs = False):

labels = inputs[â€ślabelsâ€ť].long()

logits = model(inputs[â€śinputsâ€ť])

loss_function = nn.CrossEntropyLoss()

loss = loss_function(logits,labels)

return (loss, logits) if return_outputs else loss

def compute_metrics(eval_pred):

preds, labels = eval_pred

print(preds)

preds = np.argmax(preds, axis=1)

```
accuracy = accuracy_score(labels, preds)
micro_f1 = f1_score(labels, preds, average = "micro")
macro_f1 = f1_score(labels, preds, average = "macro")
return {"Accuracy": accuracy, "Micro F1": micro_f1, "Macro F1": macro_f1}
```

dataset = CustomDataset()

n_tokens = dataset.n_tokens

n_hidden = 2

n_labels = dataset.n_labels

model = nn.Sequential(

nn.Linear(n_tokens,n_hidden),

nn.ReLU(),

nn.BatchNorm1d(n_hidden),

nn.Linear(n_hidden,n_hidden),

nn.ReLU(),

nn.BatchNorm1d(n_hidden),

nn.Linear(n_hidden, n_labels))

train_dataset = eval_dataset = dataset

args = TrainingArguments(output_dir = â€śexampleâ€ť,

report_to=[],

num_train_epochs = 3,

per_device_train_batch_size =8,

per_device_eval_batch_size = 8,

evaluation_strategy = â€śstepsâ€ť,

logging_steps =4)

trainer = MyTrainer(

model = model,

args=args,

train_dataset=train_dataset,

eval_dataset=eval_dataset,

compute_metrics=compute_metrics

)

trainer.train()

Here is what happens, when comparing the output of trainer.predict and trainer.model:

preds1 = trainer.model(dataset[0:16][â€śinputsâ€ť]).detach().numpy()

preds2 = trainer.predict(dataset)[0]

preds2 == np.concatenate((preds1[1:8], preds1[9:16], [[-100.,-100],[-100.,-100]]))

Output:

array([[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True],

[ True, True]])

**So what got lost here is the first instance of both the two batches.**

I changed it to a 3-class data set and the same two rows were missing, so in this small example there is no indication of the relationship between number of missing values and number of classes here that I saw in the full setup.

Edit:

I had to run the line preds1 = trainer.model(dataset[0:16][â€śinputsâ€ť]).detach().numpy() twice to get these results. The first call of trainer.model gives different results than subsequent ones.