Hi,
I’m trying to train a XLNetForSequenceClassification
model using Trainer
to classify sentences into 3 categories. It works fine for the training and eval datasets during trainer.train()
(loss reduces as expected), but if I try to use compute_metrics
argument in my trainer or I try to obtain the predictions on the same eval dataset using trainer.predict()
, it crashes with the following error :
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in predict(self, test_dataset)
1353 test_dataloader = self.get_test_dataloader(test_dataset)
1354
-> 1355 return self.prediction_loop(test_dataloader, description="Prediction")
1356
1357 def prediction_loop(
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in prediction_loop(self, dataloader, description, prediction_loss_only)
1442 eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
1443 if not prediction_loss_only:
-> 1444 preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
1445 labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
1446
/usr/local/lib/python3.6/dist-packages/transformers/trainer_pt_utils.py in add_arrays(self, arrays)
328 # If we get new arrays that are too big too fit, we expand the shape fo the storage
329 self._storage = nested_expand_like(self._storage, arrays_shape[1], padding_index=self.padding_index)
--> 330 slice_len = self._nested_set_tensors(self._storage, arrays)
331 for i in range(self.world_size):
332 self._offsets[i] += slice_len
/usr/local/lib/python3.6/dist-packages/transformers/trainer_pt_utils.py in _nested_set_tensors(self, storage, arrays)
335 if isinstance(arrays, (list, tuple)):
336 for x, y in zip(storage, arrays):
--> 337 slice_len = self._nested_set_tensors(x, y)
338 return slice_len
339 assert (
/usr/local/lib/python3.6/dist-packages/transformers/trainer_pt_utils.py in _nested_set_tensors(self, storage, arrays)
335 if isinstance(arrays, (list, tuple)):
336 for x, y in zip(storage, arrays):
--> 337 slice_len = self._nested_set_tensors(x, y)
338 return slice_len
339 assert (
/usr/local/lib/python3.6/dist-packages/transformers/trainer_pt_utils.py in _nested_set_tensors(self, storage, arrays)
347 else:
348 storage[self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]] = arrays[
--> 349 i * slice_len : (i + 1) * slice_len
350 ]
351 return slice_len
ValueError: could not broadcast input array from shape (4565,16,768) into shape (916,16,768)
Here 916 is the size of the eval dataset and 16 is the batch_size, and my guess is that 4565 is the longest concatenated feature list?
My code is as follows :
class XLNetDataset(data.Dataset):
def __init__(self, dfObject):
self.dfObject = dfObject # Pandas dataframe
def __len__(self):
return self.dfObject.shape[0]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
dfRows = self.dfObject.iloc[idx]
dfSentences = dfRows['sentence']
dfLabels = dfRows['p_typ']
return dfSentences, dfLabels
def XLNetCollatFunc(data):
sents = [elem[0] for elem in data]
labels = [elem[1] for elem in data]
encoded_result = xlTokenizer(sents, padding=True, truncation=True, max_length=128, return_tensors='pt', return_attention_mask=True)
output = {'input_ids': encoded_result['input_ids'],
'attention_mask': encoded_result['attention_mask'],
'token_type_ids': encoded_result['token_type_ids'],
'labels': torch.tensor(labels)}
return output
trainDataset = XLNetDataset(trainData) # trainData is pandas DF containing train sentences
testDataset = XLNetDataset(testData) # testData is pandas DF containing test sentences
xlTokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlNetModel = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=3)
for param in xlNetModel.base_model.parameters():
param.requires_grad = False
trainArgs = TrainingArguments(
num_train_epochs = 1,
evaluation_strategy = 'epoch',
per_device_train_batch_size = 16,
per_device_eval_batch_size = 16
)
trainer = Trainer(
model = xlNetModel,
args = trainArgs,
train_dataset = trainDataset,
eval_dataset = testDataset,
data_collator = XLNetCollatFunc
)
trainer.train()
trainer.predict(testDataset)
I’m guessing the problem is somewhere with my custom data collator (I’m still a little unsure of the exact data format the data collator or trainer is expected to receive), but I can’t understand how it is able to produce training and evaluation loss during trainer.train()
and not during the predict()
call.
I’m using the latest API version (3.5)