I’m trying to use HuggingFace’s tokenizers and datasets with a PyTorch dataloader, like so:
dataset = load_dataset(
'wikitext',
'wikitext-2-raw-v1',
split='train[:5%]', # take only first 5% of the dataset
cache_dir=cache_dir)
tokenized_dataset = dataset.map(
lambda e: self.tokenizer(e['text'],
padding=True,
max_length=512,
# padding='max_length',
truncation=True),
batched=True)
with a dataloader:
dataloader = torch.utils.data.DataLoader(
dataset=tokenized_dataset,
batch_size=batch_size,
shuffle=True)
But the dataloader throws the following error:
File "/home/rschaef/CoCoSci-Language-Distillation/distillation_v2/ratchet_learning/train.py", line 139, in run_epoch
for batch_idx, batch in enumerate(task.dataloader):
File "/home/rschaef/CoCoSci-Language-Distillation/cocosci/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
data = self._next_data()
File "/home/rschaef/CoCoSci-Language-Distillation/cocosci/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 475, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/rschaef/CoCoSci-Language-Distillation/cocosci/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/home/rschaef/CoCoSci-Language-Distillation/cocosci/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 73, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/home/rschaef/CoCoSci-Language-Distillation/cocosci/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 73, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/home/rschaef/CoCoSci-Language-Distillation/cocosci/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 81, in default_collate
raise RuntimeError('each element in list of batch should be of equal size')
RuntimeError: each element in list of batch should be of equal size
Why is this happening and how do I prevent it from happening?