How does one create a pytorch data loader with a custom hugging face data set without having errors?

Currently my custom data set gives None indices in the data loader, but NOT in the pure data set. When I wrap it in pytorch data loader it fails.

Code is in colab but will put it here in case colab dies someday:

pip install datasets
pip install pytorch
pip install transformers

then run

token = None
batch_size = 10
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
  tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)

# -- Get batch from dataset
from datasets import load_dataset
# path, name = 'brando/debug1_af', 'debug1_af'
path, name = 'brando/debug0_af', 'debug0_af'
remove_columns = []
dataset = load_dataset(path, name, streaming=True, split="train", token=token).with_format("torch")
print(f'{dataset=}')
batch = dataset.take(batch_size)
# print(f'{next(iter(batch))=}')

# - Prepare functions to tokenize batch
def preprocess(examples):  # gets the raw text batch according to the specific names in table in data set & tokenize
    return tokenizer(examples["link"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch):  # apply preprocess to batch to all examples in batch represented as a dataset
    return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(batch)
# print(f'{next(iter(tokenized_batch))=}')

from torch.utils.data import Dataset, DataLoader, SequentialSampler
dataset = tokenized_batch
print(f'{type(dataset)=}')
print(f'{dataset.__class__=}')
print(f'{isinstance(dataset, Dataset)=}')
# for i, d in enumerate(dataset):
#     assert isinstance(d, dict)
#     # dd = dataset[i]
#     # assert isinstance(dd, dict)
loader_opts = {}
classifier_opts = {} 
# data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
#                         num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=SequentialSampler(range(512))  )
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
                    num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
print(f'{iter(data_loader)=}')
print(f'{next(iter(data_loader))=}')
print('Done\a')

error:

dataset=<datasets.iterable_dataset.IterableDataset object at 0x7e42c2f21d20>
type(dataset)=<class 'datasets.iterable_dataset.IterableDataset'>
dataset.__class__=<class 'datasets.iterable_dataset.IterableDataset'>
isinstance(dataset, Dataset)=True
iter(data_loader)=<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7e42c2f21660>
/usr/local/lib/python3.10/dist-packages/datasets/formatting/torch_formatter.py:68: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    126         try:
--> 127             return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
    128         except TypeError:

9 frames
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
    126         try:
--> 127             return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
    128         except TypeError:

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    149 
--> 150     raise TypeError(default_collate_err_msg_format.format(elem_type))
    151 

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-6-1153c5915bd8> in <cell line: 49>()
     47                     num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
     48 print(f'{iter(data_loader)=}')
---> 49 print(f'{next(iter(data_loader))=}')
     50 print('Done\a')

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    631                 # TODO(https://github.com/pytorch/pytorch/issues/76750)
    632                 self._reset()  # type: ignore[call-arg]
--> 633             data = self._next_data()
    634             self._num_yielded += 1
    635             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    675     def _next_data(self):
    676         index = self._next_index()  # may raise StopIteration
--> 677         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    678         if self._pin_memory:
    679             data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     40         else:
     41             data = next(self.dataset_iter)
---> 42         return self.collate_fn(data)
     43 
     44 

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
    263             >>> default_collate(batch)  # Handle `CustomType` automatically
    264     """
--> 265     return collate(batch, collate_fn_map=default_collate_fn_map)

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    128         except TypeError:
    129             # The mapping type may not support `__init__(iterable)`.
--> 130             return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
    131     elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
    132         return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
    128         except TypeError:
    129             # The mapping type may not support `__init__(iterable)`.
--> 130             return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
    131     elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
    132         return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    148                 return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
    149 
--> 150     raise TypeError(default_collate_err_msg_format.format(elem_type))
    151 
    152 

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

why is this error happening?

I’ve done all the checks, e.g., make sure the return indices are dicts, even went in detail debugging mode with pdb inside of pytorch’s code.

so: python - How does one create a pytorch data loader with a custom hugging face data set without having errors? - Stack Overflow

similar issue: How does one create a pytoch data loader using an interleaved hugging face dataset?

this is useful HuggingFace dataset: each element in list of batch should be of equal size

solution answer: python - How does one create a pytorch data loader with a custom hugging face data set without having errors? - Stack Overflow