Hi,
I am following this tutorial on masked language modelling using my own dataset: notebooks/language_modeling.ipynb at master · huggingface/notebooks · GitHub, and I am coming across this error:
Input:
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
batch_size=1000,
num_proc=4,
)
Output:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 186, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py", line 397, in wrapper
out = func(self, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1977, in _map_single
writer.write_batch(batch)
File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_writer.py", line 383, in write_batch
pa_table = pa.Table.from_pydict(typed_sequence_examples)
File "pyarrow/table.pxi", line 1559, in pyarrow.lib.Table.from_pydict
arrays.append(asarray(v))
File "pyarrow/array.pxi", line 331, in pyarrow.lib.asarray
return array(values, type=type)
File "pyarrow/array.pxi", line 222, in pyarrow.lib.array
return _handle_arrow_array_protocol(obj, type, mask, size)
File "pyarrow/array.pxi", line 110, in pyarrow.lib._handle_arrow_array_protocol
res = obj.__arrow_array__(type=type)
File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_writer.py", line 100, in __arrow_array__
if trying_type and out[0].as_py() != self.data[0]:
File "pyarrow/array.pxi", line 1067, in pyarrow.lib.Array.__getitem__
return self.getitem(_normalize_index(key, self.length()))
File "pyarrow/array.pxi", line 549, in pyarrow.lib._normalize_index
raise IndexError("index out of bounds")
IndexError: index out of bounds
"""
The above exception was the direct cause of the following exception:
IndexError Traceback (most recent call last)
<ipython-input-34-e35eeb51570c> in <module>()
3 batched=True,
4 batch_size=1000,
----> 5 num_proc=4,
6 )
16 frames
/usr/local/lib/python3.7/dist-packages/pyarrow/array.pxi in pyarrow.lib._normalize_index()
547 raise IndexError("index out of bounds")
548 elif index >= length:
--> 549 raise IndexError("index out of bounds")
550 return index
551
IndexError: index out of bounds