Hi I have a custom dataset of structure:
DatasetDict({
train: Dataset({
features: ['sentence', 'label'],
num_rows: 12638343
})
test: Dataset({
features: ['sentence', 'label'],
num_rows: 1560290
})
validation: Dataset({
features: ['sentence', 'label'],
num_rows: 1404261
})
})
When I encode training example using map method from dataset with tokenizer defined:
Note I can only use num_proc = 1 no matter how I code if name == “main”: after defining tokenizer for multiprocess it will say “tokenizer” not found… anyway lets focus on this error for the time being
train_encodings = dataset["train"].map(lambda examples: tokenizer(examples['sentence'], truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)
I ran into Index Error 10% through, no matter I change batch_size it will still stuck at this:
C:\Users\xxxxx\AppData\Roaming\Python\Python38\site-packages\datasets\table.py:84:
RuntimeWarning: overflow encountered in int_scalars
k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-13-b3bd4b3dbd29> in <module>
1 ## Pass our Tokenizer to our Train, test, validation set ##
2
----> 3 train_encodings = dataset["train"].map(lambda examples: tokenizer(examples['sentence'],
truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)
4
5 # val_encodings = dataset["validation"].map(lambda examples: tokenizer(examples['sentence'],
truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)
~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in map(self,
function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns,
keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features,
disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint)
1481
1482 if num_proc is None or num_proc == 1:
-> 1483 return self._map_single(
1484 function=function,
1485 with_indices=with_indices,
~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in wrapper(*args,
**kwargs)
172 }
173 # apply actual function
--> 174 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
175 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
176 # re-apply format to the output
~\AppData\Roaming\Python\Python38\site-packages\datasets\fingerprint.py in wrapper(*args,
**kwargs)
338 # Call actual function
339
--> 340 out = func(self, *args, **kwargs)
341
342 # Update fingerprint of in-place transforms + update in-place history of transforms
~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in
_map_single(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch,
remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size,
features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
1812 if drop_last_batch and i + batch_size > input_dataset.num_rows:
1813 continue
-> 1814 batch = input_dataset[i : i + batch_size]
1815 indices = list(
1816 range(*(slice(i, i + batch_size).indices(input_dataset.num_rows)))
~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in __getitem__(self,
key)
1343 def __getitem__(self, key: Union[int, slice, str]) -> Union[Dict, List]:
1344 """Can be used to index columns (by string names) or rows (by integer index or iterable of
indices or bools)."""
-> 1345 return self._getitem(
1346 key,
1347 format_type=self._format_type,
~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in _getitem(self,
key, format_type, format_columns, output_all_columns, format_kwargs)
1335 format_kwargs = format_kwargs if format_kwargs is not None else {}
1336 formatter = get_formatter(format_type, **format_kwargs)
-> 1337 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not
None else None)
1338 formatted_output = format_table(
1339 pa_subtable, key, formatter=formatter, format_columns=format_columns,
output_all_columns=output_all_columns
~\AppData\Roaming\Python\Python38\site-packages\datasets\formatting\formatting.py in
query_table(table, key, indices)
366 # Query the main table
367 if indices is None:
--> 368 pa_subtable = _query_table(table, key)
369 else:
370 pa_subtable = _query_table_with_indices_mapping(table, key, indices=indices)
~\AppData\Roaming\Python\Python38\site-packages\datasets\formatting\formatting.py in
_query_table(table, key)
82 if isinstance(key, range):
83 if _is_range_contiguous(key) and key.start >= 0:
---> 84 return table.fast_slice(key.start, key.stop - key.start)
85 else:
86 pass # treat as an iterable
~\AppData\Roaming\Python\Python38\site-packages\datasets\table.py in fast_slice(self, offset, length)
131 batches[0] = batches[0].slice(offset - self._offsets[i])
132 else:
--> 133 j = _interpolation_search(self._offsets, offset + length - 1)
134 batches = self._batches[i : j + 1]
135 batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])
~\AppData\Roaming\Python\Python38\site-packages\datasets\table.py in _interpolation_search(arr, x)
89 else:
90 i, j = i, k
---> 91 raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
92
93
IndexError: Invalid query '1698999' for size 12638343.
I could have load the csv using pandas then encode on it but I close to run out of memory on that so I try to use datasets_load class… Any help would be appreciated! Thanks