Map method to tokenize raises index error

gabrielwong1991 · May 3, 2021, 1:04am

Hi I have a custom dataset of structure:

DatasetDict({
train: Dataset({
    features: ['sentence', 'label'],
    num_rows: 12638343
})
test: Dataset({
    features: ['sentence', 'label'],
    num_rows: 1560290
})
validation: Dataset({
    features: ['sentence', 'label'],
    num_rows: 1404261
})
})

When I encode training example using map method from dataset with tokenizer defined:

Note I can only use num_proc = 1 no matter how I code if name == “main”: after defining tokenizer for multiprocess it will say “tokenizer” not found… anyway lets focus on this error for the time being

train_encodings = dataset["train"].map(lambda examples: tokenizer(examples['sentence'], truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)

I ran into Index Error 10% through, no matter I change batch_size it will still stuck at this:

C:\Users\xxxxx\AppData\Roaming\Python\Python38\site-packages\datasets\table.py:84: 
RuntimeWarning: overflow encountered in int_scalars
k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-13-b3bd4b3dbd29> in <module>
  1 ## Pass our Tokenizer to our Train, test, validation set ##
  2 
----> 3 train_encodings = dataset["train"].map(lambda examples: tokenizer(examples['sentence'], 
truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)
  4 
  5 # val_encodings = dataset["validation"].map(lambda examples: tokenizer(examples['sentence'], 
truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in map(self, 
function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, 
keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, 
disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint)
1481 
1482         if num_proc is None or num_proc == 1:
-> 1483             return self._map_single(
1484                 function=function,
1485                 with_indices=with_indices,

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in wrapper(*args, 
**kwargs)
172         }
173         # apply actual function
--> 174         out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
175         datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
176         # re-apply format to the output

~\AppData\Roaming\Python\Python38\site-packages\datasets\fingerprint.py in wrapper(*args, 
**kwargs)
338             # Call actual function
339 
--> 340             out = func(self, *args, **kwargs)
341 
342             # Update fingerprint of in-place transforms + update in-place history of transforms

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in 
_map_single(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, 
remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, 
features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
1812                         if drop_last_batch and i + batch_size > input_dataset.num_rows:
1813                             continue
-> 1814                         batch = input_dataset[i : i + batch_size]
1815                         indices = list(
1816                             range(*(slice(i, i + batch_size).indices(input_dataset.num_rows)))

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in __getitem__(self, 
key)
1343     def __getitem__(self, key: Union[int, slice, str]) -> Union[Dict, List]:
1344         """Can be used to index columns (by string names) or rows (by integer index or iterable of 
indices or bools)."""
-> 1345         return self._getitem(
1346             key,
1347             format_type=self._format_type,

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in _getitem(self, 
key, format_type, format_columns, output_all_columns, format_kwargs)
1335         format_kwargs = format_kwargs if format_kwargs is not None else {}
1336         formatter = get_formatter(format_type, **format_kwargs)
-> 1337         pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not 
None else None)
1338         formatted_output = format_table(
1339             pa_subtable, key, formatter=formatter, format_columns=format_columns, 
output_all_columns=output_all_columns

~\AppData\Roaming\Python\Python38\site-packages\datasets\formatting\formatting.py in 
 query_table(table, key, indices)
 366     # Query the main table
 367     if indices is None:
--> 368         pa_subtable = _query_table(table, key)
 369     else:
 370         pa_subtable = _query_table_with_indices_mapping(table, key, indices=indices)

~\AppData\Roaming\Python\Python38\site-packages\datasets\formatting\formatting.py in 
_query_table(table, key)
 82     if isinstance(key, range):
 83         if _is_range_contiguous(key) and key.start >= 0:
---> 84             return table.fast_slice(key.start, key.stop - key.start)
 85         else:
 86             pass  # treat as an iterable

~\AppData\Roaming\Python\Python38\site-packages\datasets\table.py in fast_slice(self, offset, length)
131             batches[0] = batches[0].slice(offset - self._offsets[i])
132         else:
--> 133             j = _interpolation_search(self._offsets, offset + length - 1)
134             batches = self._batches[i : j + 1]
135             batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])

~\AppData\Roaming\Python\Python38\site-packages\datasets\table.py in _interpolation_search(arr, x)
 89         else:
 90             i, j = i, k
---> 91     raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
 92 
 93 

IndexError: Invalid query '1698999' for size 12638343.

I could have load the csv using pandas then encode on it but I close to run out of memory on that so I try to use datasets_load class… Any help would be appreciated! Thanks

yaxirhuxxain · May 8, 2021, 6:26am

I am facing similar issue. Have you found any solution?

vblagoje · May 8, 2021, 9:58am

Does your map function work for non-batched encoding? I always first focus on making non-batched approach working before optimizing further.

gabrielwong1991 · May 8, 2021, 12:25pm

Hi, I think it is system-specific. If I use the exact same code to Colab there is no problem (previously on my own computer)… So I am not sure which dependencies cause that?

mtala3t · May 8, 2021, 11:39pm

Hello,

I am facing the same issue, did you found any solution ? Please help.

gabrielwong1991 · May 9, 2021, 12:18am

Try to run the codes in Colab it will work… I am 90% sure it is dependency issue in our computer, otherwise colab’s package will not work too.

I can provide further information regarding the issue like dump file but please tell me how to do that?

mtala3t · May 9, 2021, 11:37pm

Hello all,

Finally, I found a fix for this overflow issue. Please check the below fix for the table.py file.

https://github.com/huggingface/datasets/pull/2336

Thanks,

lhoestq · May 10, 2021, 1:31pm

Hi ! The fix is now available on the master branch of datasets. We’ll do a patch release soon

The issue was affecting windows computers because they use int32 as default for numpy arrays of integers instead of int64.

gabrielwong1991 · May 10, 2021, 5:56pm

Thank you @lhoestq

rainman020 · June 9, 2021, 11:41am

Hi,

unfortunately I still have this problem, although I use the latest datasets version 1.8.0.
I am trying to run the run_ner.py from transformers/examples/pytorch/token-classification at master · huggingface/transformers · GitHub on Google Colab using a custom dataset. For a small set it does work but when using my dataset in its entirety I always get the following error:

  0% 0/1 [00:00<?, ?ba/s]Traceback (most recent call last):
  File "/content/drive/MyDrive/ner/run_ner.py", line 512, in <module>
    main()
  File "/content/drive/MyDrive/ner/run_ner.py", line 359, in main
    load_from_cache_file=not data_args.overwrite_cache,
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1635, in map
    desc=desc,
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 186, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py", line 397, in wrapper
    out = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1954, in _map_single
    batch = input_dataset[i : i + batch_size]
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1484, in __getitem__
    format_kwargs=self._format_kwargs,
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1471, in _getitem
    pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
  File "/usr/local/lib/python3.7/dist-packages/datasets/formatting/formatting.py", line 368, in query_table
    pa_subtable = _query_table(table, key)
  File "/usr/local/lib/python3.7/dist-packages/datasets/formatting/formatting.py", line 84, in _query_table
    return table.fast_slice(key.start, key.stop - key.start)
  File "/usr/local/lib/python3.7/dist-packages/datasets/table.py", line 129, in fast_slice
    i = _interpolation_search(self._offsets, offset)
  File "/usr/local/lib/python3.7/dist-packages/datasets/table.py", line 92, in _interpolation_search
    raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
IndexError: Invalid query '0' for size 1.
  0% 0/1 [00:00<?, ?ba/s]

Do you have any idea what could cause it? Or is there a workaround for this?
Sorry, I am a newbie to Huggingface…

Thank you!

Topic		Replies	Views
Dataset map() raises value error when mapping list to dict-like class 🤗Datasets	6	103	August 15, 2024
Map multiprocessing Issue 🤗Datasets	31	17611	July 16, 2024
Tokenizer dataset is very slow 🤗Tokenizers	3	4319	March 2, 2024
Cannot encode/tokenize my Dataset Dictionary Beginners	1	1075	August 19, 2021
Num_proc is not working with map Beginners	5	2193	April 15, 2024

Map method to tokenize raises index error

Related topics