Map method to tokenize raises index error

Hi I have a custom dataset of structure:

DatasetDict({
train: Dataset({
    features: ['sentence', 'label'],
    num_rows: 12638343
})
test: Dataset({
    features: ['sentence', 'label'],
    num_rows: 1560290
})
validation: Dataset({
    features: ['sentence', 'label'],
    num_rows: 1404261
})
})

When I encode training example using map method from dataset with tokenizer defined:

Note I can only use num_proc = 1 no matter how I code if name == “main”: after defining tokenizer for multiprocess it will say “tokenizer” not found… anyway lets focus on this error for the time being

train_encodings = dataset["train"].map(lambda examples: tokenizer(examples['sentence'], truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)

I ran into Index Error 10% through, no matter I change batch_size it will still stuck at this:

C:\Users\xxxxx\AppData\Roaming\Python\Python38\site-packages\datasets\table.py:84: 
RuntimeWarning: overflow encountered in int_scalars
k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-13-b3bd4b3dbd29> in <module>
  1 ## Pass our Tokenizer to our Train, test, validation set ##
  2 
----> 3 train_encodings = dataset["train"].map(lambda examples: tokenizer(examples['sentence'], 
truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)
  4 
  5 # val_encodings = dataset["validation"].map(lambda examples: tokenizer(examples['sentence'], 
truncation = True, padding = True), batched=True, num_proc = 1, keep_in_memory = True)

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in map(self, 
function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, 
keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, 
disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint)
1481 
1482         if num_proc is None or num_proc == 1:
-> 1483             return self._map_single(
1484                 function=function,
1485                 with_indices=with_indices,

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in wrapper(*args, 
**kwargs)
172         }
173         # apply actual function
--> 174         out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
175         datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
176         # re-apply format to the output

~\AppData\Roaming\Python\Python38\site-packages\datasets\fingerprint.py in wrapper(*args, 
**kwargs)
338             # Call actual function
339 
--> 340             out = func(self, *args, **kwargs)
341 
342             # Update fingerprint of in-place transforms + update in-place history of transforms

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in 
_map_single(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, 
remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, 
features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
1812                         if drop_last_batch and i + batch_size > input_dataset.num_rows:
1813                             continue
-> 1814                         batch = input_dataset[i : i + batch_size]
1815                         indices = list(
1816                             range(*(slice(i, i + batch_size).indices(input_dataset.num_rows)))

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in __getitem__(self, 
key)
1343     def __getitem__(self, key: Union[int, slice, str]) -> Union[Dict, List]:
1344         """Can be used to index columns (by string names) or rows (by integer index or iterable of 
indices or bools)."""
-> 1345         return self._getitem(
1346             key,
1347             format_type=self._format_type,

~\AppData\Roaming\Python\Python38\site-packages\datasets\arrow_dataset.py in _getitem(self, 
key, format_type, format_columns, output_all_columns, format_kwargs)
1335         format_kwargs = format_kwargs if format_kwargs is not None else {}
1336         formatter = get_formatter(format_type, **format_kwargs)
-> 1337         pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not 
None else None)
1338         formatted_output = format_table(
1339             pa_subtable, key, formatter=formatter, format_columns=format_columns, 
output_all_columns=output_all_columns

~\AppData\Roaming\Python\Python38\site-packages\datasets\formatting\formatting.py in 
 query_table(table, key, indices)
 366     # Query the main table
 367     if indices is None:
--> 368         pa_subtable = _query_table(table, key)
 369     else:
 370         pa_subtable = _query_table_with_indices_mapping(table, key, indices=indices)

~\AppData\Roaming\Python\Python38\site-packages\datasets\formatting\formatting.py in 
_query_table(table, key)
 82     if isinstance(key, range):
 83         if _is_range_contiguous(key) and key.start >= 0:
---> 84             return table.fast_slice(key.start, key.stop - key.start)
 85         else:
 86             pass  # treat as an iterable

~\AppData\Roaming\Python\Python38\site-packages\datasets\table.py in fast_slice(self, offset, length)
131             batches[0] = batches[0].slice(offset - self._offsets[i])
132         else:
--> 133             j = _interpolation_search(self._offsets, offset + length - 1)
134             batches = self._batches[i : j + 1]
135             batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])

~\AppData\Roaming\Python\Python38\site-packages\datasets\table.py in _interpolation_search(arr, x)
 89         else:
 90             i, j = i, k
---> 91     raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
 92 
 93 

IndexError: Invalid query '1698999' for size 12638343.

I could have load the csv using pandas then encode on it but I close to run out of memory on that so I try to use datasets_load class… Any help would be appreciated! Thanks