I am using the ch 7 tutorial () to fine-tune an LM on my dataset. And, here is the error log:
The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: text. If text are not expected by `DistilBertForMaskedLM.forward`, you can safely ignore this message.
***** Running training *****
Num examples = 4595
Num Epochs = 3
Instantaneous batch size per device = 64
Total train batch size (w. parallel, distributed & accumulation) = 64
Gradient Accumulation steps = 1
Total optimization steps = 216
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
Input In [69], in <module>
----> 1 trainer.train()
File /opt/conda/lib/python3.8/site-packages/transformers/trainer.py:1317, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1312 self.model_wrapped = self.model
1314 inner_training_loop = find_executable_batch_size(
1315 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1316 )
-> 1317 return inner_training_loop(
1318 args=args,
1319 resume_from_checkpoint=resume_from_checkpoint,
1320 trial=trial,
1321 ignore_keys_for_eval=ignore_keys_for_eval,
1322 )
File /opt/conda/lib/python3.8/site-packages/transformers/trainer.py:1528, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1525 self._load_rng_state(resume_from_checkpoint)
1527 step = -1
-> 1528 for step, inputs in enumerate(epoch_iterator):
1529
1530 # Skip past any already trained steps if resuming training
1531 if steps_trained_in_current_epoch > 0:
1532 steps_trained_in_current_epoch -= 1
File /opt/conda/lib/python3.8/site-packages/torch/utils/data/dataloader.py:530, in _BaseDataLoaderIter.__next__(self)
528 if self._sampler_iter is None:
529 self._reset()
--> 530 data = self._next_data()
531 self._num_yielded += 1
532 if self._dataset_kind == _DatasetKind.Iterable and \
533 self._IterableDataset_len_called is not None and \
534 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.8/site-packages/torch/utils/data/dataloader.py:570, in _SingleProcessDataLoaderIter._next_data(self)
568 def _next_data(self):
569 index = self._next_index() # may raise StopIteration
--> 570 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
571 if self._pin_memory:
572 data = _utils.pin_memory.pin_memory(data)
File /opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py:49, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
File /opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py:49, in <listcomp>(.0)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
File /opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2124, in Dataset.__getitem__(self, key)
2122 def __getitem__(self, key): # noqa: F811
2123 """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 2124 return self._getitem(
2125 key,
2126 )
File /opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2108, in Dataset._getitem(self, key, decoded, **kwargs)
2106 format_kwargs = format_kwargs if format_kwargs is not None else {}
2107 formatter = get_formatter(format_type, features=self.features, decoded=decoded, **format_kwargs)
-> 2108 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
2109 formatted_output = format_table(
2110 pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
2111 )
2112 return formatted_output
File /opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:491, in query_table(table, key, indices)
489 pa_subtable = _query_table(table, key)
490 else:
--> 491 pa_subtable = _query_table_with_indices_mapping(table, key, indices=indices)
492 return pa_subtable
File /opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:57, in _query_table_with_indices_mapping(table, key, indices)
55 if isinstance(key, int):
56 key = indices.fast_slice(key % indices.num_rows, 1).column(0)[0].as_py()
---> 57 return _query_table(table, key)
58 if isinstance(key, slice):
59 key = range(*key.indices(indices.num_rows))
File /opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:81, in _query_table(table, key)
77 """
78 Query a pyarrow Table to extract the subtable that correspond to the given key.
79 """
80 if isinstance(key, int):
---> 81 return table.fast_slice(key % table.num_rows, 1)
82 if isinstance(key, slice):
83 key = range(*key.indices(table.num_rows))
ZeroDivisionError: integer division or modulo by zero
The only difference between the tutorial and what I have done is to load the dataset from a csv, as opposed to hub. Can someone please tell me what could I be doing wrong?