I cant train the whole codeparrot dataset so i tried to use a subset here before training
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"][:10000],
eval_dataset=tokenized_datasets["valid"],
)```
but it gives me error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[36], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1771, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1768 try:
1769 # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
1770 hf_hub_utils.disable_progress_bars()
-> 1771 return inner_training_loop(
1772 args=args,
1773 resume_from_checkpoint=resume_from_checkpoint,
1774 trial=trial,
1775 ignore_keys_for_eval=ignore_keys_for_eval,
1776 )
1777 finally:
1778 hf_hub_utils.enable_progress_bars()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2085, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2082 rng_to_sync = True
2084 step = -1
-> 2085 for step, inputs in enumerate(epoch_iterator):
2086 total_batched_samples += 1
2088 if self.args.include_num_input_tokens_seen:
File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:452, in DataLoaderShard.__iter__(self)
450 # We iterate one batch ahead to check when we are at the end
451 try:
--> 452 current_batch = next(dataloader_iter)
453 except StopIteration:
454 yield
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
672 def _next_data(self):
673 index = self._next_index() # may raise StopIteration
--> 674 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
675 if self._pin_memory:
676 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
49 data = self.dataset.__getitems__(possibly_batched_index)
50 else:
---> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in <listcomp>(.0)
49 data = self.dataset.__getitems__(possibly_batched_index)
50 else:
---> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]
KeyError: 0