Chapter 7 questions

sahlebrahim · May 12, 2024, 10:06am

I cant train the whole codeparrot dataset so i tried to use a subset here before training

    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"][:10000],
    eval_dataset=tokenized_datasets["valid"],
)```

but it gives me error: 
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[36], line 1
----> 1 trainer.train()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1771, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1768 try:
   1769     # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
   1770     hf_hub_utils.disable_progress_bars()
-> 1771     return inner_training_loop(
   1772         args=args,
   1773         resume_from_checkpoint=resume_from_checkpoint,
   1774         trial=trial,
   1775         ignore_keys_for_eval=ignore_keys_for_eval,
   1776     )
   1777 finally:
   1778     hf_hub_utils.enable_progress_bars()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2085, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2082     rng_to_sync = True
   2084 step = -1
-> 2085 for step, inputs in enumerate(epoch_iterator):
   2086     total_batched_samples += 1
   2088     if self.args.include_num_input_tokens_seen:

File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:452, in DataLoaderShard.__iter__(self)
    450 # We iterate one batch ahead to check when we are at the end
    451 try:
--> 452     current_batch = next(dataloader_iter)
    453 except StopIteration:
    454     yield

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
    627 if self._sampler_iter is None:
    628     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    629     self._reset()  # type: ignore[call-arg]
--> 630 data = self._next_data()
    631 self._num_yielded += 1
    632 if self._dataset_kind == _DatasetKind.Iterable and \
    633         self._IterableDataset_len_called is not None and \
    634         self._num_yielded > self._IterableDataset_len_called:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
    672 def _next_data(self):
    673     index = self._next_index()  # may raise StopIteration
--> 674     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    675     if self._pin_memory:
    676         data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
     49         data = self.dataset.__getitems__(possibly_batched_index)
     50     else:
---> 51         data = [self.dataset[idx] for idx in possibly_batched_index]
     52 else:
     53     data = self.dataset[possibly_batched_index]

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:51, in <listcomp>(.0)
     49         data = self.dataset.__getitems__(possibly_batched_index)
     50     else:
---> 51         data = [self.dataset[idx] for idx in possibly_batched_index]
     52 else:
     53     data = self.dataset[possibly_batched_index]

KeyError: 0

Topic		Replies	Views
Chapter 3 questions Course	151	10656	October 6, 2025
Fine Tuning IMDb tutorial - Unable to reproduce and adapt Beginners	19	8607	August 21, 2020
Transformers v3.0.0 is out! 🤗Transformers	0	1953	July 7, 2020
Seq2SeqTrainer: enabled must be a bool (got NoneType) 🤗Transformers	15	3972	December 5, 2022
Tutorial: Fine-tuning with custom datasets – sentiment, NER, and question answering 🤗Transformers	19	12953	February 12, 2024

Chapter 7 questions

Related topics