Hi, I followed the code from the lesson and have the following error:
RuntimeError: Could not infer dtype of NoneType
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In[21], line 3
1 import math
----> 3 eval_results = trainer.evaluate()
4 print(f">>> Perplexity before training: {math.exp(eval_results['eval_loss']):.2f}")
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3011, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
3008 start_time = time.time()
3010 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 3011 output = eval_loop(
3012 eval_dataloader,
3013 description="Evaluation",
3014 # No point gathering the predictions if there are no metrics, otherwise we defer to
3015 # self.args.prediction_loss_only
3016 prediction_loss_only=True if self.compute_metrics is None else None,
3017 ignore_keys=ignore_keys,
3018 metric_key_prefix=metric_key_prefix,
3019 )
3021 total_batch_size = self.args.eval_batch_size * self.args.world_size
3022 if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3190, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3188 observed_num_examples = 0
3189 # Main evaluation loop
-> 3190 for step, inputs in enumerate(dataloader):
3191 # Update the observed num examples
3192 observed_batch_size = find_batch_size(inputs)
3193 if observed_batch_size is not None:
File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:384, in DataLoaderShard.__iter__(self)
382 # We iterate one batch ahead to check when we are at the end
383 try:
--> 384 current_batch = next(dataloader_iter)
385 except StopIteration:
386 yield
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:633, in _BaseDataLoaderIter.__next__(self)
630 if self._sampler_iter is None:
631 # TODO(https://github.com/pytorch/pytorch/issues/76750)
632 self._reset() # type: ignore[call-arg]
--> 633 data = self._next_data()
634 self._num_yielded += 1
635 if self._dataset_kind == _DatasetKind.Iterable and \
636 self._IterableDataset_len_called is not None and \
637 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:677, in _SingleProcessDataLoaderIter._next_data(self)
675 def _next_data(self):
676 index = self._next_index() # may raise StopIteration
--> 677 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
678 if self._pin_memory:
679 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)
File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:45, in DataCollatorMixin.__call__(self, features, return_tensors)
43 return self.tf_call(features)
44 elif return_tensors == "pt":
---> 45 return self.torch_call(features)
46 elif return_tensors == "np":
47 return self.numpy_call(features)
File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:732, in DataCollatorForLanguageModeling.torch_call(self, examples)
729 def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
730 # Handle dict or lists with proper padding and conversion to tensor.
731 if isinstance(examples[0], Mapping):
--> 732 batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
733 else:
734 batch = {
735 "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
736 }
File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3295, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3292 batch_outputs[key] = []
3293 batch_outputs[key].append(value)
-> 3295 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:223, in BatchEncoding.__init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
219 n_sequences = encoding[0].n_sequences
221 self._n_sequences = n_sequences
--> 223 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:764, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
759 if key == "overflowing_tokens":
760 raise ValueError(
761 "Unable to create tensor returning overflowing tokens of different lengths. "
762 "Please see if a fast version of this tokenizer is available to have this feature available."
763 ) from e
--> 764 raise ValueError(
765 "Unable to create tensor, you should probably activate truncation and/or padding with"
766 " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
767 f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
768 " expected)."
769 ) from e
771 return self
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`word_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
```
If I comment out these lines in the tokenize_function:
```
if tokenizer.is_fast:
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
```
then it works. However, I cannot use whole_word_masking_data_collator if I do so.
Do you know how to fix the error?