When opening a dataset that requires explicit encoding="utf8"
when opening the file, the behavior of using the stream=True
in load_dataset
behaves differently from the default stream=False
, e.g.
This works okay:
from datasets import load_dataset
dataset_nostream = load_dataset("facebook/flores", "eng_Latn-deu_Latn", streaming=False,
split="dev")
dataset_nostream = dataset_nostream.rename_column('sentence_eng_Latn', 'SRC')
dataset_nostream = dataset_nostream.rename_column('sentence_deu_Latn', 'TRG')
for row in dataset_nostream:
break
print(row)
[out]:
{'id': 1, 'URL': 'https://en.wikinews.org/wiki/Scientists_say_new_medical_diagnostic_chip_can_sort_cells_anywhere_with_an_inkjet', 'domain': 'wikinews', 'topic': 'health', 'has_image': 0, 'has_hyperlink': 0, 'SRC': 'On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.', 'TRG': 'Am Montag haben die Wisenschaftler der Stanford University School of Medicine die Erfindung eines neuen Diagnosetools bekanntgegeben, mit dem Zellen nach ihrem Typ sortiert werden können: ein winziger, ausdruckbarer Chip, der für jeweils etwa einen US-Cent mit Standard-Tintenstrahldruckern hergestellt werden kann.'}
But when doing this:
dataset_stream = load_dataset("facebook/flores", "eng_Latn-deu_Latn", streaming=True,
split="dev")
dataset_stream = dataset_stream.rename_column('sentence_eng_Latn', 'SRC')
dataset_stream = dataset_stream.rename_column('sentence_deu_Latn', 'TRG')
for row in dataset_stream:
break
print(row)
it throws an error:
[out]:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
/tmp/ipykernel_27/4098039311.py in <module>
5 dataset_stream = dataset_stream.rename_column('sentence_deu_Latn', 'TRG')
6
----> 7 for row in dataset_stream:
8 break
9 print(row)
/opt/conda/lib/python3.7/site-packages/datasets/iterable_dataset.py in __iter__(self)
495
496 def __iter__(self):
--> 497 for key, example in self._iter():
498 if self.features:
499 # we encode the example for ClassLabel feature types for example
/opt/conda/lib/python3.7/site-packages/datasets/iterable_dataset.py in _iter(self)
492 else:
493 ex_iterable = self._ex_iterable
--> 494 yield from ex_iterable
495
496 def __iter__(self):
/opt/conda/lib/python3.7/site-packages/datasets/iterable_dataset.py in __iter__(self)
231 current_idx += batch_idx + 1
232 else:
--> 233 for key, example in iterator:
234 # If not batched, we can apply the transform and yield the example directly
235 # first copy the example, since we might drop some keys
/opt/conda/lib/python3.7/site-packages/datasets/iterable_dataset.py in __iter__(self)
231 current_idx += batch_idx + 1
232 else:
--> 233 for key, example in iterator:
234 # If not batched, we can apply the transform and yield the example directly
235 # first copy the example, since we might drop some keys
/opt/conda/lib/python3.7/site-packages/datasets/iterable_dataset.py in __iter__(self)
415
416 def __iter__(self):
--> 417 for key, example in self.ex_iterable:
418 # we encode the example for ClassLabel feature types for example
419 encoded_example = self.features.encode_example(example)
/opt/conda/lib/python3.7/site-packages/datasets/iterable_dataset.py in __iter__(self)
85
86 def __iter__(self):
---> 87 yield from self.generate_examples_fn(**self.kwargs)
88
89 def shuffle_data_sources(self, generator: np.random.Generator) -> "ExamplesIterable":
~/.cache/huggingface/modules/datasets_modules/datasets/facebook--flores/2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef/flores.py in _generate_examples(self, sentence_paths, metadata_path, langs)
210 for path, lang in zip(sentence_paths, langs):
211 with open(path, "r") as sent_file:
--> 212 sentences[lang] = [l.strip() for l in sent_file.readlines()]
213 with open(metadata_path, "r") as metadata_file:
214 metadata_lines = [l.strip() for l in metadata_file.readlines()[1:]]
/opt/conda/lib/python3.7/codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 108: invalid start byte
When looking at the datasets/iterable_dataset.py at main · huggingface/datasets · GitHub, there’s an absence of **kwargs
, not sure if that’s the place to add kwargs.
Or maybe I should be looking directly at BuilderConfig: Flores200Config(name='eng_Latn-deu_Latn', version=1.0.0, data_dir=None, data_files=None, description='FLORES-200: eng_Latn-deu_Latn aligned subset.')
and then add the encoding specifics there.