Unable to load mozilla-foundation/common_voice_6_0 dataset

I am trying to load mozilla-foundation/common_voice_6_0 dataset, but getting JSONDecodeError. The same error happens for other versions as well.

P.S. I can load the common_voice dataset without any problem.

from datasets import load_dataset

common_voice = load_dataset("mozilla-foundation/common_voice_6_0", "tr", split = "train+test+validation", use_auth_token=True)
Downloading and preparing dataset common_voice/tr to /home/ramil/.cache/huggingface/datasets/mozilla-foundation___common_voice/tr/6.0.0/1f4733442c50d49580aa34c328fae5f022e2e9fde47683da3cd3e71950cf7a6e...
---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
File ~/miniconda3/envs/torch110/lib/python3.8/site-packages/requests/models.py:910, in Response.json(self, **kwargs)
    909 try:
--> 910     return complexjson.loads(self.text, **kwargs)
    911 except JSONDecodeError as e:
    912     # Catch JSON-related errors and raise as requests.JSONDecodeError
    913     # This aliases json.JSONDecodeError and simplejson.JSONDecodeError

File ~/miniconda3/envs/torch110/lib/python3.8/json/__init__.py:357, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    354 if (cls is None and object_hook is None and
    355         parse_int is None and parse_float is None and
    356         parse_constant is None and object_pairs_hook is None and not kw):
--> 357     return _default_decoder.decode(s)
    358 if cls is None:

File ~/miniconda3/envs/torch110/lib/python3.8/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
    333 """Return the Python representation of ``s`` (a ``str`` instance
    334 containing a JSON document).
    335 
    336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338 end = _w(s, end).end()

File ~/miniconda3/envs/torch110/lib/python3.8/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
    354 except StopIteration as err:
--> 355     raise JSONDecodeError("Expecting value", s, err.value) from None
    356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

JSONDecodeError                           Traceback (most recent call last)
Input In [9], in <module>
      1 from datasets import load_dataset, load_metric, Audio
----> 3 common_voice = load_dataset("mozilla-foundation/common_voice_6_0", "tr", split = "train+test+validation", use_auth_token=True)

File ~/miniconda3/envs/torch110/lib/python3.8/site-packages/datasets/load.py:1702, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)
   1699 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1701 # Download and prepare data
-> 1702 builder_instance.download_and_prepare(
   1703     download_config=download_config,
   1704     download_mode=download_mode,
   1705     ignore_verifications=ignore_verifications,
   1706     try_from_hf_gcs=try_from_hf_gcs,
   1707     use_auth_token=use_auth_token,
   1708 )
   1710 # Build dataset for splits
   1711 keep_in_memory = (
   1712     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1713 )

File ~/miniconda3/envs/torch110/lib/python3.8/site-packages/datasets/builder.py:594, in DatasetBuilder.download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
    592         logger.warning("HF google storage unreachable. Downloading and preparing it from source")
    593 if not downloaded_from_gcs:
--> 594     self._download_and_prepare(
    595         dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    596     )
    597 # Sync info
    598 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/miniconda3/envs/torch110/lib/python3.8/site-packages/datasets/builder.py:661, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    659 split_dict = SplitDict(dataset_name=self.name)
    660 split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
--> 661 split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
    663 # Checksums verification
    664 if verify_infos:

File ~/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_6_0/1f4733442c50d49580aa34c328fae5f022e2e9fde47683da3cd3e71950cf7a6e/common_voice_6_0.py:153, in CommonVoice._split_generators(self, dl_manager)
    150 dl_manager.download_config.ignore_url_params = True
    152 self._log_download(self.config.name, bundle_version, hf_auth_token)
--> 153 archive = dl_manager.download(self._get_bundle_url(self.config.name, bundle_url_template))
    155 if self.config.version < datasets.Version("5.0.0"):
    156     path_to_data = ""

File ~/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_6_0/1f4733442c50d49580aa34c328fae5f022e2e9fde47683da3cd3e71950cf7a6e/common_voice_6_0.py:132, in CommonVoice._get_bundle_url(self, locale, url_template)
    130 path = urllib.parse.quote(path.encode("utf-8"), safe="~()*!.'")
    131 use_cdn = self.config.size_bytes < 20 * 1024 * 1024 * 1024
--> 132 response = requests.get(f"{_API_URL}/bucket/dataset/{path}/{use_cdn}", timeout=10.0).json()
    133 return response["url"]

File ~/miniconda3/envs/torch110/lib/python3.8/site-packages/requests/models.py:917, in Response.json(self, **kwargs)
    915     raise RequestsJSONDecodeError(e.message)
    916 else:
--> 917     raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)

JSONDecodeError: [Errno Expecting value] Not Found: 0

Hi, thanks for reporting! The issue should now be fixed (Loading mozilla-foundation/common_voice_7_0 dataset failed · Issue #4062 · huggingface/datasets · GitHub). Let us know if that’s not the case.

1 Like

Hi, it works perfectly. Thanks for fixing it!