Missing an extracting tool - load_dataset extraction crashes with windows

Hello,

I wanted to use the following line to load a dataset

from datasets import load_dataset

dataset = load_dataset("facebook/voxpopuli", "fr")
len(dataset)

Downloading this dataset requires to download ~10 GB of compressed data and to extract it; amounting to ~100 GB of data. All of this should be handled by this line of code. Unfortunately, it looks like my machine misses an extracting plugin or something. This command runs fine on Colab, but it crashes after downloading the compressed data:

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[5], line 3
      1 from datasets import load_dataset
----> 3 dataset = load_dataset("facebook/voxpopuli", "fr", download_mode="force_redownload")
      4 len(dataset)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\load.py:1687, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1684 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1686 # Download and prepare data
-> 1687 builder_instance.download_and_prepare(
   1688     download_config=download_config,
   1689     download_mode=download_mode,
   1690     ignore_verifications=ignore_verifications,
   1691     try_from_hf_gcs=try_from_hf_gcs,
   1692     use_auth_token=use_auth_token,
   1693 )
   1695 # Build dataset for splits
   1696 keep_in_memory = (
   1697     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1698 )

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\builder.py:605, in DatasetBuilder.download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
    603         logger.warning("HF google storage unreachable. Downloading and preparing it from source")
    604 if not downloaded_from_gcs:
--> 605     self._download_and_prepare(
    606         dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    607     )
    608 # Sync info
    609 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\builder.py:1104, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verify_infos)
   1103 def _download_and_prepare(self, dl_manager, verify_infos):
-> 1104     super()._download_and_prepare(dl_manager, verify_infos, check_duplicate_keys=verify_infos)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\builder.py:672, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    670 split_dict = SplitDict(dataset_name=self.name)
    671 split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
--> 672 split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
    674 # Checksums verification
    675 if verify_infos:

File ~\.cache\huggingface\modules\datasets_modules\datasets\facebook--voxpopuli\b5ff837284f0778eefe0f642734e142d8c3f574eba8c9c8a4b13602297f73604\voxpopuli.py:146, in Voxpopuli._split_generators(self, dl_manager)
    142 meta_paths = dl_manager.download_and_extract(meta_urls)
    143 audio_paths = dl_manager.download(audio_urls)
    145 local_extracted_audio_paths = (
--> 146     dl_manager.extract(audio_paths) if not dl_manager.is_streaming else
    147     {
    148         split: {lang: [None] * len(audio_paths[split][lang]) for lang in self.config.languages} for split in splits
    149     }
    150 )
    151 if self.config.name == "en_accented":
    152     return [
    153         datasets.SplitGenerator(
    154             name=datasets.Split.TEST,
   (...)
    163         ),
    164     ]

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\download_manager.py:355, in DownloadManager.extract(self, path_or_paths, num_proc)
    353 if download_config.download_desc is None:
    354     download_config.download_desc = "Downloading data"
--> 355 extracted_paths = map_nested(
    356     partial(cached_path, download_config=download_config),
    357     path_or_paths,
    358     num_proc=num_proc,
    359     disable_tqdm=not is_progress_bar_enabled(),
    360     desc="Extracting data files",
    361 )
    362 path_or_paths = NestedDataStructure(path_or_paths)
    363 extracted_paths = NestedDataStructure(extracted_paths)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:315, in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, types, disable_tqdm, desc)
    312     num_proc = 1
    313 if num_proc <= 1 or len(iterable) <= num_proc:
    314     mapped = [
--> 315         _single_map_nested((function, obj, types, None, True, None))
    316         for obj in logging.tqdm(iterable, disable=disable_tqdm, desc=desc)
    317     ]
    318 else:
    319     split_kwds = []  # We organize the splits ourselve (contiguous splits)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:267, in _single_map_nested(args)
    264 pbar = logging.tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit="obj", desc=pbar_desc)
    266 if isinstance(data_struct, dict):
--> 267     return {k: _single_map_nested((function, v, types, None, True, None)) for k, v in pbar}
    268 else:
    269     mapped = [_single_map_nested((function, v, types, None, True, None)) for v in pbar]

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:269, in _single_map_nested(args)
    267     return {k: _single_map_nested((function, v, types, None, True, None)) for k, v in pbar}
    268 else:
--> 269     mapped = [_single_map_nested((function, v, types, None, True, None)) for v in pbar]
    270     if isinstance(data_struct, list):
    271         return mapped

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:251, in _single_map_nested(args)
    249 # Singleton first to spare some computation
    250 if not isinstance(data_struct, dict) and not isinstance(data_struct, types):
--> 251     return function(data_struct)
    253 # Reduce logging to keep things readable in multiprocessing with tqdm
    254 if rank is not None and logging.get_verbosity() < logging.WARNING:

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\file_utils.py:262, in cached_path(url_or_filename, download_config, **download_kwargs)
    259     return output_path
    261 if download_config.extract_compressed_file:
--> 262     output_path = ExtractManager(cache_dir=download_config.cache_dir).extract(
    263         output_path, force_extract=download_config.force_extract
    264     )
    266 return output_path

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\extract.py:40, in ExtractManager.extract(self, input_path, force_extract)
     38 output_path = self._get_output_path(input_path)
     39 if self._do_extract(output_path, force_extract):
---> 40     self.extractor.extract(input_path, output_path, extractor=extractor)
     41 return output_path

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\extract.py:179, in Extractor.extract(cls, input_path, output_path, extractor)
    177 os.makedirs(os.path.dirname(output_path), exist_ok=True)
    178 if extractor:
--> 179     return extractor.extract(input_path, output_path)
    180 for extractor in cls.extractors:
    181     if extractor.is_extractable(input_path):

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\extract.py:53, in TarExtractor.extract(input_path, output_path)
     51 os.makedirs(output_path, exist_ok=True)
     52 tar_file = tarfile.open(input_path)
---> 53 tar_file.extractall(output_path)
     54 tar_file.close()

File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2269, in TarFile.extractall(self, path, members, numeric_owner, filter)
   2264     if tarinfo.isdir():
   2265         # For directories, delay setting attributes until later,
   2266         # since permissions can interfere with extraction and
   2267         # extracting contents can reset mtime.
   2268         directories.append(tarinfo)
-> 2269     self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(),
   2270                       numeric_owner=numeric_owner)
   2272 # Reverse sort directories.
   2273 directories.sort(key=lambda a: a.name, reverse=True)

File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2336, in TarFile._extract_one(self, tarinfo, path, set_attrs, numeric_owner)
   2332     self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
   2333                          set_attrs=set_attrs,
   2334                          numeric_owner=numeric_owner)
   2335 except OSError as e:
-> 2336     self._handle_fatal_error(e)
   2337 except ExtractError as e:
   2338     self._handle_nonfatal_error(e)

File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2332, in TarFile._extract_one(self, tarinfo, path, set_attrs, numeric_owner)
   2329 self._check("r")
   2331 try:
-> 2332     self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
   2333                          set_attrs=set_attrs,
   2334                          numeric_owner=numeric_owner)
   2335 except OSError as e:
   2336     self._handle_fatal_error(e)

File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2415, in TarFile._extract_member(self, tarinfo, targetpath, set_attrs, numeric_owner)
   2412     self._dbg(1, tarinfo.name)
   2414 if tarinfo.isreg():
-> 2415     self.makefile(tarinfo, targetpath)
   2416 elif tarinfo.isdir():
   2417     self.makedir(tarinfo, targetpath)

File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2461, in TarFile.makefile(self, tarinfo, targetpath)
   2459 source.seek(tarinfo.offset_data)
   2460 bufsize = self.copybufsize
-> 2461 with bltn_open(targetpath, "wb") as target:
   2462     if tarinfo.sparse is not None:
   2463         for offset, size in tarinfo.sparse:

OSError: [Errno 22] Invalid argument: 'C:\\Users\\crass\\.cache\\huggingface\\datasets\\downloads\\extracted\\a94dd87c5acd289650c238fcf4d2fe9b7e0c8bfecb9f21e95415fafee5583f82\\train_part_0\\20200212-0900-PLENARY-fr_20200212-18:11:25_1.wav'

Navigating through the folder, I noticed that the “extracted” is not filled with data. I use datasets ==2.20 (and I already tried somedowngrading) I don’t have any disk space issues. I have 7zip installed on my computer and the pip package rarfile. I use python 3.12.4 and windows 11. Thanks for helping.

*** CORRECTION ***
I tried again on my linux distro it works fine!
Maybe there should be a warning somewhere that windows wont work for this :confused: