Hello,
I wanted to use the following line to load a dataset
from datasets import load_dataset
dataset = load_dataset("facebook/voxpopuli", "fr")
len(dataset)
Downloading this dataset requires to download ~10 GB of compressed data and to extract it; amounting to ~100 GB of data. All of this should be handled by this line of code. Unfortunately, it looks like my machine misses an extracting plugin or something. This command runs fine on Colab, but it crashes after downloading the compressed data:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Cell In[5], line 3
1 from datasets import load_dataset
----> 3 dataset = load_dataset("facebook/voxpopuli", "fr", download_mode="force_redownload")
4 len(dataset)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\load.py:1687, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
1684 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
1686 # Download and prepare data
-> 1687 builder_instance.download_and_prepare(
1688 download_config=download_config,
1689 download_mode=download_mode,
1690 ignore_verifications=ignore_verifications,
1691 try_from_hf_gcs=try_from_hf_gcs,
1692 use_auth_token=use_auth_token,
1693 )
1695 # Build dataset for splits
1696 keep_in_memory = (
1697 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1698 )
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\builder.py:605, in DatasetBuilder.download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
603 logger.warning("HF google storage unreachable. Downloading and preparing it from source")
604 if not downloaded_from_gcs:
--> 605 self._download_and_prepare(
606 dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
607 )
608 # Sync info
609 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\builder.py:1104, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verify_infos)
1103 def _download_and_prepare(self, dl_manager, verify_infos):
-> 1104 super()._download_and_prepare(dl_manager, verify_infos, check_duplicate_keys=verify_infos)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\builder.py:672, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
670 split_dict = SplitDict(dataset_name=self.name)
671 split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
--> 672 split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
674 # Checksums verification
675 if verify_infos:
File ~\.cache\huggingface\modules\datasets_modules\datasets\facebook--voxpopuli\b5ff837284f0778eefe0f642734e142d8c3f574eba8c9c8a4b13602297f73604\voxpopuli.py:146, in Voxpopuli._split_generators(self, dl_manager)
142 meta_paths = dl_manager.download_and_extract(meta_urls)
143 audio_paths = dl_manager.download(audio_urls)
145 local_extracted_audio_paths = (
--> 146 dl_manager.extract(audio_paths) if not dl_manager.is_streaming else
147 {
148 split: {lang: [None] * len(audio_paths[split][lang]) for lang in self.config.languages} for split in splits
149 }
150 )
151 if self.config.name == "en_accented":
152 return [
153 datasets.SplitGenerator(
154 name=datasets.Split.TEST,
(...)
163 ),
164 ]
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\download_manager.py:355, in DownloadManager.extract(self, path_or_paths, num_proc)
353 if download_config.download_desc is None:
354 download_config.download_desc = "Downloading data"
--> 355 extracted_paths = map_nested(
356 partial(cached_path, download_config=download_config),
357 path_or_paths,
358 num_proc=num_proc,
359 disable_tqdm=not is_progress_bar_enabled(),
360 desc="Extracting data files",
361 )
362 path_or_paths = NestedDataStructure(path_or_paths)
363 extracted_paths = NestedDataStructure(extracted_paths)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:315, in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, types, disable_tqdm, desc)
312 num_proc = 1
313 if num_proc <= 1 or len(iterable) <= num_proc:
314 mapped = [
--> 315 _single_map_nested((function, obj, types, None, True, None))
316 for obj in logging.tqdm(iterable, disable=disable_tqdm, desc=desc)
317 ]
318 else:
319 split_kwds = [] # We organize the splits ourselve (contiguous splits)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:267, in _single_map_nested(args)
264 pbar = logging.tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit="obj", desc=pbar_desc)
266 if isinstance(data_struct, dict):
--> 267 return {k: _single_map_nested((function, v, types, None, True, None)) for k, v in pbar}
268 else:
269 mapped = [_single_map_nested((function, v, types, None, True, None)) for v in pbar]
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:269, in _single_map_nested(args)
267 return {k: _single_map_nested((function, v, types, None, True, None)) for k, v in pbar}
268 else:
--> 269 mapped = [_single_map_nested((function, v, types, None, True, None)) for v in pbar]
270 if isinstance(data_struct, list):
271 return mapped
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\py_utils.py:251, in _single_map_nested(args)
249 # Singleton first to spare some computation
250 if not isinstance(data_struct, dict) and not isinstance(data_struct, types):
--> 251 return function(data_struct)
253 # Reduce logging to keep things readable in multiprocessing with tqdm
254 if rank is not None and logging.get_verbosity() < logging.WARNING:
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\file_utils.py:262, in cached_path(url_or_filename, download_config, **download_kwargs)
259 return output_path
261 if download_config.extract_compressed_file:
--> 262 output_path = ExtractManager(cache_dir=download_config.cache_dir).extract(
263 output_path, force_extract=download_config.force_extract
264 )
266 return output_path
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\extract.py:40, in ExtractManager.extract(self, input_path, force_extract)
38 output_path = self._get_output_path(input_path)
39 if self._do_extract(output_path, force_extract):
---> 40 self.extractor.extract(input_path, output_path, extractor=extractor)
41 return output_path
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\extract.py:179, in Extractor.extract(cls, input_path, output_path, extractor)
177 os.makedirs(os.path.dirname(output_path), exist_ok=True)
178 if extractor:
--> 179 return extractor.extract(input_path, output_path)
180 for extractor in cls.extractors:
181 if extractor.is_extractable(input_path):
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\datasets\utils\extract.py:53, in TarExtractor.extract(input_path, output_path)
51 os.makedirs(output_path, exist_ok=True)
52 tar_file = tarfile.open(input_path)
---> 53 tar_file.extractall(output_path)
54 tar_file.close()
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2269, in TarFile.extractall(self, path, members, numeric_owner, filter)
2264 if tarinfo.isdir():
2265 # For directories, delay setting attributes until later,
2266 # since permissions can interfere with extraction and
2267 # extracting contents can reset mtime.
2268 directories.append(tarinfo)
-> 2269 self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(),
2270 numeric_owner=numeric_owner)
2272 # Reverse sort directories.
2273 directories.sort(key=lambda a: a.name, reverse=True)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2336, in TarFile._extract_one(self, tarinfo, path, set_attrs, numeric_owner)
2332 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2333 set_attrs=set_attrs,
2334 numeric_owner=numeric_owner)
2335 except OSError as e:
-> 2336 self._handle_fatal_error(e)
2337 except ExtractError as e:
2338 self._handle_nonfatal_error(e)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2332, in TarFile._extract_one(self, tarinfo, path, set_attrs, numeric_owner)
2329 self._check("r")
2331 try:
-> 2332 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2333 set_attrs=set_attrs,
2334 numeric_owner=numeric_owner)
2335 except OSError as e:
2336 self._handle_fatal_error(e)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2415, in TarFile._extract_member(self, tarinfo, targetpath, set_attrs, numeric_owner)
2412 self._dbg(1, tarinfo.name)
2414 if tarinfo.isreg():
-> 2415 self.makefile(tarinfo, targetpath)
2416 elif tarinfo.isdir():
2417 self.makedir(tarinfo, targetpath)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1264.0_x64__qbz5n2kfra8p0\Lib\tarfile.py:2461, in TarFile.makefile(self, tarinfo, targetpath)
2459 source.seek(tarinfo.offset_data)
2460 bufsize = self.copybufsize
-> 2461 with bltn_open(targetpath, "wb") as target:
2462 if tarinfo.sparse is not None:
2463 for offset, size in tarinfo.sparse:
OSError: [Errno 22] Invalid argument: 'C:\\Users\\crass\\.cache\\huggingface\\datasets\\downloads\\extracted\\a94dd87c5acd289650c238fcf4d2fe9b7e0c8bfecb9f21e95415fafee5583f82\\train_part_0\\20200212-0900-PLENARY-fr_20200212-18:11:25_1.wav'
Navigating through the folder, I noticed that the “extracted” is not filled with data. I use datasets ==2.20 (and I already tried somedowngrading) I don’t have any disk space issues. I have 7zip installed on my computer and the pip package rarfile. I use python 3.12.4 and windows 11. Thanks for helping.
*** CORRECTION ***
I tried again on my linux distro it works fine!
Maybe there should be a warning somewhere that windows wont work for this