Hello there,
I’m trying to load a few datasets using datasets.load_dataset
. However, a lot of them failed with error similar to this:
FileNotFoundError: Unable to find 'hf://datasets/gigaword@598c069a6891a3566cea85b7b7a02b75a3475275/default/train/0000.parquet' with any supported extension ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.arrow', '.txt', '.tar', '.blp', '.bmp', '.dib', '.bufr', '.cur', '.pcx', '.dcx', '.dds', '.ps', '.eps', '.fit', '.fits', '.fli', '.flc', '.ftc', '.ftu', '.gbr', '.gif', '.grib', '.h5', '.hdf', '.png', '.apng', '.jp2', '.j2k', '.jpc', '.jpf', '.jpx', '.j2c', '.icns', '.ico', '.im', '.iim', '.tif', '.tiff', '.jfif', '.jpe', '.jpg', '.jpeg', '.mpg', '.mpeg', '.msp', '.pcd', '.pxr', '.pbm', '.pgm', '.ppm', '.pnm', '.psd', '.bw', '.rgb', '.rgba', '.sgi', '.ras', '.tga', '.icb', '.vda', '.vst', '.webp', '.wmf', '.emf', '.xbm', '.xpm', '.BLP', '.BMP', '.DIB', '.BUFR', '.CUR', '.PCX', '.DCX', '.DDS', '.PS', '.EPS', '.FIT', '.FITS', '.FLI', '.FLC', '.FTC', '.FTU', '.GBR', '.GIF', '.GRIB', '.H5', '.HDF', '.PNG', '.APNG', '.JP2', '.J2K', '.JPC', '.JPF', '.JPX', '.J2C', '.ICNS', '.ICO', '.IM', '.IIM', '.TIF', '.TIFF', '.JFIF', '.JPE', '.JPG', '.JPEG', '.MPG', '.MPEG', '.MSP', '.PCD', '.PXR', '.PBM', '.PGM', '.PPM', '.PNM', '.PSD', '.BW', '.RGB', '.RGBA', '.SGI', '.RAS', '.TGA', '.ICB', '.VDA', '.VST', '.WEBP', '.WMF', '.EMF', '.XBM', '.XPM', '.aiff', '.au', '.avr', '.caf', '.flac', '.htk', '.svx', '.mat4', '.mat5', '.mpc2k', '.ogg', '.paf', '.pvf', '.raw', '.rf64', '.sd2', '.sds', '.ircam', '.voc', '.w64', '.wav', '.nist', '.wavex', '.wve', '.xi', '.mp3', '.opus', '.AIFF'...
I have encountered issues for these datasets: gigaword
, allenai/multi_lexsum
, super_glue
. The issue can be reproduced on Google Colab using the default Python 3 CPU runtime. Here’s the full stack trace for datasets.load_dataset("allenai/multi_lexsum", "v20220616")
:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-24-3788860837e7> in <cell line: 1>()
----> 1 multilexsum = datasets.load_dataset("allenai/multi_lexsum", "v20220616")
7 frames
/usr/local/lib/python3.10/dist-packages/datasets/load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2521
2522 # Create a dataset builder
-> 2523 builder_instance = load_dataset_builder(
2524 path=path,
2525 name=name,
/usr/local/lib/python3.10/dist-packages/datasets/load.py in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)
2230 builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
2231 # Instantiate the dataset builder
-> 2232 builder_instance: DatasetBuilder = builder_cls(
2233 cache_dir=cache_dir,
2234 dataset_name=dataset_name,
/usr/local/lib/python3.10/dist-packages/datasets/builder.py in __init__(self, cache_dir, dataset_name, config_name, hash, base_path, info, features, token, use_auth_token, repo_id, data_files, data_dir, storage_options, writer_batch_size, name, **config_kwargs)
369 if data_dir is not None:
370 config_kwargs["data_dir"] = data_dir
--> 371 self.config, self.config_id = self._create_builder_config(
372 config_name=config_name,
373 custom_features=features,
/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _create_builder_config(self, config_name, custom_features, **config_kwargs)
618
619 # resolve data files if needed
--> 620 builder_config._resolve_data_files(
621 base_path=self.base_path,
622 download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _resolve_data_files(self, base_path, download_config)
209 if isinstance(self.data_files, DataFilesPatternsDict):
210 base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
--> 211 self.data_files = self.data_files.resolve(base_path, download_config)
212
213
/usr/local/lib/python3.10/dist-packages/datasets/data_files.py in resolve(self, base_path, download_config)
786 out = DataFilesDict()
787 for key, data_files_patterns_list in self.items():
--> 788 out[key] = data_files_patterns_list.resolve(base_path, download_config)
789 return out
790
/usr/local/lib/python3.10/dist-packages/datasets/data_files.py in resolve(self, base_path, download_config)
739 try:
740 data_files.extend(
--> 741 resolve_pattern(
742 pattern,
743 base_path=base_path,
/usr/local/lib/python3.10/dist-packages/datasets/data_files.py in resolve_pattern(pattern, base_path, allowed_extensions, download_config)
381 if allowed_extensions is not None:
382 error_msg += f" with any supported extension {list(allowed_extensions)}"
--> 383 raise FileNotFoundError(error_msg)
384 return out
385
FileNotFoundError: Unable to find 'hf://datasets/allenai/multi_lexsum@fac423ae35d43248f966361fa96cd909fd1ef243/v20220616/train/0000.parquet' with any supported extension ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.arrow', '.txt', '.tar', '.blp', '.bmp', '.dib', '.bufr', '.cur', '.pcx', '.dcx', '.dds', '.ps', '.eps', '.fit', '.fits', '.fli', '.flc', '.ftc', '.ftu', '.gbr', '.gif', '.grib', '.h5', '.hdf', '.png', '.apng', '.jp2', '.j2k', '.jpc', '.jpf', '.jpx', '.j2c', '.icns', '.ico', '.im', '.iim', '.tif', '.tiff', '.jfif', '.jpe', '.jpg', '.jpeg', '.mpg', '.mpeg', '.msp', '.pcd', '.pxr', '.pbm', '.pgm', '.ppm', '.pnm', '.psd', '.bw', '.rgb', '.rgba', '.sgi', '.ras', '.tga', '.icb', '.vda', '.vst', '.webp', '.wmf', '.emf', '.xbm', '.xpm', '.BLP', '.BMP', '.DIB', '.BUFR', '.CUR', '.PCX', '.DCX', '.DDS', '.PS', '.EPS', '.FIT', '.FITS', '.FLI', '.FLC', '.FTC', '.FTU', '.GBR', '.GIF', '.GRIB', '.H5', '.HDF', '.PNG', '.APNG', '.JP2', '.J2K', '.JPC', '.JPF', '.JPX', '.J2C', '.ICNS', '.ICO', '.IM', '.IIM', '.TIF', '.TIFF', '.JFIF', '.JPE', '.JPG', '.JPEG', '.MPG', '.MPEG', '.MSP', '.PCD', '.PXR', '.PBM', '.PGM', '.PPM', '.PNM', '.PSD', '.BW', '.RGB', '.RGBA', '.SGI', '.RAS', '.TGA', '.ICB', '.VDA', '.VST', '.WEBP', '.WMF', '.EMF', '.XBM', '.XPM', '.aiff', '.au', '.avr', '.caf', '.flac', '.htk', '.svx', '.mat4', '.mat5', '.mpc2k', '.ogg', '.paf', '.pvf', '.raw', '.rf64', '.sd2', '.sds', '.ircam', '.voc', '.w64', '.wav', '.nist', '.wavex', '.wve', '.xi', '.mp3', '....
As far as I can tell, the data is there and is publically accessible.