Hi all,
I have been working with the Banking77 dataset.
from datasets import load_dataset
ds = load_dataset(“PolyAI/banking77”)
It worked about an hour ago, however I now get the following error:
KeyError Traceback (most recent call last)
Cell In[14], line 3
1 from datasets import load_dataset
----> 3 ds = load_dataset(“PolyAI/banking77”)
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/load.py:2628, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2625 return builder_instance.as_streaming_dataset(split=split)
2627 # Download and prepare data
→ 2628 builder_instance.download_and_prepare(
2629 download_config=download_config,
2630 download_mode=download_mode,
2631 verification_mode=verification_mode,
2632 num_proc=num_proc,
2633 storage_options=storage_options,
2634 )
2636 # Build dataset for splits
2637 keep_in_memory = (
2638 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2639 )
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/builder.py:1029, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
1027 if num_proc is not None:
1028 prepare_split_kwargs[“num_proc”] = num_proc
→ 1029 self._download_and_prepare(
1030 dl_manager=dl_manager,
1031 verification_mode=verification_mode,
1032 **prepare_split_kwargs,
1033 **download_and_prepare_kwargs,
1034 )
1035 # Sync info
1036 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/builder.py:1102, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
1100 split_dict = SplitDict(dataset_name=self.dataset_name)
1101 split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
→ 1102 split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
1104 # Checksums verification
1105 if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/packaged_modules/parquet/parquet.py:47, in Parquet._split_generators(self, dl_manager)
45 raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
46 dl_manager.download_config.extract_on_the_fly = True
—> 47 data_files = dl_manager.download_and_extract(self.config.data_files)
48 splits =
49 for split_name, files in data_files.items():
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/download/download_manager.py:434, in DownloadManager.download_and_extract(self, url_or_urls)
418 def download_and_extract(self, url_or_urls):
419 “”“Download and extract given url_or_urls
.
420
421 Is roughly equivalent to:
(…)
432 extracted_path(s): str
, extracted paths of given URL(s).
433 “””
→ 434 return self.extract(self.download(url_or_urls))
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/download/download_manager.py:257, in DownloadManager.download(self, url_or_urls)
255 start_time = datetime.now()
256 with stack_multiprocessing_download_progress_bars():
→ 257 downloaded_path_or_paths = map_nested(
258 download_func,
259 url_or_urls,
260 map_tuple=True,
261 num_proc=download_config.num_proc,
262 desc=“Downloading data files”,
263 batched=True,
264 batch_size=-1,
265 )
266 duration = datetime.now() - start_time
267 logger.info(f"Downloading took {duration.total_seconds() // 60} min")
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/py_utils.py:511, in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, batched, batch_size, types, disable_tqdm, desc)
509 batch_size = max(len(iterable) // num_proc + int(len(iterable) % num_proc > 0), 1)
510 iterable = list(iter_batched(iterable, batch_size))
→ 511 mapped = [
512 _single_map_nested((function, obj, batched, batch_size, types, None, True, None))
513 for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
514 ]
515 if batched:
516 mapped = [mapped_item for mapped_batch in mapped for mapped_item in mapped_batch]
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/py_utils.py:512, in (.0)
509 batch_size = max(len(iterable) // num_proc + int(len(iterable) % num_proc > 0), 1)
510 iterable = list(iter_batched(iterable, batch_size))
511 mapped = [
→ 512 _single_map_nested((function, obj, batched, batch_size, types, None, True, None))
513 for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
514 ]
515 if batched:
516 mapped = [mapped_item for mapped_batch in mapped for mapped_item in mapped_batch]
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/py_utils.py:399, in _single_map_nested(args)
395 return {
396 k: _single_map_nested((function, v, batched, batch_size, types, None, True, None)) for k, v in pbar
397 }
398 else:
→ 399 mapped = [_single_map_nested((function, v, batched, batch_size, types, None, True, None)) for v in pbar]
400 if isinstance(data_struct, list):
401 return mapped
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/py_utils.py:399, in (.0)
395 return {
396 k: _single_map_nested((function, v, batched, batch_size, types, None, True, None)) for k, v in pbar
397 }
398 else:
→ 399 mapped = [_single_map_nested((function, v, batched, batch_size, types, None, True, None)) for v in pbar]
400 if isinstance(data_struct, list):
401 return mapped
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/py_utils.py:380, in _single_map_nested(args)
373 return function(data_struct)
374 if (
375 batched
376 and not isinstance(data_struct, dict)
377 and isinstance(data_struct, types)
378 and all(not isinstance(v, (dict, types)) for v in data_struct)
379 ):
→ 380 return [mapped_item for batch in iter_batched(data_struct, batch_size) for mapped_item in function(batch)]
382 # Reduce logging to keep things readable in multiprocessing with tqdm
383 if rank is not None and logging.get_verbosity() < logging.WARNING:
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/py_utils.py:380, in (.0)
373 return function(data_struct)
374 if (
375 batched
376 and not isinstance(data_struct, dict)
377 and isinstance(data_struct, types)
378 and all(not isinstance(v, (dict, types)) for v in data_struct)
379 ):
→ 380 return [mapped_item for batch in iter_batched(data_struct, batch_size) for mapped_item in function(batch)]
382 # Reduce logging to keep things readable in multiprocessing with tqdm
383 if rank is not None and logging.get_verbosity() < logging.WARNING:
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/download/download_manager.py:313, in DownloadManager._download_batched(self, url_or_filenames, download_config)
300 return thread_map(
301 download_func,
302 url_or_filenames,
(…)
310 tqdm_class=tqdm,
311 )
312 else:
→ 313 return [
314 self._download_single(url_or_filename, download_config=download_config)
315 for url_or_filename in url_or_filenames
316 ]
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/download/download_manager.py:314, in (.0)
300 return thread_map(
301 download_func,
302 url_or_filenames,
(…)
310 tqdm_class=tqdm,
311 )
312 else:
313 return [
→ 314 self._download_single(url_or_filename, download_config=download_config)
315 for url_or_filename in url_or_filenames
316 ]
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/download/download_manager.py:323, in DownloadManager._download_single(self, url_or_filename, download_config)
320 if is_relative_path(url_or_filename):
321 # append the relative path to the base_path
322 url_or_filename = url_or_path_join(self._base_path, url_or_filename)
→ 323 out = cached_path(url_or_filename, download_config=download_config)
324 out = tracked_str(out)
325 out.set_origin(url_or_filename)
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/file_utils.py:211, in cached_path(url_or_filename, download_config, **download_kwargs)
205 if (
206 storage_options
207 and storage_options.keys() < {“http”, “https”}
208 and not (download_config.storage_options and download_config.storage_options.keys() < {“http”, “https”})
209 ):
210 storage_options = {}
→ 211 output_path = get_from_cache(
212 url_or_filename,
213 cache_dir=cache_dir,
214 force_download=download_config.force_download,
215 proxies=download_config.proxies,
216 resume_download=download_config.resume_download,
217 user_agent=download_config.user_agent,
218 local_files_only=download_config.local_files_only,
219 use_etag=download_config.use_etag,
220 max_retries=download_config.max_retries,
221 token=download_config.token,
222 ignore_url_params=download_config.ignore_url_params,
223 storage_options=storage_options,
224 download_desc=download_config.download_desc,
225 disable_tqdm=download_config.disable_tqdm,
226 )
227 elif os.path.exists(url_or_filename):
228 # File, and it exists.
229 output_path = url_or_filename
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/file_utils.py:583, in get_from_cache(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, token, use_auth_token, ignore_url_params, storage_options, download_desc, disable_tqdm)
581 connected = ftp_head(url)
582 elif scheme not in {“http”, “https”} or storage_options.get(scheme):
→ 583 response = fsspec_head(url, storage_options=storage_options)
584 # s3fs uses “ETag”, gcsfs uses “etag”
585 etag = (response.get(“ETag”, None) or response.get(“etag”, None)) if use_etag else None
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/datasets/utils/file_utils.py:361, in fsspec_head(url, storage_options)
359 _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
360 fs, path = url_to_fs(url, **(storage_options or {}))
→ 361 return fs.info(path)
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py:540, in HfFileSystem.info(self, path, refresh, revision, **kwargs)
538 out = out1[0]
539 if refresh or out is None or (expand_info and out and out[“last_commit”] is None):
→ 540 paths_info = self._api.get_paths_info(
541 resolved_path.repo_id,
542 resolved_path.path_in_repo,
543 expand=expand_info,
544 revision=resolved_path.revision,
545 repo_type=resolved_path.repo_type,
546 )
547 if not paths_info:
548 _raise_file_not_found(path, None)
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs)
→ 114 return fn(*args, **kwargs)
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/huggingface_hub/hf_api.py:3145, in HfApi.get_paths_info(self, repo_id, paths, expand, revision, repo_type, token)
3143 hf_raise_for_status(response)
3144 paths_info = response.json()
→ 3145 return [
3146 RepoFile(**path_info) if path_info[“type”] == “file” else RepoFolder(**path_info)
3147 for path_info in paths_info
3148 ]
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/huggingface_hub/hf_api.py:3146, in (.0)
3143 hf_raise_for_status(response)
3144 paths_info = response.json()
3145 return [
→ 3146 RepoFile(**path_info) if path_info[“type”] == “file” else RepoFolder(**path_info)
3147 for path_info in paths_info
3148 ]
File ~/anaconda3/envs/rapids-24.08/lib/python3.11/site-packages/huggingface_hub/hf_api.py:638, in RepoFile.init(self, **kwargs)
635 security = kwargs.pop(“security”, None)
636 if security is not None:
637 security = BlobSecurityInfo(
→ 638 safe=security[“safe”], av_scan=security[“avScan”], pickle_import_scan=security[“pickleImportScan”]
639 )
640 self.security = security
642 # backwards compatibility
KeyError: ‘safe’
I would just like some advice on how to over come this error.
thanks
Kind Regards
Reshay