Hi! I have a small test public dataset here (trajectories from reinforcement learning agent): Howuhh/nle_hf_dataset at main
It has simple structure with two directories. Metadata which is just a json file for each episode and data which is hdf5 file for each episode.
Thus, metadata/*.json and data/*.hdf5.
I wrote a custom script to load it into the dataset. I don’t want to load all the files as the final dataset will be big, so I tried to load only some part with data_files option. It works locally, when I call it like that:
load_dataset("nle_hf_dataset.py", "metadata", data_files=["metadata/1.json", "metadata/2.json"])
It will load only needed files. However, when I try to load from the repo, it fails:
load_dataset("Howuhh/nle_hf_dataset", "metadata", data_files=["metadata/1.json", "metadata/2.json"])
with strange error:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In [36], line 1
----> 1 load_dataset("Howuhh/nle_hf_dataset", "metadata", data_files=["metadata/1.json", "metadata/2.json"])
File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/load.py:1773, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
1768 verification_mode = VerificationMode(
1769 (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
1770 )
1772 # Create a dataset builder
-> 1773 builder_instance = load_dataset_builder(
1774 path=path,
1775 name=name,
1776 data_dir=data_dir,
1777 data_files=data_files,
1778 cache_dir=cache_dir,
1779 features=features,
1780 download_config=download_config,
1781 download_mode=download_mode,
1782 revision=revision,
1783 use_auth_token=use_auth_token,
1784 storage_options=storage_options,
1785 **config_kwargs,
1786 )
1788 # Return iterable dataset in case of streaming
1789 if streaming:
File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/load.py:1528, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, storage_options, **config_kwargs)
1525 raise ValueError(error_msg)
1527 # Instantiate the dataset builder
-> 1528 builder_instance: DatasetBuilder = builder_cls(
1529 cache_dir=cache_dir,
1530 config_name=config_name,
1531 data_dir=data_dir,
1532 data_files=data_files,
1533 hash=hash,
1534 features=features,
1535 use_auth_token=use_auth_token,
1536 storage_options=storage_options,
1537 **builder_kwargs,
1538 **config_kwargs,
1539 )
1541 return builder_instance
File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/builder.py:329, in DatasetBuilder.__init__(self, cache_dir, config_name, hash, base_path, info, features, use_auth_token, repo_id, data_files, data_dir, storage_options, writer_batch_size, name, **config_kwargs)
326 self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
328 if data_files is not None and not isinstance(data_files, DataFilesDict):
--> 329 data_files = DataFilesDict.from_local_or_remote(
330 sanitize_patterns(data_files), base_path=base_path, use_auth_token=use_auth_token
331 )
333 # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
334 if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:783, in DataFilesDict.from_local_or_remote(cls, patterns, base_path, allowed_extensions, use_auth_token)
780 out = cls()
781 for key, patterns_for_key in patterns.items():
782 out[key] = (
--> 783 DataFilesList.from_local_or_remote(
784 patterns_for_key,
785 base_path=base_path,
786 allowed_extensions=allowed_extensions,
787 use_auth_token=use_auth_token,
788 )
789 if not isinstance(patterns_for_key, DataFilesList)
790 else patterns_for_key
791 )
792 return out
File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:751, in DataFilesList.from_local_or_remote(cls, patterns, base_path, allowed_extensions, use_auth_token)
742 @classmethod
743 def from_local_or_remote(
744 cls,
(...)
748 use_auth_token: Optional[Union[bool, str]] = None,
749 ) -> "DataFilesList":
750 base_path = base_path if base_path is not None else str(Path().resolve())
--> 751 data_files = resolve_patterns_locally_or_by_urls(base_path, patterns, allowed_extensions)
752 origin_metadata = _get_origin_metadata_locally_or_by_urls(data_files, use_auth_token=use_auth_token)
753 return cls(data_files, origin_metadata)
File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:349, in resolve_patterns_locally_or_by_urls(base_path, patterns, allowed_extensions)
347 data_files.append(Url(pattern))
348 else:
--> 349 for path in _resolve_single_pattern_locally(base_path, pattern, allowed_extensions):
350 data_files.append(path)
352 if not data_files:
File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:293, in _resolve_single_pattern_locally(base_path, pattern, allowed_extensions)
291 if allowed_extensions is not None:
292 error_msg += f" with any supported extension {list(allowed_extensions)}"
--> 293 raise FileNotFoundError(error_msg)
294 return sorted(out)
FileNotFoundError: Unable to find 'https://huggingface.co/datasets/Howuhh/nle_hf_dataset/resolve/main/metadata/1.json' at /Users/a.p.nikulin/All/nle_hf_dataset/https:/huggingface.co/datasets/Howuhh/nle_hf_dataset/resolve/main
I don’t understand why is searching in such a strange path (and why there is https there…). As I understood from the documentation, I can specify relative paths even for the repository, not just local files.