Data_files not working with custom loading script and remote dataset

Hi! I have a small test public dataset here (trajectories from reinforcement learning agent): Howuhh/nle_hf_dataset at main

It has simple structure with two directories. Metadata which is just a json file for each episode and data which is hdf5 file for each episode.

Thus, metadata/*.json and data/*.hdf5.
I wrote a custom script to load it into the dataset. I don’t want to load all the files as the final dataset will be big, so I tried to load only some part with data_files option. It works locally, when I call it like that:

load_dataset("nle_hf_dataset.py", "metadata", data_files=["metadata/1.json", "metadata/2.json"])

It will load only needed files. However, when I try to load from the repo, it fails:

load_dataset("Howuhh/nle_hf_dataset", "metadata", data_files=["metadata/1.json", "metadata/2.json"])

with strange error:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In [36], line 1
----> 1 load_dataset("Howuhh/nle_hf_dataset", "metadata", data_files=["metadata/1.json", "metadata/2.json"])

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/load.py:1773, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1768 verification_mode = VerificationMode(
   1769     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1770 )
   1772 # Create a dataset builder
-> 1773 builder_instance = load_dataset_builder(
   1774     path=path,
   1775     name=name,
   1776     data_dir=data_dir,
   1777     data_files=data_files,
   1778     cache_dir=cache_dir,
   1779     features=features,
   1780     download_config=download_config,
   1781     download_mode=download_mode,
   1782     revision=revision,
   1783     use_auth_token=use_auth_token,
   1784     storage_options=storage_options,
   1785     **config_kwargs,
   1786 )
   1788 # Return iterable dataset in case of streaming
   1789 if streaming:

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/load.py:1528, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, storage_options, **config_kwargs)
   1525     raise ValueError(error_msg)
   1527 # Instantiate the dataset builder
-> 1528 builder_instance: DatasetBuilder = builder_cls(
   1529     cache_dir=cache_dir,
   1530     config_name=config_name,
   1531     data_dir=data_dir,
   1532     data_files=data_files,
   1533     hash=hash,
   1534     features=features,
   1535     use_auth_token=use_auth_token,
   1536     storage_options=storage_options,
   1537     **builder_kwargs,
   1538     **config_kwargs,
   1539 )
   1541 return builder_instance

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/builder.py:329, in DatasetBuilder.__init__(self, cache_dir, config_name, hash, base_path, info, features, use_auth_token, repo_id, data_files, data_dir, storage_options, writer_batch_size, name, **config_kwargs)
    326 self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
    328 if data_files is not None and not isinstance(data_files, DataFilesDict):
--> 329     data_files = DataFilesDict.from_local_or_remote(
    330         sanitize_patterns(data_files), base_path=base_path, use_auth_token=use_auth_token
    331     )
    333 # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
    334 if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:783, in DataFilesDict.from_local_or_remote(cls, patterns, base_path, allowed_extensions, use_auth_token)
    780 out = cls()
    781 for key, patterns_for_key in patterns.items():
    782     out[key] = (
--> 783         DataFilesList.from_local_or_remote(
    784             patterns_for_key,
    785             base_path=base_path,
    786             allowed_extensions=allowed_extensions,
    787             use_auth_token=use_auth_token,
    788         )
    789         if not isinstance(patterns_for_key, DataFilesList)
    790         else patterns_for_key
    791     )
    792 return out

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:751, in DataFilesList.from_local_or_remote(cls, patterns, base_path, allowed_extensions, use_auth_token)
    742 @classmethod
    743 def from_local_or_remote(
    744     cls,
   (...)
    748     use_auth_token: Optional[Union[bool, str]] = None,
    749 ) -> "DataFilesList":
    750     base_path = base_path if base_path is not None else str(Path().resolve())
--> 751     data_files = resolve_patterns_locally_or_by_urls(base_path, patterns, allowed_extensions)
    752     origin_metadata = _get_origin_metadata_locally_or_by_urls(data_files, use_auth_token=use_auth_token)
    753     return cls(data_files, origin_metadata)

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:349, in resolve_patterns_locally_or_by_urls(base_path, patterns, allowed_extensions)
    347         data_files.append(Url(pattern))
    348     else:
--> 349         for path in _resolve_single_pattern_locally(base_path, pattern, allowed_extensions):
    350             data_files.append(path)
    352 if not data_files:

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:293, in _resolve_single_pattern_locally(base_path, pattern, allowed_extensions)
    291     if allowed_extensions is not None:
    292         error_msg += f" with any supported extension {list(allowed_extensions)}"
--> 293     raise FileNotFoundError(error_msg)
    294 return sorted(out)

FileNotFoundError: Unable to find 'https://huggingface.co/datasets/Howuhh/nle_hf_dataset/resolve/main/metadata/1.json' at /Users/a.p.nikulin/All/nle_hf_dataset/https:/huggingface.co/datasets/Howuhh/nle_hf_dataset/resolve/main

I don’t understand why is searching in such a strange path (and why there is https there…). As I understood from the documentation, I can specify relative paths even for the repository, not just local files.