Data files not working with custom loading script and dataset

Hi! I have a small test dataset of reinforcement learning agent episodes at Howuhh/nle_hf_dataset with simple structure:

  1. separate dir with metadata, just json for each episode
  2. separate dir with data, hdf5 file for each episode

I wrote a custom loading script, which loads the data according to the domain “metadata” or “data”. I want to be able to load some parts of data based on metadata filtering, thus I need a working data files functionality, like this:

load_dataset("Howuhh/nle_hf_dataset", "metadata", data_files=["metadata/2.json"])

I tested it locally with path to script instead of dataset name and it works and loads only specified parts. However, when I try this with dataset name it fails with this error:

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/load.py:1773, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1768 verification_mode = VerificationMode(
   1769     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1770 )
   1772 # Create a dataset builder
-> 1773 builder_instance = load_dataset_builder(
   1774     path=path,
   1775     name=name,
   1776     data_dir=data_dir,
   1777     data_files=data_files,
   1778     cache_dir=cache_dir,
   1779     features=features,
   1780     download_config=download_config,
   1781     download_mode=download_mode,
   1782     revision=revision,
   1783     use_auth_token=use_auth_token,
   1784     storage_options=storage_options,
   1785     **config_kwargs,
   1786 )
   1788 # Return iterable dataset in case of streaming
   1789 if streaming:

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/load.py:1528, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, storage_options, **config_kwargs)
   1525     raise ValueError(error_msg)
   1527 # Instantiate the dataset builder
-> 1528 builder_instance: DatasetBuilder = builder_cls(
   1529     cache_dir=cache_dir,
   1530     config_name=config_name,
   1531     data_dir=data_dir,
   1532     data_files=data_files,
   1533     hash=hash,
   1534     features=features,
   1535     use_auth_token=use_auth_token,
   1536     storage_options=storage_options,
   1537     **builder_kwargs,
   1538     **config_kwargs,
   1539 )
   1541 return builder_instance

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/builder.py:329, in DatasetBuilder.__init__(self, cache_dir, config_name, hash, base_path, info, features, use_auth_token, repo_id, data_files, data_dir, storage_options, writer_batch_size, name, **config_kwargs)
    326 self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
    328 if data_files is not None and not isinstance(data_files, DataFilesDict):
--> 329     data_files = DataFilesDict.from_local_or_remote(
    330         sanitize_patterns(data_files), base_path=base_path, use_auth_token=use_auth_token
    331     )
    333 # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
    334 if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:783, in DataFilesDict.from_local_or_remote(cls, patterns, base_path, allowed_extensions, use_auth_token)
    780 out = cls()
    781 for key, patterns_for_key in patterns.items():
    782     out[key] = (
--> 783         DataFilesList.from_local_or_remote(
    784             patterns_for_key,
    785             base_path=base_path,
    786             allowed_extensions=allowed_extensions,
    787             use_auth_token=use_auth_token,
    788         )
    789         if not isinstance(patterns_for_key, DataFilesList)
    790         else patterns_for_key
    791     )
    792 return out

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:751, in DataFilesList.from_local_or_remote(cls, patterns, base_path, allowed_extensions, use_auth_token)
    742 @classmethod
    743 def from_local_or_remote(
    744     cls,
   (...)
    748     use_auth_token: Optional[Union[bool, str]] = None,
    749 ) -> "DataFilesList":
    750     base_path = base_path if base_path is not None else str(Path().resolve())
--> 751     data_files = resolve_patterns_locally_or_by_urls(base_path, patterns, allowed_extensions)
    752     origin_metadata = _get_origin_metadata_locally_or_by_urls(data_files, use_auth_token=use_auth_token)
    753     return cls(data_files, origin_metadata)

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:349, in resolve_patterns_locally_or_by_urls(base_path, patterns, allowed_extensions)
    347         data_files.append(Url(pattern))
    348     else:
--> 349         for path in _resolve_single_pattern_locally(base_path, pattern, allowed_extensions):
    350             data_files.append(path)
    352 if not data_files:

File ~/miniconda3/envs/jax/lib/python3.9/site-packages/datasets/data_files.py:293, in _resolve_single_pattern_locally(base_path, pattern, allowed_extensions)
    291     if allowed_extensions is not None:
    292         error_msg += f" with any supported extension {list(allowed_extensions)}"
--> 293     raise FileNotFoundError(error_msg)
    294 return sorted(out)

FileNotFoundError: Unable to find 'https://huggingface.co/datasets/Howuhh/nle_hf_dataset/resolve/main/metadata/2.json' at /Users/a.p.nikulin/All/nle_hf_dataset/https:/huggingface.co/datasets/Howuhh/nle_hf_dataset/resolve/main

Why it searched in such a strange path?

Interestingly it works for allenai/c4 dataset from documentation example:

load_dataset("allenai/c4", name="en", data_files=["en/c4-train.00000-of-01024.json.gz"])

From the debugger I see that at some point during execution data_files will be transformed to absolute path with url:

https://huggingface.co/datasets/allenai/c4/resolve/607bd4c8450a42878aa9ddc051a65a055450ef87/en/c4-train.00000-of-01024.json.gz

However, this is done only for datasets without loading script in dataset_module_factory and HubDatasetModuleFactoryWithoutScript. Then, later data_path will be poped from builder with correct formatting here:

# datasets/load.py
    dataset_module = dataset_module_factory(
        path,
        revision=revision,
        download_config=download_config,
        download_mode=download_mode,
        data_dir=data_dir,
        data_files=data_files,
    )

    # Get dataset builder class from the processing script
    builder_cls = import_main_class(dataset_module.module_path)
    builder_kwargs = dataset_module.builder_kwargs
    data_files = builder_kwargs.pop("data_files", data_files)   <-------- HERE, it will stay relative for datasets with loading script!
    config_name = builder_kwargs.pop("config_name", name)
    hash = builder_kwargs.pop("hash")

How then relative paths should be done for datasets with custom loading script?

Hi ! It looks like data_files is only implemented for datasets without loading scripts - can you open an issue on github about this ?

Actually, they kinda work, but only when you pass absolute url, not relative path (and manually filter in the loading script based on data_files). So the real issue is with correct formatting. Base path should be appended to the relative path for datasets with custom scripts too.