NotImplementedError when solidifying a streaming dataset

Hello,

I’m trying to take only the top 100 rows from a very large dataset, using the following method from the documentation here.

dataset = load_dataset("laion/laion-coco", split="train", streaming=True)
list(dataset.take(100))

However, this raises the error

NotImplementedError: os.walk is not extended to support URLs in streaming mode

The code from the documentation worked fine, I’ve only changed the dataset URL below.

Am I missing something? Any help would be great, thanks in advance.

Environment:

  • MacOS Ventura 13.1
  • Python 3.10.6
  • Datasets 2.8.0

Also tried on Ubuntu 20.04.4 LTS with the same python and dataset versions.

The full stacktrace is here:

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
Cell In[29], line 2
      1 dataset = load_dataset("laion/laion-coco", split="train", streaming=True)
----> 2 list(dataset.take(100))

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:846, in IterableDataset.__iter__(self)
    843         yield from self._iter_pytorch(worker_info)
    844         return
--> 846 for key, example in self._iter():
    847     if self.features:
    848         # `IterableDataset` automatically fills missing columns with None.
    849         # This is done with `_apply_feature_types_on_example`.
    850         yield _apply_feature_types_on_example(
    851             example, self.features, token_per_repo_id=self._token_per_repo_id
    852         )

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:788, in IterableDataset._iter(self)
    786 else:
    787     ex_iterable = self._ex_iterable
--> 788 yield from ex_iterable

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:628, in TakeExamplesIterable.__iter__(self)
    627 def __iter__(self):
--> 628     yield from islice(self.ex_iterable, self.n)

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:113, in ExamplesIterable.__iter__(self)
    112 def __iter__(self):
--> 113     yield from self.generate_examples_fn(**self.kwargs)

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:713, in _generate_examples_from_tables_wrapper.<locals>.wrapper(**kwargs)
    711 def wrapper(**kwargs):
    712     python_formatter = PythonFormatter()
--> 713     for key, table in generate_tables_fn(**kwargs):
    714         batch = python_formatter.format_batch(table)
    715         for i, example in enumerate(_batch_to_examples(batch)):

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/packaged_modules/parquet/parquet.py:65, in Parquet._generate_tables(self, files)
     61     if sorted(field.name for field in schema) != sorted(self.config.columns):
     62         raise ValueError(
     63             f"Tried to load parquet data with columns '{self.config.columns}' with mismatching features '{self.config.features}'"
     64         )
---> 65 for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
     66     with open(file, "rb") as f:
     67         parquet_file = pq.ParquetFile(f)

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:792, in _IterableFromGenerator.__iter__(self)
    791 def __iter__(self):
--> 792     yield from self.generator(*self.args, **self.kwargs)

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:847, in FilesIterable._iter_from_urlpaths(cls, urlpaths, use_auth_token)
    845     yield urlpath
    846 else:
--> 847     for dirpath, dirnames, filenames in xwalk(urlpath, use_auth_token=use_auth_token):
    848         # skipping hidden directories; prune the search
    849         # [:] for the in-place list modification required by os.walk
    850         # (only works for local paths as fsspec's walk doesn't support the in-place modification)
    851         dirnames[:] = sorted([dirname for dirname in dirnames if not dirname.startswith((".", "__"))])
    852         if xbasename(dirpath).startswith((".", "__")):
    853             # skipping hidden directories

File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:561, in xwalk(urlpath, use_auth_token, **kwargs)
    558 else:
    559     # walking inside a zip in a private repo requires authentication
    560     if not rest_hops and (main_hop.startswith("http://") or main_hop.startswith("https://")):
--> 561         raise NotImplementedError("os.walk is not extended to support URLs in streaming mode")
    562     elif rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")):
    563         url = rest_hops[0]

NotImplementedError: os.walk is not extended to support URLs in streaming mode

Hi ! Sorry for the inconvenience, we are fixing this right now for datasets hosted on huggingface.co - will keep you posted

1 Like

It’s fixed now ! you can now stream data again from the HF Hub :slightly_smiling_face:

1 Like