Hello,
I’m trying to take only the top 100 rows from a very large dataset, using the following method from the documentation here.
dataset = load_dataset("laion/laion-coco", split="train", streaming=True)
list(dataset.take(100))
However, this raises the error
NotImplementedError: os.walk is not extended to support URLs in streaming mode
The code from the documentation worked fine, I’ve only changed the dataset URL below.
Am I missing something? Any help would be great, thanks in advance.
Environment:
- MacOS Ventura 13.1
- Python 3.10.6
- Datasets 2.8.0
Also tried on Ubuntu 20.04.4 LTS with the same python and dataset versions.
The full stacktrace is here:
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
Cell In[29], line 2
1 dataset = load_dataset("laion/laion-coco", split="train", streaming=True)
----> 2 list(dataset.take(100))
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:846, in IterableDataset.__iter__(self)
843 yield from self._iter_pytorch(worker_info)
844 return
--> 846 for key, example in self._iter():
847 if self.features:
848 # `IterableDataset` automatically fills missing columns with None.
849 # This is done with `_apply_feature_types_on_example`.
850 yield _apply_feature_types_on_example(
851 example, self.features, token_per_repo_id=self._token_per_repo_id
852 )
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:788, in IterableDataset._iter(self)
786 else:
787 ex_iterable = self._ex_iterable
--> 788 yield from ex_iterable
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:628, in TakeExamplesIterable.__iter__(self)
627 def __iter__(self):
--> 628 yield from islice(self.ex_iterable, self.n)
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:113, in ExamplesIterable.__iter__(self)
112 def __iter__(self):
--> 113 yield from self.generate_examples_fn(**self.kwargs)
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/iterable_dataset.py:713, in _generate_examples_from_tables_wrapper.<locals>.wrapper(**kwargs)
711 def wrapper(**kwargs):
712 python_formatter = PythonFormatter()
--> 713 for key, table in generate_tables_fn(**kwargs):
714 batch = python_formatter.format_batch(table)
715 for i, example in enumerate(_batch_to_examples(batch)):
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/packaged_modules/parquet/parquet.py:65, in Parquet._generate_tables(self, files)
61 if sorted(field.name for field in schema) != sorted(self.config.columns):
62 raise ValueError(
63 f"Tried to load parquet data with columns '{self.config.columns}' with mismatching features '{self.config.features}'"
64 )
---> 65 for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
66 with open(file, "rb") as f:
67 parquet_file = pq.ParquetFile(f)
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:792, in _IterableFromGenerator.__iter__(self)
791 def __iter__(self):
--> 792 yield from self.generator(*self.args, **self.kwargs)
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:847, in FilesIterable._iter_from_urlpaths(cls, urlpaths, use_auth_token)
845 yield urlpath
846 else:
--> 847 for dirpath, dirnames, filenames in xwalk(urlpath, use_auth_token=use_auth_token):
848 # skipping hidden directories; prune the search
849 # [:] for the in-place list modification required by os.walk
850 # (only works for local paths as fsspec's walk doesn't support the in-place modification)
851 dirnames[:] = sorted([dirname for dirname in dirnames if not dirname.startswith((".", "__"))])
852 if xbasename(dirpath).startswith((".", "__")):
853 # skipping hidden directories
File /opt/homebrew/Caskroom/miniconda/base/envs/fyp/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:561, in xwalk(urlpath, use_auth_token, **kwargs)
558 else:
559 # walking inside a zip in a private repo requires authentication
560 if not rest_hops and (main_hop.startswith("http://") or main_hop.startswith("https://")):
--> 561 raise NotImplementedError("os.walk is not extended to support URLs in streaming mode")
562 elif rest_hops and (rest_hops[0].startswith("http://") or rest_hops[0].startswith("https://")):
563 url = rest_hops[0]
NotImplementedError: os.walk is not extended to support URLs in streaming mode