Hi all,
So I want to download a portion of the refined-web dataset but apart from streaming I couldnβt make it run. So the question to be asked here is how can I download a portion of the refinedweb dataset?
Here are some of the stuff I tried and fail:
subset = load_dataset("tiiuae/falcon-refinedweb", num_proc=10, split="train[%10]")
subset = load_dataset("tiiuae/falcon-refinedweb", num_proc=10, split="train[10:20]")
following these already asked question Subset of split via fraction
these two examples just tried to download the whole dataset.
Then I tried to query the file-list and download it and it resulted in an error:
dataset_name = "tiiuae/falcon-refinedweb"
API_TOKEN = ""
data_files = ""
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = (
"https://datasets-server.huggingface.co/parquet?dataset=tiiuae/falcon-refinedweb"
)
def query():
response = requests.get(API_URL, headers=headers)
return response.json()
data = query()
no_samples = len(data["parquet_files"])
portion = 0.1
iteration_limit = int(no_samples * portion)
data_files = [data["parquet_files"][ii]["url"] for ii in range(iteration_limit)]
subset = load_dataset(dataset_name, num_proc=10, data_files=data_files)
This resulted in the following error:
raceback (most recent call last):βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 54/56 [26:20<00:47, 23.95s/files]
File "load_dataset.py", line 28, in <module>βββββββββββββββββββββββββββββββββββββββββββ| 56/56 [26:50<00:00, 18.33s/files]
subset = load_dataset("tiiuae/falcon-refinedweb", num_proc=10, data_files=data_files)
File " python3.10/site-packages/datasets/load.py", line 2609, in load_dataset 53/56 [25:47<01:07, 22.35s/files]
builder_instance.download_and_prepare(βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 54/56 [26:02<00:40, 20.15s/files]
File " python3.10/site-packages/datasets/builder.py", line 1027, in download_and_prepare32<00:00, 17.41s/files]
self._download_and_prepare(
File " python3.10/site-packages/datasets/builder.py", line 1140, in _download_and_prepare
verify_splits(self.info.splits, split_dict)
File " python3.10/site-packages/datasets/utils/info_utils.py", line 101, in verify_splits
raise NonMatchingSplitsSizesError(str(bad_splits))
datasets.utils.info_utils.NonMatchingSplitsSizesError: [{'expected': SplitInfo(name='train', num_bytes=2766953721769, num_examples=968000015, shard_lengths=None, dataset_name=None), 'recorded': SplitInfo(name='train', num_bytes=278244261454, num_examples=96730207, shard_lengths=[169000, 177919, 178838, 175919, 172000, 173919, 176838, 180919, 181919, 177919, 178919, 170919, 170919, 180919, 173919, 172919, 174919, 161919, 165000, 168919, 177919, 175919, 174919, 170919, 176919, 173919, 179919, 176919, 184838, 180919, 175919, 169919, 171919, 185919, 169919, 175919, 167919, 172919, 165000, 174919, 174919, 176919, 174919, 170919, 172919, 176919, 178919, 180919, 177838, 177919, 173919, 173919, 181919, 161000, 173919, 172919, 6919, 166000, 164919, 163919, 178919, 177919, 177919, 171919, 176919, 178919, 176919, 177919, 184919, 179838, 175919, 169000, 172919, 175919, 169919, 176919, 174919, 172919, 157919, 175919, 171919, 178919, 173919, 170919, 178919, 179919, 181919, 176919, 181919, 178838, 174919, 170000, 180838, 174919, 170000, 173919, 169919, 168919, 169919, 181919, 176919, 172919, 175919, 178919, 178919, 177919, 177838, 178919, 178919, 175919, 170919, 181919, 160919, 174919, 174000, 173919, 170919, 172919, 175919, 175919, 176919, 170919, 174919, 164919, 176919, 176919, 180919, 178919, 177919, 174919, 171919, 177919, 177838, 175919, 173919, 168000, 166919, 178919, 175919, 175919, 176919, 177919, 162919, 174919, 181919, 177919, 180838, 177919, 178919, 166919, 180919, 172919, 172919, 174919, 172919, 165000, 174919, 179919, 175919, 172919, 167919, 178919, 165919, 175919, 177919, 180919, 177919, 173919, 171919, 7919, 175919, 175919, 176919, 173919, 171919, 169000, 170919, 175919, 177919, 172919, 162919, 176919, 160919, 174919, 179919, 176919, 182919, 175919, 159919, 165919, 181919, 172919, 172919, 174919, 175919, 170919, 176919, 172919, 174919, 174919, 162919, 174919, 175919, 175919, 180919, 179919, 177919, 171919, 172919, 179919, 177919, 173919, 174919, 178919, 163919, 162919, 176919, 173919, 174919, 166919, 180919, 168919, 173919, 176919, 181919, 51919, 177919, 165000, 164919, 175919, 176919, 173919, 174919, 169919, 164919, 156919, 178919, 173919, 171919, 165919, 181919, 167919, 176919, 174919, 176919, 180919, 178919, 168919, 169919, 179919, 172919, 174919, 172919, 168919, 153919, 176919, 174919, 174919, 171919, 164919, 180919, 176919, 175919, 176919, 180919, 177919, 170919, 155919, 169919, 175919, 175919, 176919, 171919, 156919, 173919, 175919, 175919, 172919, 165919, 181919, 177919, 115919, 176919, 176919, 178919, 179919, 160000, 176838, 174919, 175919, 173000, 176838, 174919, 168000, 171919, 177919, 173919, 170919, 178919, 179919, 171919, 178919, 178838, 179919, 176919, 167919, 165000, 178919, 174919, 173919, 173919, 176919, 168919, 160919, 176919, 173919, 170919, 177919, 180919, 178919, 172919, 173919, 181919, 180919, 175919, 173919, 165919, 178919, 178919, 176919, 172919, 172919, 169919, 179919, 170919, 170919, 177919, 10919, 181919, 176919, 171919, 176919, 179919, 178919, 173919, 173919, 164919, 178919, 172919, 173919, 175919, 172919, 167000, 173919, 175919, 175919, 171919, 175919, 181838, 173919, 173919, 179919, 175919, 180919, 172919, 170919, 175919, 171919, 171919, 176919, 173919, 161000, 173919, 175919, 177919, 170919, 176919, 177919, 175919, 176919, 170919, 183838, 179919, 173919, 169000, 167919, 178919, 170919, 177919, 170919, 163919, 162919, 176919, 29919, 178919, 169000, 175919, 179838, 180919, 175919, 172919, 181919, 179919, 173919, 171919, 162919, 177919, 172919, 175919, 173919, 170919, 175919, 174919, 175919, 173919, 176919, 177919, 186919, 179919, 175919, 174919, 176919, 178919, 185919, 165919, 177919, 172919, 174919, 174919, 165919, 170919, 178919, 177919, 171919, 170919, 175919, 180919, 180919, 184919, 176919, 178919, 175919, 176919, 171919, 173919, 172919, 168919, 175919, 131919, 169000, 174919, 172919, 176919, 175919, 172919, 179919, 179838, 165000, 175919, 178919, 179838, 173919, 183919, 168919, 179919, 171919, 175919, 163000, 174919, 170919, 175919, 174919, 163919, 173919, 175919, 179919, 165919, 182919, 173919, 178919, 175919, 187838, 168000, 178838, 172000, 171919, 173919, 172919, 171919, 176919, 173919, 173919, 165919, 178919, 178919, 169919, 177919, 173919, 177919, 173919, 180919, 171919, 178919, 171919, 9919, 169000, 173919, 155919, 161919, 163919, 172919, 174919, 162919, 175919, 179919, 179919, 161919, 174919, 177919, 178919, 176919, 180919, 165919, 171919, 171919, 172919, 168919, 159919, 166919, 167919, 175919, 174919, 167919, 176919, 181919, 167919, 173919, 174919, 179919, 174919, 178919, 168919, 167919, 173919, 172919, 173919, 173919, 166919, 168919, 174919, 173919, 157919, 174919, 178919, 183919, 169919, 170919, 179919, 178919, 170919, 140919], dataset_name='falcon-refinedweb')}]