Extremely Slow Loading of Parquet Dataset with datasets

HuanLin · April 29, 2025, 5:24pm

here is my test script

import time
import torch
import argparse
from datasets import load_dataset
from torch.utils.data import DataLoader


def test_dataset_load_speed():
    """Test the speed of loading a dataset using the datasets library"""
    print("Testing datasets library loading speed...")
    start_time = time.time()
    dataset = load_dataset("./data/parquet", keep_in_memory=True)
    load_time = time.time() - start_time
    print(f"Time taken to load dataset: {load_time:.4f} seconds")

    print(
        f"Dataset size: train set {len(dataset['train'])} samples, validation set {len(dataset['validation'])} samples"
    )
    print(f"Feature columns: {dataset['train'].column_names}")

    return dataset


def test_random_access_speed(dataset, num_samples=100):
    """Test the speed of random access to dataset items"""
    print("\nTesting random access speed...")
    train_dataset = dataset["train"]
    total_items = len(train_dataset)

    if total_items == 0:
        print("Training dataset is empty, cannot test access speed")
        return

    # Adjust sample count to avoid exceeding dataset size
    # num_samples = min(num_samples, total_items)

    start_time = time.time()
    for _ in range(num_samples):
        idx = torch.randint(0, total_items, (1,)).item()
        _ = train_dataset[idx]

    access_time = time.time() - start_time
    print(
        f"Time taken to randomly access {num_samples} samples: {access_time:.4f} seconds"
    )
    print(f"Average access time per sample: {access_time / num_samples * 1000:.2f} ms")


def test_by_dataloader(dataset, batch_size, num_workers):
    """Test the speed of loading dataset using DataLoader"""
    print(f"\nTesting data loading speed with batch size {batch_size}...")

    dataloader = DataLoader(dataset["train"], batch_size=batch_size)

    start_time = 0
    for batch in dataloader:
        if start_time == 0:
            start_time = time.time()
        # Simulate data processing
        _ = batch

    load_time = time.time() - start_time
    print(f"Time taken to load batches of size {batch_size}: {load_time:.4f} seconds")


def main():
    parser = argparse.ArgumentParser(
        description="Test the reading speed of datasets library and data loader"
    )
    parser.add_argument(
        "--random-samples",
        type=int,
        default=100,
        help="Number of samples to test random access",
    )
    args = parser.parse_args()

    print("Starting datasets library read speed test...\n")

    # Test dataset loading speed
    dataset = test_dataset_load_speed()
    dataset.with_format("torch")
    # Test random access speed
    test_random_access_speed(dataset, args.random_samples)

    # Test DataLoader loading speed
    # test_by_dataloader(dataset, 1, 2)

    print("\nTest completed!")


if __name__ == "__main__":
    main()

And I got this result

 ➜ uv run .\test.py
Starting datasets library read speed test...

Testing datasets library loading speed...
Time taken to load dataset: 0.2677 seconds
Dataset size: train set 44 samples, validation set 2 samples
Feature columns: ['f0', 'volume', 'aug_vol', 'spk_id', 'frame_len', 'pitch_aug', 'mel', 'aug_mel', 'units']

Testing random access speed...
Time taken to randomly access 100 samples: 24.9198 seconds
Average access time per sample: 249.20 ms

Test completed!

The training .parquet file is only 175 MB, yet loading fragmented .npy files is significantly faster. I’m puzzled by this discrepancy and would appreciate any insights!

John6666 · April 30, 2025, 1:23am

It may indeed be a parquet issue. Some say that it works fine with numpy. @lhoestq

github.com/huggingface/datasets

Slow iteration speeds when using IterableDataset.shuffle with load_dataset(data_files=..., streaming=True)

opened 09:44PM - 14 Aug 24 UTC

lajd

### Describe the bug When I load a dataset from a number of arrow files, as i…n: ``` random_dataset = load_dataset( "arrow", data_files={split: shard_filepaths}, streaming=True, split=split, ) ``` I'm able to get fast iteration speeds when iterating over the dataset without shuffling. When I shuffle the dataset, the iteration speed is reduced by ~1000x. It's very possible the way I'm loading dataset shards is not appropriate; if so please advise! Thanks for the help ### Steps to reproduce the bug Here's full code to reproduce the issue: - Generate a random dataset - Create shards of data independently using Dataset.save_to_disk() - The below will generate 16 shards (arrow files), of 512 examples each ``` import time from pathlib import Path from multiprocessing import Pool, cpu_count import torch from datasets import Dataset, load_dataset split = "train" split_save_dir = "/tmp/random_split" def generate_random_example(): return { 'inputs': torch.randn(128).tolist(), 'indices': torch.randint(0, 10000, (2, 20000)).tolist(), 'values': torch.randn(20000).tolist(), } def generate_shard_dataset(examples_per_shard: int = 512): dataset_dict = { 'inputs': [], 'indices': [], 'values': [] } for _ in range(examples_per_shard): example = generate_random_example() dataset_dict['inputs'].append(example['inputs']) dataset_dict['indices'].append(example['indices']) dataset_dict['values'].append(example['values']) return Dataset.from_dict(dataset_dict) def save_shard(shard_idx, save_dir, examples_per_shard): shard_dataset = generate_shard_dataset(examples_per_shard) shard_write_path = Path(save_dir) / f"shard_{shard_idx}" shard_dataset.save_to_disk(shard_write_path) return str(Path(shard_write_path) / "data-00000-of-00001.arrow") def generate_split_shards(save_dir, num_shards: int = 16, examples_per_shard: int = 512): with Pool(cpu_count()) as pool: args = [(m, save_dir, examples_per_shard) for m in range(num_shards)] shard_filepaths = pool.starmap(save_shard, args) return shard_filepaths shard_filepaths = generate_split_shards(split_save_dir) ``` Load the dataset as IterableDataset: ``` random_dataset = load_dataset( "arrow", data_files={split: shard_filepaths}, streaming=True, split=split, ) random_dataset = random_dataset.with_format("numpy") ``` Observe the iterations/second when iterating over the dataset directly, and applying shuffling before iterating: Without shuffling, this gives ~1500 iterations/second ``` start_time = time.time() for count, item in enumerate(random_dataset): if count > 0 and count % 100 == 0: elapsed_time = time.time() - start_time iterations_per_second = count / elapsed_time print(f"Processed {count} items at an average of {iterations_per_second:.2f} iterations/second") ``` ``` Processed 100 items at an average of 705.74 iterations/second Processed 200 items at an average of 1169.68 iterations/second Processed 300 items at an average of 1497.97 iterations/second Processed 400 items at an average of 1739.62 iterations/second Processed 500 items at an average of 1931.11 iterations/second` ``` When shuffling, this gives ~3 iterations/second: ``` random_dataset = random_dataset.shuffle(buffer_size=100,seed=42) start_time = time.time() for count, item in enumerate(random_dataset): if count > 0 and count % 100 == 0: elapsed_time = time.time() - start_time iterations_per_second = count / elapsed_time print(f"Processed {count} items at an average of {iterations_per_second:.2f} iterations/second") ``` ``` Processed 100 items at an average of 3.75 iterations/second Processed 200 items at an average of 3.93 iterations/second ``` ### Expected behavior Iterations per second should be barely affected by shuffling, especially with a small buffer size ### Environment info Datasets version: 2.21.0 Python 3.10 Ubuntu 22.04

lhoestq · April 30, 2025, 9:03am

Certain data types are slower to load in pure python than others, like lists. If your dataset contains arrays or long lists, it’s faster to load them as numpy arrays using e.g.

ds = ds.with_format("numpy")

Btw you can also access multiple examples faster using a list of indices in ds[...]:

indices = [...]
examples = ds[indices]

^ this is faster than using a for loop

Topic		Replies	Views
Iterating on dataset extremely slow 🤗Datasets	8	1944	November 6, 2024
Slow DataLoader with big batch_size 🤗Datasets	4	1738	October 5, 2023
How to load parquet to datasets without caching? 🤗Datasets	1	3369	June 24, 2022
Fetching data takes too too much time 🤗Datasets	1	1292	June 13, 2022
Dataset loading script test fails 🤗Datasets	3	597	July 29, 2022

Extremely Slow Loading of Parquet Dataset with datasets

Related topics