Hi ! Right now you have to shard the dataset yourself to save multiple files, but I’m working on supporting saving into multiple files, it will be available soon ![]()
In the meantime you can do:
ds = load_dataset(...)
num_shards = 32
for shard_idx in range(num_shards):
shard = ds.shard(num_shards=num_shards, index=shard_idx, contiguous=True)
shard.save_to_disk(f"path/to/shard_{shard_idx}")
# reload later
from datasets import load_from_disk, concatenate_datasets
ds = concatenate_datasets([
load_from_disk(f"path/to/shard_{shard_idx}")
for shard_idx in range(num_shards)
])