When using StreamingDataset
, the only thing I can think of is to use .shuffle
and .take
to pick up data pseudo-randomly…
If we can load it as a normal Dataset
somehow, we can use .select
simply.
from datasets import load_dataset
ds_id = "mozilla-foundation/common_voice_17_0"
#config_list = ["en", "es", "fr", "de", "it", "pt", "zh"]
config_list = ["en"]
for config in config_list:
ds = load_dataset(ds_id, config, trust_remote_code=True, split="train", streaming=True)
print(f"Config: {config}")
print("Shuffling...")
ds = ds.shuffle(seed=42, buffer_size=10_000) # https://huggingface.co/docs/datasets/v4.0.0/stream#shuffle
ds = ds.take(2) # https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.IterableDataset.take
print(list(ds))
#Config: en
#Shuffling...
#Reading metadata...: 1101170it [00:32, 33788.82it/s]
#[{'client_id': '403e12b207165717054b33f2499ef897add3c30d6f23db3d59657269eb7c76136c3dc483075b9230f26bac3cbdfb749524575f2ce4a96da5a1b9dfe93834801e', 'path': 'en_train_7/common_voice_en_20044076.mp3', 'audio': {'path': 'en_train_7/common_voice_en_20044076.mp3', 'array': array([ 0.00000000e+00, -2.55246364e-16, -3.18319667e-16, ...,
# -8.15750082e-06, -1.59247902e-05, -3.02179451e-05]), 'sampling_rate': 48000}, 'sentence': 'Public opinion in Italy was outraged.', 'up_votes': 2, 'down_votes': 0, 'age': 'twenties', 'gender': 'male_masculine', 'accent': '', 'locale': 'en', 'segment': '', 'variant': ''}, {'client_id': '6c602be8a0dccb7a1a888009bc2211262882f091b85a79c86d0384f6df09f7b7dde560fbdff1f7b8e51dc0944afa85bcffd45c28a333b2dc2fcfce56972e3d9f', 'path': 'en_train_7/common_voice_en_25336162.mp3', 'audio': {'path': 'en_train_7/common_voice_en_25336162.mp3', 'array': array([ 2.01526852e-14, 3.12324653e-13, 4.46545295e-13, ...,
# -2.86631730e-04, -2.47185675e-04, -1.32337733e-04]), 'sampling_rate': 48000}, 'sentence': 'He and his second wife live on "Mirene", a -long working tugboat.', 'up_votes': 2, 'down_votes': 1, 'age': '', 'gender': '', 'accent': '', 'locale': 'en', 'segment': '', 'variant': ''}]