How to save audio dataset with parquet format on disk

Currently, audio feature has some inconsistency, the saved files may contain URLs or embed bytes. With arrow format it’s fine, and I can use save_to_disk to get a dataset with embeded bytes regardless of original format (URLs or embed bytes). However, there’s no option for save_to_disk to write parquet files, and use something like to_parquet still get a dataset with URLs instead of forced embeded bytes.

It seems I need a function like this one datasets/src/datasets/arrow_dataset.py at 092118fc00f7dd718ab3643739d7b23ff16c9eff · huggingface/datasets · GitHub

Ask here to find a better solution. So any plan to address the inconsistency?

1 Like

Work around with some codes which mimics save_to_disk or push_to_hub

import os
from datasets import load_dataset
from datasets.features.features import require_decoding
from datasets.download.streaming_download_manager import xgetsize
from datasets import config
from datasets.utils.py_utils import convert_file_size_to_int
from datasets.table import embed_table_storage
from tqdm import tqdm


data_dir = 'lj_speech_parquets'
split = 'train'
max_shard_size = '500MB'

dataset = load_dataset("lj_speech", split=split)

decodable_columns = (
    [k for k, v in dataset.features.items() if require_decoding(v, ignore_decode_attribute=True)]
)
dataset_nbytes = dataset._estimate_nbytes()
max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
num_shards = int(dataset_nbytes / max_shard_size) + 1
num_shards = max(num_shards, 1)
shards = (dataset.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards))

def shards_with_embedded_external_files(shards):
    for shard in shards:
        format = shard.format
        shard = shard.with_format("arrow")
        shard = shard.map(
            embed_table_storage,
            batched=True,
            batch_size=1000,
            keep_in_memory=True,
        )
        shard = shard.with_format(**format)
        yield shard
shards = shards_with_embedded_external_files(shards)

os.makedirs(data_dir)

for index, shard in tqdm(
    enumerate(shards),
    desc="Save the dataset shards",
    total=num_shards,
):
    shard_path = f"{data_dir}/{split}-{index:05d}-of-{num_shards:05d}.parquet"
    shard.to_parquet(shard_path)

@mariosasko please help to take a look