Currently, audio feature has some inconsistency, the saved files may contain URLs or embed bytes. With arrow format it’s fine, and I can use save_to_disk
to get a dataset with embeded bytes regardless of original format (URLs or embed bytes). However, there’s no option for save_to_disk
to write parquet files, and use something like to_parquet
still get a dataset with URLs instead of forced embeded bytes.
It seems I need a function like this one datasets/src/datasets/arrow_dataset.py at 092118fc00f7dd718ab3643739d7b23ff16c9eff · huggingface/datasets · GitHub
Ask here to find a better solution. So any plan to address the inconsistency?
2 Likes
Work around with some codes which mimics save_to_disk
or push_to_hub
import os
from datasets import load_dataset
from datasets.features.features import require_decoding
from datasets.download.streaming_download_manager import xgetsize
from datasets import config
from datasets.utils.py_utils import convert_file_size_to_int
from datasets.table import embed_table_storage
from tqdm import tqdm
data_dir = 'lj_speech_parquets'
split = 'train'
max_shard_size = '500MB'
dataset = load_dataset("lj_speech", split=split)
decodable_columns = (
[k for k, v in dataset.features.items() if require_decoding(v, ignore_decode_attribute=True)]
)
dataset_nbytes = dataset._estimate_nbytes()
max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
num_shards = int(dataset_nbytes / max_shard_size) + 1
num_shards = max(num_shards, 1)
shards = (dataset.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards))
def shards_with_embedded_external_files(shards):
for shard in shards:
format = shard.format
shard = shard.with_format("arrow")
shard = shard.map(
embed_table_storage,
batched=True,
batch_size=1000,
keep_in_memory=True,
)
shard = shard.with_format(**format)
yield shard
shards = shards_with_embedded_external_files(shards)
os.makedirs(data_dir)
for index, shard in tqdm(
enumerate(shards),
desc="Save the dataset shards",
total=num_shards,
):
shard_path = f"{data_dir}/{split}-{index:05d}-of-{num_shards:05d}.parquet"
shard.to_parquet(shard_path)
@mariosasko please help to take a look