@lhoestq , just to make things clear …
the following is my original code, thats not in the HF dataset loading script:
import functools
import seqio
import tensorflow as tf
import t5.data
from datasets import load_from_disk
from t5.data import postprocessors
from t5.data import preprocessors
from t5.evaluation import metrics
from seqio import FunctionDataSource, utils
import glob
TaskRegistry = seqio.TaskRegistry
def gen_dataset(split, shuffle=False, seed=None, column="text", dataset_path=None):
dataset = load_from_disk(dataset_path)
if shuffle:
if seed:
dataset = dataset.shuffle(seed=seed)
else:
dataset = dataset.shuffle()
while True:
for item in dataset[str(split)]:
yield item[column]
def dataset_fn(split, shuffle_files, seed=None, dataset_path=None):
return tf.data.Dataset.from_generator(
functools.partial(gen_dataset, split, shuffle_files, seed, dataset_path=dataset_path),
output_signature=tf.TensorSpec(shape=(), dtype=tf.string, name=dataset_path)
)
@utils.map_over_dataset
def target_to_key(x, key_map, target_key):
"""Assign the value from the dataset to target_key in key_map"""
return {**key_map, target_key: x}
data_path = glob.glob("/home/stephen/Desktop/MEGA_CORPUS/COMBINED_CORPUS/*", recursive=False)
seqio_train_list = []
for lang in data_path:
dataset_name = lang.split("/")[-1]
dataset_shapes = None
TaskRegistry.add(
str(dataset_name),
source=seqio.FunctionDataSource(
dataset_fn=functools.partial(dataset_fn, dataset_path=lang),
splits=("train", "test"),
caching_permitted=False,
num_input_examples=dataset_shapes,
),
preprocessors=[
functools.partial(
target_to_key, key_map={
"targets": None,
}, target_key="targets")],
output_features={"targets": seqio.Feature(vocabulary=seqio.PassThroughVocabulary, add_eos=False, dtype=tf.string, rank=0)},
metric_fns=[]
)
seqio_train_dataset = seqio.get_mixture_or_task(dataset_name).get_dataset(
sequence_length=None,
split="train",
shuffle=True,
num_epochs=1,
shard_info=seqio.ShardInfo(index=0, num_shards=10),
use_cached=False,
seed=42)
seqio_train_list.append(seqio_train_dataset)
lang_name_list = []
for lang in data_path:
lang_name = lang.split("/")[-1]
lang_name_list.append(lang_name)
seqio_mixture = seqio.MixtureRegistry.add(
"seqio_mixture",
lang_name_list,
default_rate=0.7
)
seqio_mixture_dataset = seqio.get_mixture_or_task("seqio_mixture").get_dataset(
sequence_length=None,
split="train",
shuffle=True,
num_epochs=1,
shard_info=seqio.ShardInfo(index=0, num_shards=10),
use_cached=False,
seed=42)
for _, ex in zip(range(15), seqio_mixture_dataset):
print(ex["targets"].numpy().decode())
where the seqio_mixture_dataset is the generator that i wanted to be wrapped in HF dataset.
also additionally, could you please tell me how do i set the default_rate=0.7
args where seqio_mixture
is defined to be made as a custom option in the HF load_dataset() method,
maybe like this:
seqio_mixture_dataset = datasets.load_dataset("seqio_loader",temperature=0.5)