Hi,
here s an example to create your own dataset for translation.
This one parses a .po gettext file
# coding=utf-8
# CONSULTER https://huggingface.co/docs/datasets/about_dataset_load
#https://huggingface.co/docs/datasets/loading#hugging-face-hub
#
#https://huggingface.co/docs/datasets/dataset_script
# Lint as: python3
import os
import datasets
from pofile import pofile
_DESCRIPTION = """T&F translation dataset"""
_HOMEPAGE_URL = ""
_CITATION = ""
_VERSION = "1.0.0"
_BASE_NAME = "poedit.{}.{}"
_BASE_URL = ""
# Please note that only few pairs are shown here. You can use config to generate data for all language pairs
_LANGUAGE_PAIRS=[]
filenames = os.listdir("./DATASET")
for f in filenames:
if f.startswith("main_"):
tgt = f.split("_")[1].split(".")[0]
_LANGUAGE_PAIRS.append(("fr",tgt))
# Poedit dataset is a set of multiplke configurations : one for each language
# using a configuratikon for each language
# here a name is given to each dataset : fr-en , fr-.... defined by _LANGUAGE_PAIRS
## then it is easy to load a dataset by it's name:
from datasets import load_dataset
#dataset = load_dataset('main_po_dataset_loader', 'fr-en')
class PoeditConfig(datasets.BuilderConfig):
def __init__(self, *args, srcl=None, tgtl=None, **kwargs):
super().__init__(
*args,
name=f"{srcl}-{tgtl}",
**kwargs,
)
self.srcl = srcl
self.tgtl= tgtl
class Poedit(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
PoeditConfig(
srcl=lang1,
tgtl=lang2,
description=f"Translating {lang1} to {lang2} or vice versa",
version=datasets.Version(_VERSION),
)
for lang1, lang2 in _LANGUAGE_PAIRS
]
BUILDER_CONFIG_CLASS = PoeditConfig
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"id": datasets.Value("string"),
"translation": datasets.Translation(languages=(self.config.srcl, self.config.tgtl)),
},
),
supervised_keys=None,
homepage=_HOMEPAGE_URL,
citation=_CITATION,
)
"""
download data files
organize into splits
"""
def _split_generators(self, dl_manager):
print("split generator")
path = os.getcwd()+"/DATASET/"
print("path ",path)
print("splitgenerator end")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"datapath": path},
)
]
"""
read data files
create examples from dataset
"""
def _generate_examples(self, datapath):
l1, l2 = self.config.srcl, self.config.tgtl
folder = l1 + "-" + l2
l1_file = _BASE_NAME.format(folder, l1)
l2_file = _BASE_NAME.format(folder, l2)
l1_path = os.path.join(datapath, l1_file)
l2_path = os.path.join(datapath, l2_file)
print("l1,l2 paths: ",l1_path,l2_path)
filename = "./DATASET/main_"+l2+".po"
main = pofile(filename)
main.read()
for sentence_counter, (x, y) in enumerate(zip(main.msgid.keys(), main.msgid.values())):
x = x.strip()
y = y.strip()
result = (
sentence_counter,
{
"id": str(sentence_counter),
"translation": {l1: x, l2: y},
},
)
yield result