Create a dataset for translation

Hello, I need to create a translation dataset based on my text corpus. It looks like I need to use a DatasetDict but I don’t know how to create one on my data

It would be very convenient to turn the dictionary into a dataset like {word1: translation1, word2: translation2}

Given that you already have a dictionary of key value pairs, you can use the Dataset.from_dict method to create an object of the Dataset class.

Take a look at this link in the ‘from_local_files’ section on how to do it.
https://huggingface.co/docs/datasets/v2.14.5/create_dataset

how can one get the same dataset as in the picture ? I mean the “dict” under the feature should we add it with code or should import a specific CSV format ù?

If I understand the question correctly, you are asking if one can directly import the data as it is. Yes you can, you can also choose to convert the dataset into a pandas dataframe if that’s more convenient.

You can directly convert the dataset into a pandas df by using pd.DataFrame(huggingface dataset)
A naïve solution is to iterate over all the items in the dataset in the picture.
Define two lists, one for English and another one for Hindi.
Then, for each row, append the corresponding source and target sentences in their corresponding lists.
Create a dictionary {‘en’; en_list, ‘hi’: hi_list}. Finally, create a pandas dataframe if that’s what you want to work with.

Regardless, if you’re just using pytorch datasets, the format in the picture should be fine when calling initializing dataloaders.

Hi,

here s an example to create your own dataset for translation.
This one parses a .po gettext file

# coding=utf-8

# CONSULTER https://huggingface.co/docs/datasets/about_dataset_load
#https://huggingface.co/docs/datasets/loading#hugging-face-hub
#
#https://huggingface.co/docs/datasets/dataset_script
# Lint as: python3
import os

import datasets
from pofile import pofile

_DESCRIPTION = """T&F translation dataset"""
_HOMEPAGE_URL = ""
_CITATION = ""

_VERSION = "1.0.0"
_BASE_NAME = "poedit.{}.{}"
_BASE_URL = ""




# Please note that only few pairs are shown here. You can use config to generate data for all language pairs
_LANGUAGE_PAIRS=[]
filenames = os.listdir("./DATASET")
for f in filenames:
    if f.startswith("main_"):
        tgt = f.split("_")[1].split(".")[0]
        _LANGUAGE_PAIRS.append(("fr",tgt))


# Poedit dataset is a set of multiplke configurations : one for each language
# using a configuratikon for each language
# here a name is given to each dataset : fr-en , fr-.... defined by _LANGUAGE_PAIRS
## then it is easy to load a dataset by it's name:
from datasets import load_dataset
#dataset = load_dataset('main_po_dataset_loader', 'fr-en')
class PoeditConfig(datasets.BuilderConfig):
    def __init__(self, *args, srcl=None, tgtl=None, **kwargs):
        super().__init__(
            *args,
            name=f"{srcl}-{tgtl}",
            **kwargs,
        )
        self.srcl = srcl
        self.tgtl= tgtl


class Poedit(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        PoeditConfig(
            srcl=lang1,
            tgtl=lang2,
            description=f"Translating {lang1} to {lang2} or vice versa",
            version=datasets.Version(_VERSION),
        )
        for lang1, lang2 in _LANGUAGE_PAIRS
    ]
    BUILDER_CONFIG_CLASS = PoeditConfig

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "translation": datasets.Translation(languages=(self.config.srcl, self.config.tgtl)),
                },
            ),
            supervised_keys=None,
            homepage=_HOMEPAGE_URL,
            citation=_CITATION,
        )

    """ 
    download data files
    organize into splits
    """
    def _split_generators(self, dl_manager):
        print("split generator")
        path = os.getcwd()+"/DATASET/"
        print("path ",path)
        print("splitgenerator end")
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"datapath": path},
            )
        ]
    """ 
    read data files 
    create examples from dataset
    """
    def _generate_examples(self, datapath):
        l1, l2 = self.config.srcl, self.config.tgtl
        folder = l1 + "-" + l2
        l1_file = _BASE_NAME.format(folder, l1)
        l2_file = _BASE_NAME.format(folder, l2)
        l1_path = os.path.join(datapath, l1_file)
        l2_path = os.path.join(datapath, l2_file)
        print("l1,l2 paths: ",l1_path,l2_path)
        filename = "./DATASET/main_"+l2+".po"
        main = pofile(filename)
        main.read()
        for sentence_counter, (x, y) in enumerate(zip(main.msgid.keys(), main.msgid.values())):
                x = x.strip()
                y = y.strip()
                result = (
                    sentence_counter,
                    {
                        "id": str(sentence_counter),
                        "translation": {l1: x, l2: y},
                    },
                )
                yield result