Need to read subset of data files in WMT14

asking28 · February 12, 2022, 10:34pm

Thanks for helping
wmt_gigafren_only.py

import datasets

from .wmt_utils import Wmt, WmtConfig


_URL = "http://www.statmt.org/wmt14/translation-task.html"
_CITATION = """
@InProceedings{bojar-EtAl:2014:W14-33,
  author    = {Bojar, Ondrej  and  Buck, Christian  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Leveling, Johannes  and  Monz, Christof  and  Pecina, Pavel  and  Post, Matt  and  Saint-Amand, Herve  and  Soricut, Radu  and  Specia, Lucia  and  Tamchyna, Ale\v{s}},
  title     = {Findings of the 2014 Workshop on Statistical Machine Translation},
  booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},
  month     = {June},
  year      = {2014},
  address   = {Baltimore, Maryland, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {12--58},
  url       = {http://www.aclweb.org/anthology/W/W14/W14-3302}
}
"""

_LANGUAGE_PAIRS = [(lang, "en") for lang in ["cs", "de", "fr", "hi", "ru"]]


class Wmt14(Wmt):
    """WMT 14 translation datasets for all {xx, "en"} language pairs."""

    # Version history:
    # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
    BUILDER_CONFIGS = [
        WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2014 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version=datasets.Version("1.0.0"),
        )
        for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def manual_download_instructions(self):
        if self.config.language_pair[1] in ["cs", "hi", "ru"]:
            return "Please download the data manually as explained. TODO(PVP)"
        return None

    @property
    def _subsets(self):
        return {
            datasets.Split.TRAIN: [
                "europarl_v7",
#                 "commoncrawl",
#                 "multiun",
#                 "newscommentary_v9",
#                 "gigafren",
#                 "czeng_10",
#                 "yandexcorpus",
#                 "wikiheadlines_hi",
#                 "wikiheadlines_ru",
#                 "hindencorp_01",
            ],
            datasets.Split.VALIDATION: ["newsdev2014", "newstest2013"],
            datasets.Split.TEST: ["newstest2014"],
        }

loading as

dataset=load_dataset('./wmt14_gigafren.py')

also tried

dataset=load_dataset('wmt14','fr-en',/wmt14_gigafren.py')

Topic		Replies	Views
How to download subset of of a dataset scripted 🤗Datasets	6	6426	December 7, 2023
Cannot download wmt16 🤗Datasets	0	435	November 16, 2020
Is there any ways to download only a subset of dataset using huggingface-cli? 🤗Hub	0	297	July 17, 2024
Load_dataset split=‘test’ not working again Beginners	3	41	April 19, 2025
Downloading a portion of parquet files 🤗Datasets	3	694	May 23, 2024

Need to read subset of data files in WMT14

Related topics