Hello,
I’m trying to upload a multilingual low resource West Balkan machine translation dataset called rosetta_balcanica on Hugging Face hub. The data is stored in Github and was manually extracted. This is an on-going project. I’ve created a dataset creation script that should enable one to download and load the dataset based on the configuration specified. I’m also following the documentation page for creating a dataset loading script.
After writing out the script, I use datasets-cli
to test the script. First off, its not clear where to run this command even though it says at the root, of the datasets directory. This to me implies the root of the repo, but it seems its one directory above that.
When I run the command datasets-cli test rosetta_balcanica --save_infos --all_configs
(the directory rosetta_balcanica
has only a README.md and rosetta_balcania.py
), I get this error:
Traceback (most recent call last):
File "/home/sudarshan/anaconda3/envs/rb_hub/bin/datasets-cli", line 8, in <module>
sys.exit(main())
File "/home/sudarshan/anaconda3/envs/rb_hub/lib/python3.7/site-packages/datasets/commands/datasets_cli.py", line 33, in main
service.run()
File "/home/sudarshan/anaconda3/envs/rb_hub/lib/python3.7/site-packages/datasets/commands/test.py", line 119, in run
module = dataset_module_factory(path)
File "/home/sudarshan/anaconda3/envs/rb_hub/lib/python3.7/site-packages/datasets/load.py", line 1083, in dataset_module_factory
combined_path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
File "/home/sudarshan/anaconda3/envs/rb_hub/lib/python3.7/site-packages/datasets/load.py", line 669, in get_module
download_config=self.download_config,
File "/home/sudarshan/anaconda3/envs/rb_hub/lib/python3.7/site-packages/datasets/load.py", line 292, in _download_additional_modules
f"To be able to use {name}, you need to install the following dependencies"
ImportError: To be able to use rosetta_balcanica, you need to install the following dependencies['datasets,'] using 'pip install datasets,' for instance'
However, I have already installed datasets
so I’m not sure whey I’m getting this error. This is my dataset creation script rosetta_balcanica.py
:
_DESCRIPTION="""
Rosetta-Balcanica is a set of evaluation datasets for low resource western Balkan languages manually sourced from articles from OSCE website.
"""
_HOMEPAGE='https://github.com/ebegoli/rosetta-balcanica'
_DATA_URL='https://github.com/ebegoli/rosetta-balcanica/raw/main/rosetta_balcanica.tar.gz'
_VERSION=datasets.Version('1.0.0')
class RosettaBalcanicaConfig(datasets.BuilderConfig):
"""BuilderConfig for Rosetta Balcanica for low resource West Balcan languages
"""
def __init__(self, lang_pair=(None, None), **kwargs):
assert lang_pair in _VALID_LANGUAGE_PAIRS, (f"Language pair {lang_pair} not supported (yet)")
name = f'{lang_pair[0]} to {lang_pair[1]}'
desc = f'Translation dataset from {lang_pair[0]} to {lang_pair[1]}'
super(RosettaBalcanicaConfig, self).__init__(
name=name,
description=desc,
version=_VERSION,
**kwargs
)
self.lang_pair = lang_pair
class RoesettaBalcancia(datasets.GeneratorBasedBuilder):
logger.debug("i'm in builder")
BUILDER_CONFIGS = [
RosettaBalcanicaConfig(
lang_pair=lang_pair,
versino=_VERSION,
)
for lang_pair in _VALID_LANGUAGE_PAIRS
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{'translation': datasets.features.Translation(languages=self.config.lang_pair)}
),
homepage=_HOMEPAGE,
supervised_keys=self.config.lang_pair,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
archive = dl_manager.download(_DATA_URL)
source,target = self.config.lang_pair
non_en = source if target == 'en' else target
data_dir = f'en-{non_en}'
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
'source_file': f'{data_dir}/train_{source}.txt',
'target_file': f'{data_dir}/train_{target}.txt',
'files': dl_manager.iter_archive(archive)
}
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
'source_file': f'{data_dir}/test_{source}.txt',
'target_file': f'{data_dir}/test_{target}.txt',
'files': dl_manager.iter_archive(archive)
}
),
]
def _generate_examples(self, source_file, target_file, files):
source_sents, target_sents = None, None
for path, f in files:
if path == source_file:
source_sents = f.read().decode('utf-8').split('\n')
elif path == target_file:
target_sents = f.read().decode('utf-8').split('\n')
if source_sents is not None and target_sents is not None:
break
assert len(target_sents) == len(source_sents), (f"Sizes do not match: {len(source_sents) vs len(target_sents)} for {source_file} vs {target_file}")
source,target = self.config.lang_pair
for idx, (l1, l2) in enumerate(zip(source_sents, target_sents)):
result = {
'translation': {source: l1, target: l2}
}
if all(result.values()):
yield idx, result
Please could I get some help on this?
Thanks!