Hello everyone,
I already store some CSV file for my dataset in a private S3 storage and I want to write a Cloud storage load script to load my dataset from S3, save it locally, to be able to call it using load_dataset (basic).
I was not able to find in documentation how to do that in a proper way …
My tests tell me:
-
Cloud Storage documentation: show me how to load data from private S3 storage, not how to save it locally using
download_and_extract()
method. - download_custom() method documentation: show me that we could possibly define our download method, but not how to do that in a clean way.
This is a working solution for my problem but this is not clean, do you have better idea ?
import os
import s3fs
import datasets
import pandas as pd
from datasets.tasks import TextClassification
logger = datasets.logging.get_logger(__name__)
_CITATION = """
HIDDEN
"""
_DESCRIPTION = """
HIDDEN
"""
_URL = S3_STORAGE_PATH # HIDDEN but trust me It works
_URLS = {
"train": "umsab_train_split.csv",
"validation": "umsab_val_split.csv",
"test": "umsab_test_split.csv",
}
storage_options = {
"key": os.getenv("AWS_ACCESS_KEY"),
"secret": os.getenv("AWS_SECRET_KEY"),
}
aws_fs = s3fs.S3FileSystem(**storage_options)
def custom_download_func(urls):
aws_fs.download(_URL, f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab", recursive=True)
return {
"train": f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab/{urls['train']}",
"validation": f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab/{urls['validation']}",
"test": f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab/{urls['test']}",
}
class UmsabConfig(datasets.BuilderConfig):
"""BuilderConfig for UMSAB."""
def __init__(self, **kwargs):
"""BuilderConfig for UMSAB.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(UmsabConfig, self).__init__(**kwargs)
class Umsab(datasets.GeneratorBasedBuilder):
"""UMSAB: the Unified Multilingual Sentiment Analysis Benchmark."""
BUILDER_CONFIGS = [
UmsabConfig(
name="UMSAB",
version=datasets.Version("1.0.0", ""),
description=_DESCRIPTION,
),
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"text": datasets.Value("string"),
"label": datasets.ClassLabel(num_classes=3, names=["negative", "neutral", "positive"]),
"language": datasets.Value("string"),
}
),
supervised_keys=None,
homepage="https://huggingface.co/datasets/bstrai/umsab",
citation=_CITATION,
task_templates=[
TextClassification(
text_column="text",
label_column="label",
)
],
)
def _split_generators(self, dl_manager):
# My problem is here:
downloaded_files = custom_download_func(_URLS)
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["validation"]}),
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}),
]
def _generate_examples(self, filepath):
"""This function returns the examples in the raw (text) form."""
logger.info("generating examples from = %s", filepath)
split_df = pd.read_csv(
filepath,
# storage_options=storage_options,
# could be another solution for me ?
)
for key, item in split_df.iterrows():
yield key, {
**item,
}
Thank you in advance for your answer and your help