How to write a dataset load script using private S3 storage

Hello everyone,
I already store some CSV file for my dataset in a private S3 storage and I want to write a Cloud storage load script to load my dataset from S3, save it locally, to be able to call it using load_dataset (basic).
I was not able to find in documentation how to do that in a proper way …

My tests tell me:

This is a working solution for my problem but this is not clean, do you have better idea ?

import os
import s3fs

import datasets
import pandas as pd
from datasets.tasks import TextClassification


logger = datasets.logging.get_logger(__name__)


_CITATION = """
HIDDEN
 """

_DESCRIPTION = """
HIDDEN
"""


_URL = S3_STORAGE_PATH # HIDDEN but trust me It works
_URLS = {
    "train": "umsab_train_split.csv",
    "validation": "umsab_val_split.csv",
    "test": "umsab_test_split.csv",
}

storage_options = {
    "key": os.getenv("AWS_ACCESS_KEY"),
    "secret": os.getenv("AWS_SECRET_KEY"),
}

aws_fs = s3fs.S3FileSystem(**storage_options)


def custom_download_func(urls):
    aws_fs.download(_URL, f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab", recursive=True)

    return {
        "train": f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab/{urls['train']}",
        "validation": f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab/{urls['validation']}",
        "test": f"{datasets.config.DOWNLOADED_DATASETS_PATH}/umsab/{urls['test']}",
    }

class UmsabConfig(datasets.BuilderConfig):
    """BuilderConfig for UMSAB."""

    def __init__(self, **kwargs):
        """BuilderConfig for UMSAB.

        Args:
          **kwargs: keyword arguments forwarded to super.
        """
        super(UmsabConfig, self).__init__(**kwargs)


class Umsab(datasets.GeneratorBasedBuilder):
    """UMSAB: the Unified Multilingual Sentiment Analysis Benchmark."""

    BUILDER_CONFIGS = [
        UmsabConfig(
            name="UMSAB",
            version=datasets.Version("1.0.0", ""),
            description=_DESCRIPTION,
        ),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "text": datasets.Value("string"),
                    "label": datasets.ClassLabel(num_classes=3, names=["negative", "neutral", "positive"]),
                    "language": datasets.Value("string"),
                }
            ),
            supervised_keys=None,
            homepage="https://huggingface.co/datasets/bstrai/umsab",
            citation=_CITATION,
            task_templates=[
                TextClassification(
                    text_column="text",
                    label_column="label",
                )
            ],
        )

    def _split_generators(self, dl_manager):
        # My problem is here:
        downloaded_files = custom_download_func(_URLS)

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["validation"]}),
            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}),
        ]

    def _generate_examples(self, filepath):
        """This function returns the examples in the raw (text) form."""
        logger.info("generating examples from = %s", filepath)

        split_df = pd.read_csv(
            filepath,
            # storage_options=storage_options,
            # could be another solution for me ?
        )
        for key, item in split_df.iterrows():
            yield key, {
                **item,
            }

Thank you in advance for your answer and your help :slight_smile:

Hi ! We plan to add a native support for S3 paths - we opened a discussion here: Support cloud storage in load_dataset · Issue #5281 · huggingface/datasets · GitHub

In the meantime you can use download_custom indeed:

    def _split_generators(self, dl_manager):
        downloaded_files = dl_manager.download_custom(_URLS, aws_fs.download)

        return {
            "train": downloaded_files["train"],
            "validation": downloaded_files["validation"],
            "test": downloaded_files["test"],
        }
1 Like

Nice thank you for your solution It works as expected ! I tried to implement my own solution for download_custom() but this solution is better.