Persistent storage who can access?

Hi
I save some files to my Persistent storage , who can access to files ?
Is safe or not from space users?

1 Like

I don’t think that Persistent Storage is the only free pass, because even in normal space, most paths cannot be touched unless explicitly allowed by the program.
I think so, but it’s about a paid service, so you’d better ask the HF representative.:sweat:

1 Like

Yes i mean paid service ( Persistent Storage ), how can i ask HF representative

Thank you

1 Like

Hi @ADalsrehy
If you want to save your data into a huggingface dataset instead you can use a commit scheduler.
These are some methods proposed by wauplin to push your data (I have hot patched his space in here Space to Dataset Saver - a Hugging Face Space by not-lain )

save as json

import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4

import gradio as gr

from huggingface_hub import CommitScheduler


JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)

JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"

scheduler = CommitScheduler(
    repo_id="example-space-to-dataset-json",
    repo_type="dataset",
    folder_path=JSON_DATASET_DIR,
    path_in_repo="data",
)


def greet(name: str) -> str:
    return "Hello " + name + "!"


def save_json(name: str, greetings: str) -> None:
    with scheduler.lock:
        with JSON_DATASET_PATH.open("a") as f:
            json.dump({"name": name, "greetings": greetings, "datetime": datetime.now().isoformat()}, f)
            f.write("\n")


with gr.Blocks() as demo:
    with gr.Row():
        greet_name = gr.Textbox(label="Name")
        greet_output = gr.Textbox(label="Greetings")
    greet_btn = gr.Button("Greet")
    greet_btn.click(fn=greet, inputs=greet_name, outputs=greet_output).success(
        fn=save_json,
        inputs=[greet_name, greet_output],
        outputs=None,
    )


demo.launch()

save image dataset

import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4

import gradio as gr
import numpy as np
from PIL import Image

from huggingface_hub import CommitScheduler, InferenceClient


IMAGE_DATASET_DIR = Path("image_dataset") / f"train-{uuid4()}"
IMAGE_DATASET_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_JSONL_PATH = IMAGE_DATASET_DIR / "metadata.jsonl"

scheduler = CommitScheduler(
    repo_id="example-space-to-dataset-image",
    repo_type="dataset",
    folder_path=IMAGE_DATASET_DIR,
    path_in_repo=IMAGE_DATASET_DIR.name,
)

client = InferenceClient()


def generate_image(prompt: str) -> Image:
    return client.text_to_image(prompt)


def save_image(prompt: str, image_array: np.ndarray) -> None:
    image_path = IMAGE_DATASET_DIR / f"{uuid4()}.png"

    with scheduler.lock:
        Image.fromarray(image_array).save(image_path)
        with IMAGE_JSONL_PATH.open("a") as f:
            json.dump({"prompt": prompt, "file_name": image_path.name, "datetime": datetime.now().isoformat()}, f)
            f.write("\n")


with gr.Blocks() as demo:
    with gr.Row():
        prompt_value = gr.Textbox(label="Prompt")
        image_value = gr.Image(label="Generated image")
    text_to_image_btn = gr.Button("Generate")
    text_to_image_btn.click(fn=generate_image, inputs=prompt_value, outputs=image_value).success(
        fn=save_image,
        inputs=[prompt_value, image_value],
        outputs=None,
    )


demo.launch()

save 1 million samples

import json
import tempfile
import zipfile
from datetime import datetime
from pathlib import Path
from uuid import uuid4

import gradio as gr
import numpy as np
from PIL import Image

from huggingface_hub import CommitScheduler, InferenceClient


IMAGE_DATASET_DIR = Path("image_dataset_1M") / f"train-{uuid4()}"

IMAGE_DATASET_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_JSONL_PATH = IMAGE_DATASET_DIR / "metadata.jsonl"


class ZipScheduler(CommitScheduler):
    """
    Example of a custom CommitScheduler with overwritten `push_to_hub` to zip images before pushing them to the Hub.

    Workflow:
    1. Read metadata + list PNG files.
    2. Zip png files in a single archive.
    3. Create commit (metadata + archive).
    4. Delete local png files to avoid re-uploading them later.

    Only step 1 requires to activate the lock. Once the metadata is read, the lock is released and the rest of the
    process can be done without blocking the Gradio app.
    """

    def push_to_hub(self):
        # 1. Read metadata + list PNG files
        with self.lock:
            png_files = list(self.folder_path.glob("*.png"))
            if len(png_files) == 0:
                return None  # return early if nothing to commit

            # Read and delete metadata file
            metadata = IMAGE_JSONL_PATH.read_text()
            try:
                IMAGE_JSONL_PATH.unlink()
            except Exception:
                pass

        with tempfile.TemporaryDirectory() as tmpdir:
            # 2. Zip png files + metadata in a single archive
            archive_path = Path(tmpdir) / "train.zip"
            with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as zip:
                # PNG files
                for png_file in png_files:
                    zip.write(filename=png_file, arcname=png_file.name)

                # Metadata
                tmp_metadata = Path(tmpdir) / "metadata.jsonl"
                tmp_metadata.write_text(metadata)
                zip.write(filename=tmp_metadata, arcname="metadata.jsonl")

            # 3. Create commit
            self.api.upload_file(
                repo_id=self.repo_id,
                repo_type=self.repo_type,
                revision=self.revision,
                path_in_repo=f"train-{uuid4()}.zip",
                path_or_fileobj=archive_path,
            )

        # 4. Delete local png files to avoid re-uploading them later
        for png_file in png_files:
            try:
                png_file.unlink()
            except Exception:
                pass


scheduler = ZipScheduler(
    repo_id="example-space-to-dataset-image-zip",
    repo_type="dataset",
    folder_path=IMAGE_DATASET_DIR,
)

client = InferenceClient()


def generate_image(prompt: str) -> Image:
    return client.text_to_image(prompt)


def save_image(prompt: str, image_array: np.ndarray) -> None:
    print("Saving: " + prompt)
    image_path = IMAGE_DATASET_DIR / f"{uuid4()}.png"

    with scheduler.lock:
        Image.fromarray(image_array).save(image_path)
        with IMAGE_JSONL_PATH.open("a") as f:
            json.dump({"prompt": prompt, "file_name": image_path.name, "datetime": datetime.now().isoformat()}, f)
            f.write("\n")


with gr.Blocks() as demo:
    with gr.Row():
        prompt_value = gr.Textbox(label="Prompt")
        image_value = gr.Image(label="Generated image")
    text_to_image_btn = gr.Button("Generate")
    text_to_image_btn.click(fn=generate_image, inputs=prompt_value, outputs=image_value).success(
        fn=save_image,
        inputs=[prompt_value, image_value],
        outputs=None,
    )


demo.launch()

note that commit schedulers do not push your data in real time and they upload once every 5 minutes.

Sorry but i mean if i saved files into persistent storage paid service, who can access to files ? Only space app or all users ?
Not interested to use datasets
Thanks

1 Like

@ADalsrehy
I have found a space that is using peristant storage and they are saving the data under /data , and no, we can’t access their data

I am not sure of my information but I think that all data stays hidden if stored in all placed that are different from /app.
not sure what happens if you store something under /app but i don’t think it’s public either because everything is actually placed in a docker container undereath the hood and deployed in spaces.

1 Like

Also even in Gradio, users should not be able to access most folders unless we specify it this way.
Even if you allow it in this way, the list of files in the images folder will not be public, but only downloadable if you specify the path and hit a lucky guess.

with gr.Blocks() as app:
~
app.launch(allowed_paths=["./images/"]) 
1 Like