Hi @ADalsrehy
If you want to save your data into a huggingface dataset instead you can use a commit scheduler.
These are some methods proposed by wauplin to push your data (I have hot patched his space in here Space to Dataset Saver - a Hugging Face Space by not-lain )
save as json
import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4
import gradio as gr
from huggingface_hub import CommitScheduler
JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
scheduler = CommitScheduler(
repo_id="example-space-to-dataset-json",
repo_type="dataset",
folder_path=JSON_DATASET_DIR,
path_in_repo="data",
)
def greet(name: str) -> str:
return "Hello " + name + "!"
def save_json(name: str, greetings: str) -> None:
with scheduler.lock:
with JSON_DATASET_PATH.open("a") as f:
json.dump({"name": name, "greetings": greetings, "datetime": datetime.now().isoformat()}, f)
f.write("\n")
with gr.Blocks() as demo:
with gr.Row():
greet_name = gr.Textbox(label="Name")
greet_output = gr.Textbox(label="Greetings")
greet_btn = gr.Button("Greet")
greet_btn.click(fn=greet, inputs=greet_name, outputs=greet_output).success(
fn=save_json,
inputs=[greet_name, greet_output],
outputs=None,
)
demo.launch()
save image dataset
import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4
import gradio as gr
import numpy as np
from PIL import Image
from huggingface_hub import CommitScheduler, InferenceClient
IMAGE_DATASET_DIR = Path("image_dataset") / f"train-{uuid4()}"
IMAGE_DATASET_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_JSONL_PATH = IMAGE_DATASET_DIR / "metadata.jsonl"
scheduler = CommitScheduler(
repo_id="example-space-to-dataset-image",
repo_type="dataset",
folder_path=IMAGE_DATASET_DIR,
path_in_repo=IMAGE_DATASET_DIR.name,
)
client = InferenceClient()
def generate_image(prompt: str) -> Image:
return client.text_to_image(prompt)
def save_image(prompt: str, image_array: np.ndarray) -> None:
image_path = IMAGE_DATASET_DIR / f"{uuid4()}.png"
with scheduler.lock:
Image.fromarray(image_array).save(image_path)
with IMAGE_JSONL_PATH.open("a") as f:
json.dump({"prompt": prompt, "file_name": image_path.name, "datetime": datetime.now().isoformat()}, f)
f.write("\n")
with gr.Blocks() as demo:
with gr.Row():
prompt_value = gr.Textbox(label="Prompt")
image_value = gr.Image(label="Generated image")
text_to_image_btn = gr.Button("Generate")
text_to_image_btn.click(fn=generate_image, inputs=prompt_value, outputs=image_value).success(
fn=save_image,
inputs=[prompt_value, image_value],
outputs=None,
)
demo.launch()
save 1 million samples
import json
import tempfile
import zipfile
from datetime import datetime
from pathlib import Path
from uuid import uuid4
import gradio as gr
import numpy as np
from PIL import Image
from huggingface_hub import CommitScheduler, InferenceClient
IMAGE_DATASET_DIR = Path("image_dataset_1M") / f"train-{uuid4()}"
IMAGE_DATASET_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_JSONL_PATH = IMAGE_DATASET_DIR / "metadata.jsonl"
class ZipScheduler(CommitScheduler):
"""
Example of a custom CommitScheduler with overwritten `push_to_hub` to zip images before pushing them to the Hub.
Workflow:
1. Read metadata + list PNG files.
2. Zip png files in a single archive.
3. Create commit (metadata + archive).
4. Delete local png files to avoid re-uploading them later.
Only step 1 requires to activate the lock. Once the metadata is read, the lock is released and the rest of the
process can be done without blocking the Gradio app.
"""
def push_to_hub(self):
# 1. Read metadata + list PNG files
with self.lock:
png_files = list(self.folder_path.glob("*.png"))
if len(png_files) == 0:
return None # return early if nothing to commit
# Read and delete metadata file
metadata = IMAGE_JSONL_PATH.read_text()
try:
IMAGE_JSONL_PATH.unlink()
except Exception:
pass
with tempfile.TemporaryDirectory() as tmpdir:
# 2. Zip png files + metadata in a single archive
archive_path = Path(tmpdir) / "train.zip"
with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as zip:
# PNG files
for png_file in png_files:
zip.write(filename=png_file, arcname=png_file.name)
# Metadata
tmp_metadata = Path(tmpdir) / "metadata.jsonl"
tmp_metadata.write_text(metadata)
zip.write(filename=tmp_metadata, arcname="metadata.jsonl")
# 3. Create commit
self.api.upload_file(
repo_id=self.repo_id,
repo_type=self.repo_type,
revision=self.revision,
path_in_repo=f"train-{uuid4()}.zip",
path_or_fileobj=archive_path,
)
# 4. Delete local png files to avoid re-uploading them later
for png_file in png_files:
try:
png_file.unlink()
except Exception:
pass
scheduler = ZipScheduler(
repo_id="example-space-to-dataset-image-zip",
repo_type="dataset",
folder_path=IMAGE_DATASET_DIR,
)
client = InferenceClient()
def generate_image(prompt: str) -> Image:
return client.text_to_image(prompt)
def save_image(prompt: str, image_array: np.ndarray) -> None:
print("Saving: " + prompt)
image_path = IMAGE_DATASET_DIR / f"{uuid4()}.png"
with scheduler.lock:
Image.fromarray(image_array).save(image_path)
with IMAGE_JSONL_PATH.open("a") as f:
json.dump({"prompt": prompt, "file_name": image_path.name, "datetime": datetime.now().isoformat()}, f)
f.write("\n")
with gr.Blocks() as demo:
with gr.Row():
prompt_value = gr.Textbox(label="Prompt")
image_value = gr.Image(label="Generated image")
text_to_image_btn = gr.Button("Generate")
text_to_image_btn.click(fn=generate_image, inputs=prompt_value, outputs=image_value).success(
fn=save_image,
inputs=[prompt_value, image_value],
outputs=None,
)
demo.launch()
note that commit schedulers do not push your data in real time and they upload once every 5 minutes.