Hey!
I have a dataset of image and text, and I am trying to upload it to the hub using the script below. I was wondering how to create a subset, because everything is been put in āgabrielsantosrvāpracegoverā subset.
Thanks in advance
from datasets import Dataset, load_dataset
from PIL import Image
import io
import json
def try_load_image(filepath):
try:
with open(filepath, 'rb') as f:
image = Image.open(io.BytesIO(f.read()))
if isinstance(image, Image.Image):
return image
except Image.UnidentifiedImageError:
return None
if __name__ == "__main__":
split = "demo"
filepath = "sample/dataset_sample.json"
dataset = load_dataset('json', data_files=filepath, field="data")
dataset[split] = dataset.pop("train") # renaming key from train to `split`
dataset["demo"].map(lambda example: {"img": try_load_image(f"sample/images/{example['filename']}")})
repo = "gabrielsantosrv/pracegover"
dataset.push_to_hub(repo)