I’m getting started with Hugging face datasets to build an object detection model.
The dataset is in coco format and my attempts at building a loading script dataset failed!
The coco dataset has an s3 path (s3://bucket_name/path/to/image.jpeg
) which is used to lazy download images. The following tutorial seemed pretty easy to adapt for the use case Create an image dataset.
The test command fails with this error
self.session = aiobotocore.session.AioSession(**self.kwargs)
TypeError: AioSession.__init__() got an unexpected keyword argument 'hf'
I’ve tried leveraging load_dataset_builder as described in this tutorial but the storage_options still have that hf which shouldn’t be there! Cloud storage
Here’s the solution I came up with which doesn’t seem to work. The use of the dl_manager in the generate_examples method seems odd. But how can the files be lazy loaded in the split generator?
from typing import List, Dict, Tuple
import json
import datasets
from PIL import Image as PILImage
class CocoDataset(datasets.GeneratorBasedBuilder):
def __init__(self, **kwargs):
self.dataset_path = "data/coco_dataset.json"
self._init_coco_dataset()
super(datasets.GeneratorBasedBuilder, self).__init__(version=datasets.Version("1.0.0"), **kwargs)
def _init_coco_dataset(self):
with open(self.dataset_path, "r") as file:
dataset_dict = json.load(file)
if "info" not in dataset_dict:
self.coco_info = None
elif isinstance(dataset_dict["info"], dict):
self.coco_info = dataset_dict["info"]
else:
raise ValueError("Invalid COCO dataset info")
self.dataset = dataset_dict
self.id2cat = {cat["id"]: cat["name"] for cat in self.dataset["categories"]}
def _info(self):
category_names = [category["name"] for category in self.dataset["categories"]]
return datasets.DatasetInfo(
description=self.coco_info.get("description", "") if self.coco_info else "",
homepage=self.coco_info.get("url", "") if self.coco_info else "",
version="1.0.0",
features=datasets.Features(
{
"image": datasets.Image(),
"image_id": datasets.Value(dtype="int64"),
"file_path": datasets.Value(dtype="string"),
"width": datasets.Value(dtype="int64"),
"height": datasets.Value(dtype="int64"),
"objects": {
"bbox": datasets.Sequence(
feature=datasets.Sequence(feature=datasets.Value(dtype="int64"), length=4)
),
"category": datasets.Sequence(datasets.ClassLabel(names=category_names)),
},
}
),
)
def _parse_http_to_s3(self, http_url: str) -> str:
bucket = http_url.replace("https://", "").split(".s3.")[0]
object_key = http_url.split(".amazonaws.com/")[-1]
return f"s3://{bucket}/{object_key}"
def _split_generators(self, dl_manager):
samples = dict()
for image in self.dataset["images"]:
metadata = {
"s3_uri": self._parse_http_to_s3(image["coco_url"]),
"file_name": image["file_name"],
"width": image["width"],
"height": image["height"],
}
samples[image["id"]] = metadata
for ann in self.dataset["annotations"]:
im_id = ann["image_id"]
anns = samples[im_id].get("annotation", list())
anns.append(ann)
samples[im_id]["annotation"] = anns
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"dl_manager": dl_manager, "data": samples},
)
]
def _generate_examples(self, dl_manager, data):
"""Generate images and labels for splits."""
for image_id, metadata in data.items():
file_path = dl_manager.download(metadata["s3_uri"])
labels = []
bboxes = []
for ann in metadata.get("annotation", list()):
labels.append(self.id2cat[ann["category_id"]])
bboxes.append(ann["bbox"])
features = {
"image_id": image_id,
"image": PILImage.open(file_path),
"file_path": file_path,
"width": metadata["width"],
"height": metadata["height"],
"objects": {"bbox": bboxes, "category": labels},
}
yield image_id, features