Help making object detection dataset

erotemic · April 26, 2025, 4:56pm

I’ve created a small test dataset to attempt to get a general kwcoco-to-huggingface conversion tool working.

I tried to follow Object detection to add standardized object detection information, but I’m not sure if huggingface is picking it up correctly. When I tried the column-wise “objects” dictionary to a webdataset.ShardWritter it complained that it wasn’t a recognized field, so I had to add them to the “json” portion, but I’m not sure if I’ve just added generic metadata or if this really is the place to put annotation information.

If annotations are specified correctly, should I expect them to be drawn on the images in the web dataset viewer?

Is there even a standardized way to add a set of boxes / segmentations to a webdataset? Looking at other datasets on huggingface with detections / segmentations, I’m not really seeing much common structure. Is this use-case underdeveloped?

My updated conversion script is in the details

#!/usr/bin/env python3
# /// script
# dependencies = [
#   "Pillow",
#   "huggingface_hub",
#   "kwcoco",
#   "kwutil",
#   "scriptconfig",
#   "ubelt",
#   "webdataset",
# ]
# requires-python = ">=3.11"
# ///
r"""
Convert a KWCoco dataset with train/vali/test splits to Hugging Face WebDataset format.

Example usage (locally):

    python kwcoco_to_hf_webdataset.py \
        --bundle_dir /data/joncrall/dvc-repos/shitspotter_dvc \
        --output_dir /data/joncrall/dvc-repos/shitspotter_dvc/webdataset_shards \
        --hf_repo erotemic/shitspotter

References:
    https://huggingface.co/datasets/erotemic/shitspotter
    https://discuss.huggingface.co/t/help-making-object-detection-dataset/152344
    https://discuss.huggingface.co/t/generating-croissant-metadata-for-custom-image-dataset/150255
    https://chatgpt.com/c/680be71a-4a0c-8002-a31e-bd9c17b5ac05

Example:
    >>> # Demo of full conversion
    >>> from kwcoco_to_hf_webdataset import *  # NOQA
    >>> import ubelt as ub
    >>> import kwcoco
    >>> dpath = ub.Path.appdir('kwcoco/demo/hf-convert').ensuredir()
    >>> full_dset = kwcoco.CocoDataset.demo('shapes32')
    >>> full_dset.reroot(absolute=True)
    >>> # Create splits
    >>> split_names = ['train', 'validation', 'test']
    >>> imgid_chunks = list(ub.chunks(full_dset.images(), nchunks=3))
    >>> for split_name, gids in zip(split_names, imgid_chunks):
    >>>     sub_dset.fpath = dpath / (split_name + '.kwcoco.zip')
    >>>     sub_dset.dump()
    >>> # Call conversion script
    >>> config = KwcocoToHFConfig(
    >>>     bundle_dir=dpath,
    >>>     output_dir=dpath / 'webds',
    >>>     hf_repo=None,
    >>>     #hf_repo='erotemic/shapes',
    >>> )
    >>> KwcocoToHFConfig.main(argv=False, **config)
    >>> # Test conversion can be read by a torch dataloader
    >>> check_webdataset_as_torch(dpath / 'webds/train/*.tar')
    >>> # xdoctest: +REQUIRES(--upload)
    >>> # Test upload
    >>> hf_repo = 'erotemic/shapes'
    >>> upload_to_hub(hf_repo, config.bundle_dir, config.output_dir)
"""

import json
import kwcoco
import kwutil
import os
import ubelt as ub
import webdataset
from PIL import Image
from huggingface_hub import HfApi, upload_file
from io import BytesIO
import scriptconfig as scfg


class KwcocoToHFConfig(scfg.DataConfig):
    """
    Convert a KWCoco bundle (train/vali/test .kwcoco.zip files) to Hugging Face WebDataset format.
    """

    bundle_dir = scfg.Value(
        None,
        help=ub.paragraph(
            """
            Directory with train/vali/test .kwcoco.zip files
            """
        ),
    )
    output_dir = scfg.Value(
        None,
        help=ub.paragraph(
            """
            Output dir for WebDataset .tar files
            """
        ),
    )
    hf_repo = scfg.Value(
        None,
        help=ub.paragraph(
            """
            If specified, push to this huggingface repo.
            (e.g. erotemic/shitspotter)
            """
        ),
    )

    @classmethod
    def main(cls, argv=None, **kwargs):
        import rich
        from rich.markup import escape
        config = cls.cli(argv=argv, data=kwargs, strict=True)
        rich.print('config = ' + escape(ub.urepr(config, nl=1)))

        bundle_dir = ub.Path(config.bundle_dir)
        output_dir = ub.Path(config.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        splits = ["train", "validation", "test"]
        categories_out = output_dir / "categories.json"

        for split in splits:
            coco_fpath = bundle_dir / f"{split}.kwcoco.zip"
            out_tar = output_dir / f"{split}.tar"
            if not coco_fpath.exists():
                raise Exception(f"Missing {split} split at {coco_fpath}")

        for split in splits:
            coco_fpath = bundle_dir / f"{split}.kwcoco.zip"
            out_tar = output_dir / f"{split}/{split}-%06d.tar"
            categories_out = output_dir / f"{split}_categories.json"
            convert_coco_to_webdataset(coco_fpath, out_tar, categories_out)

        if config.hf_repo is not None:
            hf_repo = config.hf_repo
            upload_to_hub(hf_repo, bundle_dir, output_dir)


def check_webdataset_as_torch(urls):
    """
    Args:
        urls (str):
            glob pattern matching the tar files or directory containing them.
    """
    # Once converted, test that we can use a pytorch dataloader:
    import webdataset as wds
    import torch
    from torchvision.transforms import ToTensor
    import kwutil

    urls = list(map(os.fspath, kwutil.util_path.coerce_patterned_paths(urls, expected_extension='.tar')))
    print(f'urls = {ub.urepr(urls, nl=1)}')
    assert urls

    # decode to PIL, then map PIL→Tensor
    dset = (
        wds.WebDataset(urls)
        .decode("pil")
        .to_tuple("jpg", "json")
        .map_tuple(ToTensor(), lambda meta: meta)
    )
    loader = torch.utils.data.DataLoader(dset.batched(2))
    for imgs, metas in loader:
        # imgs is a list of torch.Tensors, metas is a list of dicts
        print(imgs[0].shape, metas[0])
        break


def convert_coco_to_webdataset(coco_dset, out_tar, categories_out=None):
    """
    Convert a coco dataset to a webdataset suitable for huggingface.

    Args:
        coco_dset (str | PathLike | CocoDataset):
            path to the coco dataset or the coco datset itself.

        out_tar (str | PathLike): this is the patterned path
            to write sharded tar files to.

        categories_out (str | PathLike | None):
            if True, write out the category json file to this path

    Example:
        >>> from kwcoco_to_hf_webdataset import *  # NOQA
        >>> import ubelt as ub
        >>> import kwcoco
        >>> dpath = ub.Path.appdir('kwcoco/test/hf-convert').ensuredir()
        >>> coco_dset = kwcoco.CocoDataset.demo('shapes8')
        >>> out_tar = dpath / f"test_wds/test-wds-%06d.tar"
        >>> categories_out = dpath / f"test_wds_categories.json"
        >>> urls = written_files = convert_coco_to_webdataset(coco_dset, out_tar, categories_out)
        >>> check_webdataset_as_torch(urls)
    """
    dset = kwcoco.CocoDataset.coerce(coco_dset)
    print(f"[INFO] Loaded {coco_dset}")

    if categories_out and not categories_out.exists():
        cats = dset.dataset.get("categories", [])
        categories_out.write_text(json.dumps(cats, indent=2))
        print(f"[INFO] Wrote categories.json with {len(cats)} categories")

    ub.Path(out_tar).parent.ensuredir()
    sink = webdataset.ShardWriter(pattern=str(out_tar), maxcount=1000)

    dset.conform(legacy=True)

    written_files = ub.oset()

    pman = kwutil.ProgressManager()
    with pman:
        coco_images = dset.images().coco_images
        prog_iter = pman.progiter(coco_images, desc=f"Processing {dset.tag}")
        for coco_img in prog_iter:
            image_id = coco_img.img["id"]
            img_path = coco_img.image_filepath()
            img_pil = Image.open(img_path).convert("RGB")

            # Save image to bytes
            img_bytes = BytesIO()
            img_pil.save(img_bytes, format="jpeg")
            img_bytes = img_bytes.getvalue()

            # Convert annots to basic JSON-serializable format

            # Attempt to make dataset object detection ready.
            # https://huggingface.co/docs/datasets/v2.14.5/en/object_detection
            objects = {
                "area": [],
                "bbox": [],
                "category": [],
                "id": [],
            }
            for ann in coco_img.annots().objs:
                objects["area"].append(int(ann["area"]))
                objects["bbox"].append(ann["bbox"])
                objects["category"].append(ann["category_id"])
                objects["id"].append(ann["id"])

            anns = []
            for ann in coco_img.annots().objs:
                anns.append(
                    {
                        "bbox": ann["bbox"],
                        "category_id": ann["category_id"],
                        "segmentation": ann.get("segmentation", None),
                        "iscrowd": ann.get("iscrowd", 0),
                    }
                )

            # Save JSON metadata
            sample = {
                "__key__": str(image_id),
                "jpg": img_bytes,
                # "image_id": image_id,
                # "width": coco_img.img["width"],
                # "height": coco_img.img["height"],
                # "objects": objects,
                "json": json.dumps(
                    {
                        "id": image_id,
                        "image_id": image_id,
                        "file_name": os.path.basename(img_path),
                        "width": coco_img.img["width"],
                        "height": coco_img.img["height"],
                        "objects": objects,
                        "annotations": anns,
                    }
                ),
            }

            sink.write(sample)
            written_files.append(sink.fname)

    sink.close()
    written_files = list(written_files)
    print(f"Saved {written_files}")
    return written_files


def upload_to_hub(hf_repo, bundle_dir, output_dir):
    api = HfApi()  # NOQA
    output_dir = ub.Path(output_dir)

    for file in output_dir.glob("*/**.tar"):
        print(f"[UPLOAD] Uploading {file.name} to {hf_repo}")
        upload_file(
            path_or_fileobj=str(file),
            path_in_repo=str(file.relative_to(bundle_dir)),
            repo_id=hf_repo,
            repo_type="dataset",
        )
    for categories_file in output_dir.glob("*categories.json"):
        upload_file(
            path_or_fileobj=str(categories_file),
            path_in_repo=str(categories_file.relative_to(bundle_dir)),
            repo_id=hf_repo,
            repo_type="dataset",
        )


if __name__ == "__main__":
    KwcocoToHFConfig.main()

Topic		Replies	Views
Load a COCO format database from disk for DETR 🤗Datasets	4	161	May 14, 2025
HF Dataset to COCO format dataset 🤗Datasets	5	1126	December 31, 2023
GUI for visualizing and editing the annotations of an object detection dataset 🤗Datasets	1	18	July 1, 2025
How to convert dir-with-images properly? 🤗Datasets	2	427	June 11, 2024
Upload efficiently for lazy split download Beginners	6	26	August 7, 2025

Help making object detection dataset

Related topics