It looks like I’ve got a variant of the dataset uploaded using webdataset, and it seems like it is able to extract metadata.
Here is the script I used to get things working:
# /// script
# dependencies = [
# "Pillow",
# "huggingface_hub",
# "kwcoco",
# "kwutil",
# "scriptconfig",
# "ubelt",
# "webdataset",
# ]
# requires-python = ">=3.11"
# ///
"""
Convert a KWCoco dataset with train/vali/test splits to Hugging Face WebDataset format.
Example usage (locally):
python kwcoco_to_hf_webdataset.py \
--bundle-dir /path/to/dataset_bundle \
--output-dir /path/to/output/webdataset_shards
Optionally push to HF:
--push-to-hub --hf-repo erotemic/shitspotter
Example PyTorch DataLoader usage:
>>> import webdataset as wds
>>> import torch
>>> from pathlib import Path
>>> split = "train"
>>> root = Path("webdataset_shards") / split
>>> urls = str(root / f"{split}-{{000000..000008}}.tar")
>>> dset = wds.WebDataset(urls).decode("pil").to_tuple("jpg", "json")
>>> loader = torch.utils.data.DataLoader(dset.batched(2))
>>> for imgs, metas in loader:
>>> print(imgs[0].size, metas[0]) # doctest: +SKIP
>>> break # Only show the first batch
>>> import webdataset as wds
>>> import torch
>>> from torchvision.transforms import ToTensor
>>> from pathlib import Path
>>> split = "train"
>>> root = Path("webdataset_shards") / split
>>> urls = str(root / f"{split}-{{000000..000008}}.tar")
>>> # decode to PIL, then map PIL→Tensor
>>> dset = (
... wds.WebDataset(urls)
... .decode("pil")
... .to_tuple("jpg", "json")
... .map_tuple(ToTensor(), lambda meta: meta)
... )
>>> loader = torch.utils.data.DataLoader(dset.batched(2))
>>> for imgs, metas in loader:
... # imgs is a list of torch.Tensors, metas is a list of dicts
... print(imgs[0].shape, metas[0])
... break # only show first batch
References:
https://huggingface.co/datasets/erotemic/shitspotter
https://discuss.huggingface.co/t/help-making-object-detection-dataset/152344
https://discuss.huggingface.co/t/generating-croissant-metadata-for-custom-image-dataset/150255
"""
from PIL import Image
from huggingface_hub import HfApi, upload_file
from io import BytesIO
from pathlib import Path
from scriptconfig import DataConfig, Value
import json
import kwcoco
import kwutil
import os
import ubelt as ub
import webdataset as wds
class KwcocoToHFConfig(DataConfig):
"""
Convert a KWCoco bundle (train/vali/test .kwcoco.zip files) to Hugging Face WebDataset format.
"""
bundle_dir = Value(
"/data/joncrall/dvc-repos/shitspotter_dvc",
help="Directory with train/vali/test .kwcoco.zip files",
)
output_dir = Value(
"/data/joncrall/dvc-repos/shitspotter_dvc/webdataset_shards",
help="Output dir for WebDataset .tar files",
)
push_to_hub = Value(
False, isflag=True, help="Push to Hugging Face hub (not implemented)"
)
hf_repo = Value(
"erotemic/shitspotter", help="Optional HF repo (e.g. erotemic/shitspotter)"
)
def convert_split(coco_fpath, out_tar, categories_out=None):
dset = kwcoco.CocoDataset(coco_fpath)
print(f"[INFO] Loaded {coco_fpath}: {len(dset.images())} images")
if categories_out and not categories_out.exists():
cats = dset.dataset.get("categories", [])
categories_out.write_text(json.dumps(cats, indent=2))
print(f"[INFO] Wrote categories.json with {len(cats)} categories")
ub.Path(out_tar).parent.ensuredir()
sink = wds.ShardWriter(str(out_tar), maxcount=1000)
pman = kwutil.ProgressManager()
with pman:
for coco_img in pman.progiter(
dset.images().coco_images, desc=f"Processing {coco_fpath}"
):
image_id = coco_img.img["id"]
img_path = coco_img.image_filepath()
img_pil = Image.open(img_path).convert("RGB")
# Save image to bytes
img_bytes = BytesIO()
img_pil.save(img_bytes, format="jpeg")
img_bytes = img_bytes.getvalue()
# Convert annots to basic JSON-serializable format
anns = []
for ann in coco_img.annots().objs:
anns.append(
{
"bbox": ann["bbox"],
"category_id": ann["category_id"],
"segmentation": ann.get("segmentation", None),
"iscrowd": ann.get("iscrowd", 0),
}
)
# Save JSON metadata
sample = {
"__key__": str(image_id),
"jpg": img_bytes,
"json": json.dumps(
{
"id": image_id,
"file_name": os.path.basename(img_path),
"width": coco_img.img["width"],
"height": coco_img.img["height"],
"annotations": anns,
}
),
}
sink.write(sample)
sink.close()
print(f"Saved {out_tar}")
def upload_to_hub(hf_repo, bundle_dir, output_dir):
api = HfApi() # NOQA
output_dir = Path(output_dir)
for file in output_dir.glob("*/**.tar"):
print(f"[UPLOAD] Uploading {file.name} to {hf_repo}")
upload_file(
path_or_fileobj=str(file),
path_in_repo=str(file.relative_to(bundle_dir)),
repo_id=hf_repo,
repo_type="dataset",
)
for categories_file in output_dir.glob("*categories.json"):
...
upload_file(
path_or_fileobj=str(categories_file),
path_in_repo=str(categories_file.relative_to(bundle_dir)),
repo_id=hf_repo,
repo_type="dataset",
)
def main():
config = KwcocoToHFConfig.cli()
print(f"[CONFIG]\n{ub.urepr(config, nl=1)}")
bundle_dir = Path(config.bundle_dir)
output_dir = Path(config.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
splits = ["train", "vali", "test"]
categories_out = output_dir / "categories.json"
for split in splits:
coco_fpath = bundle_dir / f"{split}.kwcoco.zip"
out_tar = output_dir / f"{split}.tar"
if not coco_fpath.exists():
raise Exception(f"Missing {split} split at {coco_fpath}")
for split in splits:
coco_fpath = bundle_dir / f"{split}.kwcoco.zip"
out_tar = output_dir / f"{split}/{split}-%06d.tar"
categories_out = output_dir / f"{split}_categories.json"
convert_split(coco_fpath, out_tar, categories_out)
if config.push_to_hub:
hf_repo = config.hf_repo
if not hf_repo:
raise ValueError("Must specify --hf-repo when using --push-to-hub")
upload_to_hub(hf_repo, bundle_dir, output_dir)
if __name__ == "__main__":
main()
I’m not sure if I’ve specified all the metadata correctly. I see my annotation metadata in the “json” column, but I don’t see boxes or polygons drawn, which makes me think I don’t have annotations encoded correctly.
I’m also not sure how I can go about updating this dataset if anything changes. If I change an annotation, I cause a huge LFS diff, that will make it difficult to tag multiple versions of the dataset as I continue to add to it or refine annotation. Any advice on how to handle that would be appreciated.