Load a COCO format database from disk for DETR

I have a COCO database in my disk (with a JSON in the annotations folder that contains image directions) and I would like to load it in HF dataset in orther to use CV models.

Is there a function that allows that?

1 Like

Hmm… This?

1 Like

There is no COCO loader in the datasets library, but it would be a welcomed contribution in my opinion.

All the existing data modules are listed here

2 Likes

I wrote this code for loading COCO datasets in hugging face datasets that works with DETR,

Adaptations:

  • features of your COCO JSON file
  • path to COCO folder in local
import json
import os
import subprocess
from datasets import DatasetDict, Dataset, Features, Value, Sequence, ClassLabel, Image

# Ensure the datasets module is installed
subprocess.check_call(["pip", "install", "datasets"])

class CocoDatasetLoader:
    def __init__(self, coco_folder):
        self.coco_folder = coco_folder

    def group_by_key_id(self, data, key_id, category_id_to_index):
        """
        Groups data by a specified key and maps category IDs to indices.
        
        Args:
            data (list): List of dictionaries containing the data.
            key_id (str): The key to group by.
            category_id_to_index (dict): Mapping from category IDs to indices.
            
        Returns:
            dict: Grouped data.
        """
        grouped_data = {}
        for item in data:
            key_value = item[key_id]
            if key_value not in grouped_data:
                grouped_data[key_value] = {k: [] for k in item.keys() if k != key_id}
            for k, v in item.items():
                if k != key_id:
                    grouped_data[key_value][k].append(v)
            grouped_data[key_value]['category'] = [category_id_to_index[x] for x in grouped_data[key_value]['category_id']]
        return grouped_data
    
    def load_coco_hf_dataset(self, split):
        """
        Loads COCO dataset and processes it into a format suitable for Hugging Face datasets.
        
        Args:
            split (str): Dataset split (e.g., 'Train', 'Test', 'Validation').
            
        Returns:
            Dataset: HuggingFace Dataset of the split of COCO dataset.
        """
        # Load the JSON file
        json_file_path = os.path.join(self.coco_folder, f'annotations/instances_{split}.json')
        try:
            with open(json_file_path, 'r') as f:
                coco_data = json.load(f)
        except FileNotFoundError:
            print(f"File not found: {json_file_path}")
            return []

        # Extract category names and create a mapping from category IDs to indices
        category_names = [cat['name'] for cat in coco_data['categories']]
        category_id_to_index = {cat['id']: idx for idx, cat in enumerate(coco_data['categories'])}

        # Group annotations by 'image_id'
        grouped_annotations = self.group_by_key_id(coco_data['annotations'], 'image_id', category_id_to_index)

        # Create a dictionary of images
        grouped_images = {item['id']: item for item in coco_data['images']}

        # Initialize 'objects' field in grouped_images
        annotations_keys = list(grouped_annotations.values())[0].keys()
        for k, v in grouped_images.items():
            grouped_images[k]['objects'] = {key: [] for key in annotations_keys}

        # Populate 'objects' field with annotations
        for k, v in grouped_annotations.items():
            grouped_images[k]['objects'] = v

        # Add image paths and IDs
        for k, v in grouped_images.items():
            v['image'] = os.path.join(self.coco_folder, 'images', split, v['file_name'])
            v['image_id'] = v['id']

        # Create a Hugging Face dataset from the custom data using from_list for efficiency
        hf_dataset = Dataset.from_list(list(grouped_images.values()))

        # Define the features for the main dataset
        features = Features({
            'id': Value('int64'),
            'image_id': Value('int64'),
            'image': Image(),
            'file_name': Value('string'),
            'license': Value('string'),
            'flickr_url': Value('string'),
            'coco_url': Value('string'),
            'date_captured': Value('string'),
            'width': Value('int64'),
            'height': Value('int64'),
            'objects': Sequence({
                'id': Value('int64'),
                'area': Value('float32'),
                'bbox': Sequence(Value('float32')),
                'category': ClassLabel(names=category_names),
                'attributes': {'occluded': Value('bool')},
                'category_id': Value('int64'),
                'iscrowd': Value('int64'),
                'segmentation': {
                    'counts': Sequence(Value('int64')),
                    'size': Sequence(Value('int64'))
                }
            })
        })

        # Cast the features for the Hugging Face dataset
        hf_dataset = hf_dataset.cast(features)

        return hf_dataset

# Initialize the CocoDatasetLoader class
coco_loader = CocoDatasetLoader('/path/to/coco/folder/')

hf_dataset_dict = DatasetDict()
for split in ['Train', 'Test', 'Validation']:
    # Load the COCO dataset for each split
    hf_dataset = coco_loader.load_coco_hf_dataset(split)
    
    # Print the dataset
    print(f"Dataset for {split} split:")
    print(hf_dataset)
    
    # Create a DatasetDict with the split
    hf_dataset_dict[split.lower()] = hf_dataset

1 Like

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.