Handling decoding errors such as UnidentifiedImageError

Yeah, it is an issue. Iterating with the IterableDataset oftentimes fails alltogether and if it doesn’t fail, it becomes extreeeemely slow.

Edit: Ok, I finally found a solution and will attach it below if anyone else needs it in the future:

import logging
import warnings
from datasets import load_dataset

def validate_download():
    dataset = load_dataset("ILSVRC/imagenet-1k")
    splits = ["train", "validation", "test"]

    logging.basicConfig(
        filename='/src/experiments/datasets/imagenet/validate_download.log', 
        level=logging.INFO,
        format='%(asctime)s %(levelname)s %(name)s %(message)s'
    )
    logger = logging.getLogger(__name__)

    # Treat all warnings as errors
    warnings.filterwarnings("error")

    for split in splits:
        logger.info(f"Validating all images of split '{split}'...")
        ds = dataset[split]

        for idx in range(len(ds)):
            try:
                ds[idx]['image'].load()
                ds[idx]['image'].close()
            except Exception as e:
                logger.error(f"{idx}: {e}")

    # No longer treat warnings as errors
    warnings.resetwarnings()


if __name__ == '__main__':
    validate_download()
1 Like