Yeah, it is an issue. Iterating with the IterableDataset
oftentimes fails alltogether and if it doesn’t fail, it becomes extreeeemely slow.
Edit: Ok, I finally found a solution and will attach it below if anyone else needs it in the future:
import logging
import warnings
from datasets import load_dataset
def validate_download():
dataset = load_dataset("ILSVRC/imagenet-1k")
splits = ["train", "validation", "test"]
logging.basicConfig(
filename='/src/experiments/datasets/imagenet/validate_download.log',
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s'
)
logger = logging.getLogger(__name__)
# Treat all warnings as errors
warnings.filterwarnings("error")
for split in splits:
logger.info(f"Validating all images of split '{split}'...")
ds = dataset[split]
for idx in range(len(ds)):
try:
ds[idx]['image'].load()
ds[idx]['image'].close()
except Exception as e:
logger.error(f"{idx}: {e}")
# No longer treat warnings as errors
warnings.resetwarnings()
if __name__ == '__main__':
validate_download()