Loading a dataset doesn't actually memory map

Hi, I am experimenting with huggingface. I have created a dataset and uploaded it to the hub but now I am facing a very weird behavior.

Batching over the dataset has a huge memory overhead for no real reason, the raw data is less than the actual memory overhead.

from torch.utils.data import DataLoader
from datasets import load_dataset

from transformers import AutoImageProcessor
import albumentations
import numpy as np
import torch

checkpoint = "Yorai/yolos-tiny_finetuned_dataset"
dataset = "Yorai/detect-waste"
transform = albumentations.Compose(
        albumentations.Resize(480, 480),
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),

def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(0, len(category)):
        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 0,
            "area": area[i],
            "bbox": list(bbox[i]),

    return annotations

# transforming a batch
def transform_aug_ann(examples):
    image_ids = examples["image_id"]
    images, bboxes, area, categories = [], [], [], []
    for image, objects in zip(examples["image"], examples["objects"]):
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])


    targets = [
        {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
        for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)

    return image_processor(images=images, annotations=targets, return_tensors="pt")

def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    # encoding = image_processor.pad(pixel_values, return_tensors="pt")
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    if "pixel_mask" in encoding:
        batch["pixel_mask"] = encoding["pixel_mask"]
    batch["labels"] = labels
    return batch

if __name__ == "__main__":
    ds = load_dataset(dataset)
    # load_dataset("imagenet-1k", num_proc=8)
    remove_idx = [530]
    keep = [i for i in range(len(ds["train"])) if i not in remove_idx]
    ds["train"] = ds["train"].select(keep)

    image_processor = AutoImageProcessor.from_pretrained(checkpoint)
    augmented = ds["train"].with_transform(transform_aug_ann)
    val_dataloader = DataLoader(augmented, batch_size=256, shuffle=False, num_workers=2, collate_fn=collate_fn)
    i = (next(iter(val_dataloader)))

I have ran python -m memray run src/_dataloader_dataset.py in attempt to figure out where it comes from and it basically says that memory_mapped_stream = pa.memory_map(filename) at .venv/lib/python3.10/site-packages/datasets/table.py, line 50 in _memory_mapped_record_batch_reader_from_file 13.7 GiB total is at fault.

Really weird because the entire dataset is about 20 gig tops. I do not see a reason for it to load 13+ gig for a 768mb batch.
also if I increase the number of workers it uses even more ram! this behavior isn’t a memory mapped behavior at all!

This is a complex issue - see https://github.com/huggingface/datasets/issues/4883.

PS: You can use import pyarrow as pa; pa.total_allocated_bytes() to check PyArrow’s RAM usage.

1 Like