Hi, I am experimenting with huggingface. I have created a dataset and uploaded it to the hub but now I am facing a very weird behavior.
Batching over the dataset has a huge memory overhead for no real reason, the raw data is less than the actual memory overhead.
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoImageProcessor
import albumentations
import numpy as np
import torch
checkpoint = "Yorai/yolos-tiny_finetuned_dataset"
dataset = "Yorai/detect-waste"
transform = albumentations.Compose(
[
albumentations.Resize(480, 480),
albumentations.HorizontalFlip(p=1.0),
albumentations.RandomBrightnessContrast(p=1.0),
],
bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)
def formatted_anns(image_id, category, area, bbox):
annotations = []
for i in range(0, len(category)):
new_ann = {
"image_id": image_id,
"category_id": category[i],
"isCrowd": 0,
"area": area[i],
"bbox": list(bbox[i]),
}
annotations.append(new_ann)
return annotations
# transforming a batch
def transform_aug_ann(examples):
image_ids = examples["image_id"]
images, bboxes, area, categories = [], [], [], []
for image, objects in zip(examples["image"], examples["objects"]):
image = np.array(image.convert("RGB"))[:, :, ::-1]
out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
area.append(objects["area"])
images.append(out["image"])
bboxes.append(out["bboxes"])
categories.append(out["category"])
targets = [
{"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
]
return image_processor(images=images, annotations=targets, return_tensors="pt")
def collate_fn(batch):
pixel_values = [item["pixel_values"] for item in batch]
# encoding = image_processor.pad(pixel_values, return_tensors="pt")
encoding = image_processor.pad(pixel_values, return_tensors="pt")
labels = [item["labels"] for item in batch]
batch = {}
batch["pixel_values"] = encoding["pixel_values"]
if "pixel_mask" in encoding:
batch["pixel_mask"] = encoding["pixel_mask"]
batch["labels"] = labels
return batch
if __name__ == "__main__":
ds = load_dataset(dataset)
# load_dataset("imagenet-1k", num_proc=8)
remove_idx = [530]
keep = [i for i in range(len(ds["train"])) if i not in remove_idx]
ds["train"] = ds["train"].select(keep)
image_processor = AutoImageProcessor.from_pretrained(checkpoint)
augmented = ds["train"].with_transform(transform_aug_ann)
val_dataloader = DataLoader(augmented, batch_size=256, shuffle=False, num_workers=2, collate_fn=collate_fn)
i = (next(iter(val_dataloader)))
I have ran python -m memray run src/_dataloader_dataset.py
in attempt to figure out where it comes from and it basically says that memory_mapped_stream = pa.memory_map(filename)
at .venv/lib/python3.10/site-packages/datasets/table.py, line 50 in _memory_mapped_record_batch_reader_from_file 13.7 GiB total
is at fault.
Really weird because the entire dataset is about 20 gig tops. I do not see a reason for it to load 13+ gig for a 768mb batch.
also if I increase the number of workers it uses even more ram! this behavior isn’t a memory mapped behavior at all!