Why might we want to disable the mask? Detr’s documentation:
Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding.
Anyhow, the preprocessed images look correct, at least the
sizes and the bounding boxes do.
Preprocessed batch without mask:
Preprocessed batch with mask:
Image processor:
from transformers import AutoImageProcessor
import albumentations
import numpy as np
import torch
IMAGE_SIZE = 480
MAX_SIZE = IMAGE_SIZE
checkpoint = "facebook/detr-resnet-50"
image_processor = AutoImageProcessor.from_pretrained(
checkpoint,
do_resize=True,
size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
do_pad=True,
pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
)
train_transform = albumentations.Compose(
[
albumentations.NoOp(),
],
bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)
Visualization of a preprocessed batch:
import numpy as np
import cv2
def pixel_values_to_img(pixel_values):
npimg = pixel_values.numpy()
npimg = np.transpose(npimg, (1,2,0))
npimg = (npimg * image_processor.image_std + image_processor.image_mean) * 255
npimg = (npimg).astype(np.uint8)
return npimg
def pixel_mask_to_img(pixel_mask):
npimg = pixel_mask.numpy()
npimg = npimg[0]
rgb = np.zeros((npimg.shape[0], npimg.shape[1], 3), dtype=np.uint8)
rgb[:,:,0] = npimg * 255
rgb[:,:,1] = npimg * 255
rgb[:,:,2] = npimg * 255
return rgb
# translate the preprocessed image batch into a PIL image
transformed_img = collate_fn([train_dataset_transformed[0]])
pixel_values = transformed_img['pixel_values']
if len(pixel_values.shape) > 3:
pixel_values = pixel_values[0]
img=pixel_values_to_img(pixel_values)
img_w, img_h = img.shape[0], img.shape[1]
# if there's a mask, add a magenta tint to the valid, non-padded pixels
if 'pixel_mask' in transformed_img:
mask=pixel_mask_to_img(transformed_img['pixel_mask'])
masked_image = img.copy()
masked_image = np.where(mask.astype(int),
np.array([255,0,255], dtype='uint8'),
masked_image)
masked_image = masked_image.astype(np.uint8)
masked_image = cv2.addWeighted(img, 0.6, masked_image, 0.4, 0)
img = masked_image
img = Image.fromarray(img)
# draw bboxes
draw = ImageDraw.Draw(img)
for box in transformed_img['labels'][0]['boxes']:
cx,cy,w,h=img_w*box[0],img_h*box[1],img_w*box[2],img_h*box[3]
x,y=cx-w/2,cy-h/2
x2,y2=x+w,y+h
draw.rectangle((x,y,x2,y2), outline="white", width=1)
img