Hi gfatigati,
You can do this easily by cropping the padded image with bounding boxes and resizing it to the original size. However, if the original image was larger/longer than 960px it’ll get upsampled after the resize, so you could decide to skip the resize and stick with the cropped image, with the original aspect ratio.
Follow the updated visualization instructions after nielsr’s PR:
https://huggingface.co/docs/transformers/model_doc/owlv2#transformers.Owlv2ForObjectDetection.forward.example but save the original image’s dimensions after opening it. Then use those values to crop and resize the visualization image.
# Save original dimensions here
w, h = image.size
# Crop and resize
unnormalized_image = unnormalized_image.crop((0, 0, 960, h * 960/w))
unnormalized_image = unnormalized_image.resize((w,h))
Full example:
import requests
from PIL import Image, ImageDraw
import numpy as np
import torch
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
image = Image.open(img_file)
w, h = image.size # Save dimensions here
inputs = processor(text=texts, images=image, return_tensors="pt")
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# Note: boxes need to be visualized on the padded, unnormalized image
# hence we'll set the target image sizes (height, width) based on that
def get_preprocessed_image(pixel_values):
pixel_values = pixel_values.squeeze().numpy()
unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
unnormalized_image = Image.fromarray(unnormalized_image)
return unnormalized_image
unnormalized_image = get_preprocessed_image(inputs.pixel_values)
target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
results = processor.post_process_object_detection(
outputs=outputs, threshold=0.2, target_sizes=target_sizes
)
i = 0 # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
# Draw on the image
draw = ImageDraw.Draw(unnormalized_image)
for box, score, label in zip(boxes, scores, labels):
xmin, ymin, xmax, ymax = box
draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
draw.text((xmin, ymin), f"{texts[label]}: {round(score.item(),2)}", fill="white")
# Crop the padded image to its original aspect ratio
unnormalized_image = unnormalized_image.crop((0, 0, 960, h * 960/w))
# Resize it to the original size
unnormalized_image = unnormalized_image.resize((w,h))
unnormalized_image.show()