In Post-processing model predictions section, is claimed:
Notice how we printed the output predictions on the resized input image. This is because OWL-ViT outputs normalized box coordinates in [cx, cy, w, h]format assuming a fixed input image size. We can use theOwlViTProcessor's convenient post_process() method to convert the model outputs to **a COCO API** format and retrieve rescaled coordinates (with respect to the original image sizes) in[x0, y0, x1, y1] format.
Are you sure the bounding box returned are in COCO API format? From my tests, they seems to be [Xmin, Ymin, Xmax, Ymax], so in Pascal VOC format.
I attach the code and image sample. The following are the bounding box retrieved, with the first one having negative coordinates:
Detected crown with confidence 0.319 at location [830.3, -1.3, 1243.46, 228.82]
Detected Christ with confidence 0.27 at location [8.79, 25.36, 1286.45, 1128.44]
Detected crown with confidence 0.209 at location [142.62, 783.75, 363.72, 1042.01]
import numpy as np
from PIL import Image
from transformers import OwlViTProcessor, OwlViTForObjectDetection
model = OwlViTForObjectDetection.from_pretrained("owlvit-large-patch14")
processor = OwlViTProcessor.from_pretrained("owlvit-large-patch14")
import torch
# Use GPU if available
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
# Text queries to search the image for
text_queries = [["crown","Christ"]]
image_orig = Image.open('SSPSAEPM_FI_25285UC.JPG')
img_tmp = np.asarray(image_orig)
image_rgb = Image.fromarray(np.uint8(img_tmp)).convert("RGB")
images = [image_rgb]
# Process image and text inputs
inputs = processor(text=text_queries, images=image_rgb, return_tensors="pt").to(device)
# Print input names and shapes
for key, val in inputs.items():
print(f"{key}: {val.shape}")
# Set model in evaluation mode
model = model.to(device)
model.eval()
# Get predictions
with torch.no_grad():
outputs = model(**inputs)
for k, val in outputs.items():
if k not in {"text_model_output", "vision_model_output"}:
print(f"{k}: shape of {val.shape}")
print("\nText model outputs")
for k, val in outputs.text_model_output.items():
print(f"{k}: shape of {val.shape}")
print("\nVision model outputs")
for k, val in outputs.vision_model_output.items():
print(f"{k}: shape of {val.shape}")
# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([img.size[::-1] for img in images]).to(device)
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
# Loop over predictions for each image in the batch
for i in range(len(images)):
print(f"\nProcessing image {i}")
text = text_queries[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
score_threshold = 0.20
for box, score, label in zip(boxes, scores, labels):
box = [round(i, 2) for i in box.tolist()]
if score >= score_threshold:
print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")