yunust
1
The documentation on CLIPVIsionModel says that :
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using[AutoImageProcessor]
However, when I pad my image with image_transforms.pad and do a forward pass, results become much different than the normal image.
from transformers.image_transforms import pad
import numpy as np
# Example image as a NumPy array
image = np.random.rand(224, 224, 3) # Height x Width x Channels
# Define padding: ((before_height, after_height), (before_width, after_width))
padding = ((0, 0), (112, 112)) # Pads width to make it 448
# Apply padding
padded_image = pad(image, padding=padding)
print("Original Image Shape:", image.shape)
print("Padded Image Shape:", padded_image.shape)
image_torch = torch.tensor(image).permute(2, 0, 1).unsqueeze(0)
padded_image_torch = torch.tensor(padded_image).permute(2, 0, 1).unsqueeze(0)
print("Original Image Shape (Torch):", image_torch.shape)
print("Padded Image Shape (Torch):", padded_image_torch.shape)
# Step 5: Pass the padded image through the model
outputs_padded = model(pixel_values=padded_image_torch, interpolate_pos_encoding=True)
outputs_original = model(pixel_values=image_torch)
# Step 6: Extract the results for comparison
original = outputs_original.pooler_output
padded = outputs_padded.pooler_output
print(torch.mean(original - padded))
how to handle image padding properly.
1 Like
It’s a bug, no matter how we look at it. Please see this behavior.
It looks like we should submit an issue or PR to github…
from transformers.image_transforms import pad
import numpy as np
import torch
from PIL import Image
from transformers import CLIPImageProcessor, CLIPVisionModel
model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
# Example image as a NumPy array
image = np.random.rand(224, 224, 3) # Height x Width x Channels
image_pil = np.array(Image.fromarray(image, 'RGB')) # Open with PIL and save
# Define padding: ((before_height, after_height), (before_width, after_width))
padding = ((0, 0), (112, 112)) # Pads width to make it 448
# Apply padding
padded_image = pad(image, padding=padding)
padded_image_pil = pad(image_pil, padding=padding)
print("Original Image Shape:", image.shape)
print("Padded Image Shape:", padded_image.shape)
image_torch = torch.tensor(image).permute(2, 0, 1).unsqueeze(0)
padded_image_torch = torch.tensor(padded_image).permute(2, 0, 1).unsqueeze(0)
print("Original Image Shape (Torch):", image_torch.shape)
print("Padded Image Shape (Torch):", padded_image_torch.shape)
# Step 5: Pass the padded image through the model
outputs_padded = model(pixel_values=padded_image_torch, interpolate_pos_encoding=True)
outputs_original = model(pixel_values=image_torch)
# Step 6: Extract the results for comparison
original = outputs_original.pooler_output
padded = outputs_padded.pooler_output
print(torch.mean(original - padded))
# Save images
original_im = Image.fromarray(image, 'RGB')
padded_im = Image.fromarray(padded_image, 'RGB')
padded_im_pil = Image.fromarray(padded_image_pil, 'RGB')
original_im.save("_pad_original.png") # normal
padded_im.save("_pad_padded.png") # strange
padded_im_pil.save("_pad_padded_pil.png") # normal