CLIPVisionModel Padding Problem

The documentation on CLIPVIsionModel says that :

Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using[AutoImageProcessor]

However, when I pad my image with image_transforms.pad and do a forward pass, results become much different than the normal image.

from transformers.image_transforms import pad
import numpy as np

# Example image as a NumPy array
image = np.random.rand(224, 224, 3)  # Height x Width x Channels

# Define padding: ((before_height, after_height), (before_width, after_width))
padding = ((0, 0), (112, 112))  # Pads width to make it 448

# Apply padding
padded_image = pad(image, padding=padding)
print("Original Image Shape:", image.shape)
print("Padded Image Shape:", padded_image.shape)

image_torch = torch.tensor(image).permute(2, 0, 1).unsqueeze(0)
padded_image_torch = torch.tensor(padded_image).permute(2, 0, 1).unsqueeze(0)

print("Original Image Shape (Torch):", image_torch.shape)
print("Padded Image Shape (Torch):", padded_image_torch.shape)
# Step 5: Pass the padded image through the model
outputs_padded = model(pixel_values=padded_image_torch, interpolate_pos_encoding=True)
outputs_original = model(pixel_values=image_torch)
# Step 6: Extract the results for comparison
original = outputs_original.pooler_output
padded  = outputs_padded.pooler_output

print(torch.mean(original - padded))

how to handle image padding properly.

1 Like

It’s a bug, no matter how we look at it. Please see this behavior.:sob:
It looks like we should submit an issue or PR to github…

from transformers.image_transforms import pad
import numpy as np
import torch
from PIL import Image
from transformers import CLIPImageProcessor, CLIPVisionModel
model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Example image as a NumPy array
image = np.random.rand(224, 224, 3)  # Height x Width x Channels
image_pil = np.array(Image.fromarray(image, 'RGB')) # Open with PIL and save

# Define padding: ((before_height, after_height), (before_width, after_width))
padding = ((0, 0), (112, 112))  # Pads width to make it 448

# Apply padding
padded_image = pad(image, padding=padding)
padded_image_pil = pad(image_pil, padding=padding)
print("Original Image Shape:", image.shape)
print("Padded Image Shape:", padded_image.shape)

image_torch = torch.tensor(image).permute(2, 0, 1).unsqueeze(0)
padded_image_torch = torch.tensor(padded_image).permute(2, 0, 1).unsqueeze(0)

print("Original Image Shape (Torch):", image_torch.shape)
print("Padded Image Shape (Torch):", padded_image_torch.shape)
# Step 5: Pass the padded image through the model
outputs_padded = model(pixel_values=padded_image_torch, interpolate_pos_encoding=True)
outputs_original = model(pixel_values=image_torch)
# Step 6: Extract the results for comparison
original = outputs_original.pooler_output
padded  = outputs_padded.pooler_output

print(torch.mean(original - padded))

# Save images
original_im = Image.fromarray(image, 'RGB')
padded_im = Image.fromarray(padded_image, 'RGB')
padded_im_pil = Image.fromarray(padded_image_pil, 'RGB')
original_im.save("_pad_original.png") # normal
padded_im.save("_pad_padded.png") # strange
padded_im_pil.save("_pad_padded_pil.png") # normal

I opened issue.