I’m trying to obtain ViT Image embeddings but I get completely different embeddings for the same image during multiple inferences? Shouldn’t the image embedding be constant for inference of the same image?
device = torch.device('cuda')
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTModel.from_pretrained('google/vit-base-patch16-224')
model.to(device)
model.eval()
vit_vectors = []
for file_path in file_paths:
image = Image.open(file_path)
inputs = processor(images=image, return_tensors="pt")
inputs.to(device)
outputs = model(**inputs)
_ = outputs.pooler_output.cpu().detach().numpy().copy()[0]
vit_vectors.append(_)