from transformers import BlipForConditionalGeneration,AutoProcessor,AutoTokenizer
from PIL import Image
import requests
import torch
processor= AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)
model= BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”)
tokenizer = AutoTokenizer.from_pretrained(“Salesforce/blip-image-captioning-base”)
url = “http://images.cocodataset.org/val2017/000000039769.jpg”
image = Image.open(requests.get(url, stream=True).raw)
text = “A picture of”
decoder_input_ids = [model.decoder_input_ids]
print(decoder_input_ids)
inputs.input_ids = decoder_input_ids=torch.tensor([decoder_input_ids])
predicted_ids =
for i in range(20):
outputs = model(**inputs)
logits = outputs.logits[:,i,:]
print(outputs.logits.shape)
predicted_id = logits.argmax(-1)
predicted_ids.append(predicted_id.item())
print(tokenizer.decode([predicted_id.squeeze()]))
add predicted id to decoder_input_ids
inputs.input_ids = torch.cat([inputs.input_ids, predicted_id.unsqueeze(0)], dim=1)
for this code for every image i found same logits shape [1, 5, 30524] . and also every iteration the logits are remain same. no change in logtis for changing the inputs.input_ids. why i face this error? and how to solve this?