from PIL import Image
import requests
from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer
processor_base= AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)
model_base= BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”)
url = “https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTrp7yKuY1NxcXlHQX10JtTlECuna-xWv-jetxnv73WBw&s”
image = Image.open(requests.get(url, stream=True).raw)
inputs_base= processor_base(images=image, return_tensors=“pt”)
pixel_values = inputs_base.pixel_values
outputs_base = model_base.generate(**inputs_base,output_scores=True, output_logits = True, return_dict_in_generate=True, max_length=50 , return_dict = True)
logits = outputs_base.logits
// How i generate the caption from this logits performing the softmax
Hey!
You can get the generated caption from output.sequences
.
out_text = tokenizer.batch_decode(output.sequences)
print(out_text)
If you want to work with the scores, you should be able to just take softmax over scores. Note that scores here are the scores after applying all types of processings (temperature, normalization etc.) that were passed into generate
. But in your code snippet there are no such processings
gen_token_ids = torch.argmax(scores, dim=-1)
out_text = tokenizer.batch_decode(gen_token_ids)
from transformers import BlipForConditionalGeneration,AutoProcessor,AutoTokenizer
from PIL import Image
import requests
import torch
processor= AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)
model= BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”)
tokenizer = AutoTokenizer.from_pretrained(“Salesforce/blip-image-captioning-base”)
url = “http://images.cocodataset.org/val2017/000000039769.jpg”
image = Image.open(requests.get(url, stream=True).raw)
text = “A picture of”
decoder_input_ids = [model.decoder_input_ids]
print(decoder_input_ids)
inputs.input_ids = decoder_input_ids=torch.tensor([decoder_input_ids])
predicted_ids =
for i in range(20):
outputs = model(**inputs)
logits = outputs.logits[:,i,:]
print(outputs.logits.shape)
predicted_id = logits.argmax(-1)
predicted_ids.append(predicted_id.item())
print(tokenizer.decode([predicted_id.squeeze()]))
add predicted id to decoder_input_ids
inputs.input_ids = torch.cat([inputs.input_ids, predicted_id.unsqueeze(0)], dim=1)
for this code for every image i found same logits shape [1, 5, 30524] . and also every iteration the logits are remain same. no change in logtis for changing the inputs.input_ids. why i face this error? and how to solve this?