After finetuning the BLIP model, its not generating any output, the output tensor is filled with zeros?

I’m trying to finetune BLIP on visdial dataset (An Image QA dataset). However, just for testing purposes, I took 200 Image Question answer pair and fine-tuned them while calculating the output loss which is near about 1.40.

But after the fine-tuning process is completed, whenever I’m giving the model an Image and a question from the same set from where it has been fine-tuned, it is not producing any output. On printing the output variable I observed the tensor is filled with zeros.

Here is my fine-tuning code:

class visDial_dataset(Dataset):
    def __init__(self, dataset, processor):
        self.processor = processor
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,idx):
        item = self.dataset[idx]
        
        encodings = self.processor(images = item["images"], text = item["questions"], padding = "max_length", return_tensors = "pt")
        labels = self.processor(text = item['answers'], padding = "max_length", return_tensors = "pt").input_ids

        encodings['labels'] = labels
        encodings = {k:v.squeeze() for k,v in encodings.items()}

        return encodings


from torch.utils.data import DataLoader

train_dataset_object = visDial_dataset(modified_val_set, processor)
train_dataloader = DataLoader(train_dataset_object, shuffle=True, batch_size=2)

import torch

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)


model.train()

for epoch in range(3):
  print("Epoch:", epoch)
  for idx, batch in enumerate(train_dataloader):
    
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)
    labels = batch.pop("labels").to(device)
    attention_mask = batch.pop('attention_mask').to(device)
    
    outputs = model(
                    input_ids=input_ids,
                    pixel_values=pixel_values,
                    labels=labels,
                    attention_mask = attention_mask
                   )
    
    loss = outputs.loss

    print("Loss:", loss.item())

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

model.eval()

rand = 48
print(questionList[rand])
print(ansList[rand])
pil_img_list[rand]

# inference

encoding = processor(images=pil_img_list[rand], text=questionList[rand], return_tensors="pt").to(device)
outputs = model.generate(**encoding)
print(processor.decode(outputs[0], skip_special_tokens=True))

Please help me identify where I’m making a mistake. I’m new to the transformers.