I’m trying to finetune BLIP on visdial dataset (An Image QA dataset). However, just for testing purposes, I took 200 Image Question answer pair and fine-tuned them while calculating the output loss which is near about 1.40.
But after the fine-tuning process is completed, whenever I’m giving the model an Image and a question from the same set from where it has been fine-tuned, it is not producing any output. On printing the output variable I observed the tensor is filled with zeros.
Here is my fine-tuning code:
class visDial_dataset(Dataset):
def __init__(self, dataset, processor):
self.processor = processor
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self,idx):
item = self.dataset[idx]
encodings = self.processor(images = item["images"], text = item["questions"], padding = "max_length", return_tensors = "pt")
labels = self.processor(text = item['answers'], padding = "max_length", return_tensors = "pt").input_ids
encodings['labels'] = labels
encodings = {k:v.squeeze() for k,v in encodings.items()}
return encodings
from torch.utils.data import DataLoader
train_dataset_object = visDial_dataset(modified_val_set, processor)
train_dataloader = DataLoader(train_dataset_object, shuffle=True, batch_size=2)
import torch
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)
model.train()
for epoch in range(3):
print("Epoch:", epoch)
for idx, batch in enumerate(train_dataloader):
input_ids = batch.pop("input_ids").to(device)
pixel_values = batch.pop("pixel_values").to(device)
labels = batch.pop("labels").to(device)
attention_mask = batch.pop('attention_mask').to(device)
outputs = model(
input_ids=input_ids,
pixel_values=pixel_values,
labels=labels,
attention_mask = attention_mask
)
loss = outputs.loss
print("Loss:", loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
model.eval()
rand = 48
print(questionList[rand])
print(ansList[rand])
pil_img_list[rand]
# inference
encoding = processor(images=pil_img_list[rand], text=questionList[rand], return_tensors="pt").to(device)
outputs = model.generate(**encoding)
print(processor.decode(outputs[0], skip_special_tokens=True))
Please help me identify where I’m making a mistake. I’m new to the transformers.