Finetune BLIP on customer dataset #20893

@ybelkada
Can you help me fine-tune the blip-vqa-base for this dataset:

It will be beneficial for my study of LLMs as I’m just a fresher in this field.

Hello! I have edited the previous code for the problem statement and getting loss as nan for everything. Could you please find what’s wrong with this code. I have CSV file with name. Annotations.csv. it has two columns ‘image path’ and ‘caption’.

import os
import gc
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm
import albumentations as A

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer, BlipProcessor, AutoProcessor, BlipForConditionalGeneration

Load the dataset from the CSV file

pd_padma_existing = pd.read_csv(“Annotations.csv”) # Load the ‘Annotations.csv’ file
pd_padma_existing.columns = [‘image_path’, ‘caption’] # Ensure the correct column names

class CFG:
# text length
max_length = 500
# image size
size = 224

Define the PyTorch Dataset class for image captioning

class ImageCaptioningDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor

def __len__(self):
    return len(self.dataset)

def __getitem__(self, idx):
    item = self.dataset.iloc[idx]
    encoding = self.processor(images=item["image_path"], text=item["caption"], padding="max_length", return_tensors="pt")
    # remove batch dimension
    encoding = {k:v.squeeze() for k,v in encoding.items()}
    return encoding

import cv2

class BLIPDataset(Dataset):
def init(self, image_filenames, captions, processor):
“”"
image_filenames and captions must have the same length.
“”"
self.image_filenames = image_filenames
self.captions = captions
self.processor = processor
self.transforms = A.Compose([
A.Resize(CFG.size, CFG.size, always_apply=True)
])

def __getitem__(self, idx):
    # Load image and process it
    image = cv2.imread(self.image_filenames[idx])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = self.transforms(image=image)['image']
    item_image = torch.tensor(image).permute(2, 0, 1).float() /255.0
    
    # Truncate the caption to 300 characters
    item_text = self.captions[idx][:300]
    
    # Use the processor to encode the image and text
    encoding = self.processor(images=item_image, text=item_text, padding="max_length", return_tensors="pt").to("cuda", torch.float16)
    encoding = {k: v.squeeze() for k, v in encoding.items()}
    
    # Return the processed items
    return {
        'encoding': encoding,
        'item_image': item_image,
        'item_text': item_text
    }

def __len__(self):
    return len(self.captions)

Load the processor

processor = AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)

Create the dataset

blip_dataset = BLIPDataset(
pd_padma_existing[“image_path”].values,
pd_padma_existing[“caption”].values,
processor
)

DataLoader for batching

train_dataloader = DataLoader(blip_dataset, shuffle=False, batch_size=32)

Load the model and move it to the correct device

model = BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”, torch_dtype=torch.float16)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = “cuda” if torch.cuda.is_available() else “cpu”
model.to(device)

Training loop

model.train()

for epoch in range(1):
print(“Epoch:”, epoch)
for idx, b in enumerate(train_dataloader):
batch, item_image, item_text = b[‘encoding’], b[‘item_image’], b[‘item_text’]

    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
    loss = outputs.loss

    print(f"idx={idx}, Loss: {loss.item()}")

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Move tensors back to CPU to free up GPU memory
    input_ids = input_ids.to("cpu")
    pixel_values = pixel_values.to("cpu")

Hello! I have edited the previous code for the problem statement and getting loss as nan for everything. Could you please find what’s wrong with this code. I have CSV file with name. Annotations.csv. it has two columns ‘image path’ and ‘caption’.

import os
import gc
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm
import albumentations as A

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer, BlipProcessor, AutoProcessor, BlipForConditionalGeneration

Load the dataset from the CSV file

pd_padma_existing = pd.read_csv(“Annotations.csv”) # Load the ‘Annotations.csv’ file
pd_padma_existing.columns = [‘image_path’, ‘caption’] # Ensure the correct column names

class CFG:
# text length
max_length = 500
# image size
size = 224

Define the PyTorch Dataset class for image captioning

class ImageCaptioningDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor

def __len__(self):
    return len(self.dataset)

def __getitem__(self, idx):
    item = self.dataset.iloc[idx]
    encoding = self.processor(images=item["image_path"], text=item["caption"], padding="max_length", return_tensors="pt")
    # remove batch dimension
    encoding = {k:v.squeeze() for k,v in encoding.items()}
    return encoding

import cv2

class BLIPDataset(Dataset):
def init(self, image_filenames, captions, processor):
“”"
image_filenames and captions must have the same length.
“”"
self.image_filenames = image_filenames
self.captions = captions
self.processor = processor
self.transforms = A.Compose([
A.Resize(CFG.size, CFG.size, always_apply=True)
])

def __getitem__(self, idx):
    # Load image and process it
    image = cv2.imread(self.image_filenames[idx])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = self.transforms(image=image)['image']
    item_image = torch.tensor(image).permute(2, 0, 1).float() /255.0
    
    # Truncate the caption to 300 characters
    item_text = self.captions[idx][:300]
    
    # Use the processor to encode the image and text
    encoding = self.processor(images=item_image, text=item_text, padding="max_length", return_tensors="pt").to("cuda", torch.float16)
    encoding = {k: v.squeeze() for k, v in encoding.items()}
    
    # Return the processed items
    return {
        'encoding': encoding,
        'item_image': item_image,
        'item_text': item_text
    }

def __len__(self):
    return len(self.captions)

Load the processor

processor = AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)

Create the dataset

blip_dataset = BLIPDataset(
pd_padma_existing[“image_path”].values,
pd_padma_existing[“caption”].values,
processor
)

DataLoader for batching

train_dataloader = DataLoader(blip_dataset, shuffle=False, batch_size=32)

Load the model and move it to the correct device

model = BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”, torch_dtype=torch.float16)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = “cuda” if torch.cuda.is_available() else “cpu”
model.to(device)

Training loop

model.train()

for epoch in range(1):
print(“Epoch:”, epoch)
for idx, b in enumerate(train_dataloader):
batch, item_image, item_text = b[‘encoding’], b[‘item_image’], b[‘item_text’]

    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
    loss = outputs.loss

    print(f"idx={idx}, Loss: {loss.item()}")

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Move tensors back to CPU to free up GPU memory
    input_ids = input_ids.to("cpu")
    pixel_values = pixel_values.to("cpu")

Could you please find any errors here