Finetune BLIP on customer dataset #20893

dineshcr7 · October 12, 2023, 6:05pm

@ybelkada
Can you help me fine-tune the blip-vqa-base for this dataset:

It will be beneficial for my study of LLMs as I’m just a fresher in this field.

Kenta23 · September 16, 2024, 10:58am

Hello! I have edited the previous code for the problem statement and getting loss as nan for everything. Could you please find what’s wrong with this code. I have CSV file with name. Annotations.csv. it has two columns ‘image path’ and ‘caption’.

import os
import gc
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm
import albumentations as A

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer, BlipProcessor, AutoProcessor, BlipForConditionalGeneration

Load the dataset from the CSV file

pd_padma_existing = pd.read_csv(“Annotations.csv”) # Load the ‘Annotations.csv’ file
pd_padma_existing.columns = [‘image_path’, ‘caption’] # Ensure the correct column names

class CFG:
# text length
max_length = 500
# image size
size = 224

Define the PyTorch Dataset class for image captioning

class ImageCaptioningDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor

def __len__(self):
    return len(self.dataset)

def __getitem__(self, idx):
    item = self.dataset.iloc[idx]
    encoding = self.processor(images=item["image_path"], text=item["caption"], padding="max_length", return_tensors="pt")
    # remove batch dimension
    encoding = {k:v.squeeze() for k,v in encoding.items()}
    return encoding

import cv2

class BLIPDataset(Dataset):
def init(self, image_filenames, captions, processor):
“”"
image_filenames and captions must have the same length.
“”"
self.image_filenames = image_filenames
self.captions = captions
self.processor = processor
self.transforms = A.Compose([
A.Resize(CFG.size, CFG.size, always_apply=True)
])

def __getitem__(self, idx):
    # Load image and process it
    image = cv2.imread(self.image_filenames[idx])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = self.transforms(image=image)['image']
    item_image = torch.tensor(image).permute(2, 0, 1).float() /255.0
    
    # Truncate the caption to 300 characters
    item_text = self.captions[idx][:300]
    
    # Use the processor to encode the image and text
    encoding = self.processor(images=item_image, text=item_text, padding="max_length", return_tensors="pt").to("cuda", torch.float16)
    encoding = {k: v.squeeze() for k, v in encoding.items()}
    
    # Return the processed items
    return {
        'encoding': encoding,
        'item_image': item_image,
        'item_text': item_text
    }

def __len__(self):
    return len(self.captions)

Load the processor

processor = AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)

Create the dataset

blip_dataset = BLIPDataset(
pd_padma_existing[“image_path”].values,
pd_padma_existing[“caption”].values,
processor
)

DataLoader for batching

train_dataloader = DataLoader(blip_dataset, shuffle=False, batch_size=32)

Load the model and move it to the correct device

model = BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”, torch_dtype=torch.float16)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = “cuda” if torch.cuda.is_available() else “cpu”
model.to(device)

Training loop

model.train()

for epoch in range(1):
print(“Epoch:”, epoch)
for idx, b in enumerate(train_dataloader):
batch, item_image, item_text = b[‘encoding’], b[‘item_image’], b[‘item_text’]

    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
    loss = outputs.loss

    print(f"idx={idx}, Loss: {loss.item()}")

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Move tensors back to CPU to free up GPU memory
    input_ids = input_ids.to("cpu")
    pixel_values = pixel_values.to("cpu")

Kenta23 · September 16, 2024, 10:59am

Hello! I have edited the previous code for the problem statement and getting loss as nan for everything. Could you please find what’s wrong with this code. I have CSV file with name. Annotations.csv. it has two columns ‘image path’ and ‘caption’.

import os
import gc
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm
import albumentations as A

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer, BlipProcessor, AutoProcessor, BlipForConditionalGeneration

Load the dataset from the CSV file

pd_padma_existing = pd.read_csv(“Annotations.csv”) # Load the ‘Annotations.csv’ file
pd_padma_existing.columns = [‘image_path’, ‘caption’] # Ensure the correct column names

class CFG:
# text length
max_length = 500
# image size
size = 224

Define the PyTorch Dataset class for image captioning

class ImageCaptioningDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor

def __len__(self):
    return len(self.dataset)

def __getitem__(self, idx):
    item = self.dataset.iloc[idx]
    encoding = self.processor(images=item["image_path"], text=item["caption"], padding="max_length", return_tensors="pt")
    # remove batch dimension
    encoding = {k:v.squeeze() for k,v in encoding.items()}
    return encoding

import cv2

class BLIPDataset(Dataset):
def init(self, image_filenames, captions, processor):
“”"
image_filenames and captions must have the same length.
“”"
self.image_filenames = image_filenames
self.captions = captions
self.processor = processor
self.transforms = A.Compose([
A.Resize(CFG.size, CFG.size, always_apply=True)
])

def __getitem__(self, idx):
    # Load image and process it
    image = cv2.imread(self.image_filenames[idx])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = self.transforms(image=image)['image']
    item_image = torch.tensor(image).permute(2, 0, 1).float() /255.0
    
    # Truncate the caption to 300 characters
    item_text = self.captions[idx][:300]
    
    # Use the processor to encode the image and text
    encoding = self.processor(images=item_image, text=item_text, padding="max_length", return_tensors="pt").to("cuda", torch.float16)
    encoding = {k: v.squeeze() for k, v in encoding.items()}
    
    # Return the processed items
    return {
        'encoding': encoding,
        'item_image': item_image,
        'item_text': item_text
    }

def __len__(self):
    return len(self.captions)

Load the processor

processor = AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)

Create the dataset

blip_dataset = BLIPDataset(
pd_padma_existing[“image_path”].values,
pd_padma_existing[“caption”].values,
processor
)

DataLoader for batching

train_dataloader = DataLoader(blip_dataset, shuffle=False, batch_size=32)

Load the model and move it to the correct device

model = BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”, torch_dtype=torch.float16)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = “cuda” if torch.cuda.is_available() else “cpu”
model.to(device)

Training loop

model.train()

for epoch in range(1):
print(“Epoch:”, epoch)
for idx, b in enumerate(train_dataloader):
batch, item_image, item_text = b[‘encoding’], b[‘item_image’], b[‘item_text’]

    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)

    outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
    loss = outputs.loss

    print(f"idx={idx}, Loss: {loss.item()}")

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Move tensors back to CPU to free up GPU memory
    input_ids = input_ids.to("cpu")
    pixel_values = pixel_values.to("cpu")

Could you please find any errors here

Topic		Replies	Views
Solution for Fine Tuning the Blip Model 🤗Transformers	0	93	December 13, 2024
Blip model gives no response Beginners	1	113	August 21, 2024
Fine-tuning Blip3-o with Runpod Beginners	4	28	June 17, 2025
Fine tuned BLIP model is somehow 10x slower during inference Beginners	1	1172	May 29, 2023
CLIPModel finetuning Models	9	9185	July 20, 2022

Finetune BLIP on customer dataset #20893

Load the dataset from the CSV file

Define the PyTorch Dataset class for image captioning

Load the processor

Create the dataset

DataLoader for batching

Load the model and move it to the correct device

Training loop

Load the dataset from the CSV file

Define the PyTorch Dataset class for image captioning

Load the processor

Create the dataset

DataLoader for batching

Load the model and move it to the correct device

Training loop

Related topics