@ybelkada
Can you help me fine-tune the blip-vqa-base for this dataset:
It will be beneficial for my study of LLMs as I’m just a fresher in this field.
@ybelkada
Can you help me fine-tune the blip-vqa-base for this dataset:
It will be beneficial for my study of LLMs as I’m just a fresher in this field.
Hello! I have edited the previous code for the problem statement and getting loss as nan for everything. Could you please find what’s wrong with this code. I have CSV file with name. Annotations.csv. it has two columns ‘image path’ and ‘caption’.
import os
import gc
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm
import albumentations as A
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer, BlipProcessor, AutoProcessor, BlipForConditionalGeneration
pd_padma_existing = pd.read_csv(“Annotations.csv”) # Load the ‘Annotations.csv’ file
pd_padma_existing.columns = [‘image_path’, ‘caption’] # Ensure the correct column names
class CFG:
# text length
max_length = 500
# image size
size = 224
class ImageCaptioningDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
item = self.dataset.iloc[idx]
encoding = self.processor(images=item["image_path"], text=item["caption"], padding="max_length", return_tensors="pt")
# remove batch dimension
encoding = {k:v.squeeze() for k,v in encoding.items()}
return encoding
import cv2
class BLIPDataset(Dataset):
def init(self, image_filenames, captions, processor):
“”"
image_filenames and captions must have the same length.
“”"
self.image_filenames = image_filenames
self.captions = captions
self.processor = processor
self.transforms = A.Compose([
A.Resize(CFG.size, CFG.size, always_apply=True)
])
def __getitem__(self, idx):
# Load image and process it
image = cv2.imread(self.image_filenames[idx])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = self.transforms(image=image)['image']
item_image = torch.tensor(image).permute(2, 0, 1).float() /255.0
# Truncate the caption to 300 characters
item_text = self.captions[idx][:300]
# Use the processor to encode the image and text
encoding = self.processor(images=item_image, text=item_text, padding="max_length", return_tensors="pt").to("cuda", torch.float16)
encoding = {k: v.squeeze() for k, v in encoding.items()}
# Return the processed items
return {
'encoding': encoding,
'item_image': item_image,
'item_text': item_text
}
def __len__(self):
return len(self.captions)
processor = AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)
blip_dataset = BLIPDataset(
pd_padma_existing[“image_path”].values,
pd_padma_existing[“caption”].values,
processor
)
train_dataloader = DataLoader(blip_dataset, shuffle=False, batch_size=32)
model = BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”, torch_dtype=torch.float16)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = “cuda” if torch.cuda.is_available() else “cpu”
model.to(device)
model.train()
for epoch in range(1):
print(“Epoch:”, epoch)
for idx, b in enumerate(train_dataloader):
batch, item_image, item_text = b[‘encoding’], b[‘item_image’], b[‘item_text’]
input_ids = batch.pop("input_ids").to(device)
pixel_values = batch.pop("pixel_values").to(device)
outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
loss = outputs.loss
print(f"idx={idx}, Loss: {loss.item()}")
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Move tensors back to CPU to free up GPU memory
input_ids = input_ids.to("cpu")
pixel_values = pixel_values.to("cpu")
Hello! I have edited the previous code for the problem statement and getting loss as nan for everything. Could you please find what’s wrong with this code. I have CSV file with name. Annotations.csv. it has two columns ‘image path’ and ‘caption’.
import os
import gc
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm
import albumentations as A
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer, BlipProcessor, AutoProcessor, BlipForConditionalGeneration
pd_padma_existing = pd.read_csv(“Annotations.csv”) # Load the ‘Annotations.csv’ file
pd_padma_existing.columns = [‘image_path’, ‘caption’] # Ensure the correct column names
class CFG:
# text length
max_length = 500
# image size
size = 224
class ImageCaptioningDataset(Dataset):
def init(self, dataset, processor):
self.dataset = dataset
self.processor = processor
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
item = self.dataset.iloc[idx]
encoding = self.processor(images=item["image_path"], text=item["caption"], padding="max_length", return_tensors="pt")
# remove batch dimension
encoding = {k:v.squeeze() for k,v in encoding.items()}
return encoding
import cv2
class BLIPDataset(Dataset):
def init(self, image_filenames, captions, processor):
“”"
image_filenames and captions must have the same length.
“”"
self.image_filenames = image_filenames
self.captions = captions
self.processor = processor
self.transforms = A.Compose([
A.Resize(CFG.size, CFG.size, always_apply=True)
])
def __getitem__(self, idx):
# Load image and process it
image = cv2.imread(self.image_filenames[idx])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = self.transforms(image=image)['image']
item_image = torch.tensor(image).permute(2, 0, 1).float() /255.0
# Truncate the caption to 300 characters
item_text = self.captions[idx][:300]
# Use the processor to encode the image and text
encoding = self.processor(images=item_image, text=item_text, padding="max_length", return_tensors="pt").to("cuda", torch.float16)
encoding = {k: v.squeeze() for k, v in encoding.items()}
# Return the processed items
return {
'encoding': encoding,
'item_image': item_image,
'item_text': item_text
}
def __len__(self):
return len(self.captions)
processor = AutoProcessor.from_pretrained(“Salesforce/blip-image-captioning-base”)
blip_dataset = BLIPDataset(
pd_padma_existing[“image_path”].values,
pd_padma_existing[“caption”].values,
processor
)
train_dataloader = DataLoader(blip_dataset, shuffle=False, batch_size=32)
model = BlipForConditionalGeneration.from_pretrained(“Salesforce/blip-image-captioning-base”, torch_dtype=torch.float16)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = “cuda” if torch.cuda.is_available() else “cpu”
model.to(device)
model.train()
for epoch in range(1):
print(“Epoch:”, epoch)
for idx, b in enumerate(train_dataloader):
batch, item_image, item_text = b[‘encoding’], b[‘item_image’], b[‘item_text’]
input_ids = batch.pop("input_ids").to(device)
pixel_values = batch.pop("pixel_values").to(device)
outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
loss = outputs.loss
print(f"idx={idx}, Loss: {loss.item()}")
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Move tensors back to CPU to free up GPU memory
input_ids = input_ids.to("cpu")
pixel_values = pixel_values.to("cpu")
Could you please find any errors here