Hi, I am trying to fine-tune OWL-ViT model based on a personal dataset since the current model is not finding the bound boxes I need.
I used the following code and I am consistently getting the following error:
(yes, I am aware that I passed the same data for validation and test in Trainer object, but that was literally for testing a hello world for OWL-ViT fine-tuning)
The code:
import transformers
from transformers import AutoFeatureExtractor
from transformers import AutoTokenizer
import requests
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from datasets import load_dataset
from transformers import OwlViTProcessor, OwlViTForObjectDetection, OwlViTFeatureExtractor
from transformers import TrainingArguments, Trainer
from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
import numpy as np
from datasets import load_metric
feature_extractor = OwlViTFeatureExtractor.from_pretrained("google/owlvit-base-patch32")
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
######### PREPROCESS DATASET ############
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_transforms = Compose(
[RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
)
# Create the dataset based on structure of the folder
dataset_valves = load_dataset("imagefolder", data_dir="../my_dataset_2")
# The function applies the normalization of images and the passes the text to the OwLVitProcessor
def pre_process_dataset(example):
return processor(images=_transforms(example['image'].convert("RGB")), text=[str(example['label'])])
dataset_valves_ = dataset_valves['train']
preprocessed_datasets = dataset_valves_.map(pre_process_dataset)
preprocessed_datasets2 = preprocessed_datasets.remove_columns(['image','label']) #remove the unnecessary columns
######### PREPARE AND RUN TRAINER ############
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=preprocessed_datasets2 ,
eval_dataset=preprocessed_datasets2 ,
compute_metrics=compute_metrics,
)
trainer.train()
And the error:
RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 1, 3, 768, 768]
Could anyone, please, help me?
ps: I tried to apply the following tutorial but adapted for OWV-ViT: Google Colab