I am trying to train a BlipForImageTextRetrieval model on a custom dataset. I’m very new to using transformers, so mostly following along with example notebooks.
When I try to train the model with no labels provided, I get the following error:
ValueError: The model did not return a loss from the inputs, only the following keys: itm_score,last_hidden_state,question_embeds. For reference, the inputs it received are pixel_values,input_ids,attention_mask.
On the other hand, when I try to provide labels, I get this error:
TypeError: BlipForImageTextRetrieval.forward() got an unexpected keyword argument 'labels'
This is the code that I have written. Your help would be greatly appreciated!
ds = ds.train_test_split(test_size=0.1)
train_ds = ds["train"]
test_ds = ds["test"]
from transformers import AutoProcessor
checkpoint = "Salesforce/blip-itm-base-coco"
processor = AutoProcessor.from_pretrained(checkpoint)
def transforms(example_batch):
images = example_batch["image"]
meanings = example_batch["meaning"]
inputs = processor(images=images, text=meanings, padding="max_length")
# inputs.update({"labels": inputs["input_ids"]})
return inputs
train_ds_1 = train_ds.map(transforms, batched=True, remove_columns=["image", "meaning"])
test_ds_1 = test_ds.map(transforms, batched=True, remove_columns=["image", "meaning"])
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()
from transformers import BlipForImageTextRetrieval
model = BlipForImageTextRetrieval.from_pretrained(checkpoint)
from evaluate import load
import torch
recall_metric = load("recall")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predicted = logits.argmax(-1)
decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
recall_score = recall_metric.compute(predictions=decoded_predictions, references=decoded_labels)
return {"recall_score": recall_score}
!pip install -U accelerate
!pip install -U transformers
from transformers import TrainingArguments, Trainer
model_name = checkpoint.split("/")[1]
training_args = TrainingArguments(
output_dir=f"{model_name}-mydataset",
learning_rate=5e-5,
num_train_epochs=50,
fp16=True,
per_device_train_batch_size=8, # decreased for RAM
per_device_eval_batch_size=8, # decreased for RAM
gradient_accumulation_steps=2,
save_total_limit=3,
eval_strategy="steps",
eval_steps=50,
save_strategy="steps",
save_steps=50,
logging_steps=50,
remove_unused_columns=False,
# label_names=["labels"],
load_best_model_at_end=True,
# added for RAM
gradient_checkpointing=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds_1,
eval_dataset=test_ds_1,
compute_metrics=compute_metrics,
# added data collator
data_collator=data_collator
)
trainer.train()