Issues Training BlipForImageTextRetrieval

I am trying to train a BlipForImageTextRetrieval model on a custom dataset. I’m very new to using transformers, so mostly following along with example notebooks.

When I try to train the model with no labels provided, I get the following error:
ValueError: The model did not return a loss from the inputs, only the following keys: itm_score,last_hidden_state,question_embeds. For reference, the inputs it received are pixel_values,input_ids,attention_mask.

On the other hand, when I try to provide labels, I get this error:
TypeError: BlipForImageTextRetrieval.forward() got an unexpected keyword argument 'labels'

This is the code that I have written. Your help would be greatly appreciated!

ds = ds.train_test_split(test_size=0.1)
train_ds = ds["train"]
test_ds = ds["test"]

from transformers import AutoProcessor

checkpoint = "Salesforce/blip-itm-base-coco"
processor = AutoProcessor.from_pretrained(checkpoint)

def transforms(example_batch):
  images = example_batch["image"]
  meanings = example_batch["meaning"]
  inputs = processor(images=images, text=meanings, padding="max_length")
  # inputs.update({"labels": inputs["input_ids"]})
  return inputs

train_ds_1 = train_ds.map(transforms, batched=True, remove_columns=["image", "meaning"])
test_ds_1 = test_ds.map(transforms, batched=True, remove_columns=["image", "meaning"])

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

from transformers import BlipForImageTextRetrieval

model = BlipForImageTextRetrieval.from_pretrained(checkpoint)

from evaluate import load
import torch

recall_metric = load("recall")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predicted = logits.argmax(-1)
  decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
  decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
  recall_score = recall_metric.compute(predictions=decoded_predictions, references=decoded_labels)
  return {"recall_score": recall_score}

!pip install -U accelerate
!pip install -U transformers

from transformers import TrainingArguments, Trainer

model_name = checkpoint.split("/")[1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-mydataset",
    learning_rate=5e-5,
    num_train_epochs=50,
    fp16=True,
    per_device_train_batch_size=8, # decreased for RAM
    per_device_eval_batch_size=8, # decreased for RAM
    gradient_accumulation_steps=2,
    save_total_limit=3,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    logging_steps=50,
    remove_unused_columns=False,
    # label_names=["labels"],
    load_best_model_at_end=True,
    # added for RAM
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_1,
    eval_dataset=test_ds_1,
    compute_metrics=compute_metrics,
    # added data collator
    data_collator=data_collator
)

trainer.train()