Kosmos-2 Fine tuning

based on Finetune BLIP on customer dataset #20893 - #2 by dxlong2000. I set labels as input_ids in model
e.g

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):

        # outputs = model(**inputs)
        outputs = model(
            input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"],
            attention_mask=inputs["attention_mask"], image_embeds_position_mask=inputs["image_embeds_position_mask"],
            labels = inputs["input_ids"]
        )
        logits = outputs.logits
        # Ensure that 'eval_loss' is present in the metrics dictionary
        metrics = {'eval_loss': outputs.loss.item()}
        return (outputs.loss, outputs) if return_outputs else outputs.loss