Unable to finetune DETR

Hi all.
I want to finetune the DETR model for object detection. At the beginning, my dataset was in YOLO format, that is, the structure of the folders were:

.
β”œβ”€β”€ train
    └── images
    β”‚   β”œβ”€β”€ ima1.png
    β”‚   β”œβ”€β”€ ima2.png
    β”‚   β”œβ”€β”€ ...
    └── labels
    β”‚   β”œβ”€β”€ ima1.txt
    β”‚   β”œβ”€β”€ ima2.txt
    β”‚   β”œβ”€β”€ ...
β”œβ”€β”€ val
    └── images
    β”‚   β”œβ”€β”€ ima3.png
    β”‚   β”œβ”€β”€ ima4.png
    β”‚   β”œβ”€β”€ ...
    └── labels
    β”‚   β”œβ”€β”€ ima3.txt
    β”‚   β”œβ”€β”€ ima4.txt
    β”‚   β”œβ”€β”€ ...
β”œβ”€β”€ test
    └── images
    β”‚   β”œβ”€β”€ ima5.png
    β”‚   β”œβ”€β”€ ima6.png
    β”‚   β”œβ”€β”€ ...
    └── labels
    β”‚   β”œβ”€β”€ ima5.txt
    β”‚   β”œβ”€β”€ ima6.txt
    β”‚   β”œβ”€β”€ ...

Where each txt file contains the label in YOLO format as depcited in this example.
I made a parser from this format, to the one defined in the hugface docs.

Part of the content of my train.json file is:

{"images": "/home/alberto/Dataset/train/images/ima33.png", "labels": {"image_id": 33, "annotations": [{"image_id": 33, "category_id": 0, "isCrowd": 0, "area": 836.3844989732237, "bbox": [467.6039173574827, 320.7359824929699, 16.193362718773813, 51.64983416344768]}]}}
{"images": "/home/alberto/Dataset/train/images/ima34.png", "labels": {"image_id": 34, "annotations": [{"image_id": 34, "category_id": 0, "isCrowd": 0, "area": 155.90997473673505, "bbox": [302.4758230493843, 331.4863415890505, 6.941832607149081, 22.459483476477146]}]}}
{"images": "/home/alberto/Dataset/train/images/ima35.png", "labels": {"image_id": 35, "annotations": [{"image_id": 35, "category_id": 0, "isCrowd": 0, "area": 192.56241287696886, "bbox": [275.00035776371584, 330.68727201610824, 7.221299481967947, 26.66589487914325]}]}}
{"images": "/home/alberto/Dataset/train/images/ima36.png", "labels": {"image_id": 36, "annotations": [{"image_id": 36, "category_id": 0, "isCrowd": 0, "area": 340.96618997872207, "bbox": [258.6175375390202, 331.2602621260946, 14.103082789030736, 24.176713352624066]}]}}
{"images": "/home/alberto/Dataset/train/images/ima37.png", "labels": {"image_id": 37, "annotations": [{"image_id": 37, "category_id": 0, "isCrowd": 0, "area": 184.83137819588285, "bbox": [249.30923914377175, 330.7909414770311, 7.403844315195045, 24.96424429354221]}, {"image_id": 37, "category_id": 0, "isCrowd": 0, "area": 193.20158194586713, "bbox": [255.19273430554105, 330.15024228403854, 7.4734051644021235, 25.851881129921768]}]}}
{"images": "/home/alberto/Dataset/train/images/ima38.png", "labels": {"image_id": 38, "annotations": [{"image_id": 38, "category_id": 0, "isCrowd": 0, "area": 265.78462504828593, "bbox": [226.90301445368266, 329.4439993775726, 9.569004844741528, 27.775576390719777]}, {"image_id": 38, "category_id": 0, "isCrowd": 0, "area": 256.31172715433235, "bbox": [238.493850090304, 329.62981902564036, 9.450719483020208, 27.1208692221623]}]}}
{"images": "/home/alberto/Dataset/train/images/ima39.png", "labels": {"image_id": 39, "annotations": [{"image_id": 39, "category_id": 0, "isCrowd": 0, "area": 309.6746487524533, "bbox": [222.14579024587414, 328.51104201166305, 11.106078840660189, 27.883346876551172]}, {"image_id": 39, "category_id": 0, "isCrowd": 0, "area": 323.8573679704702, "bbox": [234.43336683554077, 329.8106895355701, 11.814977490064024, 27.4107477769486]}]}}

Given all this get-in-touch information, my script looks like this:

class CustomDataset(Dataset):
	def __init__(self, subset, processor):
		self.subset = subset
		self.processor = processor

	def __len__(self):
		return len(self.subset)

	def __getitem__(self, index):
		image, target = super(CustomDataset, self).__getitem__(index)

		img_path = self.subset[index]['images']
		image = Image.open(img_path).convert("RGB")
		target = self.subset[index]['labels']
		encoding = self.processor(images=image, annotations=target, return_tensors="pt")
		pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
		target = encoding["labels"][0] # remove batch dimension
		
		return pixel_values, target

def collate_fn(batch):
	pixel_values = [item["pixel_values"] for item in batch]
	encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
	labels = [item["labels"] for item in batch]
	batch = {}
	batch["pixel_values"] = encoding["pixel_values"]
	batch["pixel_mask"] = encoding["pixel_mask"]
	batch["labels"] = labels
	return batch



data_files = {
	"train": '/path/to/train.json',
	"validation": '/path/to/val.json',
	"test": '/path/to/test.json'
}

sr_dataset = load_dataset("json", data_files=data_files)
###### sr_dataset output ######
DatasetDict({
    train: Dataset({
        features: ['images', 'labels'],
        num_rows: 9681
    })
    validation: Dataset({
        features: ['images', 'labels'],
        num_rows: 1382
    })
    test: Dataset({
        features: ['images', 'labels'],
        num_rows: 2768
    })
})
###### sr_dataset output ######

image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

train_dataset = CustomDataset(subset =sr_dataset['train'],processor=image_processor)
val_dataset = CustomDataset(subset =sr_dataset['validation'],processor=image_processor)
test_dataset = CustomDataset(subset =sr_dataset['test'],processor=image_processor)


training_args = TrainingArguments(
	output_dir="detr-finetuned",
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	num_train_epochs=10,
	fp16=True,
	save_steps=200,
	logging_steps=50,
	learning_rate=1e-5,
	weight_decay=1e-4,
	save_total_limit=2,
	remove_unused_columns=False,
	push_to_hub=False,
)


model = DetrForObjectDetection.from_pretrained(
	"facebook/detr-resnet-50",
	id2label={"0": "person"},
	label2id={"person":"0"},
	ignore_mismatched_sizes=True,
)
train_loader = DataLoader(train_dataset, batch_size=training_args.per_device_train_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=training_args.per_device_eval_batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=training_args.per_device_eval_batch_size, shuffle=False)

from transformers import Trainer

trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=collate_fn,
	train_dataset=train_dataset,
	tokenizer=image_processor,
)

trainer.train()

I get the following error:

image, target = super(CustomDataset, self).__getitem__(index)
Exception has occurred: NotImplementedError
exception: no description

Please any help is appreciated.