Hi all.
I want to finetune the DETR model for object detection. At the beginning, my dataset was in YOLO format, that is, the structure of the folders were:
.
βββ train
βββ images
β βββ ima1.png
β βββ ima2.png
β βββ ...
βββ labels
β βββ ima1.txt
β βββ ima2.txt
β βββ ...
βββ val
βββ images
β βββ ima3.png
β βββ ima4.png
β βββ ...
βββ labels
β βββ ima3.txt
β βββ ima4.txt
β βββ ...
βββ test
βββ images
β βββ ima5.png
β βββ ima6.png
β βββ ...
βββ labels
β βββ ima5.txt
β βββ ima6.txt
β βββ ...
Where each txt file contains the label in YOLO format as depcited in this example.
I made a parser from this format, to the one defined in the hugface docs.
Part of the content of my train.json file is:
{"images": "/home/alberto/Dataset/train/images/ima33.png", "labels": {"image_id": 33, "annotations": [{"image_id": 33, "category_id": 0, "isCrowd": 0, "area": 836.3844989732237, "bbox": [467.6039173574827, 320.7359824929699, 16.193362718773813, 51.64983416344768]}]}}
{"images": "/home/alberto/Dataset/train/images/ima34.png", "labels": {"image_id": 34, "annotations": [{"image_id": 34, "category_id": 0, "isCrowd": 0, "area": 155.90997473673505, "bbox": [302.4758230493843, 331.4863415890505, 6.941832607149081, 22.459483476477146]}]}}
{"images": "/home/alberto/Dataset/train/images/ima35.png", "labels": {"image_id": 35, "annotations": [{"image_id": 35, "category_id": 0, "isCrowd": 0, "area": 192.56241287696886, "bbox": [275.00035776371584, 330.68727201610824, 7.221299481967947, 26.66589487914325]}]}}
{"images": "/home/alberto/Dataset/train/images/ima36.png", "labels": {"image_id": 36, "annotations": [{"image_id": 36, "category_id": 0, "isCrowd": 0, "area": 340.96618997872207, "bbox": [258.6175375390202, 331.2602621260946, 14.103082789030736, 24.176713352624066]}]}}
{"images": "/home/alberto/Dataset/train/images/ima37.png", "labels": {"image_id": 37, "annotations": [{"image_id": 37, "category_id": 0, "isCrowd": 0, "area": 184.83137819588285, "bbox": [249.30923914377175, 330.7909414770311, 7.403844315195045, 24.96424429354221]}, {"image_id": 37, "category_id": 0, "isCrowd": 0, "area": 193.20158194586713, "bbox": [255.19273430554105, 330.15024228403854, 7.4734051644021235, 25.851881129921768]}]}}
{"images": "/home/alberto/Dataset/train/images/ima38.png", "labels": {"image_id": 38, "annotations": [{"image_id": 38, "category_id": 0, "isCrowd": 0, "area": 265.78462504828593, "bbox": [226.90301445368266, 329.4439993775726, 9.569004844741528, 27.775576390719777]}, {"image_id": 38, "category_id": 0, "isCrowd": 0, "area": 256.31172715433235, "bbox": [238.493850090304, 329.62981902564036, 9.450719483020208, 27.1208692221623]}]}}
{"images": "/home/alberto/Dataset/train/images/ima39.png", "labels": {"image_id": 39, "annotations": [{"image_id": 39, "category_id": 0, "isCrowd": 0, "area": 309.6746487524533, "bbox": [222.14579024587414, 328.51104201166305, 11.106078840660189, 27.883346876551172]}, {"image_id": 39, "category_id": 0, "isCrowd": 0, "area": 323.8573679704702, "bbox": [234.43336683554077, 329.8106895355701, 11.814977490064024, 27.4107477769486]}]}}
Given all this get-in-touch information, my script looks like this:
class CustomDataset(Dataset):
def __init__(self, subset, processor):
self.subset = subset
self.processor = processor
def __len__(self):
return len(self.subset)
def __getitem__(self, index):
image, target = super(CustomDataset, self).__getitem__(index)
img_path = self.subset[index]['images']
image = Image.open(img_path).convert("RGB")
target = self.subset[index]['labels']
encoding = self.processor(images=image, annotations=target, return_tensors="pt")
pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
target = encoding["labels"][0] # remove batch dimension
return pixel_values, target
def collate_fn(batch):
pixel_values = [item["pixel_values"] for item in batch]
encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
labels = [item["labels"] for item in batch]
batch = {}
batch["pixel_values"] = encoding["pixel_values"]
batch["pixel_mask"] = encoding["pixel_mask"]
batch["labels"] = labels
return batch
data_files = {
"train": '/path/to/train.json',
"validation": '/path/to/val.json',
"test": '/path/to/test.json'
}
sr_dataset = load_dataset("json", data_files=data_files)
###### sr_dataset output ######
DatasetDict({
train: Dataset({
features: ['images', 'labels'],
num_rows: 9681
})
validation: Dataset({
features: ['images', 'labels'],
num_rows: 1382
})
test: Dataset({
features: ['images', 'labels'],
num_rows: 2768
})
})
###### sr_dataset output ######
image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
train_dataset = CustomDataset(subset =sr_dataset['train'],processor=image_processor)
val_dataset = CustomDataset(subset =sr_dataset['validation'],processor=image_processor)
test_dataset = CustomDataset(subset =sr_dataset['test'],processor=image_processor)
training_args = TrainingArguments(
output_dir="detr-finetuned",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=10,
fp16=True,
save_steps=200,
logging_steps=50,
learning_rate=1e-5,
weight_decay=1e-4,
save_total_limit=2,
remove_unused_columns=False,
push_to_hub=False,
)
model = DetrForObjectDetection.from_pretrained(
"facebook/detr-resnet-50",
id2label={"0": "person"},
label2id={"person":"0"},
ignore_mismatched_sizes=True,
)
train_loader = DataLoader(train_dataset, batch_size=training_args.per_device_train_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=training_args.per_device_eval_batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=training_args.per_device_eval_batch_size, shuffle=False)
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=collate_fn,
train_dataset=train_dataset,
tokenizer=image_processor,
)
trainer.train()
I get the following error:
image, target = super(CustomDataset, self).__getitem__(index)
Exception has occurred: NotImplementedError
exception: no description
Please any help is appreciated.