I’m working on an open source project
The purpose is to use huggingface to train a task, only need to modify the yaml file can be adapted to a variety of model training.
I’ve already completed the
image classification
Currently working on object detection
object detect This is an unfinished product, and while the training works fine, there is a bug that the model does not converge. So I checked huggingface object detect tutorial.
Then I ran him on a colab and he was able to converge, then I ran the same code on my own computer, a macbook, and the model did not converge. I really can’t clear what’s causing this, can any of you tell me how to fix it?
MODEL_NAME = "microsoft/conditional-detr-resnet-50" # or "facebook/detr-resnet-50"
IMAGE_SIZE = 480
from datasets import load_dataset
cppe5 = load_dataset("cppe-5")
import numpy as np
import os
from PIL import Image, ImageDraw
categories = cppe5["train"].features["objects"].feature["category"].names
id2label = {index: x for index, x in enumerate(categories, start=0)}
label2id = {v: k for k, v in id2label.items()}
from transformers import AutoImageProcessor
MAX_SIZE = IMAGE_SIZE
image_processor = AutoImageProcessor.from_pretrained(
MODEL_NAME,
do_resize=True,
size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
do_pad=True,
pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
)
# import albumentations as A
# train_augment_and_transform = A.Compose(
# [
# A.Perspective(p=0.1),
# A.HorizontalFlip(p=0.5),
# A.RandomBrightnessContrast(p=0.5),
# A.HueSaturationValue(p=0.1),
# ],
# bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
# )
# validation_transform = A.Compose(
# [A.NoOp()],
# bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
# )
train_augment_and_transform = None
validation_transform = None
def format_image_annotations_as_coco(image_id, categories, areas, bboxes):
"""Format one set of image annotations to the COCO format
Args:
image_id (str): image id. e.g. "0001"
categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
areas (List[float]): list of corresponding areas to provided bounding boxes
bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
([center_x, center_y, width, height] in absolute coordinates)
Returns:
dict: {
"image_id": image id,
"annotations": list of formatted annotations
}
"""
annotations = []
for category, area, bbox in zip(categories, areas, bboxes):
formatted_annotation = {
"image_id": image_id,
"category_id": category,
"iscrowd": 0,
"area": area,
"bbox": list(bbox),
}
annotations.append(formatted_annotation)
return {
"image_id": image_id,
"annotations": annotations,
}
def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
"""Apply augmentations and format annotations in COCO format for object detection task"""
images = []
annotations = []
for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
image = np.array(image.convert("RGB"))
# apply augmentations
# output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
images.append(image)
# format annotations in COCO format
formatted_annotations = format_image_annotations_as_coco(
image_id, objects["category"], objects["area"], objects["bbox"]
)
annotations.append(formatted_annotations)
# Apply the image processor transformations: resizing, rescaling, normalization
result = image_processor(images=images, annotations=annotations, return_tensors="pt")
if not return_pixel_mask:
result.pop("pixel_mask", None)
return result
from functools import partial
# Make transform functions for batch and apply for dataset splits
train_transform_batch = partial(
augment_and_transform_batch, transform=train_augment_and_transform, image_processor=image_processor
)
validation_transform_batch = partial(
augment_and_transform_batch, transform=validation_transform, image_processor=image_processor
)
cppe5["train"] = cppe5["train"].with_transform(train_transform_batch)
cppe5["test"] = cppe5["test"].with_transform(validation_transform_batch)
import torch
def collate_fn(batch):
data = {}
data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
data["labels"] = [x["labels"] for x in batch]
if "pixel_mask" in batch[0]:
data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
return data
from transformers.image_transforms import center_to_corners_format
def convert_bbox_yolo_to_pascal(boxes, image_size):
"""
Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.
Args:
boxes (torch.Tensor): Bounding boxes in YOLO format
image_size (Tuple[int, int]): Image size in format (height, width)
Returns:
torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
"""
# convert center to corners format
boxes = center_to_corners_format(boxes)
# convert to absolute coordinates
height, width = image_size
boxes = boxes * torch.tensor([[width, height, width, height]])
return boxes
import numpy as np
from dataclasses import dataclass
from torchmetrics.detection.mean_ap import MeanAveragePrecision
@dataclass
class ModelOutput:
logits: torch.Tensor
pred_boxes: torch.Tensor
@torch.no_grad()
def compute_metrics(evaluation_results, image_processor, threshold=0.0, id2label=None):
"""
Compute mean average mAP, mAR and their variants for the object detection task.
Args:
evaluation_results (EvalPrediction): Predictions and targets from evaluation.
threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0.
id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None.
Returns:
Mapping[str, float]: Metrics in a form of dictionary {<metric_name>: <metric_value>}
"""
predictions, targets = evaluation_results.predictions, evaluation_results.label_ids
# For metric computation we need to provide:
# - targets in a form of list of dictionaries with keys "boxes", "labels"
# - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels"
image_sizes = []
post_processed_targets = []
post_processed_predictions = []
# Collect targets in the required format for metric computation
for batch in targets:
# collect image sizes, we will need them for predictions post processing
batch_image_sizes = torch.tensor(np.array([x["orig_size"] for x in batch]))
image_sizes.append(batch_image_sizes)
# collect targets in the required format for metric computation
# boxes were converted to YOLO format needed for model training
# here we will convert them to Pascal VOC format (x_min, y_min, x_max, y_max)
for image_target in batch:
boxes = torch.tensor(image_target["boxes"])
boxes = convert_bbox_yolo_to_pascal(boxes, image_target["orig_size"])
labels = torch.tensor(image_target["class_labels"])
post_processed_targets.append({"boxes": boxes, "labels": labels})
# Collect predictions in the required format for metric computation,
# model produce boxes in YOLO format, then image_processor convert them to Pascal VOC format
for batch, target_sizes in zip(predictions, image_sizes):
batch_logits, batch_boxes = batch[1], batch[2]
output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
post_processed_output = image_processor.post_process_object_detection(
output, threshold=threshold, target_sizes=target_sizes
)
post_processed_predictions.extend(post_processed_output)
# Compute metrics
metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
metric.update(post_processed_predictions, post_processed_targets)
metrics = metric.compute()
# Replace list of per class metrics with separate metric for each class
classes = metrics.pop("classes")
map_per_class = metrics.pop("map_per_class")
mar_100_per_class = metrics.pop("mar_100_per_class")
for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
metrics[f"map_{class_name}"] = class_map
metrics[f"mar_100_{class_name}"] = class_mar
metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
return metrics
eval_compute_metrics_fn = partial(
compute_metrics, image_processor=image_processor, id2label=id2label, threshold=0.0
)
from transformers import AutoModelForObjectDetection
model = AutoModelForObjectDetection.from_pretrained(
MODEL_NAME,
id2label=id2label,
label2id=label2id,
ignore_mismatched_sizes=True,
)
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="detr_finetuned_cppe5",
num_train_epochs=30,
fp16=False,
per_device_train_batch_size=4,
dataloader_num_workers=0,
learning_rate=5e-5,
lr_scheduler_type="cosine",
weight_decay=1e-4,
max_grad_norm=0.01,
metric_for_best_model="eval_map",
greater_is_better=True,
load_best_model_at_end=True,
eval_strategy="epoch",
save_strategy="epoch",
# save_total_limit=2,
remove_unused_columns=False,
eval_do_concat_batches=False,
# push_to_hub=True,
)
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=cppe5["train"],
eval_dataset=cppe5["test"],
tokenizer=image_processor,
data_collator=collate_fn,
compute_metrics=eval_compute_metrics_fn,
)
trainer.train()
this is the code which is ok in colab but not ok in my computer