Hi,
I’m trying to fine-tune a conditional-Detr object detection model on my own dataset. It all seemed to work fine for 8 epochs when I got an unexpected error:
File "/home/ubuntu/detr/scripts/train.py", line 246, in <module>
trainer.fit(model, train_val_dataloaders)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 582, in fit
call._call_and_handle_interrupt(
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 624, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1061, in _run
results = self._run_stage()
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1140, in _run_stage
self._run_train()
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1163, in _run_train
self.fit_loop.run()
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 214, in advance
batch_output = self.batch_loop.run(kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(optimizers, kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 200, in advance
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 247, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 357, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1305, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1661, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 169, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 281, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 234, in optimizer_step
return self.precision_plugin.optimizer_step(
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step
closure_result = closure()
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 147, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 133, in closure
step_output = self._step_fn()
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 406, in _training_step
training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1443, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 352, in training_step
return self.model(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0])
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/pytorch_lightning/overrides/base.py", line 98, in forward
output = self._forward_module.training_step(*inputs, **kwargs)
File "/home/ubuntu/detr/scripts/Detr_model.py", line 147, in training_step
loss, loss_dict, _ = self.common_step(batch, batch_idx) #
File "/home/ubuntu/detr/scripts/Detr_model.py", line 134, in common_step
outputs = self.model(
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/transformers/models/conditional_detr/modeling_conditional_detr.py", line 1779, in forward
loss_dict = criterion(outputs_loss, labels)
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/transformers/models/conditional_detr/modeling_conditional_detr.py", line 2397, in forward
indices = self.matcher(outputs_without_aux, targets)
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/home/ubuntu/venv/lib/python3.10/site-packages/transformers/models/conditional_detr/modeling_conditional_detr.py", line 2520, in forward
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
File "/home/ubuntu/venv/lib/python3.10/site-packages/transformers/models/conditional_detr/modeling_conditional_detr.py", line 2585, in generalized_box_iou
raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
ValueError: boxes1 must be in [x0, y0, x1, y1] (corner) format, but got tensor([[0.5889, 0.6191, 0.7930, 0.6475],
[0.1045, 0.1271, 0.9492, 0.2052],
[0.2373, 0.2783, 0.9424, 0.5625],
...,
[0.1133, 0.4707, 0.5112, 0.5547],
[0.5215, 0.4292, 0.8887, 0.5220],
[0.1031, 0.1813, 0.2440, 0.1986]], device='cuda:2',
dtype=torch.float16)
I’m training on a 4-gpu instance using ddp.
It looks like the incorrect bounding box is not coming from the targets (which seem to be boxes2) but from the predicted boxes (boxes1). I’m confused in how this can happen. Doesn’t that mean the width/height of the predicted box is negative? Or could this be caused by a numerical rounding error (something like 1.00000005 instead of 1)?
I found this post which seems related: custom training asserts with "degenerate bboxes" over and over - but bboxes look correct, any debugging insight? · Issue #28 · facebookresearch/detr · GitHub
I double checked my class labels after this and the max value is smaller than the total number of labels (13 classes with labels 0-12).
Any help would be very appreciated!
Thanks
Environment:
PyTorch version: 1.13.0+cu117
Is debug build: False
CUDA used to build PyTorch: 11.7
ROCM used to build PyTorch: N/A
OS: Ubuntu 18.04.6 LTS (x86_64)
GCC version: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Clang version: Could not collect
CMake version: version 3.21.3
Libc version: glibc-2.27
Python version: 3.10.8 (main, Nov 17 2022, 14:23:45) [GCC 7.5.0] (64-bit runtime)
Python platform: Linux-5.4.0-1089-aws-x86_64-with-glibc2.27
Is CUDA available: True
CUDA runtime version: 10.0.130
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration:
GPU 0: Tesla T4
GPU 1: Tesla T4
GPU 2: Tesla T4
GPU 3: Tesla T4
Nvidia driver version: 450.142.00
cuDNN version: Probably one of the following:
/usr/local/cuda-10.1/targets/x86_64-linux/lib/libcudnn.so.7.6.5
/usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudnn.so.7.6.5
/usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudnn.so.8.0.5
/usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudnn_adv_infer.so.8.0.5
/usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudnn_adv_train.so.8.0.5
/usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudnn_cnn_infer.so.8.0.5
/usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudnn_cnn_train.so.8.0.5
/usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudnn_ops_infer.so.8.0.5
/usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudnn_ops_train.so.8.0.5
/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudnn.so.8.0.5
/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudnn_adv_infer.so.8.0.5
/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudnn_adv_train.so.8.0.5
/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudnn_cnn_infer.so.8.0.5
/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudnn_cnn_train.so.8.0.5
/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudnn_ops_infer.so.8.0.5
/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudnn_ops_train.so.8.0.5
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
Versions of relevant libraries:
[pip3] numpy==1.23.4
[pip3] pytorch-lightning==1.8.1
[pip3] torch==1.13.0
[pip3] torchmetrics==0.10.3
[pip3] torchvision==0.14.0
[pip3] transformers==4.24.0