Hi, Iâm using the facebook/detr-resnet-50
model for object detection with two different datasets. The first dataset has 2 classes, and the second has 24 classes. Iâm using a learning rate of 5e-5 for both. However, I encounter the same error in both cases. For the first dataset, the training runs fine for 10 epochs, but if I increase the epochs to 20, the error occurs. For the second dataset, the error appears in the second epoch, even though the first epoch runs smoothly. How can I resolve this issue?
ValueError Traceback (most recent call last)
Cell In[63], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2052, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2050 hf_hub_utils.enable_progress_bars()
2051 else:
â 2052 return inner_training_loop(
2053 args=args,
2054 resume_from_checkpoint=resume_from_checkpoint,
2055 trial=trial,
2056 ignore_keys_for_eval=ignore_keys_for_eval,
2057 )
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2388, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2385 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
2387 with self.accelerator.accumulate(model):
â 2388 tr_loss_step = self.training_step(model, inputs)
2390 if (
2391 args.logging_nan_inf_filter
2392 and not is_torch_xla_available()
2393 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2394 ):
2395 # if loss is nan or inf simply add the average of previous logged losses
2396 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3485, in Trainer.training_step(self, model, inputs)
3482 return loss_mb.reduce_mean().detach().to(self.args.device)
3484 with self.compute_loss_context_manager():
â 3485 loss = self.compute_loss(model, inputs)
3487 del inputs
3488 if (
3489 self.args.torch_empty_cache_steps is not None
3490 and self.state.global_step % self.args.torch_empty_cache_steps == 0
3491 ):
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3532, in Trainer.compute_loss(self, model, inputs, return_outputs)
3530 else:
3531 labels = None
â 3532 outputs = model(**inputs)
3533 # Save past state if it exists
3534 # TODO: this needs to be fixed and made cleaner later.
3535 if self.args.past_index >= 0:
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
â 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we donât have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
â 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:820, in convert_outputs_to_fp32..forward(*args, **kwargs)
819 def forward(*args, **kwargs):
â 820 return model_forward(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py:808, in ConvertOutputsToFp32.call(self, *args, **kwargs)
807 def call(self, *args, **kwargs):
â 808 return convert_to_fp32(self.model_forward(*args, **kwargs))
File /opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py:43, in autocast_decorator..decorate_autocast(*args, **kwargs)
40 @functools.wraps(func)
41 def decorate_autocast(*args, **kwargs):
42 with autocast_instance:
â> 43 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/transformers/models/detr/modeling_detr.py:1485, in DetrForObjectDetection.forward(self, pixel_values, pixel_mask, decoder_attention_mask, encoder_outputs, inputs_embeds, decoder_inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
1482 auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
1483 outputs_loss[âauxiliary_outputsâ] = auxiliary_outputs
â 1485 loss_dict = criterion(outputs_loss, labels)
1486 # Fourth: compute total loss, as a weighted sum of the various losses
1487 weight_dict = {âloss_ceâ: 1, âloss_bboxâ: self.config.bbox_loss_coefficient}
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
â 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we donât have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
â 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/transformers/models/detr/modeling_detr.py:2084, in DetrLoss.forward(self, outputs, targets)
2081 outputs_without_aux = {k: v for k, v in outputs.items() if k != âauxiliary_outputsâ}
2083 # Retrieve the matching between the outputs of the last layer and the targets
â 2084 indices = self.matcher(outputs_without_aux, targets)
2086 # Compute the average number of target boxes across all nodes, for normalization purposes
2087 num_boxes = sum(len(t[âclass_labelsâ]) for t in targets)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
â 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we donât have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
â 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
â 116 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/transformers/models/detr/modeling_detr.py:2206, in DetrHungarianMatcher.forward(self, outputs, targets)
2203 bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
2205 # Compute the giou cost between boxes
â 2206 giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
2208 # Final cost matrix
2209 cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
File /opt/conda/lib/python3.10/site-packages/transformers/models/detr/modeling_detr.py:2271, in generalized_box_iou(boxes1, boxes2)
2268 # degenerate boxes gives inf / nan results
2269 # so do an early check
2270 if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
â 2271 raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}â)
2272 if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
2273 raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}â)
ValueError: boxes1 must be in [x0, y0, x1, y1] (corner) format, but got tensor([[nan, nan, nan, nan],
[nan, nan, nan, nan],
[nan, nan, nan, nan],
âŚ,
[nan, nan, nan, nan],
[nan, nan, nan, nan],
[nan, nan, nan, nan]], device=âcuda:0â, dtype=torch.float16)