## đ Bug
Metrics (predefined in library and custom implementations) using concaâŚtenation `dist_reduce_fx="cat"` and CPU computation `compute_on_cpu=True` raise an error when training in multiple GPUs (`ddp`). The concrete error is `RuntimeError: No backend type associated with device type cpu`.
### To Reproduce
**Code sample:**
```py
import torch
from lightning import Trainer, LightningModule
from torch.utils.data import DataLoader
from torchmetrics import AUROC
class LitModel(LightningModule):
def __init__(self) -> None:
super().__init__()
self.layer = torch.nn.Linear(1, 1)
self.auroc = AUROC(task="binary", compute_on_cpu=True)
def training_step(self, x):
preds = torch.tensor([0.13, 0.26, 0.08, 0.19, 0.34]).cuda()
target = torch.tensor([0, 0, 1, 1, 1]).cuda()
self.auroc(preds, target)
self.log("train_auroc", self.auroc, on_step=True, on_epoch=True)
loss = self.layer(x).mean()
return loss
def configure_optimizers(self):
return torch.optim.SGD(self.parameters(), lr=0.1)
def train_dataloader(self):
return DataLoader(torch.randn(32, 1), batch_size=1)
```
**Stacktrace**
```
Traceback (most recent call last):
File "/home/ruben/Documents/PhD/Research/Topological Deep Learning/lightning/pythonProject/main.py", line 35, in <module>
main()
File "/home/ruben/Documents/PhD/Research/Topological Deep Learning/lightning/pythonProject/main.py", line 31, in main
trainer.fit(model)
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 105, in launch
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 580, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 987, in _run
results = self._run_stage()
^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 1033, in _run_stage
self.fit_loop.run()
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py", line 206, in run
self.on_advance_end()
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py", line 376, in on_advance_end
call._call_callback_hooks(trainer, "on_train_epoch_end", monitoring_callbacks=False)
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py", line 208, in _call_callback_hooks
fn(trainer, trainer.lightning_module, *args, **kwargs)
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/callbacks/progress/tqdm_progress.py", line 281, in on_train_epoch_end
self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/callbacks/progress/progress_bar.py", line 198, in get_metrics
pbar_metrics = trainer.progress_bar_metrics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py", line 1651, in progress_bar_metrics
return self._logger_connector.progress_bar_metrics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py", line 253, in progress_bar_metrics
metrics = self.metrics["pbar"]
^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py", line 234, in metrics
return self.trainer._results.metrics(on_step)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 483, in metrics
value = self._get_cache(result_metric, on_step)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 447, in _get_cache
result_metric.compute()
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 289, in wrapped_func
self._computed = compute(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py", line 254, in compute
return self.value.compute()
^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/torchmetrics/metric.py", line 611, in wrapped_func
with self.sync_context(
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/contextlib.py", line 137, in __enter__
return next(self.gen)
^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/torchmetrics/metric.py", line 582, in sync_context
self.sync(
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/torchmetrics/metric.py", line 531, in sync
self._sync_dist(dist_sync_fn, process_group=process_group)
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/torchmetrics/metric.py", line 435, in _sync_dist
output_dict = apply_to_collection(
^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py", line 72, in apply_to_collection
return _apply_to_collection_slow(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py", line 104, in _apply_to_collection_slow
v = _apply_to_collection_slow(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py", line 125, in _apply_to_collection_slow
v = _apply_to_collection_slow(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/lightning_utilities/core/apply_func.py", line 96, in _apply_to_collection_slow
return function(data, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/torchmetrics/utilities/distributed.py", line 127, in gather_all_tensors
torch.distributed.all_gather(local_sizes, local_size, group=group)
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ruben/miniconda3/envs/sct/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py", line 2808, in all_gather
work = group.allgather([tensor_list], [tensor])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: No backend type associated with device type cpu
```
### Expected behavior
Metric is computed properly merging the different lists in the differents processes in multi GPU training scenarios.
### Environment
- TorchMetrics version 1.3.2. Installed using `pip`
- Python & PyTorch Version: 3.11 and 2.1.2, respectively.
- Any other relevant information such as OS (e.g., Linux): Ubuntu 23.10
### Additional context
Related bug in PyTorch Lightning
https://github.com/Lightning-AI/pytorch-lightning/issues/18803