Problem with model inference using accelerate

I am trying to fine-tune a model for multi-label text classification on my own dataset using the Accelerate module. With a single GPU, everything is OK, but on two GPUs, I am getting the following error:

Traceback (most recent call last):
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 187, in <module>
    main()
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 184, in main
    training_function()
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 169, in training_function
Traceback (most recent call last):
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 187, in <module>
    pred, ref = accelerator.gather_for_metrics((pred, batch['labels']))
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2221, in gather_for_metrics
    main()
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 184, in main
    training_function()
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 169, in training_function
    pred, ref = accelerator.gather_for_metrics((pred, batch['labels']))
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2221, in gather_for_metrics
    data = self.gather(input_data)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2184, in gather
        data = self.gather(input_data)return gather(tensor)

  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2184, in gather
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 337, in wrapper
    return function(*args, **kwargs)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 393, in gather
    return _gpu_gather(tensor)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 317, in _gpu_gather
    return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 109, in recursively_apply
        return gather(tensor)return honor_type(

  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 337, in wrapper
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 83, in honor_type
    return type(obj)(generator)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 112, in <genexpr>
    return function(*args, **kwargs)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 393, in gather
    recursively_apply(
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 128, in recursively_apply
    return func(data, *args, **kwargs)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 307, in _gpu_gather_one
    return _gpu_gather(tensor)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 317, in _gpu_gather
    gather_op(output_tensors, tensor)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
    return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)
    return func(*args, **kwargs)  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 109, in recursively_apply

  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2897, in all_gather_into_tensor
    return honor_type(
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 83, in honor_type
    return type(obj)(generator)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 112, in <genexpr>
    recursively_apply(
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 128, in recursively_apply
    return func(data, *args, **kwargs)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 307, in _gpu_gather_one
    gather_op(output_tensors, tensor)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2897, in all_gather_into_tensor
    work = group._allgather_base(output_tensor, input_tensor)
RuntimeError: Tensors must be CUDA and dense
    work = group._allgather_base(output_tensor, input_tensor)
RuntimeError: Tensors must be CUDA and dense
[2023-11-21 21:23:18,202] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1495163) of binary: /home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/bin/python
Traceback (most recent call last):
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 47, in main
    args.func(args)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/commands/launch.py", line 985, in launch_command
    multi_gpu_launcher(args)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/commands/launch.py", line 654, in multi_gpu_launcher
    distrib_run.run(args)
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
    elastic_launch(
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
test_2.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2023-11-21_21:23:18
  host      : SCS-GPU-Fall2023
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 1495164)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-11-21_21:23:18
  host      : SCS-GPU-Fall2023
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1495163)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================```
**My code is as follows:**

import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType
from copy import deepcopy
import torch.nn as nn

def get_dataloaders(checkPoint_model,checkPoint_data,accelerator,batch_size):

def strToDict(s):
    s = s.replace("'",'"')
    x = eval(s)
    return x

def processData(example):
    text = example['text']
    encoding = tokenizer(text,padding='max_length',truncation=True)
    encoding['labels'] = example['labels']
    return encoding
def collate_fn(examples):
    max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
    if accelerator.mixed_precision == "fp8":
        pad_to_multiple_of = 16
    elif accelerator.mixed_precision != "no":
        pad_to_multiple_of = 8
    else:
        pad_to_multiple_of = None

    return tokenizer.pad(
        examples,
        padding="longest",
        max_length=max_length,
        pad_to_multiple_of=pad_to_multiple_of,
        return_tensors="pt",
    )

tokenizer = AutoTokenizer.from_pretrained(checkPoint_model)
dataset = load_dataset(checkPoint_data)
label2idx = strToDict(dataset['train'][0]['label2idx'])
idx2label = strToDict(dataset['train'][0]['idx2label'])
with accelerator.main_process_first():
    encodedDataset = dataset.map(processData,batched=True,remove_columns=dataset['train'].column_names)
trainSet = encodedDataset['train'].shuffle(seed=1234)
testSet = encodedDataset['test']
trainLoader = DataLoader(trainSet,shuffle=True,batch_size=batch_size,collate_fn=collate_fn)
testLoader = DataLoader(testSet,batch_size=batch_size,collate_fn=collate_fn)
return trainLoader, testLoader, label2idx,idx2label

def training_function():

accelerator = Accelerator()
lr = 2e-5
num_epochs = 1
seed = 1234
batch_size = 2

metric = evaluate.combine([
    evaluate.load('accuracy'),
    evaluate.load('precision',average='weighted'),
    evaluate.load('recall',average='weighted'),
    evaluate.load('f1',avergae='weighted')
])
gradient_accumulation_steps = 1
set_seed(seed)
checkPoint_model = "bert-base-cased"
checkPoint_data = "akkasi/ethos"
train_dataloader, eval_dataloader,label2idx,idx2label = get_dataloaders(checkPoint_model,checkPoint_data,accelerator,batch_size)

model = AutoModelForSequenceClassification.from_pretrained(checkPoint_model,
                                                       problem_type='multi_label_classification',
                                                       num_labels = len(label2idx),
                                                       ignore_mismatched_sizes=True,
                                                       label2id = label2idx,
                                                       id2label = idx2label,
                                                       return_dict=True)


model = model.to(accelerator.device)

optimizer = AdamW(params=model.parameters(), lr=lr)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
)


cp_model = deepcopy(model)
cp_model.classifier = nn.Sequential(
    nn.Linear(in_features=768, out_features=len(label2idx), bias=True),
    nn.Sigmoid()
)
model = deepcopy(cp_model)

model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

threshold = 0.5
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch.to(accelerator.device)
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
           
    model.eval()
    samples_seen = 0
    for step, batch in enumerate(eval_dataloader):
        batch.to(accelerator.device)
        with torch.no_grad():
            outputs = model(**batch)
        pred = torch.zeros(outputs['logits'].shape)
        pred[torch.where(outputs['logits'] >= threshold)] = 1
        pred = pred.type(torch.int32)
        batch['labels'] = batch['labels'].type(torch.int32)
        pred, ref = accelerator.gather_for_metrics((pred, batch['labels']))
        for predictions,references in zip(pred,ref):                
            metric.add_batch(
                predictions=predictions,
                references=references,
            )
        break
    eval_metric = metric.compute()
    accelerator.print(f"epoch {epoch}:", eval_metric)

if name == “main”:
training_function()

@muellerzr

You need to move this to the GPU

Thank you @muellerzr the problem is solved.