I am trying to fine-tune a model for multi-label text classification on my own dataset using the Accelerate module. With a single GPU, everything is OK, but on two GPUs, I am getting the following error:
Traceback (most recent call last):
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 187, in <module>
main()
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 184, in main
training_function()
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 169, in training_function
Traceback (most recent call last):
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 187, in <module>
pred, ref = accelerator.gather_for_metrics((pred, batch['labels']))
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2221, in gather_for_metrics
main()
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 184, in main
training_function()
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/test_2.py", line 169, in training_function
pred, ref = accelerator.gather_for_metrics((pred, batch['labels']))
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2221, in gather_for_metrics
data = self.gather(input_data)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2184, in gather
data = self.gather(input_data)return gather(tensor)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/accelerator.py", line 2184, in gather
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 337, in wrapper
return function(*args, **kwargs)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 393, in gather
return _gpu_gather(tensor)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 317, in _gpu_gather
return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 109, in recursively_apply
return gather(tensor)return honor_type(
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 337, in wrapper
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 83, in honor_type
return type(obj)(generator)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 112, in <genexpr>
return function(*args, **kwargs)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 393, in gather
recursively_apply(
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 128, in recursively_apply
return func(data, *args, **kwargs)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 307, in _gpu_gather_one
return _gpu_gather(tensor)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 317, in _gpu_gather
gather_op(output_tensors, tensor)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)
return func(*args, **kwargs) File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 109, in recursively_apply
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2897, in all_gather_into_tensor
return honor_type(
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 83, in honor_type
return type(obj)(generator)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 112, in <genexpr>
recursively_apply(
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 128, in recursively_apply
return func(data, *args, **kwargs)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/utils/operations.py", line 307, in _gpu_gather_one
gather_op(output_tensors, tensor)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2897, in all_gather_into_tensor
work = group._allgather_base(output_tensor, input_tensor)
RuntimeError: Tensors must be CUDA and dense
work = group._allgather_base(output_tensor, input_tensor)
RuntimeError: Tensors must be CUDA and dense
[2023-11-21 21:23:18,202] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1495163) of binary: /home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/bin/python
Traceback (most recent call last):
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/bin/accelerate", line 8, in <module>
sys.exit(main())
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 47, in main
args.func(args)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/commands/launch.py", line 985, in launch_command
multi_gpu_launcher(args)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/accelerate/commands/launch.py", line 654, in multi_gpu_launcher
distrib_run.run(args)
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/student/Experiemnts/MultiLabelClassification_LLMs/.mlc/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
test_2.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-11-21_21:23:18
host : SCS-GPU-Fall2023
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 1495164)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-11-21_21:23:18
host : SCS-GPU-Fall2023
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1495163)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================```
**My code is as follows:**
import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from accelerate import Accelerator, DistributedType
from copy import deepcopy
import torch.nn as nn
def get_dataloaders(checkPoint_model,checkPoint_data,accelerator,batch_size):
def strToDict(s):
s = s.replace("'",'"')
x = eval(s)
return x
def processData(example):
text = example['text']
encoding = tokenizer(text,padding='max_length',truncation=True)
encoding['labels'] = example['labels']
return encoding
def collate_fn(examples):
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
elif accelerator.mixed_precision != "no":
pad_to_multiple_of = 8
else:
pad_to_multiple_of = None
return tokenizer.pad(
examples,
padding="longest",
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors="pt",
)
tokenizer = AutoTokenizer.from_pretrained(checkPoint_model)
dataset = load_dataset(checkPoint_data)
label2idx = strToDict(dataset['train'][0]['label2idx'])
idx2label = strToDict(dataset['train'][0]['idx2label'])
with accelerator.main_process_first():
encodedDataset = dataset.map(processData,batched=True,remove_columns=dataset['train'].column_names)
trainSet = encodedDataset['train'].shuffle(seed=1234)
testSet = encodedDataset['test']
trainLoader = DataLoader(trainSet,shuffle=True,batch_size=batch_size,collate_fn=collate_fn)
testLoader = DataLoader(testSet,batch_size=batch_size,collate_fn=collate_fn)
return trainLoader, testLoader, label2idx,idx2label
def training_function():
accelerator = Accelerator()
lr = 2e-5
num_epochs = 1
seed = 1234
batch_size = 2
metric = evaluate.combine([
evaluate.load('accuracy'),
evaluate.load('precision',average='weighted'),
evaluate.load('recall',average='weighted'),
evaluate.load('f1',avergae='weighted')
])
gradient_accumulation_steps = 1
set_seed(seed)
checkPoint_model = "bert-base-cased"
checkPoint_data = "akkasi/ethos"
train_dataloader, eval_dataloader,label2idx,idx2label = get_dataloaders(checkPoint_model,checkPoint_data,accelerator,batch_size)
model = AutoModelForSequenceClassification.from_pretrained(checkPoint_model,
problem_type='multi_label_classification',
num_labels = len(label2idx),
ignore_mismatched_sizes=True,
label2id = label2idx,
id2label = idx2label,
return_dict=True)
model = model.to(accelerator.device)
optimizer = AdamW(params=model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=100,
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
)
cp_model = deepcopy(model)
cp_model.classifier = nn.Sequential(
nn.Linear(in_features=768, out_features=len(label2idx), bias=True),
nn.Sigmoid()
)
model = deepcopy(cp_model)
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)
threshold = 0.5
for epoch in range(num_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
batch.to(accelerator.device)
outputs = model(**batch)
loss = outputs.loss
loss = loss / gradient_accumulation_steps
accelerator.backward(loss)
if step % gradient_accumulation_steps == 0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
model.eval()
samples_seen = 0
for step, batch in enumerate(eval_dataloader):
batch.to(accelerator.device)
with torch.no_grad():
outputs = model(**batch)
pred = torch.zeros(outputs['logits'].shape)
pred[torch.where(outputs['logits'] >= threshold)] = 1
pred = pred.type(torch.int32)
batch['labels'] = batch['labels'].type(torch.int32)
pred, ref = accelerator.gather_for_metrics((pred, batch['labels']))
for predictions,references in zip(pred,ref):
metric.add_batch(
predictions=predictions,
references=references,
)
break
eval_metric = metric.compute()
accelerator.print(f"epoch {epoch}:", eval_metric)
if name == âmainâ:
training_function()