Error when fine-tuning on multi-gpu

sai-santhosh · February 17, 2025, 4:14am

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from transformers import BitsAndBytesConfig
import torch
from huggingface_hub import login
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from trl import setup_chat_format
from trl import SFTConfig, SFTTrainer
import re

def preprocess_dataset(examples):

  messages = []

  for sentence in examples['text']:
    sentence = re.sub(r"<s>\[INST\]","",sentence)
    sentence = sentence.split("[/INST]")
    question = sentence[0]
    answer = re.sub(r"</s>","",sentence[1])


    messages.append([
        {"role":"user" ,"content":question},
        {"role":"assistant","content":answer}
        ])

  examples['messages'] = messages
  return examples



def model_finetune():
  device = "cuda" if torch.cuda.is_available() else "cpu"

  dataset = load_dataset("Alok2304/Indian_Law_Final_Dataset",split="train[:20%]")
  dataset = dataset.train_test_split(test_size=0.1)

  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  model = AutoModelForCausalLM.from_pretrained(
      "meta-llama/Llama-3.2-3B",
      quantization_config = bnb_config,
      trust_remote_code = True,
      device_map={"": torch.cuda.current_device()})

  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")

  model = prepare_model_for_kbit_training(model)


  lora_config = LoraConfig(
      r=8,
      lora_alpha=16,
      target_modules= ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
      lora_dropout=0.05,
      bias="none",
  )

  model = get_peft_model(model, lora_config)

  dataset = dataset.map(preprocess_dataset,batched=True,remove_columns=['text'])

  model,tokenizer = setup_chat_format(model,tokenizer)

  args = SFTConfig(
      output_dir = "lora_model/",
      per_device_train_batch_size = 4,
      per_device_eval_batch_size = 4,
      learning_rate = 2e-05,
      gradient_accumulation_steps = 1,
      max_steps = 300,
      logging_strategy = "steps",
      logging_steps = 25,
      save_strategy = "steps",
      save_steps = 25,
      eval_strategy = "steps",
      eval_steps = 25,
      fp16 = True,
      data_seed=42,
      max_seq_length = 2048,
      gradient_checkpointing=True,
      report_to = "none",
  )

  trainer = SFTTrainer(
      model = model,
      args = args,
      tokenizer = tokenizer,
      train_dataset = dataset['train'],
      eval_dataset = dataset['test'],)

  trainer.train()

if __name__ == "__main__":
  model_finetune()

the below is my accelerator config

compute_environment: LOCAL_MACHINE
debug: true
distributed_type: FSDP
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
enable_cpu_affinity: false
fsdp_config:
  fsdp_activation_checkpointing: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: true
  fsdp_offload_params: false
  fsdp_sharding_strategy: HYBRID_SHARD_ZERO2
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

I dont understand why im getting the below error

2025-02-17 04:08:18.969448: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-17 04:08:18.991472: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-17 04:08:18.998034: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-17 04:08:19.500859: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-17 04:08:19.522521: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-17 04:08:19.529942: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2025-02-17 04:08:21,443] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-02-17 04:08:21,924] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:10<00:00,  5.42s/it]
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:10<00:00,  5.37s/it]
Map: 100%|██████████████████████| 13564/13564 [00:00<00:00, 73704.68 examples/s]
Map: 100%|████████████████████████| 1508/1508 [00:00<00:00, 66152.20 examples/s]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Map: 100%|██████████████████████| 13564/13564 [00:00<00:00, 77073.56 examples/s]
Map: 100%|████████████████████████| 1508/1508 [00:00<00:00, 68360.02 examples/s]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
/kaggle/working/script.py:94: FutureWarning: `tokenizer` is deprecated and removed starting from version 0.16.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
  trainer = SFTTrainer(
Applying chat template to train dataset:   6%| | 753/13564 [00:00<00:01, 7466.19/kaggle/working/script.py:94: FutureWarning: `tokenizer` is deprecated and removed starting from version 0.16.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
  trainer = SFTTrainer(
[rank1]:[W217 04:08:47.864649445 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
Applying chat template to train dataset: 100%|█| 13564/13564 [00:01<00:00, 8013.
Tokenizing train dataset: 100%|██| 13564/13564 [00:05<00:00, 2375.18 examples/s]
Tokenizing train dataset: 100%|██| 13564/13564 [00:02<00:00, 5795.84 examples/s]
[rank0]:[W217 04:08:57.596706401 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
Applying chat template to eval dataset: 100%|█| 1508/1508 [00:00<00:00, 7935.59 
Tokenizing eval dataset: 100%|█████| 1508/1508 [00:00<00:00, 2350.65 examples/s]
Tokenizing eval dataset: 100%|█████| 1508/1508 [00:00<00:00, 5680.32 examples/s]
Applying chat template to train dataset: 100%|█| 13564/13564 [00:01<00:00, 8649.
Tokenizing train dataset: 100%|██| 13564/13564 [00:05<00:00, 2318.37 examples/s]
Tokenizing train dataset: 100%|██| 13564/13564 [00:02<00:00, 5582.17 examples/s]
Applying chat template to eval dataset: 100%|█| 1508/1508 [00:00<00:00, 8248.39 
Tokenizing eval dataset: 100%|█████| 1508/1508 [00:00<00:00, 1831.95 examples/s]
Tokenizing eval dataset: 100%|█████| 1508/1508 [00:00<00:00, 4390.37 examples/s]
[rank1]: Traceback (most recent call last):
[rank1]:   File "/kaggle/working/script.py", line 104, in <module>
[rank1]:     model_finetune()
[rank1]:   File "/kaggle/working/script.py", line 101, in model_finetune
[rank1]:     trainer.train()
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2171, in train
[rank1]:     return inner_training_loop(
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2320, in _inner_training_loop
[rank1]:     self.model = self.accelerator.prepare(self.model)
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1339, in prepare
[rank1]:     result = tuple(
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1340, in <genexpr>
[rank1]:     self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1215, in _prepare_one
[rank1]:     return self.prepare_model(obj, device_placement=device_placement)
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1441, in prepare_model
[rank1]:     raise ValueError(
[rank1]: ValueError: You can't train a model that has been loaded in 8-bit or 4-bit precision on a different device than the one you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`
[rank0]: Traceback (most recent call last):
[rank0]:   File "/kaggle/working/script.py", line 104, in <module>
[rank0]:     model_finetune()
[rank0]:   File "/kaggle/working/script.py", line 101, in model_finetune
[rank0]:     trainer.train()
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2171, in train
[rank0]:     return inner_training_loop(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2320, in _inner_training_loop
[rank0]:     self.model = self.accelerator.prepare(self.model)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1339, in prepare
[rank0]:     result = tuple(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1340, in <genexpr>
[rank0]:     self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1215, in _prepare_one
[rank0]:     return self.prepare_model(obj, device_placement=device_placement)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1512, in prepare_model
[rank0]:     model = FSDP(model, **kwargs)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 483, in __init__
[rank0]:     _auto_wrap(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_wrap_utils.py", line 101, in _auto_wrap
[rank0]:     _recursive_wrap(**recursive_wrap_kwargs, **root_kwargs)  # type: ignore[arg-type]
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 545, in _recursive_wrap
[rank0]:     wrapped_child, num_wrapped_params = _recursive_wrap(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 545, in _recursive_wrap
[rank0]:     wrapped_child, num_wrapped_params = _recursive_wrap(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 545, in _recursive_wrap
[rank0]:     wrapped_child, num_wrapped_params = _recursive_wrap(
[rank0]:   [Previous line repeated 6 more times]
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 563, in _recursive_wrap
[rank0]:     return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 492, in _wrap
[rank0]:     return wrapper_cls(module, **kwargs)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 509, in __init__
[rank0]:     _init_param_handle_from_module(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py", line 629, in _init_param_handle_from_module
[rank0]:     _sync_module_params_and_buffers(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py", line 1126, in _sync_module_params_and_buffers
[rank0]:     _sync_params_and_buffers(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/utils.py", line 328, in _sync_params_and_buffers
[rank0]:     dist._broadcast_coalesced(
[rank0]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/NCCLUtils.cpp:81, remote process exited or there was a network error, NCCL version 2.21.5
[rank0]: ncclRemoteError: A call failed possibly due to a network error or a remote process exiting prematurely.
[rank0]: Last error:
[rank0]: socketProgress: Connection closed by remote peer 162ca5b0f93a<58938>
W0217 04:09:12.267000 327 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 336 closing signal SIGTERM
E0217 04:09:12.582000 327 torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 1 (pid: 337) of binary: /usr/bin/python3
Traceback (most recent call last):
  File "/usr/local/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 1159, in launch_command
    multi_gpu_launcher(args)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 792, in multi_gpu_launcher
    distrib_run.run(args)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 910, in run
    elastic_launch(
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 138, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
script.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-02-17_04:09:12
  host      : 162ca5b0f93a
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 337)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

John6666 · February 17, 2025, 6:12am

The error is probably difficult to fix. It seems that changing the torch version may fix it.

github.com/pytorch/torchtune

torch.distributed.elastic.multiprocessing.errors.ChildFailedError

opened 02:32AM - 29 Sep 24 UTC

closed 01:13AM - 05 Feb 25 UTC

Vattikondadheeraj

Context :- I am trying to run distributed training on 2 A-100 gpus with 40GB of …VRAM. The batch size is 3 and gradient accumulation=1. I have attached the config file below for more details and the error as well. I thing is I am not able to pinpoint the problem here because the error message itself is unclear. Is this because of CUDA memory issue? Or am I missing something else? ``` INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: batch_size: 3 checkpointer: _component_: torchtune.training.FullModelMetaCheckpointer checkpoint_dir: /home/toolkit/scratch/LLMcode/Train/llama-3.1-instruct/original checkpoint_files: - consolidated.00.pth model_type: LLAMA3 output_dir: /home/toolkit/scratch/LLMcode/Checkpoints/full_finetuning_results-1 recipe_checkpoint: null custom_sharded_layers: - tok_embeddings - output dataset: _component_: torchtune.datasets.alpaca_dataset device: cuda dtype: bf16 enable_activation_checkpointing: true epochs: 10 gradient_accumulation_steps: 1 log_every_n_steps: 1 log_peak_memory_stats: true loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /home/toolkit/scratch/LLMcode/Checkpoints/full_finetuning_results-1 model: _component_: torchtune.models.llama3_1.llama3_1_8b optimizer: _component_: torch.optim.AdamW fused: true lr: 2.0e-05 output_dir: /home/toolkit/scratch/LLMcode/Checkpoints/full_finetuning_results-1 resume_from_checkpoint: false seed: null shuffle: true tokenizer: _component_: torchtune.models.llama3.llama3_tokenizer max_seq_len: null path: /home/toolkit/scratch/LLMcode/Train/llama-3.1-instruct/original/tokenizer.model DEBUG:torchtune.utils._logging:Setting manual seed to local seed 4088951359. Local seed is seed + rank = 4088951359 + 0 Writing logs to /home/toolkit/scratch/LLMcode/Checkpoints/full_finetuning_results-1/log_1727576992.txt INFO:torchtune.utils._logging:FSDP is enabled. Instantiating model and loading checkpoint on Rank 0 ... INFO:torchtune.utils._logging:Instantiating model and loading checkpoint took 17.02 secs INFO:torchtune.utils._logging:Memory stats after model init: GPU peak memory allocation: 8.52 GiB GPU peak memory reserved: 8.67 GiB GPU peak memory active: 8.52 GiB INFO:torchtune.utils._logging:Optimizer is initialized. INFO:torchtune.utils._logging:Loss is initialized. INFO:torchtune.utils._logging:Dataset and Sampler are initialized. WARNING:torchtune.utils._logging: Profiling disabled. INFO:torchtune.utils._logging: Profiler config after instantiation: {'enabled': False} 441 aux1 aux2 0%| | 0/70 [00:00<?, ?it/s]765 aux1 aux1 /home/toolkit/.conda/envs/torch/lib/python3.11/site-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead. with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined] /home/toolkit/.conda/envs/torch/lib/python3.11/site-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead. with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined] W0929 02:30:29.754000 139982797317952 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 2171 closing signal SIGTERM E0929 02:30:36.777000 139982797317952 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: -9) local_rank: 1 (pid: 2172) of binary: /home/toolkit/.conda/envs/torch/bin/python Traceback (most recent call last): File "/home/toolkit/.conda/envs/torch/bin/tune", line 8, in <module> sys.exit(main()) ^^^^^^ File "/home/toolkit/scratch/LLMcode/Train/torchtune/torchtune/_cli/tune.py", line 49, in main parser.run(args) File "/home/toolkit/scratch/LLMcode/Train/torchtune/torchtune/_cli/tune.py", line 43, in run args.func(args) File "/home/toolkit/scratch/LLMcode/Train/torchtune/torchtune/_cli/run.py", line 183, in _run_cmd self._run_distributed(args) File "/home/toolkit/scratch/LLMcode/Train/torchtune/torchtune/_cli/run.py", line 89, in _run_distributed run(args) File "/home/toolkit/.conda/envs/torch/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in run elastic_launch( File "/home/toolkit/.conda/envs/torch/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 133, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/toolkit/.conda/envs/torch/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ /home/toolkit/scratch/LLMcode/Train/torchtune/recipes/full_finetune_distributed.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-09-29_02:30:29 host : 92b61c39-4dd9-4911-be6f-522a27802a4a rank : 1 (local_rank: 1) exitcode : -9 (pid: 2172) error_file: <N/A> traceback : Signal 9 (SIGKILL) received by PID 2172 ============================================================ ``` Config File :- ``` tokenizer: _component_: torchtune.models.llama3.llama3_tokenizer path: /home/toolkit/scratch/LLMcode/Train/llama-3.1-instruct/original/tokenizer.model max_seq_len: null # Dataset dataset: _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True # Model Arguments model: _component_: torchtune.models.llama3_1.llama3_1_8b checkpointer: _component_: torchtune.training.FullModelMetaCheckpointer checkpoint_dir: /home/toolkit/scratch/LLMcode/Train/llama-3.1-instruct/original checkpoint_files: [ consolidated.00.pth, ] recipe_checkpoint: null output_dir: /home/toolkit/scratch/LLMcode/Checkpoints/full_finetuning_results-1 model_type: LLAMA3 resume_from_checkpoint: False # Fine-tuning arguments batch_size: 3 epochs: 10 optimizer: _component_: torch.optim.AdamW lr: 2e-5 fused: True loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 # Training env device: cuda # Memory management enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] # Reduced precision dtype: bf16 # Logging metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /home/toolkit/scratch/LLMcode/Checkpoints/full_finetuning_results-1 output_dir: /home/toolkit/scratch/LLMcode/Checkpoints/full_finetuning_results-1 log_every_n_steps: 1 log_peak_memory_stats: True ```

github.com/ultralytics/ultralytics

About multi gpu training

opened 02:28AM - 08 Nov 24 UTC

closed 06:11AM - 08 Nov 24 UTC

Timmy-Eink

question detect

### Search before asking - [X] I have searched the Ultralytics YOLO [issues](ht…tps://github.com/ultralytics/ultralytics/issues) and [discussions](https://github.com/ultralytics/ultralytics/discussions) and found no similar questions. ### Question I can run one gpu perfectly, but when i run multi gpu I got errors down below. cuda test ``` CUDA is available! CUDA version: 12.4 Number of available GPU(s): 2 GPU 0: NVIDIA L40-48C GPU 1: NVIDIA L40-24C ``` cmd : `python -m torch.distributed.run --nproc_per_node 2 /home/timmy/ultralytics/yolo/train.py --batch 32 --data /home/timmy/ultralytics/data/TM2.yaml --weights /home/timmy/ultralytics/weights/yolo11m-seg.pt --device 0,1` ``` WARNING:__main__: Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. New https://pypi.org/project/ultralytics/8.3.28 available 😃 Update with 'pip install -U ultralytics' Ultralytics 8.3.27 🚀 Python-3.10.15 torch-2.5.1 CUDA:0 (NVIDIA L40-48C, 48851MiB) CUDA:1 (NVIDIA L40-24C, 24326MiB) Overriding model.yaml nc=80 with nc=3 Transferred 705/711 items from pretrained weights Freezing layer 'model.23.dfl.conv.weight' AMP: running Automatic Mixed Precision (AMP) checks... AMP: checks passed ✅ W1108 09:47:34.113000 16359 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 16370 closing signal SIGTERM E1108 09:47:35.083000 16359 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -11) local_rank: 1 (pid: 16371) of binary: /home/timmy/anaconda3/envs/ultralytics/bin/python Traceback (most recent call last): File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/run.py", line 923, in <module> main() File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper return f(*args, **kwargs) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main run(args) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run elastic_launch( File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: /home/timmy/ultralytics/yolo/train.py FAILED Failures: <NO_OTHER_FAILURES> Root Cause (first observed failure): [0]: time : 2024-11-08_09:47:34 host : EIH1tm2p01-0.eih.eink.com rank : 1 (local_rank: 1) exitcode : -11 (pid: 16371) error_file: <N/A> traceback : Signal 11 (SIGSEGV) received by PID 16371 ``` after run `export OMP_NUM_THREADS=1` ``` Error decoding JSON from /home/timmy/.config/Ultralytics/settings.json. Starting with an empty dictionary. New https://pypi.org/project/ultralytics/8.3.28 available 😃 Update with 'pip install -U ultralytics' Ultralytics 8.3.27 🚀 Python-3.10.15 torch-2.5.1 CUDA:0 (NVIDIA L40-48C, 48851MiB) CUDA:1 (NVIDIA L40-24C, 24326MiB) Overriding model.yaml nc=80 with nc=3 Transferred 705/711 items from pretrained weights Freezing layer 'model.23.dfl.conv.weight' AMP: running Automatic Mixed Precision (AMP) checks... AMP: checks passed ✅ W1108 10:25:30.709000 16847 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 16856 closing signal SIGTERM E1108 10:25:31.728000 16847 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -11) local_rank: 0 (pid: 16855) of binary: /home/timmy/anaconda3/envs/ultralytics/bin/python Traceback (most recent call last): File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/run.py", line 923, in <module> main() File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper return f(*args, **kwargs) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main run(args) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run elastic_launch( File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/home/timmy/anaconda3/envs/ultralytics/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ======================================================= /home/timmy/ultralytics/yolo/train.py FAILED ------------------------------------------------------- Failures: <NO_OTHER_FAILURES> ------------------------------------------------------- Root Cause (first observed failure): [0]: time : 2024-11-08_10:25:30 host : EIH1tm2p01-0.eih.eink.com rank : 0 (local_rank: 0) exitcode : -11 (pid: 16855) error_file: <N/A> traceback : Signal 11 (SIGSEGV) received by PID 16855 ======================================================= ``` ### Additional _No response_

Topic		Replies	Views
Error while fine tuning with peft, lora, accelerate, SFTConfig and SFTTrainer Models	3	1672	November 7, 2024
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 🤗Accelerate	1	623	August 15, 2024
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! 🤗Transformers	28	114012	November 17, 2024
torch.distributed.elastic.multiprocessing.errors.ChildFailedError 🤗Transformers	19	40248	January 22, 2025
Training llama with Lora on multiple GPUs may exist bug 🤗Transformers	10	9589	August 25, 2023

Error when fine-tuning on multi-gpu

Related topics