Error when fine-tuning on multi-gpu

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from transformers import BitsAndBytesConfig
import torch
from huggingface_hub import login
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from trl import setup_chat_format
from trl import SFTConfig, SFTTrainer
import re

def preprocess_dataset(examples):

  messages = []

  for sentence in examples['text']:
    sentence = re.sub(r"<s>\[INST\]","",sentence)
    sentence = sentence.split("[/INST]")
    question = sentence[0]
    answer = re.sub(r"</s>","",sentence[1])


    messages.append([
        {"role":"user" ,"content":question},
        {"role":"assistant","content":answer}
        ])

  examples['messages'] = messages
  return examples



def model_finetune():
  device = "cuda" if torch.cuda.is_available() else "cpu"

  dataset = load_dataset("Alok2304/Indian_Law_Final_Dataset",split="train[:20%]")
  dataset = dataset.train_test_split(test_size=0.1)

  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  model = AutoModelForCausalLM.from_pretrained(
      "meta-llama/Llama-3.2-3B",
      quantization_config = bnb_config,
      trust_remote_code = True,
      device_map={"": torch.cuda.current_device()})

  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")

  model = prepare_model_for_kbit_training(model)


  lora_config = LoraConfig(
      r=8,
      lora_alpha=16,
      target_modules= ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
      lora_dropout=0.05,
      bias="none",
  )

  model = get_peft_model(model, lora_config)

  dataset = dataset.map(preprocess_dataset,batched=True,remove_columns=['text'])

  model,tokenizer = setup_chat_format(model,tokenizer)

  args = SFTConfig(
      output_dir = "lora_model/",
      per_device_train_batch_size = 4,
      per_device_eval_batch_size = 4,
      learning_rate = 2e-05,
      gradient_accumulation_steps = 1,
      max_steps = 300,
      logging_strategy = "steps",
      logging_steps = 25,
      save_strategy = "steps",
      save_steps = 25,
      eval_strategy = "steps",
      eval_steps = 25,
      fp16 = True,
      data_seed=42,
      max_seq_length = 2048,
      gradient_checkpointing=True,
      report_to = "none",
  )

  trainer = SFTTrainer(
      model = model,
      args = args,
      tokenizer = tokenizer,
      train_dataset = dataset['train'],
      eval_dataset = dataset['test'],)

  trainer.train()

if __name__ == "__main__":
  model_finetune()

the below is my accelerator config

compute_environment: LOCAL_MACHINE
debug: true
distributed_type: FSDP
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
enable_cpu_affinity: false
fsdp_config:
  fsdp_activation_checkpointing: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: true
  fsdp_offload_params: false
  fsdp_sharding_strategy: HYBRID_SHARD_ZERO2
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

I dont understand why im getting the below error

2025-02-17 04:08:18.969448: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-17 04:08:18.991472: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-17 04:08:18.998034: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-17 04:08:19.500859: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-17 04:08:19.522521: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-17 04:08:19.529942: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2025-02-17 04:08:21,443] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-02-17 04:08:21,924] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:10<00:00,  5.42s/it]
Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:10<00:00,  5.37s/it]
Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13564/13564 [00:00<00:00, 73704.68 examples/s]
Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1508/1508 [00:00<00:00, 66152.20 examples/s]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13564/13564 [00:00<00:00, 77073.56 examples/s]
Map: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1508/1508 [00:00<00:00, 68360.02 examples/s]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
/kaggle/working/script.py:94: FutureWarning: `tokenizer` is deprecated and removed starting from version 0.16.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
  trainer = SFTTrainer(
Applying chat template to train dataset:   6%| | 753/13564 [00:00<00:01, 7466.19/kaggle/working/script.py:94: FutureWarning: `tokenizer` is deprecated and removed starting from version 0.16.0 for `SFTTrainer.__init__`. Use `processing_class` instead.
  trainer = SFTTrainer(
[rank1]:[W217 04:08:47.864649445 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
Applying chat template to train dataset: 100%|β–ˆ| 13564/13564 [00:01<00:00, 8013.
Tokenizing train dataset: 100%|β–ˆβ–ˆ| 13564/13564 [00:05<00:00, 2375.18 examples/s]
Tokenizing train dataset: 100%|β–ˆβ–ˆ| 13564/13564 [00:02<00:00, 5795.84 examples/s]
[rank0]:[W217 04:08:57.596706401 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
Applying chat template to eval dataset: 100%|β–ˆ| 1508/1508 [00:00<00:00, 7935.59 
Tokenizing eval dataset: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1508/1508 [00:00<00:00, 2350.65 examples/s]
Tokenizing eval dataset: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1508/1508 [00:00<00:00, 5680.32 examples/s]
Applying chat template to train dataset: 100%|β–ˆ| 13564/13564 [00:01<00:00, 8649.
Tokenizing train dataset: 100%|β–ˆβ–ˆ| 13564/13564 [00:05<00:00, 2318.37 examples/s]
Tokenizing train dataset: 100%|β–ˆβ–ˆ| 13564/13564 [00:02<00:00, 5582.17 examples/s]
Applying chat template to eval dataset: 100%|β–ˆ| 1508/1508 [00:00<00:00, 8248.39 
Tokenizing eval dataset: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1508/1508 [00:00<00:00, 1831.95 examples/s]
Tokenizing eval dataset: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1508/1508 [00:00<00:00, 4390.37 examples/s]
[rank1]: Traceback (most recent call last):
[rank1]:   File "/kaggle/working/script.py", line 104, in <module>
[rank1]:     model_finetune()
[rank1]:   File "/kaggle/working/script.py", line 101, in model_finetune
[rank1]:     trainer.train()
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2171, in train
[rank1]:     return inner_training_loop(
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2320, in _inner_training_loop
[rank1]:     self.model = self.accelerator.prepare(self.model)
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1339, in prepare
[rank1]:     result = tuple(
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1340, in <genexpr>
[rank1]:     self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1215, in _prepare_one
[rank1]:     return self.prepare_model(obj, device_placement=device_placement)
[rank1]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1441, in prepare_model
[rank1]:     raise ValueError(
[rank1]: ValueError: You can't train a model that has been loaded in 8-bit or 4-bit precision on a different device than the one you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`
[rank0]: Traceback (most recent call last):
[rank0]:   File "/kaggle/working/script.py", line 104, in <module>
[rank0]:     model_finetune()
[rank0]:   File "/kaggle/working/script.py", line 101, in model_finetune
[rank0]:     trainer.train()
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2171, in train
[rank0]:     return inner_training_loop(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2320, in _inner_training_loop
[rank0]:     self.model = self.accelerator.prepare(self.model)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1339, in prepare
[rank0]:     result = tuple(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1340, in <genexpr>
[rank0]:     self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1215, in _prepare_one
[rank0]:     return self.prepare_model(obj, device_placement=device_placement)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 1512, in prepare_model
[rank0]:     model = FSDP(model, **kwargs)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 483, in __init__
[rank0]:     _auto_wrap(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_wrap_utils.py", line 101, in _auto_wrap
[rank0]:     _recursive_wrap(**recursive_wrap_kwargs, **root_kwargs)  # type: ignore[arg-type]
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 545, in _recursive_wrap
[rank0]:     wrapped_child, num_wrapped_params = _recursive_wrap(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 545, in _recursive_wrap
[rank0]:     wrapped_child, num_wrapped_params = _recursive_wrap(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 545, in _recursive_wrap
[rank0]:     wrapped_child, num_wrapped_params = _recursive_wrap(
[rank0]:   [Previous line repeated 6 more times]
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 563, in _recursive_wrap
[rank0]:     return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/wrap.py", line 492, in _wrap
[rank0]:     return wrapper_cls(module, **kwargs)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 509, in __init__
[rank0]:     _init_param_handle_from_module(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py", line 629, in _init_param_handle_from_module
[rank0]:     _sync_module_params_and_buffers(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py", line 1126, in _sync_module_params_and_buffers
[rank0]:     _sync_params_and_buffers(
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/distributed/utils.py", line 328, in _sync_params_and_buffers
[rank0]:     dist._broadcast_coalesced(
[rank0]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/NCCLUtils.cpp:81, remote process exited or there was a network error, NCCL version 2.21.5
[rank0]: ncclRemoteError: A call failed possibly due to a network error or a remote process exiting prematurely.
[rank0]: Last error:
[rank0]: socketProgress: Connection closed by remote peer 162ca5b0f93a<58938>
W0217 04:09:12.267000 327 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 336 closing signal SIGTERM
E0217 04:09:12.582000 327 torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 1 (pid: 337) of binary: /usr/bin/python3
Traceback (most recent call last):
  File "/usr/local/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 1159, in launch_command
    multi_gpu_launcher(args)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 792, in multi_gpu_launcher
    distrib_run.run(args)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 910, in run
    elastic_launch(
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 138, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
script.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-02-17_04:09:12
  host      : 162ca5b0f93a
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 337)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
1 Like

The error is probably difficult to fix. It seems that changing the torch version may fix it.