Error while fine tuning with peft, lora, accelerate, SFTConfig and SFTTrainer

I tried to do some fixes as discussed on your link (https://discuss.huggingface.co/t/multiple-gpu-in-sfttrainer/91899)

Unfortunately I still have some errors:

[rank1]: Traceback (most recent call last):
[rank1]:   File "/project/6045847/user/project/script.py", line 178, in <module>
[rank1]:     main(args.model_path, data_filename, args.output)
[rank1]:   File "/project/6045847/user/project/script.py", line 150, in main
[rank1]:     trainer.train()
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 434, in train
[rank1]:     output = super().train(*args, **kwargs)
[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/transformers/trainer.py", line 2052, in train
[rank1]:     return inner_training_loop(
[rank1]:            ^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
[rank1]:     tr_loss_step = self.training_step(model, inputs)
[rank1]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/transformers/trainer.py", line 3518, in training_step
[rank1]:     self.accelerator.backward(loss, **kwargs)
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/accelerator.py", line 2196, in backward
[rank1]:     loss.backward(**kwargs)
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/_tensor.py", line 525, in backward
[rank1]:     torch.autograd.backward(
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/__init__.py", line 267, in backward
[rank1]:     _engine_run_backward(
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
[rank1]:     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/function.py", line 301, in apply
[rank1]:     return user_fn(self, *args)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 320, in backward
[rank1]:     torch.autograd.backward(outputs_with_grad, args_with_grad)
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/__init__.py", line 267, in backward
[rank1]:     _engine_run_backward(
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
[rank1]:     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
[rank1]: Parameter at index 127 with name base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.weight has been marked as ready twice. This means that multiple autograd engine  hooks have fired for this particular parameter during this iteration.
W1106 12:34:58.957000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3711548 closing signal SIGTERM
W1106 12:34:58.958000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3711550 closing signal SIGTERM
W1106 12:34:58.958000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3711551 closing signal SIGTERM
E1106 12:34:59.240000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 3711549) of binary: /project/6045847/user/project/env/bin/python
Traceback (most recent call last):
  File "/project/6045847/user/project/env/bin/accelerate", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1165, in launch_command
    multi_gpu_launcher(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 799, in multi_gpu_launcher
    distrib_run.run(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================

This is my code now:


def get_model(model_id):

	....

	device_string = PartialState().process_index

    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=bnb_config, device_map=device_string
    )

    ....


.....

def main(model_id, data_file, dir_output):

	....

	training_arguments = SFTConfig(
		....
		gradient_checkpointing_kwargs={'use_reentrant':False},
        gradient_checkpointing=False
    )

    ....

I removed


	accelerator = Accelerator()
	model = accelerator.prepare_model(model)


and I modified my bash script:


accelerate config
accelerate launch --multi_gpu --num_processes 4 script.py --model_path $1 --output $2