Error while fine tuning with peft, lora, accelerate, SFTConfig and SFTTrainer

Hi,

I try to parallelize training on 4 GPU (v100 32GB VRAM). I have a working code for 1 GPU using lora, peft, SFTConfig and SFTTrainer. I tried to add some lines from accelerate (the lib) as I saw on some tutorials to achieve my goal without success.

This is the error I get (I get it 4 times due to the parallelization, but for more clarity, I put only one occurence):

 [rank0]: Traceback (most recent call last):
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/tuners/lora/model.py", line 360, in __getattr__
 [rank0]:     return super().__getattr__(name)  # defer to nn.Module's logic
 [rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1709, in __getattr__
 [rank0]:     raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 [rank0]: AttributeError: 'LoraModel' object has no attribute 'prepare_inputs_for_generation'
 
 [rank0]: During handling of the above exception, another exception occurred:
 
 [rank0]: Traceback (most recent call last):
 [rank0]:   File "/project/6045847/user/project/env.py", line 179, in <module>
 [rank0]:     main(args.model_path, data_filename, args.output)
 [rank0]:   File "/project/6045847/user/project/env.py", line 136, in main
 [rank0]:     trainer = SFTTrainer(
 [rank0]:               ^^^^^^^^^^^
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
 [rank0]:     return f(*args, **kwargs)
 [rank0]:            ^^^^^^^^^^^^^^^^^^
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 268, in __init__
 [rank0]:     model = get_peft_model(model, peft_config)
 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/mapping.py", line 193, in get_peft_model
 [rank0]:     return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
 [rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/peft_model.py", line 1610, in __init__
 [rank0]:     self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
 [rank0]:                                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/tuners/lora/model.py", line 364, in __getattr__
 [rank0]:     return getattr(self.model, name)
 [rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1709, in __getattr__
 [rank0]:     raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 [rank0]: AttributeError: 'DistributedDataParallel' object has no attribute 'prepare_inputs_for_generation'

This is my code (I donā€™t put all, just the code about the model itself for more clarity):

def get_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token

    return tokenizer


def get_model(model_id):

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )

    device_index = Accelerator().process_index
    device_map = {"": device_index}

    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=bnb_config, device_map=device_map
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model



def main(model_id, data_file, dir_output):

    print("get tokenizer")
    tokenizer = get_tokenizer(model_id)

    print("data")
    raw_data = load_data(data_file)
    training_data, test_data = train_test_split(raw_data, test_size=0.2, random_state=12)

    data = prepare_train_datav2(training_data)

    print("model")
    model = get_model(model_id)

    print("lora")
    peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, peft_config)

    print("preparation entrainement")
    model.train()

    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()

    accelerator = Accelerator()                 # code added
    model = accelerator.prepare_model(model)    # code added

    training_arguments = SFTConfig(
        output_dir=dir_output,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=64,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=250,
        bf16=True,
        push_to_hub=False
    )

    trainer = SFTTrainer( # the error occurs here
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )

    print("train")

    trainer.train()

The only code I added between the 1 GPU and 4 GPU versions is:

 accelerator = Accelerator()
 model = accelerator.prepare_model(model)

And I run the code via a bash script:

 accelerate config
 accelerate launch --multi_gpu script.py --model_path $1 --output $2

The 1 GPU version of this script was:

 python script.py --model_path $1 --output $2

I also tried at the end of the main function (deleting ā€˜model = accelerator.prepare_model(model)ā€™):

 accelerator = Accelerator()
 trainer = accelerator.prepare_model(trainer)

 trainer.train()

But this time I have this error:

[rank0]: Traceback (most recent call last):
[rank0]:   File "/project/6045847/user/project/env.py", line 179, in <module>
[rank0]:     main(args.model_path, data_filename, args.output)
[rank0]:   File "/project/6045847/user/project/env.py", line 149, in main
[rank0]:     trainer = accelerator.prepare_model(trainer)
[rank0]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/accelerator.py", line 1380, in prepare_model
[rank0]:     self.verify_device_map(model)
[rank0]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/accelerator.py", line 3534, in verify_device_map
[rank0]:     for m in model.modules():
[rank0]:              ^^^^^^^^^^^^^
[rank0]: AttributeError: 'SFTTrainer' object has no attribute 'modules'
Map:   0%|          | 0/960 [00:00<?, ? examples/s]W1105 11:53:02.954000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2716513 closing signal SIGTERM
W1105 11:53:02.955000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2716514 closing signal SIGTERM
W1105 11:53:02.955000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2716515 closing signal SIGTERM
E1105 11:53:03.434000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2716512) of binary: /project/6045847/user/project/env/bin/python
Traceback (most recent call last):
  File "/project/6045847/user/project/env/bin/accelerate", line 8, in <module>
    sys.exit(main())
                     ^^^^^^
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1165, in launch_command
    multi_gpu_launcher(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 799, in multi_gpu_launcher
    distrib_run.run(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
script.py FAILED

So my question is: how is it possible to use accelerate lib with all my previous code ? If you need more information, feel free to ask

1 Like

Multi-GPU, PEFT and quantization are a combination that seems quite likely to cause errorsā€¦
Itā€™s probably not a simple problem, so I think itā€™s better to ask on HF Discord than on the forum.
I didnā€™t get much useful information from the search either.

I tried to do some fixes as discussed on your link (https://discuss.huggingface.co/t/multiple-gpu-in-sfttrainer/91899)

Unfortunately I still have some errors:

[rank1]: Traceback (most recent call last):
[rank1]:   File "/project/6045847/user/project/script.py", line 178, in <module>
[rank1]:     main(args.model_path, data_filename, args.output)
[rank1]:   File "/project/6045847/user/project/script.py", line 150, in main
[rank1]:     trainer.train()
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 434, in train
[rank1]:     output = super().train(*args, **kwargs)
[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/transformers/trainer.py", line 2052, in train
[rank1]:     return inner_training_loop(
[rank1]:            ^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
[rank1]:     tr_loss_step = self.training_step(model, inputs)
[rank1]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/transformers/trainer.py", line 3518, in training_step
[rank1]:     self.accelerator.backward(loss, **kwargs)
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/accelerator.py", line 2196, in backward
[rank1]:     loss.backward(**kwargs)
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/_tensor.py", line 525, in backward
[rank1]:     torch.autograd.backward(
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/__init__.py", line 267, in backward
[rank1]:     _engine_run_backward(
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
[rank1]:     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/function.py", line 301, in apply
[rank1]:     return user_fn(self, *args)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 320, in backward
[rank1]:     torch.autograd.backward(outputs_with_grad, args_with_grad)
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/__init__.py", line 267, in backward
[rank1]:     _engine_run_backward(
[rank1]:   File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
[rank1]:     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
[rank1]: Parameter at index 127 with name base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.weight has been marked as ready twice. This means that multiple autograd engine  hooks have fired for this particular parameter during this iteration.
W1106 12:34:58.957000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3711548 closing signal SIGTERM
W1106 12:34:58.958000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3711550 closing signal SIGTERM
W1106 12:34:58.958000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3711551 closing signal SIGTERM
E1106 12:34:59.240000 22647939338304 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 3711549) of binary: /project/6045847/user/project/env/bin/python
Traceback (most recent call last):
  File "/project/6045847/user/project/env/bin/accelerate", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1165, in launch_command
    multi_gpu_launcher(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 799, in multi_gpu_launcher
    distrib_run.run(args)
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================

This is my code now:


def get_model(model_id):

	....

	device_string = PartialState().process_index

    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=bnb_config, device_map=device_string
    )

    ....


.....

def main(model_id, data_file, dir_output):

	....

	training_arguments = SFTConfig(
		....
		gradient_checkpointing_kwargs={'use_reentrant':False},
        gradient_checkpointing=False
    )

    ....

I removed


	accelerator = Accelerator()
	model = accelerator.prepare_model(model)


and I modified my bash script:


accelerate config
accelerate launch --multi_gpu --num_processes 4 script.py --model_path $1 --output $2


I found an issue that is in progress by searching for error messages.