Hi,
I try to parallelize training on 4 GPU (v100 32GB VRAM). I have a working code for 1 GPU using lora, peft, SFTConfig and SFTTrainer. I tried to add some lines from accelerate (the lib) as I saw on some tutorials to achieve my goal without success.
This is the error I get (I get it 4 times due to the parallelization, but for more clarity, I put only one occurence):
[rank0]: Traceback (most recent call last):
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/tuners/lora/model.py", line 360, in __getattr__
[rank0]: return super().__getattr__(name) # defer to nn.Module's logic
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1709, in __getattr__
[rank0]: raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
[rank0]: AttributeError: 'LoraModel' object has no attribute 'prepare_inputs_for_generation'
[rank0]: During handling of the above exception, another exception occurred:
[rank0]: Traceback (most recent call last):
[rank0]: File "/project/6045847/user/project/env.py", line 179, in <module>
[rank0]: main(args.model_path, data_filename, args.output)
[rank0]: File "/project/6045847/user/project/env.py", line 136, in main
[rank0]: trainer = SFTTrainer(
[rank0]: ^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
[rank0]: return f(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 268, in __init__
[rank0]: model = get_peft_model(model, peft_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/mapping.py", line 193, in get_peft_model
[rank0]: return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/peft_model.py", line 1610, in __init__
[rank0]: self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/peft/tuners/lora/model.py", line 364, in __getattr__
[rank0]: return getattr(self.model, name)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1709, in __getattr__
[rank0]: raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
[rank0]: AttributeError: 'DistributedDataParallel' object has no attribute 'prepare_inputs_for_generation'
This is my code (I donāt put all, just the code about the model itself for more clarity):
def get_tokenizer(model_id):
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
def get_model(model_id):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
)
device_index = Accelerator().process_index
device_map = {"": device_index}
model = AutoModelForCausalLM.from_pretrained(
model_id, quantization_config=bnb_config, device_map=device_map
)
model.config.use_cache=False
model.config.pretraining_tp=1
return model
def main(model_id, data_file, dir_output):
print("get tokenizer")
tokenizer = get_tokenizer(model_id)
print("data")
raw_data = load_data(data_file)
training_data, test_data = train_test_split(raw_data, test_size=0.2, random_state=12)
data = prepare_train_datav2(training_data)
print("model")
model = get_model(model_id)
print("lora")
peft_config = LoraConfig(
r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
print("preparation entrainement")
model.train()
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
accelerator = Accelerator() # code added
model = accelerator.prepare_model(model) # code added
training_arguments = SFTConfig(
output_dir=dir_output,
per_device_train_batch_size=1,
gradient_accumulation_steps=64,
optim="paged_adamw_32bit",
learning_rate=2e-4,
lr_scheduler_type="cosine",
save_strategy="epoch",
logging_steps=10,
num_train_epochs=3,
max_steps=250,
bf16=True,
push_to_hub=False
)
trainer = SFTTrainer( # the error occurs here
model=model,
train_dataset=data,
peft_config=peft_config,
dataset_text_field="text",
args=training_arguments,
tokenizer=tokenizer,
packing=False,
max_seq_length=1024
)
print("train")
trainer.train()
The only code I added between the 1 GPU and 4 GPU versions is:
accelerator = Accelerator()
model = accelerator.prepare_model(model)
And I run the code via a bash script:
accelerate config
accelerate launch --multi_gpu script.py --model_path $1 --output $2
The 1 GPU version of this script was:
python script.py --model_path $1 --output $2
I also tried at the end of the main function (deleting āmodel = accelerator.prepare_model(model)ā):
accelerator = Accelerator()
trainer = accelerator.prepare_model(trainer)
trainer.train()
But this time I have this error:
[rank0]: Traceback (most recent call last):
[rank0]: File "/project/6045847/user/project/env.py", line 179, in <module>
[rank0]: main(args.model_path, data_filename, args.output)
[rank0]: File "/project/6045847/user/project/env.py", line 149, in main
[rank0]: trainer = accelerator.prepare_model(trainer)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/accelerator.py", line 1380, in prepare_model
[rank0]: self.verify_device_map(model)
[rank0]: File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/accelerator.py", line 3534, in verify_device_map
[rank0]: for m in model.modules():
[rank0]: ^^^^^^^^^^^^^
[rank0]: AttributeError: 'SFTTrainer' object has no attribute 'modules'
Map: 0%| | 0/960 [00:00<?, ? examples/s]W1105 11:53:02.954000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2716513 closing signal SIGTERM
W1105 11:53:02.955000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2716514 closing signal SIGTERM
W1105 11:53:02.955000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2716515 closing signal SIGTERM
E1105 11:53:03.434000 23261697507392 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2716512) of binary: /project/6045847/user/project/env/bin/python
Traceback (most recent call last):
File "/project/6045847/user/project/env/bin/accelerate", line 8, in <module>
sys.exit(main())
^^^^^^
File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
args.func(args)
File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1165, in launch_command
multi_gpu_launcher(args)
File "/project/6045847/user/project/env/lib/python3.11/site-packages/accelerate/commands/launch.py", line 799, in multi_gpu_launcher
distrib_run.run(args)
File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
elastic_launch(
File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/project/6045847/user/project/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
script.py FAILED
So my question is: how is it possible to use accelerate lib with all my previous code ? If you need more information, feel free to ask