When I fine tuning llama2 with deepspeed and qlora on one node and multi GPUs, I… used zero3 to partition the model paramters, but it always first load the whole params on each GPU and partition params just before training instead load params after partition it. After I check the huggingface document, I find it need to put `TrainingArguments` before `from_pretrained`. I did it and zero3_init indeed wored, but the confusing problem arised: NotImplementedError: Cannot copy out of meta tensor; no data!
Here is my code:
```python
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
import bitsandbytes as bnb
from peft import LoraConfig
from trl import SFTTrainer
from accelerate import Accelerator
accelerator = Accelerator()
import deepspeed
dataset = load_dataset("json",data_files="Belle_open_source_0.5M_changed.json",split="train")
result_dir = "tmp"
training_args = TrainingArguments(
report_to="none",
output_dir=result_dir,
# per_device_train_batch_size * gradient_accumulation_steps = batch_size
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
learning_rate=2e-4,
logging_steps=10,
# max_steps=520,
num_train_epochs=0.016,
save_steps=500, # 65
bf16 = True, # set bf16 to True with an A100
# optim='paged_adamw_32bit',
gradient_checkpointing=True,
# group_by_length=True,
# remove_unused_columns=False,
# warmup_ratio=0.03,
# lr_scheduler_type='constant',
# max_grad_norm=0.3
)
current_device = Accelerator().process_index
print("current_device:", current_device)
# print(type(current_device))
base_model_name ="/home/yangtong/data/llama2-hf/llama2-13b-chat_hf"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
# bnb_4bit_quant_storage=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
load_in_4bit=True
)
base_model.tie_weights()
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1
def find_all_linear_names(model):
cls = bnb.nn.Linear4bit
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
models=find_all_linear_names(base_model)
# print(models)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
target_modules=models
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
tokenizer.pad_token = tokenizer.eos_token
max_seq_length = 512
trainer = SFTTrainer(
model=base_model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_args
)
trainer.train()
output_dir = os.path.join(result_dir, "final_checkpoint")
trainer.model.save_pretrained(output_dir)
```
Here is my accelerate config:
```
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_config_file: /home/yangtong/ft_dis/ds_config/3.json
zero3_init_flag: true
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 4
rdzv_backend: 'c10d'
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
```
And here is error:
```
Traceback (most recent call last):
File "/home/yangtong/ft_dis/ft_acc_new.py", line 58, in <module>
base_model = AutoModelForCausalLM.from_pretrained(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 565, in from_pretrained
return model_class.from_pretrained(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3307, in from_pretrained
) = cls._load_pretrained_model(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3695, in _load_pretrained_model
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/transformers/modeling_utils.py", line 749, in _load_state_dict_into_meta_model
set_module_quantized_tensor_to_device(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/transformers/integrations/bitsandbytes.py", line 108, in set_module_quantized_tensor_to_device
new_value = value.to(device)
NotImplementedError: Cannot copy out of meta tensor; no data!
```
After I set `low_cpu_mem_usage=False` in `from_pretrained`, here is another error:
```
Traceback (most recent call last):
File "/home/yangtong/ft_dis/ft_acc_new.py", line 58, in <module>
base_model = AutoModelForCausalLM.from_pretrained(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 565, in from_pretrained
return model_class.from_pretrained(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3366, in from_pretrained
dispatch_model(model, **device_map_kwargs)
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/accelerate/big_modeling.py", line 419, in dispatch_model
attach_align_device_hook_on_blocks(
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/accelerate/hooks.py", line 608, in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/accelerate/hooks.py", line 157, in add_hook_to_module
module = hook.init_hook(module)
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/accelerate/hooks.py", line 275, in init_hook
set_module_tensor_to_device(module, name, self.execution_device, tied_params_map=self.tied_params_map)
File "/home/yangtong/anaconda3/envs/llama2/lib/python3.10/site-packages/accelerate/utils/modeling.py", line 354, in set_module_tensor_to_device
raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
ValueError: weight is on the meta device, we need a `value` to put in on 0.
```
I also try to set `empty_init=False` , but the error is LlamaForCausalLM.from_pretrained doesn't has this paramter.
I will truly appreciate if anyone can help me solve it !