I’m trying to finetune Llama 2 model using DPO on a multi GPU set up. The setup has two V100s. The code’s structure is following while omitting unnecessary code
# Imports
device = {"": Accelerator().local_process_index}
model_name = "meta-llama/Llama-2-7b-chat-hf"
system_prompt = "You're an AI assistant that tries to help the user as much as you can."
# Configs
# DPO config
training_args = TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
output_dir="args/",
evaluation_strategy="no",
do_eval=False,
use_cpu=False,
logging_steps=1,
num_train_epochs=3,
)
# Peft config
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.05
)
# BnB config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
# LLMs and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Settings training amount
number_of_rounds = 10 # How many times each model is finetuned once
minibatch_size = 5 # How many data entries there are for DPO update
num_guesses = 5
test_size = 3
def main():
model_a = AutoModelForCausalLM.from_pretrained(
model_name,
device_map=device,
quantization_config=bnb_config,
temperature=0.1
)
model_b = AutoModelForCausalLM.from_pretrained(
model_name,
device_map=device,
quantization_config=bnb_config,
temperature=0.1
)
dir_a = "finetuned/a"
dir_b = "finetuned/b"
try:
os.makedirs(dir_a)
os.makedirs(dir_b)
except:
print("Couldn't make a directory")
explainer = model_a
guesser = model_b
"""
explainer.save_pretrained(dir_a)
guesser.save_pretrained(dir_b)
"""
round = 0
for i in tqdm(range(number_of_rounds), desc="Playing Alias"):
# Make models play the game
explainer.add_adapter(peft_config)
guesser.add_adapter(peft_config)
dataset, ex_dataset, ex_skipped = generate_dataset(minibatch_size, num_guesses, explainer, guesser)
testdata, ex_testdata, _ = generate_dataset(test_size, num_guesses, explainer, guesser)
# FT the explainer
guesser = get_peft_model(
guesser,
peft_config,
)
explainer = get_peft_model(
explainer,
peft_config,
)
guesser = get_peft_model(
guesser,
peft_config,
)
#print(str(dataset))
#print(device)
trainer = DPOTrainer(
guesser,
explainer,
args=training_args,
beta=0.1,
train_dataset=dataset,
eval_dataset=testdata,
tokenizer=tokenizer,
)
trainer.train()
wandb.log({"round": round, "explainer_skipped_frac": ex_skipped / minibatch_size})
wandb.finish()
ex_trainer = DPOTrainer(
explainer,
guesser,
args=training_args,
beta=0.1,
train_dataset=ex_dataset,
eval_dataset=ex_testdata,
tokenizer=tokenizer,
)
ex_trainer.train()
wandb.log({"round": round, "isExplainer": 1})
wandb.finish()
# Save the model and switch roles by loading them
guesser, explainer = explainer, guesser
if (round%2 == 0):
#print(f"\nSaved to dir. {dir_a}\n")
guesser.save_pretrained(dir_a)
guesser = AutoModelForCausalLM.from_pretrained(
dir_b,
device_map=device,
local_files_only=True
)
explainer = AutoModelForCausalLM.from_pretrained(
dir_a,
device_map=device,
local_files_only=True
)
else:
guesser.save_pretrained(dir_b)
guesser = AutoModelForCausalLM.from_pretrained(
dir_a,
device_map=device,
local_files_only=True
)
explainer = AutoModelForCausalLM.from_pretrained(
dir_b,
device_map=device,
local_files_only=True
)
round += 1
# Automatic evaluation for the latest model
eval.test_model(guesser, tokenizer, device)
if __name__ == "__main__":
main()
where generate_dataset
generates datasets for both guesser and explainer. When I try to run using the command
accelerate launch script.py
ValueError: DistributedDataParallel’s input module must be on the same type of devices, but input module parameters locate in {‘cpu’, ‘cuda’}.