ValueError when training on a multi GPU setup and DPO

I’m trying to finetune Llama 2 model using DPO on a multi GPU set up. The setup has two V100s. The code’s structure is following while omitting unnecessary code

# Imports

device = {"": Accelerator().local_process_index}
model_name = "meta-llama/Llama-2-7b-chat-hf"
system_prompt = "You're an AI assistant that tries to help the user as much as you can."

# Configs
# DPO config
training_args = TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        output_dir="args/",
        evaluation_strategy="no",
        do_eval=False,
        use_cpu=False,
        logging_steps=1,
        num_train_epochs=3,
)

# Peft config
peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05
)

# BnB config
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
)

# LLMs and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


# Settings training amount
number_of_rounds = 10 # How many times each model is finetuned once
minibatch_size = 5 # How many data entries there are for DPO update
num_guesses = 5
test_size = 3

def main():
        model_a = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map=device,
                quantization_config=bnb_config,
                temperature=0.1
        )
        model_b = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map=device,
                quantization_config=bnb_config,
                temperature=0.1
        )

        dir_a = "finetuned/a"
        dir_b = "finetuned/b"

        try:
                os.makedirs(dir_a)
                os.makedirs(dir_b)
        except:
                print("Couldn't make a directory")

        explainer = model_a
        guesser = model_b

        """
        explainer.save_pretrained(dir_a)
        guesser.save_pretrained(dir_b)
        """

        round = 0

        for i in tqdm(range(number_of_rounds), desc="Playing Alias"):
                # Make models play the game

                explainer.add_adapter(peft_config)
                guesser.add_adapter(peft_config)

                dataset, ex_dataset, ex_skipped = generate_dataset(minibatch_size, num_guesses, explainer, guesser)
                testdata, ex_testdata, _ = generate_dataset(test_size, num_guesses, explainer, guesser)

                # FT the explainer
                guesser = get_peft_model(
                        guesser,
                        peft_config,
                )


                explainer = get_peft_model(
                        explainer,
                        peft_config,
                )
                guesser = get_peft_model(
                        guesser,
                        peft_config,
                )

                #print(str(dataset))

                #print(device)

                trainer = DPOTrainer(
                        guesser,
                        explainer,
                        args=training_args,
                        beta=0.1,
                        train_dataset=dataset,
                        eval_dataset=testdata,
                        tokenizer=tokenizer,
                )
                trainer.train()
                wandb.log({"round": round, "explainer_skipped_frac": ex_skipped / minibatch_size})
                wandb.finish()

                ex_trainer = DPOTrainer(
                        explainer,
                        guesser,
                        args=training_args,
                        beta=0.1,
                        train_dataset=ex_dataset,
                        eval_dataset=ex_testdata,
                        tokenizer=tokenizer,
                )
                ex_trainer.train()
                wandb.log({"round": round, "isExplainer": 1})
                wandb.finish()

                # Save the model and switch roles by loading them
                guesser, explainer = explainer, guesser
                if (round%2 == 0):
                        #print(f"\nSaved to dir. {dir_a}\n")
                        guesser.save_pretrained(dir_a)
                        guesser = AutoModelForCausalLM.from_pretrained(
                                dir_b,
                                device_map=device,
                                local_files_only=True
                        )
                        explainer = AutoModelForCausalLM.from_pretrained(
                                dir_a,
                                device_map=device,
                                local_files_only=True
                        )
                else:
                        guesser.save_pretrained(dir_b)
                        guesser = AutoModelForCausalLM.from_pretrained(
                                dir_a,
                                device_map=device,
                                local_files_only=True
                        )
                        explainer = AutoModelForCausalLM.from_pretrained(
                                dir_b,
                                device_map=device,
                                local_files_only=True
                        )

                round += 1

        # Automatic evaluation for the latest model
        eval.test_model(guesser, tokenizer, device)

if __name__ == "__main__":
        main()

where generate_dataset generates datasets for both guesser and explainer. When I try to run using the command

accelerate launch script.py

ValueError: DistributedDataParallel’s input module must be on the same type of devices, but input module parameters locate in {‘cpu’, ‘cuda’}.