How to run inference on multigpus

I am using LLama3.1 70B for inference. I have 4 gpus nvidia L4 (24GB) each. Here is my code:

nf4_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.bfloat16)
  llm_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-70B-Instruct",
                                                        quantization_config=nf4_config,
                                                        device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.1-70B-Instruct", use_fast=True)
#Add padding in case we need to use batch_size > 1
self.tokenizer.padding_side = "left"
self.tokenizer.pad_token = self.tokenizer.eos_token


def run_llm(llm_model, tokenizer, prompt_messages: list[str],
                temperature: float = 0.001, batch_size, tokenizer_config, generation_config) -> list[dict]:
        """
        """
        data_loader = torch.utils.data.DataLoader(
            prompt_messages, batch_size=batch_size)
        tqdm_iterator = tqdm(data_loader, desc="Inference LLM model")

        outputs = []
        # Make a copy of the current generation config
        with torch.no_grad():
            for batch in tqdm_iterator:
                inputs_model = tokenizer(
                    batch, return_tensors="pt", **tokenizer_config)
                inputs_model.to(llm_model.device)

                model_input_length = len(inputs_model[0])
                output_encode = llm_model.generate(**inputs_model, **generation_config, pad_token_id=self.tokenizer.eos_token_id,
                                                        temperature=temperature)

                output_encode = output_encode[:, model_input_length:]
                output = self.tokenizer.batch_decode(
                    output_encode, skip_special_tokens=True)
                outputs.extend(output)
        return outputs

I remark that the model is split on all 4 gpus but the inference is running only on 1 GPU as depicted below:
How Can I optimize the code to run the inference on 4 multiple gpus?

1 Like