I am using LLama3.1 70B for inference. I have 4 gpus nvidia L4 (24GB) each. Here is my code:
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16)
llm_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-70B-Instruct",
quantization_config=nf4_config,
device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(
"meta-llama/Llama-3.1-70B-Instruct", use_fast=True)
#Add padding in case we need to use batch_size > 1
self.tokenizer.padding_side = "left"
self.tokenizer.pad_token = self.tokenizer.eos_token
def run_llm(llm_model, tokenizer, prompt_messages: list[str],
temperature: float = 0.001, batch_size, tokenizer_config, generation_config) -> list[dict]:
"""
"""
data_loader = torch.utils.data.DataLoader(
prompt_messages, batch_size=batch_size)
tqdm_iterator = tqdm(data_loader, desc="Inference LLM model")
outputs = []
# Make a copy of the current generation config
with torch.no_grad():
for batch in tqdm_iterator:
inputs_model = tokenizer(
batch, return_tensors="pt", **tokenizer_config)
inputs_model.to(llm_model.device)
model_input_length = len(inputs_model[0])
output_encode = llm_model.generate(**inputs_model, **generation_config, pad_token_id=self.tokenizer.eos_token_id,
temperature=temperature)
output_encode = output_encode[:, model_input_length:]
output = self.tokenizer.batch_decode(
output_encode, skip_special_tokens=True)
outputs.extend(output)
return outputs
I remark that the model is split on all 4 gpus but the inference is running only on 1 GPU as depicted below:
How Can I optimize the code to run the inference on 4 multiple gpus?