this is my current code to load llama 3.1 8b instruct model into local windows 10 pc,
i tried many methods to get it to run on multiple GPUs (in order to increase tokens per second) but without success,
the model loads onto the GPU:0 and GPU:1 stay idle, and the generation on average reaches a 12-13 tokens per second,
if i use device_map=“auto” then it deploy the model on both GPUs but on CPU as well (and then the token per second drops to ~ 5 tokens per sec)
import transformers
import torch
import bitsandbytes as bnb
try:
tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
tokenizer.pad_token_id = tokenizer.eos_token_id
model = transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", torch_dtype=torch.bfloat16, load_in_8bit=False, attn_implementation="flash_attention_2", device_map="cuda")
# Wrap the model using PyTorch's DataParallel for multi-GPU usage if more than one GPU is available
# This will allow your model to split the input data across your GPUs, improving performance for large models.
if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
print(f"Using {torch.cuda.device_count()} GPUs for inference.")
else:
model = model
print("Using single GPU for inference.")
# CUDA optimization settings
torch.backends.cuda.cufft_plan_cache.clear() # Clear CUFFT plan cache to avoid memory issues
torch.backends.cuda.matmul.allow_tf32 = True # Enable TensorFloat-32 to improve speed on RTX 3090
torch.backends.cudnn.benchmark = True # Enable cuDNN benchmark mode to improve speed
torch.backends.cudnn.deterministic = False # Disable cuDNN deterministic mode to improve speed
message_history = [{"role": "user", "content": "hello"}]
messages = tokenizer.apply_chat_template(message_history, tokenize=False, add_generation_prompt=True, return_tensors="pt")
model_inputs = tokenizer([messages], truncation=True, padding=True, return_tensors="pt").to('cuda')
model_inputs['attention_mask'] = (model_inputs['input_ids'] != tokenizer.pad_token_id).long().to('cuda')
input_ids = model_inputs['input_ids'].to('cuda')
attention_mask = model_inputs['attention_mask'].to('cuda')
with torch.no_grad():
# If the model is wrapped with DataParallel, use model.module
if isinstance(model, torch.nn.DataParallel):
model_to_use = model.module
else:
model_to_use = model
response_tensor = model_to_use.generate(
input_ids = input_ids,
attention_mask = attention_mask,
pad_token_id = tokenizer.eos_token_id,
max_new_tokens = 2048,
do_sample = True,
top_k = 150,
top_p = 0.95,
temperature = 0.75,
num_beams = 1,
)
# Decode the generated response
response = tokenizer.decode(response_tensor[0], skip_prompt=True, skip_special_tokens=True)
print(f"Response: {response}")
except Exception as err:
print(f"Error occurred while generating response: {err}")
any ideas would be appreciated,
code sample would be even better
10x