I was running Llama 3 instruct on my cpu using transform getting reasonable generation times of around 1-2 minutes. After trying to update cuda to run it on my cpu it now takes arround 40-50 minutes when running on cpu even though i didnt change anything about the original environment. Ive even tried reinstalling the model and all libraries in a new env. Anybody have any idea what could cause this?
1 Like
Heres my code:
import math
import transformers
import torch
import time
ts = time.time()
lt = time.localtime()
print(f"Loading Model {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")
device = "cpu"
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16,
"low_cpu_mem_usage": True,
},
device_map="auto",
)
messages = [
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
{"role": "user", "content": "What be the best way to find buried treasure?"},
]
terminators = [
pipeline.tokenizer.eos_token_id,
pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
lt = time.localtime()
print(f"Model loaded in {math.floor((time.time() - ts)/60):02d}:{round((time.time() - ts)%60):02d} seconds at {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")
ts = time.time()
print(f"Generating response {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")
outputs = pipeline(
messages,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
print(outputs[0]["generated_text"][-1])
lt = time.localtime()
print(f"Time taken {math.floor((time.time() - ts)/60):02d}:{round((time.time() - ts)%60):02d} seconds at {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")
1 Like