Llama 3 Instruct taking too long all of a sudden

I was running Llama 3 instruct on my cpu using transform getting reasonable generation times of around 1-2 minutes. After trying to update cuda to run it on my cpu it now takes arround 40-50 minutes when running on cpu even though i didnt change anything about the original environment. Ive even tried reinstalling the model and all libraries in a new env. Anybody have any idea what could cause this?

1 Like

Heres my code:

import math
import transformers
import torch
import time

ts = time.time()
lt = time.localtime()

print(f"Loading Model {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")

device = "cpu"

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16,
                  "low_cpu_mem_usage": True,
                  },
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "What be the best way to find buried treasure?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

lt = time.localtime()
print(f"Model loaded in {math.floor((time.time() - ts)/60):02d}:{round((time.time() - ts)%60):02d} seconds at {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")

ts = time.time()
print(f"Generating response {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

lt = time.localtime()
print(f"Time taken {math.floor((time.time() - ts)/60):02d}:{round((time.time() - ts)%60):02d} seconds at {lt.tm_hour}:{lt.tm_min}:{lt.tm_sec}")

1 Like