Hi,
I am building a chatbot using LLM like fastchat-t5-3b-v1.0 and want to reduce my inference time.
I am loading the model on GPU using device_map parameter, and monitoring the GPU and CPU usage during entire execution. Also specifying the device=0 ( which is the 1st rank GPU) for hugging face pipeline as well.
Though my model is loaded in GPU, I can see at the time of inference i.e. when I query the model, with hugging face pipeline, It also makes use of CPU for execution.
Below is the code that I am using to do inference on Fastchat LLM.
from llama_index import SimpleDirectoryReader, GPTVectorStoreIndex, PromptHelper, LLMPredictor
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext
from transformers import T5Tokenizer, T5ForConditionalGeneration
from accelerate import init_empty_weights, infer_auto_device_map
model_name = 'lmsys/fastchat-t5-3b-v1.0'
config = T5Config.from_pretrained(model_name )
with init_empty_weights():
model_layer = T5ForConditionalGeneration(config=config)
device_map = infer_auto_device_map(model_layer, max_memory={0: "12GiB",1: "12GiB", "cpu": "0GiB"}, no_split_module_classes=["T5Block"])
# the value for is : device_map = {'': 0}. i.e loading model in 1st GPU
model = T5ForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, device_map=device_map, offload_folder="offload", offload_state_dict=True)
tokenizer = T5Tokenizer.from_pretrained(model_name)
from transformers import pipeline
pipe = pipeline(
"text2text-generation", model=model, tokenizer=tokenizer, device= 0,
max_length=1536, temperature=0, top_p = 1, num_beams=1, early_stopping=False
)
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
# set maximum input size
max_input_size = 2048
# set number of output tokens
num_outputs = 512
# set maximum chunk overlap
max_chunk_overlap = 20
# set chunk size limit
chunk_size_limit = 300
prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap)
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor=LLMPredictor(llm), prompt_helper=prompt_helper, chunk_size_limit=chunk_size_limit)
# build index
documents = SimpleDirectoryReader('data').load_data()
new_index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
# query with embed_model specified
query_engine = new_index.as_query_engine(
verbose=True,
similarity_top_k=2
)
response = query_engine.query("sample query question?")
Please have a look, and let me know if this the expected behavior.
how can I make use of GPU for query execution also ?, to reduce my inference response time.