@fifthwheel you can do something like this
from typing import List, Optional
from langchain_community.llms import DeepSparse
from langchain.llms.utils import enforce_stop_tokens
class LLMService():
model: object = None
@property
def _llm_type(self) -> str:
return "deepsparse"
def _call(self,
prompt: str,
stop: Optional[List[str]] = None) -> str:
response = self.model(prompt)
if stop is not None:
response = enforce_stop_tokens(response, stop)
return response
def load_model(self, model_name_or_path: str = "hf:neuralmagic/mpt-7b-chat-pruned50-quant"):
self.model = DeepSparse(
model=model_name_or_path,
model_config={"sequence_length": 2048},
generation_config={"max_new_tokens": 300},
)
if __name__ == '__main__':
chatLLM = LLMService()
chatLLM.load_model()