i want to provide some samples of what to generate probably as chat history so that it knows what to generate. An example of what i am thinking.
model = AutoModelForCausalLM.from_pretrained( current_directory, device_map="cuda", torch_dtype="auto", trust_remote_code=True,)
#print(model.generation_config)
generation_config = GenerationConfig( bos_token_id=151643,eos_token_id=151645,pad_token_id= 151665,cache_implementation="quantized")
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache,StaticCache
tokenizer = AutoTokenizer.from_pretrained(current_directory)
import time
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
sc = StaticCache( config=model.config, batch_size=2, max_cache_len=8192, device="cuda", dtype=model.dtype )
def query(request):
generation_config= GenerationConfig.from_pretrained(current_directory )
msg_tmp = messages # messages is a dictionary of example user assistant interaction
text = request.lower()
msg_tmp.append({"role": "user", "content": text })#+ ". Do not give any other text other than the mongoDB query for this text specifically"})
inputs= tokenizer(msg_tmp, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, generation_config=generation_config, past_key_values= sc,use_cache=True,cache_implementation= "quantized" )
but this doesn’t work as i cant pass dictionary to tokenizer and i don’t think the messages variable get cached.