Provide examples to model before inferencing and how to cache the examples

i want to provide some samples of what to generate probably as chat history so that it knows what to generate. An example of what i am thinking.

model = AutoModelForCausalLM.from_pretrained(   current_directory,        device_map="cuda",        torch_dtype="auto",        trust_remote_code=True,)
#print(model.generation_config)

generation_config = GenerationConfig( bos_token_id=151643,eos_token_id=151645,pad_token_id= 151665,cache_implementation="quantized")

from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache,StaticCache

tokenizer = AutoTokenizer.from_pretrained(current_directory)

import time
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

sc = StaticCache(        config=model.config, batch_size=2, max_cache_len=8192, device="cuda", dtype=model.dtype    )

def query(request):

    generation_config= GenerationConfig.from_pretrained(current_directory )
    msg_tmp = messages # messages is a dictionary of example user assistant interaction

    text = request.lower()
    msg_tmp.append({"role": "user", "content": text })#+ ". Do not give any other text other than the mongoDB query for this text specifically"})

    inputs= tokenizer(msg_tmp, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, generation_config=generation_config, past_key_values= sc,use_cache=True,cache_implementation= "quantized" )

but this doesn’t work as i cant pass dictionary to tokenizer and i don’t think the messages variable get cached.

1 Like