Hey! Yes, we recently added a possibility to copy cache objects so now you can simply copy the same cache as re-use in different generations. Just make sure you are not using the same cache twice, as we perform in-place modification on it when generating
import os, torch, copy
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(ckpt)
prompt_cache = DynamicCache()
inputs = tokenizer(INITIAL_PROMPT, return_tensors="pt"
prompt_cache = model(**inputs, past_key_values = prompt_cache).past_key_values # this is the common prompt cached
new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
past_key_values = copy.deepcopy(prompt_cache)
outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
response = tokenizer.batch_decode(outputs)[0]
print(response)