Using huggingface transformers library, I see different outputs when generating text with model.generate with and without use_cache argument. Is this intended and how can I combat this?
The scores when I use cache (from the second token generated and onwards) are different. AFAIK use_cache is an optimization that shouldn’t effect the outputs. I also got this error on gpu (here in the code I use 'cpu). Code to reproduce:
MODEL = “meta-llama/Meta-Llama-3.1-8B-Instruct”
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding
import torch
MODEL = “Qwen/Qwen2.5-0.5B-Instruct”
MODEL = “meta-llama/Meta-Llama-3.1-8B-Instruct”
tokenizer = AutoTokenizer.from_pretrained(MODEL, padding_side=‘left’)
model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16) # , cache_dir=“/workspace/fmrai/”)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
model.eval()
device = ‘cpu’ # ‘cuda’ if torch.cuda.is_available() else ‘cpu’
model.to(device)
Example text to generate from
prompt = “Tell me something that is very exciting”
Format the prompt using the chat template
formatted_prompt = tokenizer.apply_chat_template([{“role”: “user”, “content”: prompt}], tokenize=False)
Tokenize the input
inputs = tokenizer(formatted_prompt, return_tensors=“pt”, padding=True)
def generate(model, tokenizer, inputs, use_cache, output_attentions=False, device=‘cpu’):
with torch.no_grad():
outputs = model.generate(
inputs[“input_ids”].to(device),
attention_mask=inputs[“attention_mask”].to(device),
max_new_tokens=2,
pad_token_id=tokenizer.eos_token_id,
do_sample=False,
return_dict_in_generate=True,
use_cache=use_cache,
output_scores=True,
# output_hidden_states=True,
output_attentions=output_attentions,
)
return outputs
outputs_cache = generate(model, tokenizer, inputs, use_cache=True, device=device)
outputs_no_cache = generate(model, tokenizer, inputs, use_cache=False, device=device)
outputs_cache_attentions = generate(model, tokenizer, inputs, use_cache=True, output_attentions=True, device=device)
outputs_no_cache_attentions = generate(model, tokenizer, inputs, use_cache=False, output_attentions=True, device=device)
for i in range(2):
print(f"Cache {i}: {outputs_cache.scores[i]}“)
print(f"No Cache {i}: {outputs_no_cache.scores[i]}”)
print(f"Cache Att {i}: {outputs_cache_attentions.scores[i]}“)
print(f"No Cache Att {i}: {outputs_no_cache_attentions.scores[i]}”)