What is the best way of using a model like Llama 3.1 ( meta-llama/Llama-3.1-8B-Instruct · Hugging Face ) with AutoModel, AutoTokenizer, and template chat (I can’t use pipelines for my use case) for batch generation and eventually also using DDP.
For multiple conversations and batch decoding, do I just need to apply the chat template with padding = True? When I try that, I get the error “Asking to pad but the tokenizer does not have a padding token”
I think that’s correct. If anything else to add, maybe return_dict=True or something.
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id # inference-safe
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
messages = [
[
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
{"role": "user", "content": "Who are you?"},
],
[
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
{"role": "user", "content": "How old are you?"},
],
]
# Return a BatchEncoding with input_ids **and** attention_mask, already padded on the left
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True, # explicit
return_tensors="pt",
return_dict=True, # crucial for batched generate
padding=True,
).to(model.device)
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
outputs = model.generate(
**inputs, # pass dict, not a single tensor
max_new_tokens=256,
do_sample=True,
temperature=0.6,
top_p=0.9,
eos_token_id=terminators, # stop on EOS or EOT
pad_token_id=tokenizer.eos_token_id,
)
# Drop the prompt, then decode the new tokens only
new_tokens = outputs[:, inputs["input_ids"].shape[1]:]
texts = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)