Batch generation Llama 3 Instruct | Tokenizer has no padding token

Hello everyone,

What is the best way of using a model like Llama 3.1 ( meta-llama/Llama-3.1-8B-Instruct · Hugging Face ) with AutoModel, AutoTokenizer, and template chat (I can’t use pipelines for my use case) for batch generation and eventually also using DDP.

This works for a single conversation:

from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)

messages = [
    {
        "role": "system",
        "content": "You are a pirate chatbot who always responds in pirate speak!",
    },
    {"role": "user", "content": "Who are you?"},
]

input_ids = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

response = outputs[0][input_ids.shape[-1] :]
print(tokenizer.decode(response, skip_special_tokens=True))

For multiple conversations and batch decoding, do I just need to apply the chat template with padding = True? When I try that, I get the error “Asking to pad but the tokenizer does not have a padding token”

1 Like

Actually, could this be the solution?

  1. Set padding to left

  2. Set pad token to eos token

  3. In generate set pad token id to eos token id

  4. Use tokenizer.batch_decode

from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

messages = [
    [
        {
            "role": "system",
            "content": "You are a pirate chatbot who always responds in pirate speak!",
        },
        {"role": "user", "content": "Who are you?"},
    ],
    [
        {
            "role": "system",
            "content": "You are a pirate chatbot who always responds in pirate speak!",
        },
        {"role": "user", "content": "How old are you?"},
    ],
]

input_ids = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt", padding=True
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

1 Like

I think that’s correct. If anything else to add, maybe return_dict=True or something.

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id  # inference-safe

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

messages = [
    [
        {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
        {"role": "user", "content": "Who are you?"},
    ],
    [
        {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
        {"role": "user", "content": "How old are you?"},
    ],
]

# Return a BatchEncoding with input_ids **and** attention_mask, already padded on the left
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,                # explicit
    return_tensors="pt",
    return_dict=True,             # crucial for batched generate
    padding=True,
).to(model.device)

terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

outputs = model.generate(
    **inputs,                     # pass dict, not a single tensor
    max_new_tokens=256,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    eos_token_id=terminators,     # stop on EOS or EOT
    pad_token_id=tokenizer.eos_token_id,
)

# Drop the prompt, then decode the new tokens only
new_tokens = outputs[:, inputs["input_ids"].shape[1]:]
texts = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
1 Like

That’s awesome, thank you!

1 Like

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.