Model fine-tuning not respecting <|endoftext|> stop tokens during training

I am fine-tuning a small language model for domain-specific Q&A .

The dataset follows the JSONL structure where each line contains:

{"question": "How do I terminate an FX transaction in SAP Fiori?", "answer": "Log on to the SAP Fiori launchpad as a Treasury Specialist - Front Office... <|endoftext|>"}

I’ve explicitly added <|endoftext|> at the end of every answer (with a space before it) and also included short “stop examples” like:

{"question": "Stop example?", "answer": " <|endoftext|>"}

However, during inference, the model ignores the <|endoftext|> marker, continues generating text beyond the correct answer, and does not produce an EOS token (EOS token found: False).

I suspect either:

  1. The tokenizer is not correctly recognizing <|endoftext|> as the stop sequence, or

  2. The fine-tuning script or configuration doesn’t properly register it in the eos_token parameter.

I want to confirm the correct way to:

  • Encode the <|endoftext|> token so the model learns to stop after the correct answer

  • Include a few minimal “stop” examples statistically teaching EOS

  • Verify the tokenizer’s handling of <|endoftext|> before training

If anyone has successfully implemented explicit EOS stopping during domain-specific fine-tuning (e.g., for closed QA models), please share configuration examples or troubleshooting tips.

Below is my code :

==============================================================

# :brain: Qwen-2.5-0.5B Full Fine-Tuning on SAP Q/A Dataset (Colab + Drive Safe)

# ==============================================================

!nvidia-smi

!pip install -q -U pyarrow==15.0.2 transformers datasets peft accelerate bitsandbytes huggingface_hub

import os

from datasets import load_dataset

from transformers import (

AutoTokenizer, AutoModelForCausalLM,

DataCollatorForLanguageModeling,

Trainer, TrainingArguments

)

from google.colab import files, drive

# ==============================================================

# :one: Clean + Mount Google Drive Safely

# ==============================================================

!fusermount -u /content/drive || true

!rm -rf /content/drive

drive.mount(‘/content/drive’)

SAVE_DIR = “/content/drive/MyDrive/qwen2_5b_finetuned_sap”

os.makedirs(SAVE_DIR, exist_ok=True)

# ==============================================================

# :two: Upload your JSONL dataset

# ==============================================================

print(“:open_file_folder: Please upload your dataset file (e.g. sft_data.jsonl)…”)

uploaded = files.upload()

DATA_FILE = list(uploaded.keys())[0]

# ==============================================================

# :three: Load dataset

# ==============================================================

ds = load_dataset(“json”, data_files=DATA_FILE)

train_ds = ds[“train”]

print(f":white_check_mark: Loaded {len(train_ds)} examples")

print(f":bookmark_tabs: Columns in dataset: {train_ds.column_names} (expected: [‘question’, ‘answer’])")

# ==============================================================

# :four: Load tokenizer and model (use <|endoftext|> for pad/eos)

# ==============================================================

MODEL_NAME = “Qwen/Qwen2.5-0.5B”

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Use <|endoftext|> as pad/eos — preferred for GPT/Qwen family

if tokenizer.pad_token is None:

tokenizer.add_special_tokens({"pad_token": "<|endoftext|>"})

tokenizer.eos_token = “<|endoftext|>”

model = AutoModelForCausalLM.from_pretrained(

MODEL_NAME,

device_map="auto",

trust_remote_code=True,

)

model.resize_token_embeddings(len(tokenizer))

# ==============================================================

# :five: Tokenize dataset (prompt + labels, add EOS at end)

# ==============================================================

def build_prompt(q, a):

return f"Q: {q.strip()}\nA: {a.strip()} <|endoftext|>"

def tokenize_function(batch):

prompts = \[build_prompt(q, a) for q, a in zip(batch\["question"\], batch\["answer"\])\]

tokens = tokenizer(

    prompts,

    truncation=True,

    padding="max_length",

    max_length=512

)

tokens\["labels"\] = tokens\["input_ids"\].copy()

return tokens

tokenized = train_ds.map(

tokenize_function,

batched=True,

remove_columns=train_ds.column_names

)

# Ensure pad tokens are ignored in loss

data_collator = DataCollatorForLanguageModeling(

tokenizer=tokenizer,

mlm=False

)

# ==============================================================

# :six: Training configuration (auto-save to Drive)

# ==============================================================

training_args = TrainingArguments(

output_dir=SAVE_DIR,

per_device_train_batch_size=1,

gradient_accumulation_steps=8,

num_train_epochs=5,

learning_rate=2e-5,

fp16=True,

logging_steps=50,

save_strategy="epoch",

save_total_limit=2,

overwrite_output_dir=True,

report_to="none",

)

trainer = Trainer(

model=model,

args=training_args,

train_dataset=tokenized,

data_collator=data_collator,

)

# ==============================================================

# :seven: Resume or train from scratch

# ==============================================================

last_checkpoint = None

if any(“checkpoint” in d for d in os.listdir(SAVE_DIR)):

checkpoints = \[os.path.join(SAVE_DIR, d) for d in os.listdir(SAVE_DIR) if "checkpoint" in d\]

if checkpoints:

    last_checkpoint = max(checkpoints, key=os.path.getmtime)

print(f":counterclockwise_arrows_button: Resuming from checkpoint: {last_checkpoint}")

else:

print(“:rocket: Starting fresh fine-tuning”)

trainer.train(resume_from_checkpoint=last_checkpoint)

# ==============================================================

# :eight: Save model + tokenizer

# ==============================================================

model.save_pretrained(SAVE_DIR)

tokenizer.save_pretrained(SAVE_DIR)

print(f":white_check_mark: Final model and tokenizer saved to: {SAVE_DIR}")

# ==============================================================

# :nine: Verify reload

# ==============================================================

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = “/content/drive/MyDrive/qwen2_5b_finetuned_sap”

print(“:counterclockwise_arrows_button: Loading model from Google Drive…”)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(

MODEL_PATH,

device_map="auto",

trust_remote_code=True,

)

# Use same EOS/PAD

tokenizer.pad_token = tokenizer.eos_token

qa_pipe = pipeline(“text-generation”, model=model, tokenizer=tokenizer)

prompt = “Q: How do I terminate an FX transaction in SAP Fiori?\nA:”

out = qa_pipe(

prompt,

max_new_tokens=150,

do_sample=False,

eos_token_id=tokenizer.eos_token_id,

pad_token_id=tokenizer.eos_token_id

)

# Cleanly stop at <|endoftext|>

text = out[0][“generated_text”].split(“<|endoftext|>”)[0]

print(“\n🧾 Model output:\n”, text)

1 Like

Making a slight change to how EOS tokens are handled during training seems to improve things somewhat.