I am fine-tuning a small language model for domain-specific Q&A .
The dataset follows the JSONL structure where each line contains:
{"question": "How do I terminate an FX transaction in SAP Fiori?", "answer": "Log on to the SAP Fiori launchpad as a Treasury Specialist - Front Office... <|endoftext|>"}
Iâve explicitly added <|endoftext|> at the end of every answer (with a space before it) and also included short âstop examplesâ like:
{"question": "Stop example?", "answer": " <|endoftext|>"}
However, during inference, the model ignores the <|endoftext|> marker, continues generating text beyond the correct answer, and does not produce an EOS token (EOS token found: False).
I suspect either:
-
The tokenizer is not correctly recognizing
<|endoftext|>as the stop sequence, or -
The fine-tuning script or configuration doesnât properly register it in the
eos_tokenparameter.
I want to confirm the correct way to:
-
Encode the
<|endoftext|>token so the model learns to stop after the correct answer -
Include a few minimal âstopâ examples statistically teaching EOS
-
Verify the tokenizerâs handling of
<|endoftext|>before training
If anyone has successfully implemented explicit EOS stopping during domain-specific fine-tuning (e.g., for closed QA models), please share configuration examples or troubleshooting tips.
Below is my code :
==============================================================
#
Qwen-2.5-0.5B Full Fine-Tuning on SAP Q/A Dataset (Colab + Drive Safe)
# ==============================================================
!nvidia-smi
!pip install -q -U pyarrow==15.0.2 transformers datasets peft accelerate bitsandbytes huggingface_hub
import os
from datasets import load_dataset
from transformers import (
AutoTokenizer, AutoModelForCausalLM,
DataCollatorForLanguageModeling,
Trainer, TrainingArguments
)
from google.colab import files, drive
# ==============================================================
#
Clean + Mount Google Drive Safely
# ==============================================================
!fusermount -u /content/drive || true
!rm -rf /content/drive
drive.mount(â/content/driveâ)
SAVE_DIR = â/content/drive/MyDrive/qwen2_5b_finetuned_sapâ
os.makedirs(SAVE_DIR, exist_ok=True)
# ==============================================================
#
Upload your JSONL dataset
# ==============================================================
print(â
Please upload your dataset file (e.g. sft_data.jsonl)âŚâ)
uploaded = files.upload()
DATA_FILE = list(uploaded.keys())[0]
# ==============================================================
#
Load dataset
# ==============================================================
ds = load_dataset(âjsonâ, data_files=DATA_FILE)
train_ds = ds[âtrainâ]
print(f"
Loaded {len(train_ds)} examples")
print(f"
Columns in dataset: {train_ds.column_names} (expected: [âquestionâ, âanswerâ])")
# ==============================================================
#
Load tokenizer and model (use <|endoftext|> for pad/eos)
# ==============================================================
MODEL_NAME = âQwen/Qwen2.5-0.5Bâ
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Use <|endoftext|> as pad/eos â preferred for GPT/Qwen family
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>"})
tokenizer.eos_token = â<|endoftext|>â
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
trust_remote_code=True,
)
model.resize_token_embeddings(len(tokenizer))
# ==============================================================
#
Tokenize dataset (prompt + labels, add EOS at end)
# ==============================================================
def build_prompt(q, a):
return f"Q: {q.strip()}\nA: {a.strip()} <|endoftext|>"
def tokenize_function(batch):
prompts = \[build_prompt(q, a) for q, a in zip(batch\["question"\], batch\["answer"\])\]
tokens = tokenizer(
prompts,
truncation=True,
padding="max_length",
max_length=512
)
tokens\["labels"\] = tokens\["input_ids"\].copy()
return tokens
tokenized = train_ds.map(
tokenize_function,
batched=True,
remove_columns=train_ds.column_names
)
# Ensure pad tokens are ignored in loss
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# ==============================================================
#
Training configuration (auto-save to Drive)
# ==============================================================
training_args = TrainingArguments(
output_dir=SAVE_DIR,
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
num_train_epochs=5,
learning_rate=2e-5,
fp16=True,
logging_steps=50,
save_strategy="epoch",
save_total_limit=2,
overwrite_output_dir=True,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized,
data_collator=data_collator,
)
# ==============================================================
#
Resume or train from scratch
# ==============================================================
last_checkpoint = None
if any(âcheckpointâ in d for d in os.listdir(SAVE_DIR)):
checkpoints = \[os.path.join(SAVE_DIR, d) for d in os.listdir(SAVE_DIR) if "checkpoint" in d\]
if checkpoints:
last_checkpoint = max(checkpoints, key=os.path.getmtime)
print(f"
Resuming from checkpoint: {last_checkpoint}")
else:
print(â
Starting fresh fine-tuningâ)
trainer.train(resume_from_checkpoint=last_checkpoint)
# ==============================================================
#
Save model + tokenizer
# ==============================================================
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"
Final model and tokenizer saved to: {SAVE_DIR}")
# ==============================================================
#
Verify reload
# ==============================================================
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
MODEL_PATH = â/content/drive/MyDrive/qwen2_5b_finetuned_sapâ
print(â
Loading model from Google DriveâŚâ)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
device_map="auto",
trust_remote_code=True,
)
# Use same EOS/PAD
tokenizer.pad_token = tokenizer.eos_token
qa_pipe = pipeline(âtext-generationâ, model=model, tokenizer=tokenizer)
prompt = âQ: How do I terminate an FX transaction in SAP Fiori?\nA:â
out = qa_pipe(
prompt,
max_new_tokens=150,
do_sample=False,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id
)
# Cleanly stop at <|endoftext|>
text = out[0][âgenerated_textâ].split(â<|endoftext|>â)[0]
print(â\nđ§ž Model output:\nâ, text)