I’m fine-tuning a LLaMA based model (Llama-3.3-70B-Instruct) to generate Overpass Turbo queries (a query language for extracting specific geographic data from OpenStreetMap) from natural language prompts. For experimental reasons, I call .generate()
inside trainer.evaluate()
to track the model’s predictions during evaluation and compare them with its direct logits output. However, I notice that the model’s raw predictions (from logits) are highly repetitive, while .generate()
produces more coherent outputs.
I wanted to check if there is an issue somewhere, whether in how evaluation is handled, my setup, or something else. Any insights would be appreciated!
My evaluation function:
def compute_metrics(eval_pred):
logits, labels = eval_pred
# Convert logits to token IDs
predictions = np.argmax(logits, axis=-1)
# Debug: Check raw logits and token IDs
print("Raw Logits:", logits.shape)
print("Predictions Token IDs:", predictions)
# Remove ignored index (-128004) from labels
labels = [[token for token in label if token != -128004] for label in labels]
# Convert token IDs back to text
predictions_text = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
labels_text = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
# Debug: Print model's predictions and compare it to model's generate output
DEVICE, _, _ = get_backend()
test_inputs = tokenizer(["Generate an Overpass Turbo query to find all basketball courts in Montreal."], return_tensors="pt").to(DEVICE)
test_output_ids = model.generate(**test_inputs)
test_output_text = [tokenizer.decode(output, skip_special_tokens=True) for output in test_output_ids]
for i in range(len(test_output_text)):
print(f"âś… test prediction {i}: {test_output_text[i]}")
print("*" * 50)
for i in range(len(predictions_text)):
print(f"âś… Prediction {i}: {predictions_text[i]}")
print(f"âś… Label {i}: {labels_text[i]}")
print("-" * 50)
# Compute and return metrics
...
Example output of model.generate on top vs output of eval logits on bottom
Other key parts of my finetuning script
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = LlamaForCausalLM.from_pretrained(
BASE_MODEL,
use_safetensors=True,
torch_dtype=torch.bfloat16)
config = LoraConfig(
r=16,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.1,
bias="none"
)
model.enable_input_require_grads()
model = get_peft_model(model, config)
def preprocess(batch):
inputs = [
f"Using this data {batch['system'][i]}, generate overpass turbo query: {batch['prompt'][i]}"
if batch['system'][i] else f"Generate overpass turbo query: {batch['prompt'][i]}"
for i in range(len(batch['prompt']))
]
model_inputs = tokenizer(
inputs,
text_target=batch["completion"],
padding="max_length",
max_length=256,
)
return model_inputs
tokenized_op_train = op_train.map(
preprocess, batched=True, remove_columns=op_train.column_names)
tokenized_op_test = op_test.map(
preprocess, batched=True, remove_columns=op_test.column_names)
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model
)
training_args = TrainingArguments(
num_train_epochs = epochs,
output_dir=str(VOL_MOUNT_PATH / "model"),
logging_dir=str(VOL_MOUNT_PATH / "logs"),
metric_for_best_model="exact_match",
logging_strategy="steps",
logging_steps=10,
eval_strategy="steps",
save_strategy="steps",
save_steps=100,
save_total_limit=2,
bf16=True,
learning_rate=3e-5,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=8,
gradient_checkpointing=True,
label_names=["labels"]
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_op_train,
eval_dataset=tokenized_op_test,
compute_metrics=compute_metrics
)
try:
resume = restarts > 1
if resume:
print("resuming from checkpoint")
trainer.train(resume_from_checkpoint=False)
except KeyboardInterrupt: # handle possible preemption
print("received interrupt; saving state and model")
trainer.save_state()
trainer.save_model()
raise
model.save_pretrained(str(VOL_MOUNT_PATH / MODEL_NAME), safe_serialization=True)
tokenizer.save_pretrained(str(VOL_MOUNT_PATH / FINETUNED_MODEL_NAME))
output_vol.commit()
...