Repetitive Token Generation During Evaluation in Fine-Tuned LLaMA Model

I’m fine-tuning a LLaMA based model (Llama-3.3-70B-Instruct) to generate Overpass Turbo queries (a query language for extracting specific geographic data from OpenStreetMap) from natural language prompts. For experimental reasons, I call .generate() inside trainer.evaluate() to track the model’s predictions during evaluation and compare them with its direct logits output. However, I notice that the model’s raw predictions (from logits) are highly repetitive, while .generate() produces more coherent outputs.

I wanted to check if there is an issue somewhere, whether in how evaluation is handled, my setup, or something else. Any insights would be appreciated!

My evaluation function:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    # Convert logits to token IDs
    predictions = np.argmax(logits, axis=-1)
    
    # Debug: Check raw logits and token IDs
    print("Raw Logits:", logits.shape)
    print("Predictions Token IDs:", predictions)
    
    # Remove ignored index (-128004) from labels
    labels = [[token for token in label if token != -128004] for label in labels]

    # Convert token IDs back to text
    predictions_text = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    labels_text = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
    
    # Debug: Print model's predictions and compare it to model's generate output
    DEVICE, _, _ = get_backend() 
    test_inputs = tokenizer(["Generate an Overpass Turbo query to find all basketball courts in Montreal."], return_tensors="pt").to(DEVICE)
    test_output_ids = model.generate(**test_inputs)
    test_output_text = [tokenizer.decode(output, skip_special_tokens=True) for output in test_output_ids]
    for i in range(len(test_output_text)):
        print(f"âś… test prediction {i}: {test_output_text[i]}")
        print("*" * 50)

    for i in range(len(predictions_text)):
        print(f"âś… Prediction {i}: {predictions_text[i]}")
        print(f"âś… Label {i}: {labels_text[i]}")
        print("-" * 50)

    # Compute and return metrics 
   ...

Example output of model.generate on top vs output of eval logits on bottom

Other key parts of my finetuning script

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model =  LlamaForCausalLM.from_pretrained(
    BASE_MODEL, 
    use_safetensors=True,
    torch_dtype=torch.bfloat16)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none"
)

model.enable_input_require_grads()
model = get_peft_model(model, config)

def preprocess(batch):
    inputs = [
        f"Using this data {batch['system'][i]}, generate overpass turbo query: {batch['prompt'][i]}"
        if batch['system'][i] else f"Generate overpass turbo query: {batch['prompt'][i]}"
        for i in range(len(batch['prompt']))
    ]
    
    model_inputs = tokenizer(
        inputs,
        text_target=batch["completion"],
        padding="max_length",
        max_length=256, 
    )
    
    return model_inputs

tokenized_op_train = op_train.map(
    preprocess, batched=True, remove_columns=op_train.column_names)
tokenized_op_test = op_test.map(
    preprocess, batched=True, remove_columns=op_test.column_names)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model
)

training_args = TrainingArguments(
    num_train_epochs = epochs,
    output_dir=str(VOL_MOUNT_PATH / "model"),
    logging_dir=str(VOL_MOUNT_PATH / "logs"),
    metric_for_best_model="exact_match",
    logging_strategy="steps",
    logging_steps=10,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    bf16=True,
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    label_names=["labels"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_op_train,
    eval_dataset=tokenized_op_test,
    compute_metrics=compute_metrics
)

try:
    resume = restarts > 1
    if resume:
        print("resuming from checkpoint")
    trainer.train(resume_from_checkpoint=False)
except KeyboardInterrupt:  # handle possible preemption
    print("received interrupt; saving state and model")
    trainer.save_state()
    trainer.save_model()
    raise

model.save_pretrained(str(VOL_MOUNT_PATH / MODEL_NAME), safe_serialization=True)
tokenizer.save_pretrained(str(VOL_MOUNT_PATH / FINETUNED_MODEL_NAME))
output_vol.commit()

... 
1 Like

This is a problem that we often hear about with fine tuning of Llama 3, but this time it doesn’t seem to be a problem derived from the base, so I wonder if this is it.