Reduced inference f1 score with QLoRA finetuned model

Hello, I am finetuning Llama2-7b with QLoRA for a sentence classification task. When I am training the model, the f1 score on the validation set reaches 87%. When I save the finetuned model and reload it for inference, the model only achieves ~20% F1 score on the same evaluation dataset.

For finetuning the model, here is how I load the dataset, prepare the model with adapters, and train the model.

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, TrainingArguments, Trainer
from torch.utils.data import Dataset
import torch
import pandas as pd
import evaluate
import numpy as np  
import pandas as pd
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
import bitsandbytes as bnb
# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True, trust_remote_code=True)
tokenizer.add_special_tokens({"pad_token":"<pad>"}) 

train = pd.read_csv('train_dataset.csv')
train_sentences, train_labels = train['sentences'], train['enc']
val = pd.read_csv('validation_dataset.csv')
val_sentences, val_labels = val['sentences'], val['enc']

# Create torch dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Split into training and validation sets (90% training, 10% validation)
train_encodings = tokenizer(train_sentences, truncation=True, max_length=512, padding='max_length', return_attention_mask=True)
val_encodings = tokenizer(val_sentences, truncation=True, max_length=512, padding='max_length', return_attention_mask=True)

# Prepare datasets
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

### Prepare model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForSequenceClassification.from_pretrained(model_id, 
                                                        quantization_config=bnb_config,
                                                        use_auth_token=True, 
                                                        trust_remote_code=True,
                                                        num_labels=len(set(train_labels)))
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules=modules, 
    lora_dropout=0.1, 
    bias="none", 
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, config)

def compute_metrics(eval_pred):
    metric = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
    return {"f1": f1}

training_args = TrainingArguments(
    logging_dir='./logs',
    output_dir='./models',        
    evaluation_strategy='steps'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
model.config.use_cache = False  # silence the warnings. please re-enable for inference!

trainer.train()

Here is how I load the saved model at time of inference.

import torch
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForSequenceClassification
PEFT_MODEL = "best-checkpoint"

config = PeftConfig.from_pretrained(PEFT_MODEL)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    num_labels=4
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.add_special_tokens({"pad_token":"<pad>"}) 
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

model = PeftModel.from_pretrained(model, PEFT_MODEL)

To run inference on the same validation set used in training, I use the following code:

from tqdm import tqdm
import evaluate
metric = evaluate.load("f1")

preds = []
for sentence in val_sentences:
    input = tokenizer(sentence, return_tensors='pt', truncation=True, max_length=512)
    pred = model(**input) 
    preds.append(pred.logits.argmax().item())

metric.compute(predictions=preds, references=val_labels, average='macro')["f1"]

Is this the correct way to load the model for inference? I am confused how it could be performing so well during the validation steps while training and so poorly on the same dataset after the loading the saved model. Thanks in advance for your help!

This appears to be a problem with older versions of peft, as the issue was resolved by upgrading to peft version 0.5.0

1 Like