When I inference my fine tuned model, why does it behave randomly?

I have fine tuned “bert-base-uncased model” for paraphrase, while training i get 99.02%accuracy, but when inferencing, the accuracy is random? I provide the training code and inference code. Plz, help me
###Training Code
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from peft import LoraConfig, get_peft_model ,TaskType

def print_trainable_parameters(model):
“”"
Prints the number of trainable parameters in the model.
“”"
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
)

model = AutoModelForSequenceClassification.from_pretrained(“bert-base-uncased”, num_labels=2)
print_trainable_parameters(model)

config = LoraConfig(
r=32,
lora_alpha=32,
target_modules=[“query”, “value”],
lora_dropout=0.1,
bias=“lora_only”,
modules_to_save=[“decode_head”],
)
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

raw_datasets = load_dataset(“gokuls/glue_augmented_mrpc”)
checkpoint = “bert-base-uncased”
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
return tokenizer(example[“sentence1”], example[“sentence2”], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
“test-trainer-lora”,
eval_strategy=“epoch”,
num_train_epochs=20, # 10 epochs, change to 3 for faster training
learning_rate=5e-6,
per_device_train_batch_size=16,
save_strategy=“no”,
save_steps = 0
)

trainer = Trainer(
lora_model,
training_args,
train_dataset=tokenized_datasets[“train”],
eval_dataset=tokenized_datasets[“validation”],
data_collator=data_collator,
tokenizer=tokenizer,
)

trainer.train()

lora_model.save_pretrained(“./lora_model”)

predictions = trainer.predict(tokenized_datasets[“validation”])

preds = np.argmax(predictions.predictions[1], axis=1)
labels = np.array(raw_datasets[“validation”][“label”])

accuracy = (preds == labels).mean()
print(f"Model accuracy: {accuracy*100:.2f}%")
###Inference Code
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
import numpy as np
import torch
from tqdm import tqdm
from datasets import load_dataset

config = PeftConfig.from_pretrained(“./lora_model”)
model = AutoModelForSequenceClassification.from_pretrained(“bert-base-uncased”, num_labels=2)
inference_model = PeftModel.from_pretrained(model, “./lora_model”)

tokenizer = AutoTokenizer.from_pretrained(“bert-base-uncased”)

raw_datasets = load_dataset(“gokuls/glue_augmented_mrpc”)
def is_paraphrase(sentence1, sentence2):
inputs = tokenizer(sentence1, sentence2,return_tensors=“pt”)
with torch.no_grad():
outputs = inference_model(**inputs)
predicted_class = torch.argmax(outputs.logits, dim=-1).item()
return predicted_class

if name == “main”:
raw_datasets = raw_datasets[“validation”]
print(raw_datasets.shape)
correct = 0
eval_len = len(raw_datasets)
for i in tqdm(range(eval_len)):
sentence1 = raw_datasets[i][“sentence1”]
sentence2 = raw_datasets[i][“sentence2”]
if is_paraphrase(sentence1, sentence2) == int(raw_datasets[i][“label”]):
correct += 1
print(f"Accuracy := {100*correct/eval_len:.2f}%")

1 Like

It seems to become non-deterministic if you forget to call .eval().