Hi, I am trying to save a model with
model_path = "./deberta-v3-large-5"
trainer.save_model(model_path)
and load it with
reloaded_model = AutoModelForSequenceClassification.from_pretrained(
model_path, num_labels=2, id2label=id2label, label2id=label2id)
This is my full code:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
#Initial load of the model
model_checkpoint = 'microsoft/deberta-v3-large'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer
# Define label maps specific to your task
id2label = {0: "Human", 1: "AI"}
label2id = {"Human": 0, "AI": 1}
# Generate classification model from model_checkpoint with the defined labels
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast = False)
# add pad token if none exists
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def tokenize_function(examples):
# Extract text entries from the 'texts' field
texts = examples["texts"]
# Tokenize and optionally truncate the text; handling batch of texts
tokenized_inputs = tokenizer(
texts,
truncation=True,
max_length=512,
padding="max_length", # Ensure all sequences are padded to the same length for batch processing
return_tensors="pt" # Return PyTorch tensors
)
return tokenized_inputs
from transformers import EarlyStoppingCallback
# Adjust these parameters to fit your needs
early_stopping_patience = 2 # Number of evaluations with no improvement after which training will be stopped
peft_config = LoraConfig(task_type="SEQ_CLS",
r=1,
lora_alpha=16,
lora_dropout=0.2)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# hyperparameters
lr = 1e-4
batch_size = 16
num_epochs = 1
# define training arguments
training_args = TrainingArguments(
output_dir= model_checkpoint + "-lora-text-classification",
learning_rate=lr,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.02,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_strategy="epoch", # This ensures that logs are generated at the end of each epoch
load_best_model_at_end=True,
logging_steps =50,
)
# creater trainer object
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience)]
)
# train model
trainer.train()
And to test, I am using
import json
import torch
import numpy as np
from sklearn.metrics import confusion_matrix
# Assuming the tokenizer and model have been already loaded
# Path to the JSON file
json_file_path = "/home/mario/bittensor/miguel_32/bittensor32/custom/model_training/new_test_data/query_results__gpu0__01bc13c3aaa94fca94f433c548102e92.json"
# Lists to store all texts and true labels
all_texts = []
all_true_labels = []
# Load data from the JSON file
with open(json_file_path, 'r') as file:
data = json.load(file)
all_texts.extend(data['texts']) # Adjust field names based on your JSON structure
all_true_labels.extend(data['labels']) # Adjust field names based on your JSON structure
# Lists to store predictions
all_predicted_labels = []
print("Model predictions:")
print("-------------------")
# Make prediction for each text
for text in all_texts[:100]:
print("prediction")
# Tokenize the text
inputs = tokenizer(
text,
truncation=True,
max_length=512,
padding="max_length", # Ensure all sequences are padded to the same length for batch processing
return_tensors="pt" # Return PyTorch tensors
)
# Move tensors to the same device as model
inputs = {k: v.to(reloaded_model.device) for k, v in inputs.items()}
# Make prediction
with torch.no_grad():
outputs = reloaded_model(**inputs)
# Handle different output types
if isinstance(outputs, tuple):
logits = outputs[0]
else:
logits = outputs.logits
# Convert logits to probabilities
probabilities = torch.nn.functional.softmax(logits, dim=-1)
# Get the most likely class
predicted_class_id = probabilities.argmax().item()
# Collect the predicted label
all_predicted_labels.append(predicted_class_id)
# Calculate the confusion matrix
conf_matrix = confusion_matrix(all_true_labels[:100], all_predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)
My problem is that the confusion matrix completely changes after saving and loading, and I keep getting this warning:
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.```
Thank you very much