Hi,
I am using Huggingface Trainer and want to load the best model at the end. However this does not seem to work. Here is my code:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
for train_index in train_test_number:
print(f"Fold Number: {train_index}")
# Read data
with s3.open(f"{bucket_name}/KFOLD{train_index}/{train_file_name}",'r') as file:
data = pd.read_csv(file)
with s3.open(f"{bucket_name}/KFOLD{train_index}/{test_file_name}",'r') as file:
test_data = pd.read_csv(file)
data = data[["Text", "majority_vote"]]
test_data = test_data[["Text", "majority_vote"]]
data.rename(columns={'Text': 'text', 'majority_vote': 'labels'}, inplace=True)
test_data.rename(columns={'Text': 'text', 'majority_vote': 'labels'}, inplace=True)
# Define pretrained tokenizer and model
model_name = "deepset/gbert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
# ----- 1. Preprocess data -----#
# Preprocess data
X = list(data["text"])
y = list(data["labels"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels=None):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
if self.labels:
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.encodings["input_ids"])
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
pred, labels = p
pred = np.argmax(pred, axis=1)
accuracy = accuracy_score(y_true=labels, y_pred=pred)
recall = recall_score(y_true=labels, y_pred=pred,average="micro")
precision = precision_score(y_true=labels, y_pred=pred,average="micro")
f1 = f1_score(y_true=labels, y_pred=pred,average="micro")
return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
args = TrainingArguments(
# output_dir: directory where the model checkpoints will be saved.
output_dir=f"{ansatz}_res",
overwrite_output_dir=True,
evaluation_strategy="steps",
eval_steps=50,
logging_strategy="steps",
logging_steps=50,
save_strategy="steps",
save_steps=200,
learning_rate=5e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
load_best_model_at_end=True,
metric_for_best_model="f1",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)
# Train pre-trained model
trainer.train()
print("Saving Best Model")
trainer.save_model()
# ----- 3. Predict -----#
# Load test data
X_test = list(test_data["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)
# Load trained model
best_ckpt_path = trainer.state.best_model_checkpoint
#model_path = "gbert_res/checkpoint-200"
model = BertForSequenceClassification.from_pretrained(best_ckpt_path, num_labels=2)
# Define test trainer
test_trainer = Trainer(model)
#test_trainer = trainer
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)
#raw_pred, _, _ = trainer.predict(test_dataset)
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)
from sklearn.metrics import f1_score
test_list = test_data["labels"].tolist()
test_list = np.array(test_list)
# Compute Evaluation Metrics
f = f1_multiclass(test_list, y_pred)
p = p_multiclass(test_list, y_pred)
r = recall_multiclass(test_list, y_pred)
a = accuracy_score(test_list, y_pred)
ba = balanced_accuracy_score(test_list, y_pred)
prs = precision_recall_fscore_support(test_list, y_pred)
m = matthews_corrcoef(test_list,y_pred)
results = {}
results["acc"] = a
results["f1"] = f
results["precision"] = p
results["recall"] = r
results["bal_acc"] = ba
results["prfs"] = prs
results["mcc"] = m
prepare_results(results)
result_df = dict_to_df(results)
result_df.to_csv(f"{ansatz}{train_index}.csv")
print(result_df)
print(f"Finished Fold Number: {train_index}")
But I always following notification:
Saving model checkpoint to gBERTbase_balanced_germeval_res/checkpoint-200
Configuration saved in gBERTbase_balanced_germeval_res/checkpoint-200/config.json
Model weights saved in gBERTbase_balanced_germeval_res/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
Num examples = 186
Batch size = 8
Training completed. Do not forget to share your model on huggingface.co/models =)
Loading best model from gBERTbase_balanced_germeval_res/checkpoint-200 (score: 0.7903225806451614).
Saving model checkpoint to gBERTbase_balanced_germeval_res
Configuration saved in gBERTbase_balanced_germeval_res/config.json
So it looks like the checkpoint-200 is loaded. However I see that e.g. checkpoint 300 is more accurate (higher F1). So how do I get the best performing model for the evaluation?
Also is there a way to store the output_dir into a S3 Bucket?
Thanks in advance!