Trainer "load_best_model_at_end" doesn't load the best model

Hi,

I am using Huggingface Trainer and want to load the best model at the end. However this does not seem to work. Here is my code:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback


for train_index in train_test_number:
    print(f"Fold Number: {train_index}")
    # Read data
    with s3.open(f"{bucket_name}/KFOLD{train_index}/{train_file_name}",'r') as file:
        data = pd.read_csv(file)
    with s3.open(f"{bucket_name}/KFOLD{train_index}/{test_file_name}",'r') as file:
        test_data = pd.read_csv(file)
    data = data[["Text", "majority_vote"]]
    test_data = test_data[["Text", "majority_vote"]]
    data.rename(columns={'Text': 'text', 'majority_vote': 'labels'}, inplace=True)
    test_data.rename(columns={'Text': 'text', 'majority_vote': 'labels'}, inplace=True)

    # Define pretrained tokenizer and model
    model_name = "deepset/gbert-base"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # ----- 1. Preprocess data -----#
    # Preprocess data
    X = list(data["text"])
    y = list(data["labels"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

    # Create torch dataset
    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels=None):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            if self.labels:
                item["labels"] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.encodings["input_ids"])

    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)
    
    # ----- 2. Fine-tune pretrained model -----#
    # Define Trainer parameters
    def compute_metrics(p):
        pred, labels = p
        pred = np.argmax(pred, axis=1)

        accuracy = accuracy_score(y_true=labels, y_pred=pred)
        recall = recall_score(y_true=labels, y_pred=pred,average="micro")
        precision = precision_score(y_true=labels, y_pred=pred,average="micro")
        f1 = f1_score(y_true=labels, y_pred=pred,average="micro")

        return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

    args = TrainingArguments(
        # output_dir: directory where the model checkpoints will be saved.
        output_dir=f"{ansatz}_res",
        overwrite_output_dir=True,
        evaluation_strategy="steps",
        eval_steps=50,
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="steps",
        save_steps=200,
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
    )

    # Train pre-trained model
    trainer.train()
    
    print("Saving Best Model")
    trainer.save_model()

    # ----- 3. Predict -----#
    # Load test data
    X_test = list(test_data["text"])
    X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

    # Create torch dataset
    test_dataset = Dataset(X_test_tokenized)

    # Load trained model
    best_ckpt_path = trainer.state.best_model_checkpoint
    #model_path = "gbert_res/checkpoint-200"
    model = BertForSequenceClassification.from_pretrained(best_ckpt_path, num_labels=2)

    # Define test trainer
    test_trainer = Trainer(model)
    #test_trainer = trainer

    # Make prediction
    raw_pred, _, _ = test_trainer.predict(test_dataset)
    #raw_pred, _, _ = trainer.predict(test_dataset)

    # Preprocess raw predictions
    y_pred = np.argmax(raw_pred, axis=1)

    from sklearn.metrics import f1_score

    test_list = test_data["labels"].tolist()
    test_list = np.array(test_list)

    # Compute Evaluation Metrics
    f = f1_multiclass(test_list, y_pred)
    p = p_multiclass(test_list, y_pred)
    r = recall_multiclass(test_list, y_pred)
    a = accuracy_score(test_list, y_pred)
    ba = balanced_accuracy_score(test_list, y_pred)
    prs = precision_recall_fscore_support(test_list, y_pred)
    m = matthews_corrcoef(test_list,y_pred)

    results = {}
    results["acc"] = a
    results["f1"] = f
    results["precision"] = p
    results["recall"] = r
    results["bal_acc"] = ba
    results["prfs"] = prs
    results["mcc"] = m

    prepare_results(results)
    result_df = dict_to_df(results)
    result_df.to_csv(f"{ansatz}{train_index}.csv")
    print(result_df)
    print(f"Finished Fold Number: {train_index}")

But I always following notification:

Saving model checkpoint to gBERTbase_balanced_germeval_res/checkpoint-200
Configuration saved in gBERTbase_balanced_germeval_res/checkpoint-200/config.json
Model weights saved in gBERTbase_balanced_germeval_res/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from gBERTbase_balanced_germeval_res/checkpoint-200 (score: 0.7903225806451614).
Saving model checkpoint to gBERTbase_balanced_germeval_res
Configuration saved in gBERTbase_balanced_germeval_res/config.json

So it looks like the checkpoint-200 is loaded. However I see that e.g. checkpoint 300 is more accurate (higher F1). So how do I get the best performing model for the evaluation?
Also is there a way to store the output_dir into a S3 Bucket?

Thanks in advance!

4 Likes