How to resume training from a checkpoint using huggingface trainer

mhamza-007 · May 7, 2025, 6:58am

main.py:

def configure_parameters(model):
    for param in model.parameters():
        param.requires_grad = False
    
    for i in [6, 7, 8, 9, 10, 11]: 
        for param in model.timesformer.encoder.layer[i].parameters():
            param.requires_grad = True
    model.timesformer.layernorm.weight.requires_grad = True
    model.timesformer.layernorm.bias.requires_grad = True
    model.classifier.weight.requires_grad = True
    model.classifier.bias.requires_grad = True

    return model

  base_model = None
        resume_path = None
        num_epochs = 5  
        warmup_epochs = 1
        
        resume_choice = input("Do you want to resume training from a checkpoint? (y/n, default: n): ").strip().lower()
        if resume_choice in ["y", "yes"]:
            target_epoch = int(input("Enter the epoch number to resume from (e.g., 3): ").strip())
            checkpoint_folders = sorted([f for f in os.listdir("./ckpt") if f.startswith("checkpoint-")], 
                                        key=lambda x: int(x.split("-")[1]))
            for folder in checkpoint_folders:
                trainer_state_path = os.path.join("./ckpt", folder, "trainer_state.json")
                if os.path.exists(trainer_state_path):
                    with open(trainer_state_path, 'r') as f:
                        trainer_state = json.load(f)
                    current_epoch = int(trainer_state.get("epoch", 0))
                    if current_epoch == target_epoch:
                        resume_path = os.path.join("./ckpt", folder)
                        print(f"Resuming from epoch {target_epoch} at {resume_path}")
                        break

            if resume_path:
                base_model = TimesformerForVideoClassification.from_pretrained(resume_path)
                # config = TimesformerConfig.from_pretrained(resume_path)
                # base_model = TimesformerForVideoClassification(config)
                base_model = configure_parameters(base_model)

                while True:
                    try:
                        total_epochs = int(input(f"Enter total epochs (including current {target_epoch}): "))
                        if total_epochs <= target_epoch:
                                print(f"Total epochs must be greater than {target_epoch}.")
                        else:
                            remaining_epochs = total_epochs - target_epoch
                            num_epochs = remaining_epochs
                            break
                    except ValueError:
                            print("Invalid input. Enter a number.")
            else:
                print(f"No checkpoint for epoch {target_epoch}")
                resume_choice = "n"

        if not resume_path:
            base_model = load_model("./model")
            if not base_model:
                print("Failed to load model")
                sys.exit(1)
            base_model = configure_parameters(base_model)

            try:
                num_epochs = int(input("Training epochs (default 5): ") or 5)
            except ValueError:
                print("Invalid input, using default 5")
                num_epochs = 5

        while True:
            try:
                warmup_input = input(f"Enter number of warmup epochs (default: 1, max: {int(num_epochs)}): ").strip()
                warmup_epochs = int(warmup_input) if warmup_input else 1
                if warmup_epochs < 0:
                    print("Warmup epochs cannot be negative. Using default 1.")
                    warmup_epochs = 1
                    break
                elif warmup_epochs > num_epochs:
                    print(f"Warmup epochs cannot exceed total epochs. Setting to {int(num_epochs)}.")
                    warmup_epochs = int(num_epochs)
                    break
                else:
                    break
            except ValueError:
                print("Invalid input. Please enter an integer.")

        trained_model = train_model(
            base_model, 
            train_dataset,
            val_dataset,
            num_epochs=num_epochs,
            warmup_epochs=warmup_epochs,
            resume_from_checkpoint=resume_path
            )
        
        save_finetuned_model(trained_model, "./weights")
        print("Training complete. Model saved.")

train_model.py:

os.environ["WANDB_PROJECT"] = "deepfake-detection"

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
roc_auc_metric = evaluate.load("roc_auc")

class EpochProgressCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        current = int(state.epoch) + 1
        total = int(args.num_train_epochs)
        print(f"\n\n>>> Starting epoch {current}/{total}")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(references=labels, predictions=predictions)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="binary")
    recall = recall_metric.compute(references=labels, predictions=predictions, average="binary", zero_division=0)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")
    
    logits_t = torch.from_numpy(logits)
    probs = F.softmax(logits_t.float(), dim=1).cpu().numpy()
    auc = roc_auc_metric.compute(prediction_scores=probs[:, 1], references=labels)

    try:
        wandb.log({"roc": wandb.plot.roc_curve(labels, probs, labels=["real", "fake"])})
        wandb.log({"pr": wandb.plot.pr_curve(labels, probs, labels=["real", "fake"])})
    except Exception as e:
        print(f"Warning: Failed to log to wandb: {e}")

    metrics = {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
        "auc": auc["roc_auc"]
    }

    return metrics

def train_model(
        model, 
        train_dataset, 
        val_dataset, 
        num_epochs, 
        warmup_epochs,
        resume_from_checkpoint=None
):
    per_device_batch_size = 8
    total_steps = num_epochs * (len(train_dataset) // per_device_batch_size)
    warmup_steps = warmup_epochs * (len(train_dataset) // per_device_batch_size)

    training_args = TrainingArguments(
        output_dir="./ckpt",
        overwrite_output_dir=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        
        optim="adamw_torch",
        learning_rate=1.5e-5,
        weight_decay=0.01,
        label_smoothing_factor=0.1,
        max_grad_norm=1.0,
        gradient_accumulation_steps=1,
        lr_scheduler_type="cosine",

        num_train_epochs=num_epochs,
        warmup_steps=warmup_steps,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",

        dataloader_num_workers=4,
        dataloader_pin_memory=True,
        dataloader_persistent_workers=True,
        dataloader_prefetch_factor=4,
        
        fp16=True,
        disable_tqdm=False,
        report_to='wandb',
        run_name="TALL-TimeSformer-Tesla V100-Dropout(0.2)"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EpochProgressCallback()]
    )

    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    model.save_pretrained('./weights/best_model')
    return model

ckpt_dir:

These are the checkpoints for 10 epochs. Trainer saves the checkpoint using global step.

When training from scratch, we need to specify the model in the Trainer and pass model.safetensors along with the corresponding config.json, using:

configuration = TimesformerConfig()
model = TimesformerModel(configuration)

My question is: when resuming training from a checkpoint, do I only need to pass resume_from_checkpoint, or do I have to manually load model.safetensors from the checkpoint folder and pass it separately?

Does the model specified in the Trainer’s model parameter refer to the initial model, or does it load the model from the checkpoint folder?

I’m a little confused about this. I hope I’ve explained my query well enough. If anything is unclear, please ask. I need to resume training from the checkpoint.

John6666 · May 7, 2025, 7:09am

If hyperparameters, etc. remain unchanged, it seems easier to use resume_from_checkpoint=True , but specifying the folder may be more reliable.

github.com/huggingface/transformers

how to continue training from a checkpoint with Trainer?

opened 03:24AM - 17 Sep 20 UTC

closed 08:19PM - 20 Sep 20 UTC

fumpe

# ❓ Questions & Help ## Details I am trying to continue training my model …(gpt-2) from a checkpoint, using Trainer. However when I try to do it the model starts training from 0, not from the checkpoint. I share my code because I don't know where I'm making the mistake. ``` import torch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') from transformers import TextDataset,DataCollatorForLanguageModeling, AutoTokenizer, GPT2LMHeadModel, Trainer, TrainingArguments tokenizer = AutoTokenizer.from_pretrained("gpt2-large") train_dataset = TextDataset( tokenizer=tokenizer, file_path='textfile (1).txt', block_size=128) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) model = GPT2LMHeadModel.from_pretrained("checkpoint-9500").to(device) ##HERE I LOAD FROM CHECKPOINT training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=4, # total # of training epochs per_device_train_batch_size=1, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, #eval_dataset=validation_dataset, prediction_loss_only=True, ) trainer.train() ``` Thanks a lot for the help.

https://stackoverflow.com/questions/76217781/how-to-continue-training-with-huggingface-trainer
https://stackoverflow.com/questions/72672281/does-huggingfaces-resume-from-checkpoint-work

mhamza-007 · May 7, 2025, 7:16am

Do I only need to pass the resume_from_checkpoint parameter, or do I also need to manually load the model from the checkpoint directory and pass it to the model parameter of the Trainer?

John6666 · May 7, 2025, 10:06am

If you want to continue training under the same conditions, resume_from_checkpoint seems to be suitable, but if that is not the case, or if you want to avoid bugs in specific environments, specifying the directory name may be more reliable. That way, there is no room for error.

github.com/huggingface/transformers

resume_from_checkpoint is not working with Deepspeed

opened 03:22AM - 15 Mar 23 UTC

closed 02:16AM - 22 Mar 23 UTC

Raibows

### System Info - `transformers` version: 4.26.1 - Platform: Linux-5.4.0-139-g…eneric-x86_64-with-glibc2.17 - Python version: 3.8.16 - Huggingface_hub version: 0.12.1 - PyTorch version (GPU?): 1.13.1 (True) - Tensorflow version (GPU?): 2.7.0 (True) - Flax version (CPU?/GPU?/TPU?): not installed (NA) - Jax version: not installed - JaxLib version: not installed - Using GPU in script?: true - Using distributed or parallel set-up in script?: true ### Who can help? @stas00 ### Information - [ ] The official example scripts - [X] My own modified scripts ### Tasks - [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...) - [X] My own task or dataset (give details below) ### Reproduction 1. trainer with deepspeed using stage_2 or 3 (I think it does not matter) 2. set save_strategy = 'epoch', i.e., save every epoch 3. you cannot use ``resume_from_checkpoint`` to resume the training procedure 4. why? in ``transformers/deepspeed.py/L359``, ``deepspeed_engine.load_checkpoint`` actually needs an argument called ``tag`` or you need have a "latest file" in the checkpoint directory. However, neither of them are supported by trainer. The trainer does not provide a chance to pass ``tag`` , and does not store a "latest file" in the checkpoint directory. 5. related to ``deepspeed/runtime/engine.py/L2712`` ### Expected behavior It should work well as passed ``resume_from_checkpoint``.

mhamza-007 · May 7, 2025, 3:49pm

I am fine-tuning Time Space Transformer, which has a total of 12 encoder layers. I initially unfroze 6 encoder layers and fine-tuned the model for 10 epochs. Now, I want to resume training for 10 more epochs from where I left off (same hyperparameters).

My question :

When using resume_from_checkpoint, do I need to explicitly specify which encoder layers to unfreeze, or will the Trainer automatically retain the previous layer settings?

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EpochProgressCallback()]
    )

Before resuming from the checkpoint, I initially loaded the Time Space Transformer, unfroze the last 6 encoder layers, froze the rest, and passed the model to the Trainer.

Follow-up clarification:

When resuming from a checkpoint, do I need to explicitly load the model from the checkpoint directory, unfreeze the last 6 encoder layers again, and pass it to the model parameter? Or can I simply pass the resume_from_checkpoint parameter, and the Trainer will automatically use the previously fine-tuned model with the same layer configurations?

dbrenes · May 8, 2025, 6:44pm

Hello Muhammad,

Whether you load the model explicitly from the checkpoint directory or you pass True to the resume_from_checkpoint parameter in the Trainer, you still need to configure the model’s layers again.

This happens because the checkpoints that are saved from the training progress do not keep track of which layers are frozen/unfrozen, they keep track of other stuff related to the model weights and the states of the optimizer and scheduler.

You can check which of the parameters in your model are trainable and verify this behavior with a function as such:

def check_all_grads(model):
    print(f"{'Parameter':<60} | {'requires_grad'}")
    print("-" * 75)
    for name, param in model.named_parameters():
        print(f"{name:<60} | {param.requires_grad}")

Daniela Brenes
https://www.ridgerun.ai/
Contact us: support@ridgerun.ai

Topic		Replies	Views
Trainer .train (resume _from _checkpoint =True) Beginners	9	14531	May 16, 2024
Resuming Training from Checkpoints Stored on Hugging Face Hub (without downloading manually) 🤗Transformers	7	276	February 10, 2025
Resume training from checkpoint Beginners	1	3033	January 5, 2023
Load checkpoint from Trainer 🤗Transformers	0	578	February 13, 2024
Resume_from_checkpoint Models	1	2345	June 25, 2024

How to resume training from a checkpoint using huggingface trainer

My question :

Follow-up clarification:

Related topics