What is the behaviour of cosine scheduler and warm up steps when setting using epochs?

How does HF behave if we have epochs and warmup ration? will it use epochs * num seqs and properly set the warm up ration as I give it?

e.g., see full code provided

import os
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from typing import Dict, Tuple, Optional
from pathlib import Path

# Clear CUDA cache to free up memory
torch.cuda.empty_cache()

# Load the accuracy metric from the datasets library
metric = load_metric('accuracy')

def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
    """
    Compute the accuracy of the model.

    Args:
    eval_pred: A tuple containing the model predictions and labels.

    Returns:
    A dictionary with the accuracy score.
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def preprocess_function_proofnet(examples: Dict[str, list], tokenizer: GPT2Tokenizer) -> Dict[str, torch.Tensor]:
    """
    Preprocess the input data for the proofnet dataset.

    Args:
    examples: The examples to preprocess.
    tokenizer: The tokenizer for encoding the texts.

    Returns:
    The processed model inputs.
    """
    inputs = [f"{examples['nl_statement'][i]}{tokenizer.eos_token}{examples['formal_statement'][i]}" for i in range(len(examples['nl_statement']))]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    labels = model_inputs.input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

def setup_and_train_proofnet(pretrained_model_name_or_path: str = "gpt2", 
                            path: str = "hoskinson-center/proofnet",
                            output_dir_val: str = '$HOME/tmp/proofnet/validation',
                            output_dir_test: str = '$HOME/tmp/proofnet/test',
                            path_to_save_model: Optional[str] = None,  # suggested path: '$HOME/tmp/proofnet/model'
                            num_train_epochs: int = 3,
                            per_device_train_batch_size: Optional[int] = 2,
                            per_device_eval_batch_size: Optional[int] = 2,
                            save_total_limit: Optional[int] = None,
                            evaluation_strategy: str = 'epoch',
                            learning_rate: float = 5e-5,
                            weight_decay: float = 0.01,
                            max_grad_norm: float = 1.0, 
                            optim='paged_adamw_32bit',
                    ) -> None:
    """
    Set up the environment, preprocess the dataset, and train the model.

    Args:
    tokenizer_name: The name of the tokenizer.
    model_name: The name of the model.
    dataset_path: The path to the dataset.
    """
    # Load tokenizer and model
    if pretrained_model_name_or_path == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, max_length=1024)
        # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f'{tokenizer.pad_token=}')
        model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path)
        # model.resize_token_embeddings(len(tokenizer))  # leaving for reference, not needed since pad = eos for us
        device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        block_size: int = tokenizer.model_max_length
        print(f'{block_size=}')

    # Load the dataset
    dataset_val = load_dataset(path, split='validation')
    dataset_test = load_dataset(path, split='test')

    # Preprocess the dataset
    if path == "hoskinson-center/proofnet":
        preprocess_function = preprocess_function_proofnet
        val_dataset = dataset_val.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=["nl_statement", "formal_statement"])
        test_dataset = dataset_test.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=["nl_statement", "formal_statement"])

    # Training arguments
    output_dir_val: Path = Path(output_dir_val).expanduser()
    output_dir_val.mkdir(parents=True, exist_ok=True)
    training_args = TrainingArguments(
        output_dir=output_dir_val,
        evaluation_strategy=evaluation_strategy,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        weight_decay=weight_decay,
        save_total_limit=save_total_limit,
        num_train_epochs=num_train_epochs,
        max_grad_norm=max_grad_norm,
        optim=optim,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        fp16=False,  # never ever set to True
        bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8,  # if >= 8 ==> brain float 16 available or set to True if you always want fp32
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=val_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    # Train the model
    trainer.train()

    # Evaluate the model
    output_dir_test: Path = Path(output_dir_test).expanduser()
    output_dir_test.mkdir(parents=True, exist_ok=True)
    results = trainer.evaluate(test_dataset)
    print(results)

    # Save the trained model
    if path_to_save_model is not None:
        path_to_save_model: Path = Path(path_to_save_model).expanduser()
        output_dir_test.mkdir(parents=True, exist_ok=True)
        model.save_pretrained(path_to_save_model)

def main() -> None:
    """
    Main function to execute the model training and evaluation.
    """
    setup_and_train_proofnet()

if __name__ == "__main__":
    import time
    start_time = time.time()
    main()
    print(f"Time taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a")

related: Is it possible to set epoch less than 1 when using Trainer - #2 by nielsr

refs:

My best advice (as its late here if you’re still unsure I’ll check in the AM), try logging with wandb and see what the LR graph looks like :wink: