What is the behaviour of cosine scheduler and warm up steps when setting using epochs?

brando · April 10, 2024, 11:32pm

How does HF behave if we have epochs and warmup ration? will it use epochs * num seqs and properly set the warm up ration as I give it?

e.g., see full code provided

import os
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from typing import Dict, Tuple, Optional
from pathlib import Path

# Clear CUDA cache to free up memory
torch.cuda.empty_cache()

# Load the accuracy metric from the datasets library
metric = load_metric('accuracy')

def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
    """
    Compute the accuracy of the model.

    Args:
    eval_pred: A tuple containing the model predictions and labels.

    Returns:
    A dictionary with the accuracy score.
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def preprocess_function_proofnet(examples: Dict[str, list], tokenizer: GPT2Tokenizer) -> Dict[str, torch.Tensor]:
    """
    Preprocess the input data for the proofnet dataset.

    Args:
    examples: The examples to preprocess.
    tokenizer: The tokenizer for encoding the texts.

    Returns:
    The processed model inputs.
    """
    inputs = [f"{examples['nl_statement'][i]}{tokenizer.eos_token}{examples['formal_statement'][i]}" for i in range(len(examples['nl_statement']))]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    labels = model_inputs.input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

def setup_and_train_proofnet(pretrained_model_name_or_path: str = "gpt2", 
                            path: str = "hoskinson-center/proofnet",
                            output_dir_val: str = '$HOME/tmp/proofnet/validation',
                            output_dir_test: str = '$HOME/tmp/proofnet/test',
                            path_to_save_model: Optional[str] = None,  # suggested path: '$HOME/tmp/proofnet/model'
                            num_train_epochs: int = 3,
                            per_device_train_batch_size: Optional[int] = 2,
                            per_device_eval_batch_size: Optional[int] = 2,
                            save_total_limit: Optional[int] = None,
                            evaluation_strategy: str = 'epoch',
                            learning_rate: float = 5e-5,
                            weight_decay: float = 0.01,
                            max_grad_norm: float = 1.0, 
                            optim='paged_adamw_32bit',
                    ) -> None:
    """
    Set up the environment, preprocess the dataset, and train the model.

    Args:
    tokenizer_name: The name of the tokenizer.
    model_name: The name of the model.
    dataset_path: The path to the dataset.
    """
    # Load tokenizer and model
    if pretrained_model_name_or_path == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, max_length=1024)
        # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f'{tokenizer.pad_token=}')
        model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path)
        # model.resize_token_embeddings(len(tokenizer))  # leaving for reference, not needed since pad = eos for us
        device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        block_size: int = tokenizer.model_max_length
        print(f'{block_size=}')

    # Load the dataset
    dataset_val = load_dataset(path, split='validation')
    dataset_test = load_dataset(path, split='test')

    # Preprocess the dataset
    if path == "hoskinson-center/proofnet":
        preprocess_function = preprocess_function_proofnet
        val_dataset = dataset_val.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=["nl_statement", "formal_statement"])
        test_dataset = dataset_test.map(lambda examples: preprocess_function(examples, tokenizer), batched=True, remove_columns=["nl_statement", "formal_statement"])

    # Training arguments
    output_dir_val: Path = Path(output_dir_val).expanduser()
    output_dir_val.mkdir(parents=True, exist_ok=True)
    training_args = TrainingArguments(
        output_dir=output_dir_val,
        evaluation_strategy=evaluation_strategy,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        weight_decay=weight_decay,
        save_total_limit=save_total_limit,
        num_train_epochs=num_train_epochs,
        max_grad_norm=max_grad_norm,
        optim=optim,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        fp16=False,  # never ever set to True
        bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8,  # if >= 8 ==> brain float 16 available or set to True if you always want fp32
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=val_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    # Train the model
    trainer.train()

    # Evaluate the model
    output_dir_test: Path = Path(output_dir_test).expanduser()
    output_dir_test.mkdir(parents=True, exist_ok=True)
    results = trainer.evaluate(test_dataset)
    print(results)

    # Save the trained model
    if path_to_save_model is not None:
        path_to_save_model: Path = Path(path_to_save_model).expanduser()
        output_dir_test.mkdir(parents=True, exist_ok=True)
        model.save_pretrained(path_to_save_model)

def main() -> None:
    """
    Main function to execute the model training and evaluation.
    """
    setup_and_train_proofnet()

if __name__ == "__main__":
    import time
    start_time = time.time()
    main()
    print(f"Time taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a")

related: Is it possible to set epoch less than 1 when using Trainer - #2 by nielsr

refs:

muellerzr · April 10, 2024, 11:52pm

My best advice (as its late here if you’re still unsure I’ll check in the AM), try logging with wandb and see what the LR graph looks like

Topic		Replies	Views
Eval Steps after warm-up 🤗Transformers	0	250	August 7, 2021
Trainer only doing 3 epochs no matter the TrainingArguments! Beginners	5	15185	June 20, 2022
Is it possible to set epoch less than 1 when using Trainer 🤗Transformers	1	1296	June 18, 2022
Trainer epoch does not go through all training data? Beginners	4	3909	January 22, 2021
Settings warmup_steps=1 dosn't fixed the learning rate , which change every epcohs Beginners	2	188	January 10, 2024

What is the behaviour of cosine scheduler and warm up steps when setting using epochs?

Related topics