Using Cosine LR scheduler via TrainingArguments in Trainer

Hi, can anyone confirm whether my approach is correct or not, I’m trying to fine-tune Wav2Vec2 on a large dataset hence I need to make sure the process is correct:

I want to use an LR scheduler - Cosine scheduler with warmup, it can be extracted in the general training through transformers.get_cosine_schedule_with_warmup. What I’m not clear with is: How should I use this in training args to pass on to the Trainer? It has an argument: lr_scheduler_type should I pass an instance of the transformers.get_cosine_schedule_with_warmup with this lr_scheduler_type ? or is there something else I’m missing?

Any help is appreciated, Thanks!

1 Like

You have to set lr_scheduler_type to "cosine".

Is there a list somewhere with the respective strings?

1 Like

@sgugger What if I dont want learning rate decaying to 0 but let’s say to 50% of peak lr? Is there any way to do this?

1 Like

You can pass your own learning rate scheduler to the Trainer.

I found this, hope it helps.

https://huggingface.co/transformers/v4.7.0/_modules/transformers/trainer_utils.html#:~:text=class-,SchedulerType,-(ExplicitEnum)%3A

2 Likes

@sgugger is this the right way to do it?

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import PagedAdamW_32bit
from datasets import load_dataset

def train_gpt2_model(dataset_path: str, model_name: str = "gpt2", num_train_epochs: int = 3):
    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Load and preprocess the dataset
    dataset = load_dataset("text", data_files=dataset_path)
    tokenized_dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=512), batched=True)

    # Initialize the optimizer
    optimizer = PagedAdamW_32bit(model.parameters())

    # Training arguments including the learning rate scheduler
    training_args = TrainingArguments(
        output_dir="./gpt2_trained",
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=2,  # Adjust based on your GPU memory
        warmup_steps=500,  # Number of warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    # Create the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        optimizers=(optimizer, None)  # Scheduler is None, will be set later
    )

    # Set the scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=training_args.warmup_steps,
        num_training_steps=trainer.get_train_dataloader().__len__()
    )
    trainer.lr_scheduler = scheduler

    # Train the model
    trainer.train()

    # Save the model
    model.save_pretrained("./gpt2_trained")

if __name__ == "__main__":
    dataset_path = "path/to/your/dataset.json"
    train_gpt2_model(dataset_path)

you can set it in the trainer

    # -- max steps manually decided depending on how many tokens we want to train on
    per_device_train_batch_size = batch_size
    print(f'{per_device_train_batch_size=}')
    print(f'{num_epochs=} {max_steps=}')

    # -- Get Optimizer & Scheduler
    # - Get Optimizer
    if optim == 'paged_adamw_32bit':
        from transformers import PagedAdamW_32bit
        optimizer = PagedAdamW_32bit(model.parameters())
    elif optim == 'adamw_manual':
        optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    else:
        optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    print(f'{optimizer=}')
    # - Get Scheduler
    if lr_scheduler_type == 'cosine_with_warmup_manual':
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(max_steps*warmup_ratio),
            num_training_steps=max_steps,
        )
    else:
        lr_scheduler = None
    print(f'{lr_scheduler=}')

    # -- Training arguments and trainer instantiation ref: https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments
    output_dir = Path(f'~/data/results_{today}/').expanduser() if not debug else Path(f'~/data/results/').expanduser()
    # output_dir = '.'
    # print(f'{debug=} {output_dir=} \n {report_to=}')
    training_args = TrainingArguments(
        output_dir=output_dir,  # The output directory where the model predictions and checkpoints will be written.
        # output_dir='.',  # The output directory where the model predictions and checkpoints will be written.
        # num_train_epochs = num_train_epochs, 
        max_steps=max_steps,  # TODO: hard to fix, see above
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,  # based on alpaca https://github.com/tatsu-lab/stanford_alpaca, allows to process effective_batch_size = gradient_accumulation_steps * batch_size, num its to accumulate before opt update step
        gradient_checkpointing = gradient_checkpointing,  # TODO depending on hardware set to true?
        # optim=optim,
        # warmup_steps=int(max_steps*warmup_ratio),  # TODO: once real training starts we can select this number for llama v2, what does llama v2 do to make it stable while v1 didn't?
        # warmup_ratio=warmup_ratio,  # copying alpaca for now, number of steps for a linear warmup, TODO once real training starts change? 
        # weight_decay=0.01,  # TODO once real training change?
        weight_decay=weight_decay,  # TODO once real training change?
        learning_rate = learning_rate,  # TODO once real training change? anything larger than -3 I've had terrible experiences with
        max_grad_norm=1.0, # TODO once real training change?
        # lr_scheduler_type=lr_scheduler_type,  # TODO once real training change? using what I've seen most in vision 
        # lr_scheduler_kwargs=lr_scheduler_kwargs,  # ref: https://huggingface.co/docs/transformers/v4.37.0/en/main_classes/optimizer_schedules#transformers.SchedulerType 
        logging_dir=Path('~/data/maf/logs').expanduser(),
        # save_steps=4000,  # alpaca does 2000, other defaults were 500
        save_steps=max_steps//3,  # alpaca does 2000, other defaults were 500
        # save_steps=1,  # alpaca does 2000, other defaults were 500
        # logging_steps=250,
        # logging_steps=50,  
        logging_first_step=True,
        # logging_steps=3,
        logging_steps=1,
        remove_unused_columns=False,  # TODO don't get why https://stackoverflow.com/questions/76879872/how-to-use-huggingface-hf-trainer-train-with-custom-collate-function/76929999#76929999 , https://claude.ai/chat/475a4638-cee3-4ce0-af64-c8b8d1dc0d90
        report_to=report_to,  # change to wandb!
        fp16=False,  # never ever set to True
        bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8,  # if >= 8 ==> brain float 16 available or set to True if you always want fp32
    )
    print(f'{pretrained_model_name_or_path=}\n{optim=}\n{learning_rate=}')

    # TODO: might be nice to figure our how llamav2 counts the number of token's they've trained on
    print(f'{train_dataset=}')
    # print(f'{eval_dataset=}')
    trainer = Trainer(
        model=model,
        args=training_args,  
        train_dataset=train_dataset,
        optimizers=(optimizer, lr_scheduler),
    )

    # - Train
    cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES')
    if cuda_visible_devices is not None:
        print(f"CUDA_VISIBLE_DEVICES = {cuda_visible_devices}")
    trainer.train()
    trainer.save_model(output_dir=output_dir)  # TODO is this really needed? https://discuss.huggingface.co/t/do-we-need-to-explicity-save-the-model-if-the-save-steps-is-not-a-multiple-of-the-num-steps-with-hf/56745

related How do use lr_scheduler - #12 by brando

1 Like

Much easier way:

lr_scheduler_type = "cosine_with_restarts",
lr_scheduler_kwargs = { "num_cycles": 5 },
3 Likes

Thank you so much @edwarddgao. It’s working perfectly.

is this documented somewhere?