you can set it in the trainer
# -- max steps manually decided depending on how many tokens we want to train on
per_device_train_batch_size = batch_size
print(f'{per_device_train_batch_size=}')
print(f'{num_epochs=} {max_steps=}')
# -- Get Optimizer & Scheduler
# - Get Optimizer
if optim == 'paged_adamw_32bit':
from transformers import PagedAdamW_32bit
optimizer = PagedAdamW_32bit(model.parameters())
elif optim == 'adamw_manual':
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
else:
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
print(f'{optimizer=}')
# - Get Scheduler
if lr_scheduler_type == 'cosine_with_warmup_manual':
lr_scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=int(max_steps*warmup_ratio),
num_training_steps=max_steps,
)
else:
lr_scheduler = None
print(f'{lr_scheduler=}')
# -- Training arguments and trainer instantiation ref: https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments
output_dir = Path(f'~/data/results_{today}/').expanduser() if not debug else Path(f'~/data/results/').expanduser()
# output_dir = '.'
# print(f'{debug=} {output_dir=} \n {report_to=}')
training_args = TrainingArguments(
output_dir=output_dir, # The output directory where the model predictions and checkpoints will be written.
# output_dir='.', # The output directory where the model predictions and checkpoints will be written.
# num_train_epochs = num_train_epochs,
max_steps=max_steps, # TODO: hard to fix, see above
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps, # based on alpaca https://github.com/tatsu-lab/stanford_alpaca, allows to process effective_batch_size = gradient_accumulation_steps * batch_size, num its to accumulate before opt update step
gradient_checkpointing = gradient_checkpointing, # TODO depending on hardware set to true?
# optim=optim,
# warmup_steps=int(max_steps*warmup_ratio), # TODO: once real training starts we can select this number for llama v2, what does llama v2 do to make it stable while v1 didn't?
# warmup_ratio=warmup_ratio, # copying alpaca for now, number of steps for a linear warmup, TODO once real training starts change?
# weight_decay=0.01, # TODO once real training change?
weight_decay=weight_decay, # TODO once real training change?
learning_rate = learning_rate, # TODO once real training change? anything larger than -3 I've had terrible experiences with
max_grad_norm=1.0, # TODO once real training change?
# lr_scheduler_type=lr_scheduler_type, # TODO once real training change? using what I've seen most in vision
# lr_scheduler_kwargs=lr_scheduler_kwargs, # ref: https://huggingface.co/docs/transformers/v4.37.0/en/main_classes/optimizer_schedules#transformers.SchedulerType
logging_dir=Path('~/data/maf/logs').expanduser(),
# save_steps=4000, # alpaca does 2000, other defaults were 500
save_steps=max_steps//3, # alpaca does 2000, other defaults were 500
# save_steps=1, # alpaca does 2000, other defaults were 500
# logging_steps=250,
# logging_steps=50,
logging_first_step=True,
# logging_steps=3,
logging_steps=1,
remove_unused_columns=False, # TODO don't get why https://stackoverflow.com/questions/76879872/how-to-use-huggingface-hf-trainer-train-with-custom-collate-function/76929999#76929999 , https://claude.ai/chat/475a4638-cee3-4ce0-af64-c8b8d1dc0d90
report_to=report_to, # change to wandb!
fp16=False, # never ever set to True
bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8, # if >= 8 ==> brain float 16 available or set to True if you always want fp32
)
print(f'{pretrained_model_name_or_path=}\n{optim=}\n{learning_rate=}')
# TODO: might be nice to figure our how llamav2 counts the number of token's they've trained on
print(f'{train_dataset=}')
# print(f'{eval_dataset=}')
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
optimizers=(optimizer, lr_scheduler),
)
# - Train
cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES')
if cuda_visible_devices is not None:
print(f"CUDA_VISIBLE_DEVICES = {cuda_visible_devices}")
trainer.train()
trainer.save_model(output_dir=output_dir) # TODO is this really needed? https://discuss.huggingface.co/t/do-we-need-to-explicity-save-the-model-if-the-save-steps-is-not-a-multiple-of-the-num-steps-with-hf/56745
related Using Cosine LR scheduler via TrainingArguments in Trainer - #8 by brando