I saw:
def get_original_training_args(output_dir="./results",
per_device_train_batch_size=1,
gradient_accumulation_steps=16, # num its to accumulate before opt update step
optim="paged_adamw_32bit",
save_steps=10, # how often to save, if <1 -> % of train steps
logging_steps=10, # how often to log, if <1 -> % of train steps
learning_rate=2e-4,
max_grad_norm=0.3,
max_steps=500, # number of training steps/its
warmup_ratio=0.03, # number of steps for a linear warmup
lr_scheduler_type="constant",
):
"""
"""
from transformers import TrainingArguments
training_arguments = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
fp16=True,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=True,
lr_scheduler_type=lr_scheduler_type,
)
return training_arguments
Is there a 16 brain float option for this? Or should I always have the optimizer be in 32 bit precision?