Now I basicly doing such:
model1, model2 = accelerator.prepare(model1, model2)
optimizer1 = optimizer_cls(
model1.parameters(),
lr=config.train.learning_rate_1,
betas=(config.train.adam_beta1, config.train.adam_beta2),
weight_decay=config.train.adam_weight_decay,
eps=config.train.adam_epsilon,
)
optimizer2 = optimizer_cls(
model2.parameters(),
lr=config.train.learning_rate_2,
betas=(config.train.adam_beta1, config.train.adam_beta2),
weight_decay=config.train.adam_weight_decay,
eps=config.train.adam_epsilon,
)
optimizer1, optimizer2 = accelerator.prepare(optimizer1, optimizer2)
for epoch in range(config.num_epochs):
output1 = model1(input)
output2 = model2(input)
loss = loss1(output1) + loss2(output2)
accelerator.backward(loss)
if accelerator.sync_gradients:
accelerator.clip_grad_norm_(model1.parameters(), config.train.max_grad_norm)
accelerator.clip_grad_norm_(model2.parameters(), config.train.max_grad_norm)
optimizer1.step()
optimizer2.step()
optimizer1.zero_grad()
optimizer2.zero_grad()
accelerator.save_state()
Am I doing this right? Thanks!