Same number of optimizations steps with 1 GPU or 4 GPUs?

I am getting the same number of optimization steps using Trainer and Accelerate; what might be going wrong with my settings?


def train_trainer_ddp():

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=1,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        fp16=True,
        dataloader_num_workers=20,
        disable_tqdm=False,
        fsdp=False,
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=1000,
    )

    model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=num_classes)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=val_set 
    )

    trainer.train()


notebook_launcher(train_trainer_ddp, args=(), num_processes=1, mixed_precision="fp16")  # or 4