Multinode worse performance than single node with same settings

Hi I am using the Trainer to train a sequence classification model. I get worse performance when I have more than one node.

Even though the settings and batch effective total batch size is the same? what could be happening here.

I am using the Huggingface Trainer and torchrun to run this distributed.

Its not about batch size mistakes because all runs say

Total train batch size (w. parallel, distributed & accumulation) = 256

Multinode is 4 nodes each with 4 A100 Gpus. Singlenode is 1 node with 4 A100 GPUs but with gradient accumulation bumped up to 4 to prevent OOM. They both have the same settings including batch size per device of 16 for the 256 total batch size example.

Here is the config. Only difference is gradient accumulation 1 → 4 for single node version. And both were launched using torchrun.

{
  "bf16": {
    "desc": null,
    "value": false
  },
  "fp16": {
    "desc": null,
    "value": true
  },
  "fsdp": {
    "desc": null,
    "value": []
  },
  "seed": {
    "desc": null,
    "value": 100
  },
  "tf32": {
    "desc": null,
    "value": null
  },
  "debug": {
    "desc": null,
    "value": []
  },
  "n_ctx": {
    "desc": null,
    "value": 1024
  },
  "optim": {
    "desc": null,
    "value": "adamw_hf"
  },
  "top_k": {
    "desc": null,
    "value": 50
  },
  "top_p": {
    "desc": null,
    "value": 1
  },
  "_wandb": {
    "desc": null,
    "value": {
      "m": [
        {
          "1": "train/global_step",
          "6": [
            3
          ]
        },
        {
          "1": "eval/loss",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "eval/pairwise_accuracy",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "eval/runtime",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "eval/samples_per_second",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "eval/steps_per_second",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "train/loss",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "train/grad_norm",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "train/learning_rate",
          "5": 1,
          "6": [
            1
          ]
        },
        {
          "1": "train/epoch",
          "5": 1,
          "6": [
            1
          ]
        }
      ],
      "t": {
        "1": [
          1,
          11,
          49,
          55
        ],
        "2": [
          1,
          5,
          11,
          49,
          51,
          53,
          55,
          71,
          98
        ],
        "3": [
          7,
          13,
          23,
          66
        ],
        "4": "3.10.12",
        "5": "0.17.0",
        "6": "4.41.2",
        "8": [
          5
        ],
        "9": {
          "1": "transformers_trainer"
        },
        "13": "linux-x86_64"
      },
      "framework": "huggingface",
      "start_time": 1717389249,
      "cli_version": "0.17.0",
      "is_jupyter_run": false,
      "python_version": "3.10.12",
      "is_kaggle_kernel": false,
      "huggingface_version": "4.41.2"
    }
  },
  "n_embd": {
    "desc": null,
    "value": 1024
  },
  "n_head": {
    "desc": null,
    "value": 16
  },
  "prefix": {
    "desc": null,
    "value": null
  },
  "do_eval": {
    "desc": null,
    "value": true
  },
  "n_inner": {
    "desc": null,
    "value": null
  },
  "n_layer": {
    "desc": null,
    "value": 24
  },
  "no_cuda": {
    "desc": null,
    "value": false
  },
  "use_cpu": {
    "desc": null,
    "value": false
  },
  "do_train": {
    "desc": null,
    "value": false
  },
  "id2label": {
    "desc": null,
    "value": {
      "0": "LABEL_0"
    }
  },
  "label2id": {
    "desc": null,
    "value": {
      "LABEL_0": 0
    }
  },
  "run_name": {
    "desc": null,
    "value": "/tmp/reward_model_100_gpt2-medium_433860e1"
  },
  "use_ipex": {
    "desc": null,
    "value": false
  },
  "adafactor": {
    "desc": null,
    "value": false
  },
  "data_seed": {
    "desc": null,
    "value": null
  },
  "deepspeed": {
    "desc": null,
    "value": null
  },
  "do_sample": {
    "desc": null,
    "value": false
  },
  "hub_token": {
    "desc": null,
    "value": "<HUB_TOKEN>"
  },
  "log_level": {
    "desc": null,
    "value": "passive"
  },
  "max_steps": {
    "desc": null,
    "value": -1
  },
  "n_special": {
    "desc": null,
    "value": 0
  },
  "num_beams": {
    "desc": null,
    "value": 1
  },
  "ray_scope": {
    "desc": null,
    "value": "last"
  },
  "report_to": {
    "desc": null,
    "value": [
      "wandb"
    ]
  },
  "typical_p": {
    "desc": null,
    "value": 1
  },
  "use_cache": {
    "desc": null,
    "value": true
  },
  "adam_beta1": {
    "desc": null,
    "value": 0.9
  },
  "adam_beta2": {
    "desc": null,
    "value": 0.999
  },
  "attn_pdrop": {
    "desc": null,
    "value": 0.1
  },
  "do_predict": {
    "desc": null,
    "value": false
  },
  "embd_pdrop": {
    "desc": null,
    "value": 0.1
  },
  "eval_delay": {
    "desc": null,
    "value": 0
  },
  "eval_steps": {
    "desc": null,
    "value": 1953
  },
  "is_decoder": {
    "desc": null,
    "value": false
  },
  "local_rank": {
    "desc": null,
    "value": 0
  },
  "max_length": {
    "desc": null,
    "value": 20
  },
  "min_length": {
    "desc": null,
    "value": 0
  },
  "model_type": {
    "desc": null,
    "value": "gpt2"
  },
  "optim_args": {
    "desc": null,
    "value": null
  },
  "output_dir": {
    "desc": null,
    "value": "/tmp/reward_model_100_gpt2-medium_433860e1"
  },
  "past_index": {
    "desc": null,
    "value": -1
  },
  "save_steps": {
    "desc": null,
    "value": 1953
  },
  "vocab_size": {
    "desc": null,
    "value": 50257
  },
  "ddp_backend": {
    "desc": null,
    "value": null
  },
  "ddp_timeout": {
    "desc": null,
    "value": 1800
  },
  "fsdp_config": {
    "desc": null,
    "value": {
      "xla": false,
      "xla_fsdp_v2": false,
      "min_num_params": 0,
      "xla_fsdp_grad_ckpt": false
    }
  },
  "label_names": {
    "desc": null,
    "value": null
  },
  "logging_dir": {
    "desc": null,
    "value": "/tmp/reward_model_100_gpt2-medium_433860e1/runs/Jun03_04-34-17_training-bs-fix-master-0"
  },
  "n_positions": {
    "desc": null,
    "value": 1024
  },
  "push_to_hub": {
    "desc": null,
    "value": false
  },
  "resid_pdrop": {
    "desc": null,
    "value": 0.1
  },
  "return_dict": {
    "desc": null,
    "value": true
  },
  "temperature": {
    "desc": null,
    "value": 1
  },
  "torch_dtype": {
    "desc": null,
    "value": null
  },
  "torchdynamo": {
    "desc": null,
    "value": null
  },
  "torchscript": {
    "desc": null,
    "value": false
  },
  "adam_epsilon": {
    "desc": null,
    "value": 1e-8
  },
  "bos_token_id": {
    "desc": null,
    "value": 50256
  },
  "disable_tqdm": {
    "desc": null,
    "value": false
  },
  "eos_token_id": {
    "desc": null,
    "value": 50256
  },
  "fp16_backend": {
    "desc": null,
    "value": "auto"
  },
  "hub_model_id": {
    "desc": null,
    "value": null
  },
  "hub_strategy": {
    "desc": null,
    "value": "every_save"
  },
  "pad_token_id": {
    "desc": null,
    "value": 50256
  },
  "problem_type": {
    "desc": null,
    "value": null
  },
  "pruned_heads": {
    "desc": null,
    "value": {}
  },
  "sep_token_id": {
    "desc": null,
    "value": null
  },
  "summary_type": {
    "desc": null,
    "value": "cls_index"
  },
  "use_bfloat16": {
    "desc": null,
    "value": false
  },
  "warmup_ratio": {
    "desc": null,
    "value": 0.05
  },
  "warmup_steps": {
    "desc": null,
    "value": 0
  },
  "weight_decay": {
    "desc": null,
    "value": 0
  },
  "_name_or_path": {
    "desc": null,
    "value": "openai-community/gpt2-medium"
  },
  "architectures": {
    "desc": null,
    "value": [
      "GPT2LMHeadModel"
    ]
  },
  "bad_words_ids": {
    "desc": null,
    "value": null
  },
  "eval_strategy": {
    "desc": null,
    "value": "steps"
  },
  "jit_mode_eval": {
    "desc": null,
    "value": false
  },
  "learning_rate": {
    "desc": null,
    "value": 0.00004
  },
  "logging_steps": {
    "desc": null,
    "value": 1953
  },
  "max_grad_norm": {
    "desc": null,
    "value": 1
  },
  "mp_parameters": {
    "desc": null,
    "value": ""
  },
  "output_scores": {
    "desc": null,
    "value": false
  },
  "save_strategy": {
    "desc": null,
    "value": "steps"
  },
  "split_batches": {
    "desc": null,
    "value": null
  },
  "torch_compile": {
    "desc": null,
    "value": false
  },
  "tpu_num_cores": {
    "desc": null,
    "value": null
  },
  "bf16_full_eval": {
    "desc": null,
    "value": false
  },
  "early_stopping": {
    "desc": null,
    "value": false
  },
  "fp16_full_eval": {
    "desc": null,
    "value": false
  },
  "fp16_opt_level": {
    "desc": null,
    "value": "O1"
  },
  "length_penalty": {
    "desc": null,
    "value": 1
  },
  "tf_legacy_loss": {
    "desc": null,
    "value": false
  },
  "use_mps_device": {
    "desc": null,
    "value": false
  },
  "finetuning_task": {
    "desc": null,
    "value": null
  },
  "group_by_length": {
    "desc": null,
    "value": false
  },
  "hub_always_push": {
    "desc": null,
    "value": false
  },
  "num_beam_groups": {
    "desc": null,
    "value": 1
  },
  "save_only_model": {
    "desc": null,
    "value": false
  },
  "suppress_tokens": {
    "desc": null,
    "value": null
  },
  "tokenizer_class": {
    "desc": null,
    "value": null
  },
  "dispatch_batches": {
    "desc": null,
    "value": null
  },
  "full_determinism": {
    "desc": null,
    "value": false
  },
  "hub_private_repo": {
    "desc": null,
    "value": false
  },
  "ignore_data_skip": {
    "desc": null,
    "value": false
  },
  "log_on_each_node": {
    "desc": null,
    "value": true
  },
  "logging_strategy": {
    "desc": null,
    "value": "steps"
  },
  "num_train_epochs": {
    "desc": null,
    "value": 1
  },
  "save_safetensors": {
    "desc": null,
    "value": true
  },
  "save_total_limit": {
    "desc": null,
    "value": 4
  },
  "summary_use_proj": {
    "desc": null,
    "value": true
  },
  "ddp_bucket_cap_mb": {
    "desc": null,
    "value": 100
  },
  "diversity_penalty": {
    "desc": null,
    "value": 0
  },
  "greater_is_better": {
    "desc": null,
    "value": null
  },
  "initializer_range": {
    "desc": null,
    "value": 0.02
  },
  "log_level_replica": {
    "desc": null,
    "value": "warning"
  },
  "lr_scheduler_type": {
    "desc": null,
    "value": "cosine"
  },
  "output_attentions": {
    "desc": null,
    "value": false
  },
  "push_to_hub_token": {
    "desc": null,
    "value": "<PUSH_TO_HUB_TOKEN>"
  },
  "save_on_each_node": {
    "desc": null,
    "value": false
  },
  "tpu_metrics_debug": {
    "desc": null,
    "value": false
  },
  "accelerator_config": {
    "desc": null,
    "value": {
      "even_batches": true,
      "non_blocking": false,
      "split_batches": false,
      "dispatch_batches": null,
      "use_seedable_sampler": true,
      "gradient_accumulation_kwargs": null
    }
  },
  "batch_eval_metrics": {
    "desc": null,
    "value": false
  },
  "is_encoder_decoder": {
    "desc": null,
    "value": false
  },
  "layer_norm_epsilon": {
    "desc": null,
    "value": 0.00001
  },
  "length_column_name": {
    "desc": null,
    "value": "length"
  },
  "logging_first_step": {
    "desc": null,
    "value": true
  },
  "repetition_penalty": {
    "desc": null,
    "value": 1
  },
  "scale_attn_weights": {
    "desc": null,
    "value": true
  },
  "summary_activation": {
    "desc": null,
    "value": null
  },
  "torch_compile_mode": {
    "desc": null,
    "value": null
  },
  "activation_function": {
    "desc": null,
    "value": "gelu_new"
  },
  "add_cross_attention": {
    "desc": null,
    "value": false
  },
  "evaluation_strategy": {
    "desc": null,
    "value": "steps"
  },
  "forced_bos_token_id": {
    "desc": null,
    "value": null
  },
  "forced_eos_token_id": {
    "desc": null,
    "value": null
  },
  "fsdp_min_num_params": {
    "desc": null,
    "value": 0
  },
  "lr_scheduler_kwargs": {
    "desc": null,
    "value": {}
  },
  "neftune_noise_alpha": {
    "desc": null,
    "value": null
  },
  "skip_memory_metrics": {
    "desc": null,
    "value": true
  },
  "tie_encoder_decoder": {
    "desc": null,
    "value": false
  },
  "tie_word_embeddings": {
    "desc": null,
    "value": true
  },
  "auto_find_batch_size": {
    "desc": null,
    "value": false
  },
  "dataloader_drop_last": {
    "desc": null,
    "value": false
  },
  "no_repeat_ngram_size": {
    "desc": null,
    "value": 0
  },
  "num_return_sequences": {
    "desc": null,
    "value": 1
  },
  "optim_target_modules": {
    "desc": null,
    "value": null
  },
  "output_hidden_states": {
    "desc": null,
    "value": false
  },
  "overwrite_output_dir": {
    "desc": null,
    "value": false
  },
  "prediction_loss_only": {
    "desc": null,
    "value": false
  },
  "push_to_hub_model_id": {
    "desc": null,
    "value": null
  },
  "task_specific_params": {
    "desc": null,
    "value": {
      "text-generation": {
        "do_sample": true,
        "max_length": 50
      }
    }
  },
  "transformers_version": {
    "desc": null,
    "value": "4.41.2"
  },
  "begin_suppress_tokens": {
    "desc": null,
    "value": null
  },
  "dataloader_pin_memory": {
    "desc": null,
    "value": true
  },
  "ddp_broadcast_buffers": {
    "desc": null,
    "value": false
  },
  "metric_for_best_model": {
    "desc": null,
    "value": null
  },
  "remove_invalid_values": {
    "desc": null,
    "value": false
  },
  "remove_unused_columns": {
    "desc": null,
    "value": false
  },
  "summary_first_dropout": {
    "desc": null,
    "value": 0.1
  },
  "torch_compile_backend": {
    "desc": null,
    "value": null
  },
  "dataloader_num_workers": {
    "desc": null,
    "value": 12
  },
  "decoder_start_token_id": {
    "desc": null,
    "value": null
  },
  "eval_do_concat_batches": {
    "desc": null,
    "value": true
  },
  "gradient_checkpointing": {
    "desc": null,
    "value": false
  },
  "half_precision_backend": {
    "desc": null,
    "value": "auto"
  },
  "label_smoothing_factor": {
    "desc": null,
    "value": 0
  },
  "load_best_model_at_end": {
    "desc": null,
    "value": false
  },
  "logging_nan_inf_filter": {
    "desc": null,
    "value": true
  },
  "predict_special_tokens": {
    "desc": null,
    "value": true
  },
  "resume_from_checkpoint": {
    "desc": null,
    "value": null
  },
  "summary_proj_to_labels": {
    "desc": null,
    "value": true
  },
  "chunk_size_feed_forward": {
    "desc": null,
    "value": 0
  },
  "eval_accumulation_steps": {
    "desc": null,
    "value": null
  },
  "per_gpu_eval_batch_size": {
    "desc": null,
    "value": null
  },
  "reorder_and_upcast_attn": {
    "desc": null,
    "value": false
  },
  "return_dict_in_generate": {
    "desc": null,
    "value": false
  },
  "per_gpu_train_batch_size": {
    "desc": null,
    "value": null
  },
  "push_to_hub_organization": {
    "desc": null,
    "value": null
  },
  "include_tokens_per_second": {
    "desc": null,
    "value": false
  },
  "dataloader_prefetch_factor": {
    "desc": null,
    "value": null
  },
  "ddp_find_unused_parameters": {
    "desc": null,
    "value": null
  },
  "include_inputs_for_metrics": {
    "desc": null,
    "value": false
  },
  "per_device_eval_batch_size": {
    "desc": null,
    "value": 8
  },
  "use_legacy_prediction_loop": {
    "desc": null,
    "value": false
  },
  "cross_attention_hidden_size": {
    "desc": null,
    "value": null
  },
  "gradient_accumulation_steps": {
    "desc": null,
    "value": 4
  },
  "per_device_train_batch_size": {
    "desc": null,
    "value": 16
  },
  "encoder_no_repeat_ngram_size": {
    "desc": null,
    "value": 0
  },
  "dataloader_persistent_workers": {
    "desc": null,
    "value": false
  },
  "gradient_checkpointing_kwargs": {
    "desc": null,
    "value": null
  },
  "include_num_input_tokens_seen": {
    "desc": null,
    "value": false
  },
  "scale_attn_by_inverse_layer_idx": {
    "desc": null,
    "value": false
  },
  "exponential_decay_length_penalty": {
    "desc": null,
    "value": null
  },
  "fsdp_transformer_layer_cls_to_wrap": {
    "desc": null,
    "value": null
  },
  "restore_callback_states_from_checkpoint": {
    "desc": null,
    "value": false
  }
}

I get the same performance if I do

  1. 1 node 4 GPUs
  2. 2 nodes each with 2 GPUs

therefore its not about internode communication

There is a clear relationship where more GPUs = worse performance.

Ok I fixed this issue

the random seed was set to the same value on each GPU

this meant that all the dropout masks were the same on each device which led to large and funky gradients

I think this is a gotcha for people and maybe you should handle this internally?

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.