I am trying to fit big MoE models in a multi-host, multi-GPU setup using deepspeed stage 2 or 3 for full finetuning. I tried a number of settings, with both smaller facebook/opt-350m
as well as deepseek-ai/DeepSeek-Coder-V2-Lite-Base
. The observations are that turning multinode on doesn’t decrease memory usage / doesn’t help fitting the big model into the GPU.
- Am I doing something obviously wrong with example settings below? In both of them I observed the ~same memory usage (on each gpu) when swapping from 1 machine to 2.
- Is my expectation for a ~2x decrease in memory usage of a particular GPU when scaling from 1 machine to 2 with zero stage 3 wrong?
- How do I debug this more? I saw this guide but it didn’t help me understand the issue.
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
gradient_accumulation_steps: 1
offload_optimizer_device: cpu
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
dynamo_backend: INDUCTOR
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
another:
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_hostfile: /home/ubuntu/code/hostfile
deepspeed_multinode_launcher: pdsh
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
dynamo_backend: INDUCTOR
enable_cpu_affinity: false
machine_rank: 0
main_process_ip: my_ip
main_process_port: 57313
main_training_function: main
num_machines: 2
num_processes: 16
rdzv_backend: static
same_network: false
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
{
"zero_optimization": {
"stage": 3,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"overlap_comm": false,
"contiguous_gradients": false,
"offload_optimizer": {
"device": "cpu"
},
"offload_param": {
"device": "none"
}
},
"optimizer": {
"type": "Adam",
"params": {
"lr": "auto",
"weight_decay": "auto",
"bias_correction": true,
"betas": "auto",
"eps": 1e-8
}
},
"use_node_local_storage": true,
"zero_allow_untested_optimizer": true,
"train_micro_batch_size_per_gpu": "auto",
"mixed_precision": "bf16",
"round_robin_gradients": true
}