I am using accelerate to train on multi-gpus. I have setup my accelerate config as → multi-gpu, num_processes = 8 (using a p4d.24xlarge machine on AWS). I am trying to train a “google/gemma-2-2B-it” model. So a pretty small model.
below is a snippet of my code:
def stage_one_initialization(model, tokenizer, data, epochs=2, lr=1e-5, beta_kl=0.1, accelerator=None):
dataset = StageOneDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
optimizer = bnb.optim.Adam8bit(model.parameters(),lr=lr, betas=(0.9, 0.995))
model, optimizer, data_loader = accelerator.prepare(model, optimizer, dataloader)
for i in range(torch.cuda.device_count()):
print(f"GPU {i} memory allocated (after prepare) in stage 1: {torch.cuda.memory_allocated(i)}")
model.train()
for epoch in range(epochs):
total_loss = 0.0
for batch in data_loader:
inputs = {k: v for k, v in batch.items()}
outputs = model(**inputs, labels=inputs['input_ids'])
# Cross-entropy loss (first attempt)
cross_entropy_loss = outputs.loss
# Log probabilities and apply KL divergence loss
logits = outputs.logits
log_probs = F.log_softmax(logits, dim=-1)
with torch.no_grad():
target_probs = F.softmax(logits, dim=-1)
kl_loss = F.kl_div(log_probs, target_probs, reduction='batchmean')
# Total loss combines cross-entropy and scaled KL divergence
total_loss_value = cross_entropy_loss + beta_kl * kl_loss
optimizer.zero_grad()
# total_loss_value.backward()
accelerator.backward(total_loss_value)
optimizer.step()
total_loss += total_loss_value.item()
print(f"Stage I - Epoch {epoch+1}, Loss: {total_loss:.4f}")
def main(config_file=None):
config = load_config(config_file)
accelerator = Accelerator()
# device = accelerator.device
# Load model and tokenizer
model_name = config["model_name"]
tokenizer = AutoTokenizer.from_pretrained(model_name,)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, attn_implementation='eager')
# Load the dataset
data_file_path = config["data_file"]
df = pd.read_csv(data_file_path)
# Prepare the data for Stage I and Stage II
data_stage_one = df[["question", "original_answer"]].to_dict(orient="records")
# Stage I training (Initialization)
stage_one_initialization(
model, tokenizer, data_stage_one,
epochs=config["epochs_stage_1"],
lr=config["learning_rate"],
beta_kl=config["beta_kl"],
accelerator = accelerator
)
I have a batch_size of just 1 . I am printing the memory usage after accelerate.prepare and i can see that only 1 gpu is getting used and everything else is 0:
here is the stack trace.
GPU 0 memory allocated (after prepare) in stage 1: 20996537344
GPU 1 memory allocated (after prepare) in stage 1: 0
GPU 2 memory allocated (after prepare) in stage 1: 0
GPU 3 memory allocated (after prepare) in stage 1: 0
GPU 4 memory allocated (after prepare) in stage 1: 0
GPU 5 memory allocated (after prepare) in stage 1: 0
GPU 6 memory allocated (after prepare) in stage 1: 0
GPU 7 memory allocated (after prepare) in stage 1: 0
GPU 0 memory allocated (after prepare) in stage 1: 0
GPU 0 memory allocated (after prepare) in stage 1: 0
GPU 1 memory allocated (after prepare) in stage 1: 0
GPU 0 memory allocated (after prepare) in stage 1: 0
GPU 1 memory allocated (after prepare) in stage 1: 20996537344
GPU 2 memory allocated (after prepare) in stage 1: 0
GPU 1 memory allocated (after prepare) in stage 1: 0
GPU 2 memory allocated (after prepare) in stage 1: 0
GPU 3 memory allocated (after prepare) in stage 1: 0
GPU 2 memory allocated (after prepare) in stage 1: 0
GPU 3 memory allocated (after prepare) in stage 1: 0
GPU 4 memory allocated (after prepare) in stage 1: 20996537344
GPU 3 memory allocated (after prepare) in stage 1: 20996537344
GPU 4 memory allocated (after prepare) in stage 1: 0
GPU 5 memory allocated (after prepare) in stage 1: 0
GPU 4 memory allocated (after prepare) in stage 1: 0
GPU 5 memory allocated (after prepare) in stage 1: 0
GPU 6 memory allocated (after prepare) in stage 1: 0
GPU 5 memory allocated (after prepare) in stage 1: 0
GPU 6 memory allocated (after prepare) in stage 1: 0
GPU 7 memory allocated (after prepare) in stage 1: 0
GPU 6 memory allocated (after prepare) in stage 1: 0
GPU 7 memory allocated (after prepare) in stage 1: 0
GPU 7 memory allocated (after prepare) in stage 1: 0
GPU 0 memory allocated (after prepare) in stage 1: 0
GPU 1 memory allocated (after prepare) in stage 1: 0
GPU 0 memory allocated (after prepare) in stage 1: 0
GPU 2 memory allocated (after prepare) in stage 1: 0
GPU 1 memory allocated (after prepare) in stage 1: 0
GPU 3 memory allocated (after prepare) in stage 1: 0
GPU 2 memory allocated (after prepare) in stage 1: 0
GPU 4 memory allocated (after prepare) in stage 1: 0
GPU 3 memory allocated (after prepare) in stage 1: 0
GPU 5 memory allocated (after prepare) in stage 1: 0
GPU 4 memory allocated (after prepare) in stage 1: 0
GPU 6 memory allocated (after prepare) in stage 1: 0
GPU 5 memory allocated (after prepare) in stage 1: 0
GPU 7 memory allocated (after prepare) in stage 1: 20996537344
GPU 6 memory allocated (after prepare) in stage 1: 20996537344
GPU 7 memory allocated (after prepare) in stage 1: 0
GPU 0 memory allocated (after prepare) in stage 1: 0
GPU 0 memory allocated (after prepare) in stage 1: 0
GPU 1 memory allocated (after prepare) in stage 1: 0
GPU 1 memory allocated (after prepare) in stage 1: 0
GPU 2 memory allocated (after prepare) in stage 1: 0
GPU 2 memory allocated (after prepare) in stage 1: 20996537344
GPU 3 memory allocated (after prepare) in stage 1: 0
GPU 3 memory allocated (after prepare) in stage 1: 0
GPU 4 memory allocated (after prepare) in stage 1: 0
GPU 4 memory allocated (after prepare) in stage 1: 0
GPU 5 memory allocated (after prepare) in stage 1: 20996537344
GPU 5 memory allocated (after prepare) in stage 1: 0
GPU 6 memory allocated (after prepare) in stage 1: 0
GPU 6 memory allocated (after prepare) in stage 1: 0
GPU 7 memory allocated (after prepare) in stage 1: 0
GPU 7 memory allocated (after prepare) in stage 1: 0
[rank1]: Traceback (most recent call last):
[rank1]: File "/root/Google_SCoRe/train.py", line 230, in <module>
[rank1]: main(args.config)
[rank1]: File "/root/Google_SCoRe/train.py", line 203, in main
[rank1]: stage_one_initialization(
[rank1]: File "/root/Google_SCoRe/train.py", line 78, in stage_one_initialization
[rank1]: outputs = model(**inputs, labels=inputs['input_ids'])
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1636, in forward
[rank1]: else self._run_ddp_forward(*inputs, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1454, in _run_ddp_forward
[rank1]: return self.module(*inputs, **kwargs) # type: ignore[index]
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py", line 820, in forward
[rank1]: return model_forward(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py", line 808, in __call__
[rank1]: return convert_to_fp32(self.model_forward(*args, **kwargs))
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 43, in decorate_autocast
[rank1]: return func(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 1047, in forward
[rank1]: outputs = self.model(
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 890, in forward
[rank1]: layer_outputs = decoder_layer(
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 604, in forward
[rank1]: hidden_states, self_attn_weights, present_key_value = self.self_attn(
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 266, in forward
[rank1]: attn_output = self.o_proj(attn_output)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 117, in forward
[rank1]: return F.linear(input, self.weight, self.bias)
[rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 1 has a total capacity of 39.39 GiB of which 18.38 MiB is free. Process 86423 has 39.37 GiB memory in use. Of the allocated memory 37.37 GiB is allocated by PyTorch, and 373.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Could someone please help me understand what I am doing wrong here and why isnt all the GPUs being used?
The computation is not getting distributed.