CUDA OOM error when `ignore_mismatched_sizes` is enabled

Deepspeed works correctly when loading in Llama3 using Hugging Face Transformers. However, when I enable the ignore_mismatched_sizes parameter, I encounter a CUDA out of memory error. I am using Llama3 with LoRA, and deepspeed ZeRO stage 3 is enabled, so I think I must be overlooking something - but I don’t get why this one parameter will cause the error.

This code works without error -

from transformers import AutoModelForCausalLM
from transformers.integrations.deepspeed import HfDeepSpeedConfig
from peft import get_peft_model, LoraConfig, TaskType
import torch
import deepspeed

dschf = HfDeepSpeedConfig(deepspeed_config)

model = AutoModelForCausalLM.from_pretrained(
    "Meta-Llama-3-8B",
    torch_dtype=torch.bfloat16,
)

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "query_key_value"],
    bias="none",
)
model = get_peft_model(model, lora_config)

model_engine, optimizer, train_dataloader, lr_scheduler = deepspeed.initialize(
    model=model,
    config=deepspeed_config,
)

But when I change to -

model = AutoModelForCausalLM.from_pretrained(
    "Meta-Llama-3-8B",
    torch_dtype=torch.bfloat16,
    ignore_mismatched_sizes=True,
)

I encounter this error -

[rank2]: Traceback (most recent call last):
[rank2]:   File "/home/ubuntu/ai/models/transcript/train.py", line 158, in <module>
[rank2]:     model = AutoModelForCausalLM.from_pretrained(
[rank2]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 563, in from_pretrained
[rank2]:     return model_class.from_pretrained(
[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/transformers/modeling_utils.py", line 3754, in from_pretrained
[rank2]:     ) = cls._load_pretrained_model(
[rank2]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4038, in _load_pretrained_model
[rank2]:     with deepspeed.zero.GatheredParameters(not_initialized_parameters, modifier_rank=0):
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 2172, in __enter__
[rank2]:     self.params[0].all_gather(param_list=self.params)
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 1121, in all_gather
[rank2]:     return self._all_gather(param_list, async_op=async_op, hierarchy=hierarchy)
[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
[rank2]:     ret_val = func(*args, **kwargs)
[rank2]:               ^^^^^^^^^^^^^^^^^^^^^
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 1465, in _all_gather
[rank2]:     self._allgather_params_coalesced(all_gather_nonquantize_list, hierarchy, quantize=False)
[rank2]:   File "/home/ubuntu/miniconda3/envs/env/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 1748, in _allgather_params_coalesced
[rank2]:     flat_tensor = torch.empty(tensor_size, dtype=param_list[0].ds_tensor.dtype,
[rank2]:                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU  has a total capacity of 15.77 GiB of which 9.12 MiB is free. Including non-PyTorch memory, this process has 15.76 GiB memory in use. Of the allocated memory 14.71 GiB is allocated by PyTorch, and 125.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

This is my deepspeed config -

deepspeed_config = {
    "train_batch_size": 32,
    "optimizer": {
        "type": "Adam",
        "params": {
            "lr": 2e-4,
            "betas": [0.9, 0.95],
            "weight_decay": 0,
        },
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 2e-4,
            "warmup_num_steps": 1000,
        },
    },
    "bfloat16": {
        "enabled": True,
    },
    "zero_optimization": {
        "stage": 3,
        "overlap_comm": True,
    },
    "data_efficency": {
        "enabled": True,
    },
    "data_sampling": {
        "enabled": True,
        "num_workers": 8,
    },
}

This is my environment -

  • Ubuntu 22.04
  • 1 node with 64 vCPUs, 488GB RAM (AWS EC2 p3.16xlarge)
  • 8x 16GB V100 GPUs (driver version 525.147.05)
  • Python 3.11.5, PyTorch 2.3.0 with CUDA 12.1, deepspeed 0.14.2