hello everyone,
when I use the DeepSpeed integration with HuggingFace’s Trainer integration on my server with two 32GB V100 GPUs,I get the error 'RuntimeError: Error building extension ‘cpu_adam’ '. I have taken a look at similar issues #889 #2268, but it doesn’t works.
Here’s the part of the code I’m running:
import deepspeed
deepspeed.ops.op_builder.CPUAdamBuilder().load()
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=1,
per_device_eval_batch_size=2,
eval_steps=20,
save_steps=20,
warmup_steps=100,
learning_rate=5e-6,
fp16=True,
do_train=True,
do_eval=True,
deepspeed='deepspeed_config.json',
save_strategy='steps',
save_total_limit=3,
evaluation_strategy='steps',
load_best_model_at_end=True,
metric_for_best_model='rouge-l',
logging_steps=50,
logging_dir='./logs')
trainer = MyTrainer(
model,
training_args,
train_dataset=train_dataloader,
eval_dataset=dev_dataloader,
data_collator=collate_fn,
tokenizer=tokenizer
)
trainer.train()
deepspeed_config.json
{
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 1,
"steps_per_print": 50,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 2,
"contiguous_gradients": false,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e7,
"allgather_bucket_size": 5e7,
"cpu_offload": true
},
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "Adamw",
"params": {
"lr": 5e-6,
"betas": [
0.9,
0.999
],
"eps": 1e-8
}
},
"activation_checkpointing": {
"partition_activations": false,
"contiguous_memory_optimization": false
},
"wall_clock_breakdown": false
}
error information
(/home/whzhu_st/mypython) whzhu_st@s3:~/Pytorch/GPT/GLM$ python train_glm_finetune.py
Using /home/whzhu_st/.cache/torch_extensions as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/whzhu_st/.cache/torch_extensions/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/2] /usr/local/cuda-11.1/bin/nvcc -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/whzhu_st/mypython/lib/python3.8/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda-11.1/include -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include/TH -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda-11.1/include -isystem /home/whzhu_st/mypython/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/whzhu_st/mypython/lib/python3.8/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o
FAILED: custom_cuda_kernel.cuda.o
/usr/local/cuda-11.1/bin/nvcc -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/whzhu_st/mypython/lib/python3.8/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda-11.1/include -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include/TH -isystem /home/whzhu_st/mypython/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda-11.1/include -isystem /home/whzhu_st/mypython/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/whzhu_st/mypython/lib/python3.8/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o
gcc: fatal error: cannot execute ‘cc1plus’: execvp: No such file or directory
compilation terminated.
nvcc fatal : Failed to preprocess host compiler properties.
ninja: build stopped: subcommand failed.
Traceback (most recent call last):
File "/home/whzhu_st/mypython/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1666, in _run_ninja_build
subprocess.run(
File "/home/whzhu_st/mypython/lib/python3.8/subprocess.py", line 516, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "train_glm_finetune.py", line 115, in <module>
deepspeed.ops.op_builder.CPUAdamBuilder().load()
File "/home/whzhu_st/mypython/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 485, in load
return self.jit_load(verbose)
File "/home/whzhu_st/mypython/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 520, in jit_load
op_module = load(
File "/home/whzhu_st/mypython/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1080, in load
return _jit_compile(
File "/home/whzhu_st/mypython/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1293, in _jit_compile
_write_ninja_file_and_build_library(
File "/home/whzhu_st/mypython/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1405, in _write_ninja_file_and_build_library
_run_ninja_build(
File "/home/whzhu_st/mypython/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1682, in _run_ninja_build
raise RuntimeError(message) from e
RuntimeError: Error building extension 'cpu_adam'
Please tell me the reason of this problem. Thanks in advance for your help!