I attempted to execute DeepSpeed Stage 3 for training via huggingface/accelerate, but I encountered the following error:
File "/usr/lib/python3/dist-packages/deepspeed/runtime/zero/stage3.py", line 2117, in unscale_and_clip_grads
self.fp32_partitioned_groups_flat[sub_group_id].grad.mul_(1. / combined_scale)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
This error occurs even when I execute the same script with only 1 process.
Has anyone else experienced this issue? What steps can I take to resolve it?
Command:
accelerate launch --config_file deepspeed.stage3.json training_sample.py
Script: training_sample.py
# Fine-tune a pretrained model. (2024, May 17). Retrieved from https://huggingface.co/docs/transformers/ja/training
# ! pip install transformers datasets
# ! pip install torch accelerate evaluate scikit-learn
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
# create datasets
dataset = load_dataset("yelp_review_full")
tokenized_datasets = dataset.map(
tokenize_function, batched=True,
)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
# load model
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-cased", num_labels=5
)
# create training arguments
training_args = TrainingArguments(
output_dir="test_trainer",
evaluation_strategy="epoch",
num_train_epochs=1,
# fp16=True,
)
# create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
)
# execute traininng
trainer.train()
Config file: deepspeed.stage3.json
{
"compute_environment": "LOCAL_MACHINE",
"debug": false,
"deepspeed_config": {
"gradient_accumulation_steps": 1,
"offload_optimizer_device": "cpu",
"offload_param_device": "cpu",
"zero3_init_flag": true,
"zero3_save_16bit_model": false,
"zero_stage": 3
},
"distributed_type": "DEEPSPEED",
"downcast_bf16": "no",
"enable_cpu_affinity": false,
"machine_rank": 0,
"main_training_function": "main",
"mixed_precision": "no",
"num_machines": 1,
"num_processes": 2,
"rdzv_backend": "static",
"same_network": true,
"tpu_env": [],
"tpu_use_cluster": false,
"tpu_use_sudo": false,
"use_cpu": false
}
Full Traceback:
Traceback (most recent call last):
File "training_sample.py", line 54, in <module>
trainer.train()
File "/usr/lib/python3/dist-packages/transformers/trainer.py", line 1859, in train
return inner_training_loop(
File "/usr/lib/python3/dist-packages/transformers/trainer.py", line 2203, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/usr/lib/python3/dist-packages/transformers/trainer.py", line 3147, in training_step
self.accelerator.backward(loss)
File "/usr/lib/python3/dist-packages/accelerate/accelerator.py", line 2117, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/usr/lib/python3/dist-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
File "/usr/lib/python3/dist-packages/deepspeed/runtime/engine.py", line 2169, in step
self._take_model_step(lr_kwargs)
File "/usr/lib/python3/dist-packages/deepspeed/runtime/engine.py", line 2075, in _take_model_step
self.optimizer.step()
File "/usr/lib/python3/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/lib/python3/dist-packages/deepspeed/runtime/zero/stage3.py", line 2047, in step
self.unscale_and_clip_grads(sub_group_id, scaled_global_grad_norm)
File "/usr/lib/python3/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/lib/python3/dist-packages/deepspeed/runtime/zero/stage3.py", line 2117, in unscale_and_clip_grads
self.fp32_partitioned_groups_flat[sub_group_id].grad.mul_(1. / combined_scale)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!