### System Info
```Shell
- `Accelerate` version: 0.29.3
- Platform: Linux-5….15.0-104.119.4.2.el8uek.x86_64-x86_64-with-glibc2.31
- `accelerate` bash location: /tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/bin/accelerate
- Python version: 3.9.19
- Numpy version: 1.24.4
- PyTorch version (GPU?): 2.0.1+cu118 (True)
- PyTorch XPU available: False
- PyTorch NPU available: False
- PyTorch MLU available: False
- System RAM: 2015.21 GB
- GPU type: NVIDIA A100-SXM4-40GB
- `Accelerate` default config:
Not found
```
### Information
- [ ] The official example scripts
- [X] My own modified scripts
### Tasks
- [ ] One of the scripts in the examples/ folder of Accelerate or an officially supported `no_trainer` script in the `examples` folder of the `transformers` repo (such as `run_no_trainer_glue.py`)
- [X] My own task or dataset (give details below)
### Reproduction
I am using Ray Train with `TorchTrainer` on a Ray Cluster, DeepSpeed, and the HF transformers `Trainer`, which calls accelerate implicitly.
`requirements.txt`:
```
transformers[deepspeed]==4.40.1
accelerate==0.29.3
deepspeed==0.14.2
datasets
flash-attn
ray[train]==2.11.0
```
`train.py`
```py
from dataclasses import dataclass, field
import os
from datetime import datetime
import torch
from transformers import HfArgumentParser, TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset
from ray.train import ScalingConfig, RunConfig
from ray.train.torch import TorchTrainer
from ray.train.huggingface.transformers import RayTrainReportCallback, prepare_trainer
@dataclass
class ModelArguments:
model: str = field(default="meta-llama/Llama-2-7b-chat-hf", metadata={"help": "Model path or HF hub ID."})
use_flash_attn: bool = field(default=True, metadata={"help": "Enable Flash Attention in training."})
@dataclass
class DataArguments:
dataset: str = field(metadata={"help": "Dataset path or HF hub ID."})
@dataclass
class RayArguments:
num_workers: int = field(default=1, metadata={"help": "Number of workers to use in Ray cluster."})
def train(training_args: TrainingArguments, model_args: ModelArguments, data_args: DataArguments):
# bug when initializing distributed state with ray train + transformers
# https://github.com/ray-project/ray/issues/44204
# need to manually call deepspeed.init_distributed() here to get the correct world_size
import deepspeed
deepspeed.init_distributed()
# initialize model, tokenizer
model = AutoModelForCausalLM.from_pretrained(
model_args.model,
trust_remote_code=True,
token=True,
attn_implementation="flash_attention_2" if model_args.use_flash_attn else "eager",
torch_dtype=torch.bfloat16 if model_args.use_flash_attn else torch.float32,
)
tokenizer = AutoTokenizer.from_pretrained(model_args.model, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# preprocess dataset
dataset = load_dataset(data_args.dataset)
dataset = dataset.map(
lambda examples: tokenizer(examples["text"]),
batched=True,
remove_columns=dataset["train"].column_names
)
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# prepare trainer
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
)
trainer.add_callback(RayTrainReportCallback())
trainer = prepare_trainer(trainer)
# run training
trainer.train()
if __name__ == "__main__":
parser = HfArgumentParser((TrainingArguments, ModelArguments, DataArguments, RayArguments))
training_args, model_args, data_args, ray_args = parser.parse_args_into_dataclasses()
scaling_config = ScalingConfig(
num_workers=ray_args.num_workers,
use_gpu=True,
)
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
run_config = RunConfig(
storage_path=training_args.output_dir,
name=f"train-{timestamp}",
)
trainer = TorchTrainer(
lambda: train(training_args, model_args, data_args),
scaling_config=scaling_config,
run_config=run_config,
)
result = trainer.fit()
print(result)
print(result.best_checkpoints)
```
`runtime-env.json`
```json
{
"pip": "requirements.txt"
}
```
`deepspeed-zero-3.json`
```json
{
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": 1,
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "none"
},
"offload_param": {
"device": "none"
},
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_clipping": 1.0,
"steps_per_print": "inf",
"bf16": {
"enabled": true
}
}
```
`run-train.sh`
```bash
ray job submit \
--working-dir . \
--runtime-env runtime-env.json \
-- \
python3 train.py \
--model "EleutherAI/gpt-neo-2.7B" \
--dataset "OpenAssistant/oasst2" \
--bf16 True \
--tf32 True \
--output_dir /mnt/output \
--num_train_epochs 2 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "steps" \
--eval_steps 20 \
--save_strategy "steps" \
--save_steps 20 \
--save_total_limit 3 \
--logging_steps 1 \
--learning_rate 2e-5 \
--warmup_ratio 0.03 \
--report_to tensorboard \
--deepspeed deepspeed-zero-3.json \
--use_flash_attn True \
--num_workers 2
```
I get the following error message:
```
ray.exceptions.RayTaskError(AssertionError): ray::_RayTrainWorker__execute.get_next() (pid=85669, ip=10.244.14.216, actor_id=74d3ccf4a9a71a9c05c6e3ed13000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7fee3805dd30>)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/utils.py", line 169, in discard_return_wrapper
train_func(*args, **kwargs)
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/working_dir_files/_ray_pkg_071f1221244a3868/soundhound/polarisnlu/llm/train.py", line 110, in <lambda>
lambda: train(training_args, model_args, data_args),
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/working_dir_files/_ray_pkg_071f1221244a3868/soundhound/polarisnlu/llm/train.py", line 91, in train
trainer.train()
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/transformers/trainer.py", line 1859, in train
return inner_training_loop(
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/transformers/trainer.py", line 2012, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/accelerate/accelerator.py", line 1266, in prepare
result = self._prepare_deepspeed(*args)
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/accelerate/accelerator.py", line 1652, in _prepare_deepspeed
engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/deepspeed/__init__.py", line 167, in initialize
config_class = DeepSpeedConfig(config, mpu)
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/deepspeed/runtime/config.py", line 795, in __init__
self._configure_train_batch_size()
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/deepspeed/runtime/config.py", line 978, in _configure_train_batch_size
self._batch_assertion()
File "/tmp/ray/session_2024-04-25_09-37-01_001890_8/runtime_resources/pip/9ef89d7a93ad0f6658e2ddb0a14f262115166bb8/virtualenv/lib/python3.9/site-packages/deepspeed/runtime/config.py", line 926, in _batch_assertion
assert train_batch == micro_batch * grad_acc * self.world_size, (
AssertionError: Check batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 2 != 2 * 1 * 2
```
It says `train_batch_size` is set incorrectly, even though I didn't set it manually.
### Expected behavior
`train_batch_size` should be set to 4 in this case, since my per-GPU batch size is 2, and I have 2 GPUs.
I did a bit of digging and I think `train_batch_size` is incorrectly set in `Accelerator._prepare_deepspeed`:
https://github.com/huggingface/accelerate/blob/6af157ea93dfbace1db88b0fdc7dfb568dfdd5a5/src/accelerate/accelerator.py#L1537-1539
```py
config_kwargs = {
"train_micro_batch_size_per_gpu": batch_size_per_device,
"train_batch_size": batch_size_per_device
```
It shouldn't be set to the same value as `train_micro_batch_size_per_gpu`, since if there are multiple GPUs then this would be incorrect.
I was able to fix the problem by resetting `train_batch_size` to None with a very hacky code injection fix:
```py
def train():
# ...
import functools
def fix_train_batch_size(function):
@functools.wraps(function)
def wrapped_function(*args, config_params=None, **kwargs):
if isinstance(config_params, dict):
config_params['train_batch_size'] = None
print('Reset train_batch_size in config_params')
return function(*args, config_params=config_params, **kwargs)
return wrapped_function
deepspeed.initialize = fix_train_batch_size(deepspeed.initialize)
```