Hi everyone,
I’m training a model for MLM.
I’m facing a strange issue.
When I don’t use accelerate launch
and simply run python main.py
, the training time for 1K samples in the dataset, takes roughly 1 hour for two epochs. python main.py
will obviously use one GPU only.
However, when I use accelerate launch
to run the script with 4 GPUs, the training time is about 2 hours.
I did some logging to see which instruction of the training takes longer; it seems
outputs = model(**batch)
and accelerator.backward(loss)
take way longer time to execute while using accelerate launch
. Any Ideas or help is greatly appreciated.
Here is my Accelerate Env
- `Accelerate` version: 0.26.1
- Platform: Linux-5.14.0-284.40.1.el9_2.x86_64-x86_64-with-glibc2.34
- Python version: 3.10.10
- Numpy version: 1.26.3
- PyTorch version (GPU?): 2.1.2+cu121 (False)
- PyTorch XPU available: False
- PyTorch NPU available: False
- System RAM: 503.48 GB
- `Accelerate` default config:
- compute_environment: LOCAL_MACHINE
- distributed_type: MULTI_GPU
- mixed_precision: fp16
- use_cpu: False
- debug: False
- num_processes: 4
- machine_rank: 0
- num_machines: 1
- gpu_ids: all
- rdzv_backend: static
- same_network: True
- main_training_function: main
- downcast_bf16: no
- tpu_use_cluster: False
- tpu_use_sudo: False
- tpu_env: []
and here is my training code:
def train_mlm_using_accelerator(args):
accelerator = Accelerator(mixed_precision=args.quant)
with accelerator.main_process_first():
# Data Prepration
dataset = MyCustomDataset(args=args)
output_dir = args.output_dir + '_mlm'
global data_collator
if args.whole_word_masking_mlm:
data_collator = whole_word_masking_data_collator
else:
data_collator = DataCollatorForLanguageModeling(tokenizer=dataset.tokenizer, mlm_probability=args.wwm_probability)
# Getting Data Loaders
train_dataloader, eval_dataloader, eval_dataset = prepare_data(args=args, dataset=dataset, data_collator=data_collator)
if args.train_from_scratch:
print("Training from scratch")
config = AutoConfig.from_pretrained(
args.checkpoint,
vocab_size=len(dataset.tokenizer),
n_ctx=args.chunk_size,
bos_token_id=dataset.tokenizer.bos_token_id,
eos_token_id=dataset.tokenizer.eos_token_id,
)
model = RobertaForMaskedLM(config)
else:
model = AutoModelForMaskedLM.from_pretrained(args.checkpoint)
model.resize_token_embeddings(len(dataset.tokenizer))
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = args.num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=num_training_steps,
)
#accelerator = Accelerator(mixed_precision=args.quant)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
lr_scheduler = accelerator.prepare(lr_scheduler)
progress_bar = tqdm(range(num_training_steps))
for epoch in range(args.num_train_epochs):
model.train()
for batch in train_dataloader:
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(accelerator.gather(loss.repeat(args.batch_size)))
losses = torch.cat(losses)
losses = losses[: len(eval_dataset)]
try:
perplexity = math.exp(torch.mean(losses))
except OverflowError:
perplexity = float("inf")
print(f"Epoch {epoch}: Perplexity: {perplexity}")
# Save the model
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save, is_main_process=accelerator.is_main_process)
if accelerator.is_main_process:
dataset.tokenizer.save_pretrained(output_dir)