Hello,
Is there a specific way we are supposed to push a model to the Huggingface hub while training with multiple GPUs?
I receive this error when trying to push to hub during distributed training:
Traceback (most recent call last):
File "/home/dl/demo/train.py", line 152, in <module>
model.push_to_hub("code-model")
File "/opt/conda/envs/demo/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1207, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'DistributedDataParallel' object has no attribute 'push_to_hub'
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3987 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3988 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3989 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3986) of binary: /opt/conda/envs/demo/bin/python
Example a basic training script with accelerate:
import transformers
import argparse
from tqdm import tqdm
import copy
from transformers import GPT2Tokenizer, OPTForCausalLM, get_scheduler, default_data_collator
from datasets import load_dataset
from itertools import chain
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator
accelerator = Accelerator()
parser = argparse.ArgumentParser()
parser.add_argument('--seq_len', default = 2048, type = int)
parser.add_argument('--batch_size', default = 4, type = int)
parser.add_argument('--num_proc', default = 16, type = int)
parser.add_argument('--gradient_accumulation_steps', default = 1, type = int)
parser.add_argument('--epochs', default = 1, type = int)
args = parser.parse_args()
# Constants
EPOCHS = args.epochs
SEQ_LEN = args.seq_len
gradient_accumulation_steps = args.gradient_accumulation_steps
BATCH_SIZE = args.batch_size
NUM_PROC = args.num_proc
model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
optimizer = AdamW(model.parameters(), lr=3e-5)
# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
# Load dataset
load_train_dataset = load_dataset('conceptofmind/code-train-dedup')
# Tokenizer
def tokenize(examples):
seq_length = SEQ_LEN
examples = tokenizer(examples['content'])
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
if total_length >= seq_length:
total_length = (total_length // seq_length) * seq_length
result = {
k: [t[i : i + seq_length] for i in range(0, total_length, seq_length)]
for k, t in concatenated_examples.items()
}
result["labels"] = copy.deepcopy(result["input_ids"])
return result
with accelerator.main_process_first():
tokenized_train_dataset = load_train_dataset.map(
tokenize,
batched = True,
num_proc = NUM_PROC,
remove_columns = 'content'
)
pytorch_train_dataset = tokenized_train_dataset.with_format('torch')
# Create dataloader
train_dataloader = DataLoader(
pytorch_train_dataset['train'],
shuffle = True,
drop_last = True,
collate_fn = default_data_collator,
batch_size = BATCH_SIZE
)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=1000,
num_training_steps = (len(train_dataloader) * EPOCHS) // gradient_accumulation_steps
)
model, optimizer, train_dataloader = accelerator.prepare(
model, optimizer, train_dataloader
)
progress_bar = tqdm(range(EPOCHS * len(train_dataloader)), disable=not accelerator.is_main_process)
model.train()
for epoch in range(EPOCHS):
for step, batch in enumerate(train_dataloader, start=1):
# Do training
loss = model(**batch).loss
loss = loss / gradient_accumulation_steps
accelerator.backward(loss)
accelerator.clip_grad_norm_(model.parameters(), 1.0)
if step % gradient_accumulation_steps == 0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
if step % 5 == 0:
if accelerator.is_main_process:
model.push_to_hub("code-model")
if accelerator.is_main_process:
model.push_to_hub("code-model-final")
Does the model need to be unwrapped before pushing to hub?
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.push_to_hub("code-350-model")
I am unfamiliar with accelerate.
Thank you for any help!