Hello,
I have the following simple code to fine tune a bert model. It worked correctly when just using 1 GPU, but when I started adding DistributedDataParallel, it will have the following errors:
"You should supply an encoding or a list of encodings to this method "
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided
It seems the tokenizer is not able to generate correct output once using DDP. Does anyone know how to solve this?
def encode_batch(batch, tokenizer, max_length):
return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=max_length,
return_tensors='pt')
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "12345"
torch.distributed.init_process_group(backend="nccl", rank=0, world_size=1)
local_rank = torch.distributed.get_rank()
model_checkpoint = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
print(model.num_parameters())
model = model.to("cuda")
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank
train_data= './train_data/'
test_data = './eval.txt'
train_dataset = load_dataset('text', data_dir=train_data, streaming=True, split="train")
test_dataset = load_dataset('text', data_files=test_data, streaming=True, split="train")
max_length = 32
batch_size = 512
tokenized_train_dataset = train_dataset.map(encode_batch, batched=True, batch_size=batch_size,
fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length}) tokenized_test_dataset = test_dataset.map(encode_batch, batched=True, batch_size=batch_size,
fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True,
mlm_probability=0.3
)
training_args = TrainingArguments(
output_dir='./model_dir',
per_device_train_batch_size=256,
per_device_eval_batch_size=256,
evaluation_strategy='steps',
eval_steps=100,
logging_steps=100,
num_train_epochs=1,
save_strategy='steps',
save_steps=100,
max_steps=300,
load_best_model_at_end=True,
fp16=True,
dataloader_num_workers=3,
local_rank=local_rank,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_test_dataset,
)
trainer.train()