Tokenizer cannot produce correct output once using DistributedDataParallel

Hello,

I have the following simple code to fine tune a bert model. It worked correctly when just using 1 GPU, but when I started adding DistributedDataParallel, it will have the following errors:
"You should supply an encoding or a list of encodings to this method "
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided

It seems the tokenizer is not able to generate correct output once using DDP. Does anyone know how to solve this?

def encode_batch(batch, tokenizer, max_length):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=max_length,
                     return_tensors='pt')

os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "12345"

torch.distributed.init_process_group(backend="nccl", rank=0, world_size=1)
local_rank = torch.distributed.get_rank()

model_checkpoint = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
print(model.num_parameters())

model = model.to("cuda")
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank

train_data= './train_data/'
test_data = './eval.txt'
train_dataset = load_dataset('text', data_dir=train_data, streaming=True, split="train")
test_dataset = load_dataset('text', data_files=test_data, streaming=True, split="train")

max_length = 32
batch_size = 512
tokenized_train_dataset = train_dataset.map(encode_batch, batched=True, batch_size=batch_size,
                                                fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length}) tokenized_test_dataset = test_dataset.map(encode_batch, batched=True, batch_size=batch_size,
                                              fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})


data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.3
    )
 training_args = TrainingArguments(
        output_dir='./model_dir',
        per_device_train_batch_size=256,
        per_device_eval_batch_size=256,
        evaluation_strategy='steps',
        eval_steps=100,
        logging_steps=100,
        num_train_epochs=1,
        save_strategy='steps',
        save_steps=100,
        max_steps=300,
        load_best_model_at_end=True,
        fp16=True,
        dataloader_num_workers=3,
        local_rank=local_rank,
    )
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
    )

 trainer.train()