Pre-trained from scratch RoBERTa is not fine-tuned. (using pytorch and DDP)

Hi, I’m a student studying about pre-trained LLM

I pre-trained Roberta using MLM from the scratch
I pre-trained about two weeks using RTX 3090*4
I used cumulative loss to learn every batch_size 256. Each GPU has a 16 batch size.
However, models that our pre-trained from the scratch are not fine-tuned.
The graph below is accuracy at mrpc and train loss of our pre-trained model
(green is load pre-trained model from huggingface hub(roberta-base),
other is our model)


MRPC Accuracy

(I wanted to upload a graph for train loss, but the new user couldn’t upload it because only 1 image was available. It came out similar to the train loss graph in other papers and converged around 2.0.)

I searched a lot, but I couldn’t find out what was wrong on my code.
Could you please check which part of my code is the problem.
Below is my pre-training and dataset code.

preprocess_dataset.py

from datasets import load_dataset
from transformers import RobertaTokenizer
import multiprocessing

bookcorpus = load_dataset("bookcorpus", cache_dir="../../dataset/bookcorpus_dataset", split="train")
wiki = load_dataset("wikipedia","20220301.en", cache_dir="../../dataset/wiki_dataset", split="train")
wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
num_proc = multiprocessing.cpu_count()

b=0
def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
    )
    
    return tokenized_inputs

b = bookcorpus.map(group_texts, remove_columns=["text"], num_proc=num_proc)
a = wiki.map(group_texts, remove_columns=["text"], num_proc=num_proc)

a.save_to_disk("processed_wiki_for_roberta")
b.save_to_disk("processed_bookcorpus_for_roberta")

train_roberta.py

from transformers import DataCollatorForLanguageModeling, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaForMaskedLM, RobertaConfig
from datasets import load_dataset, load_from_disk

import torch
from torch import nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, DistributedSampler, ConcatDataset
import multiprocessing

import numpy as np
from tqdm import tqdm
import os

import wandb

#for distribute learning on 4GPU(4 * RTX 3090)
dist.init_process_group("nccl")
rank = dist.get_rank()
world_size = dist.get_world_size() # 4
torch.cuda.set_device(rank)
device = torch.cuda.current_device()

EPOCH = 10
update_batch = 256
batch_size = 16
accumulate_step = update_batch // (batch_size * world_size)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    
class custom_collate_fn():
    def __init__(self) -> None:
        self.dcl = DataCollatorForLanguageModeling(tokenizer=tokenizer)
        
    def collate_fn(self, batches):
        #get max length in batch
        max_len = max([len(batch['input_ids']) for batch in batches])
        
        #add padding in tokenizer output
        input_ids = [batch['input_ids'] + [tokenizer.pad_token_id] * (max_len - len(batch['input_ids'])) for batch in batches]
        attention_mask = [batch['attention_mask'] + [0] * (max_len - len(batch['attention_mask'])) for batch in batches]
        special_tokens_mask = [batch['special_tokens_mask'] + [1] * (max_len - len(batch['special_tokens_mask']))  for batch in batches]
        
        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.LongTensor(attention_mask)
        special_tokens_mask = torch.LongTensor(special_tokens_mask)
        
        #masking 15% token
        input_ids, label = self.dcl.torch_mask_tokens(input_ids, special_tokens_mask)
        
        return input_ids, attention_mask, label

#mask directory for save result
result_path = "roberta_base_256"
if rank == 0:
    result_path = os.path.join("results",result_path)
    os.makedirs(os.path.join("results",result_path),exist_ok=True)
    print("make_dir")

#load dataset from below code
wiki_dataset = load_from_disk("processed_wiki_for_roberta")
bk_dataset = load_from_disk("processed_bookcorpus_for_roberta")

#Learn dataset separately(wiki/bookcorpus) for learning speed
split_dataset = True
if split_dataset:
    wiki_cllate_fn = custom_collate_fn()
    bk_collate_fn = custom_collate_fn()

    class CombineDataLoader:
        def __init__(self, dataloader1, dataloader2):
            self.dataloader1 = iter(dataloader1)
            self.dataloader2 = iter(dataloader2)
            
            self.lenDL1 = len(self.dataloader1)
            self.lenDL2 = len(self.dataloader2)
            
            self.select_loader = np.arange(self.lenDL1 + self.lenDL2) < self.lenDL1
            np.random.shuffle(self.select_loader)
            self.select_loader = torch.Tensor(self.select_loader).cuda()
            dist.broadcast(self.select_loader, src=0)
            
        def __iter__(self):
            self.index = 0
            return self
        
        def __len__(self):
            return self.lenDL1 + self.lenDL2
        
        def __next__(self):
            if self.index >= self.__len__():
                raise StopIteration
                
            if self.select_loader[self.index]:
                self.index += 1
                return next(self.dataloader1)
            else:
                self.index += 1
                return next(self.dataloader2)

    wiki_ddp_sampler = DistributedSampler(
        wiki_dataset,
        num_replicas=world_size,
        rank=rank,
        shuffle=True,
    )
    bk_ddp_sampler = DistributedSampler(
        bk_dataset,
        num_replicas=world_size,
        rank=rank,
        shuffle=True,
    )

    wiki_dataloader = DataLoader(wiki_dataset, batch_size=batch_size, collate_fn=wiki_cllate_fn.collate_fn, num_workers=4, sampler=wiki_ddp_sampler)
    # print(len(wiki_dataloader))
    bk_dataloader = DataLoader(bk_dataset, batch_size=batch_size, collate_fn=bk_collate_fn.collate_fn, num_workers=4, sampler=bk_ddp_sampler)
    # print(len(bk_dataloader))

else:
    train_dataset = ConcatDataset([wiki_dataset, bk_dataset])
    
    collate_fn = custom_collate_fn()
    
    train_ddp_sampler = DistributedSampler(
        train_dataset,
        num_replicas=world_size,
        rank=rank,
        shuffle=True,
    )
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, sampler=train_ddp_sampler, collate_fn=collate_fn.collate_fn)
    
#same setting with roberta base
config_tmp = RobertaConfig.from_pretrained("roberta-base")
model = RobertaForMaskedLM(config_tmp).cuda()

#for distribute learning
model = DistributedDataParallel(model, device_ids=[device], output_device=device,find_unused_parameters=True)

#optimizer
optimizer = AdamW(model.parameters(), lr=5e-05, betas=(0.9,0.98), eps=1e-6, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=10000000)

step = 1
logging_step = 100
save_step = 250000
loss_list = []
for E in range(EPOCH):
    model.train()
    
    if split_dataset:
        train_dataloader = CombineDataLoader(bk_dataloader, wiki_dataloader)
        
    for batches in tqdm(train_dataloader):
        batches = [batch.cuda() for batch in batches]
        
        out = model(input_ids=batches[0],attention_mask=batches[1],labels=batches[-1])
        
        #cumulated loss
        out.loss = out.loss / accumulate_step
        out.loss.backward()
        if step % accumulate_step == 0: 
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        #for distributed learning logging
        dist.reduce(out.loss, op=torch.distributed.ReduceOp.AVG, dst=0)
        
        #logging & save
        if rank == 0:
            loss_list.append(out.loss.item())
            
            if step % logging_step == 0:
                result_dict = {}
                result_dict["step"] = step
                result_dict["loss"] = sum(loss_list)/len(loss_list)
                # if not args.dev:
                #     wandb.log(result_dict)
                loss_list=[]
                
            if step % save_step == 0:
                os.makedirs(os.path.join(result_path,str(step)), exist_ok=True)
                model.module.save_pretrained(os.path.join(result_path,str(step),"model.pt"))
                torch.save(model.state_dict(), os.path.join(result_path,str(step),'model_state_dict.pt'))
                torch.save(optimizer.state_dict(), os.path.join(result_path,str(step),'optimizer.pt'))
        
        #update
        optimizer.zero_grad()
        
        step+=1
    

run python using below script

python -u -m  torch.distributed.launch --nproc_per_node=4 train_roberta.py 

Thank you very much!

Excuse me,
to add a datacollator like esperberto example.

Could we have to add it in the same way as the example?
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Also, could I add mlm and mlm_probability options?