Hi, I’m a student studying about pre-trained LLM
I pre-trained Roberta using MLM from the scratch
I pre-trained about two weeks using RTX 3090*4
I used cumulative loss to learn every batch_size 256. Each GPU has a 16 batch size.
However, models that our pre-trained from the scratch are not fine-tuned.
The graph below is accuracy at mrpc and train loss of our pre-trained model
(green is load pre-trained model from huggingface hub(roberta-base),
other is our model)
MRPC Accuracy
(I wanted to upload a graph for train loss, but the new user couldn’t upload it because only 1 image was available. It came out similar to the train loss graph in other papers and converged around 2.0.)
I searched a lot, but I couldn’t find out what was wrong on my code.
Could you please check which part of my code is the problem.
Below is my pre-training and dataset code.
preprocess_dataset.py
from datasets import load_dataset
from transformers import RobertaTokenizer
import multiprocessing
bookcorpus = load_dataset("bookcorpus", cache_dir="../../dataset/bookcorpus_dataset", split="train")
wiki = load_dataset("wikipedia","20220301.en", cache_dir="../../dataset/wiki_dataset", split="train")
wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
num_proc = multiprocessing.cpu_count()
b=0
def group_texts(examples):
tokenized_inputs = tokenizer(
examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
)
return tokenized_inputs
b = bookcorpus.map(group_texts, remove_columns=["text"], num_proc=num_proc)
a = wiki.map(group_texts, remove_columns=["text"], num_proc=num_proc)
a.save_to_disk("processed_wiki_for_roberta")
b.save_to_disk("processed_bookcorpus_for_roberta")
train_roberta.py
from transformers import DataCollatorForLanguageModeling, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaForMaskedLM, RobertaConfig
from datasets import load_dataset, load_from_disk
import torch
from torch import nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, DistributedSampler, ConcatDataset
import multiprocessing
import numpy as np
from tqdm import tqdm
import os
import wandb
#for distribute learning on 4GPU(4 * RTX 3090)
dist.init_process_group("nccl")
rank = dist.get_rank()
world_size = dist.get_world_size() # 4
torch.cuda.set_device(rank)
device = torch.cuda.current_device()
EPOCH = 10
update_batch = 256
batch_size = 16
accumulate_step = update_batch // (batch_size * world_size)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
class custom_collate_fn():
def __init__(self) -> None:
self.dcl = DataCollatorForLanguageModeling(tokenizer=tokenizer)
def collate_fn(self, batches):
#get max length in batch
max_len = max([len(batch['input_ids']) for batch in batches])
#add padding in tokenizer output
input_ids = [batch['input_ids'] + [tokenizer.pad_token_id] * (max_len - len(batch['input_ids'])) for batch in batches]
attention_mask = [batch['attention_mask'] + [0] * (max_len - len(batch['attention_mask'])) for batch in batches]
special_tokens_mask = [batch['special_tokens_mask'] + [1] * (max_len - len(batch['special_tokens_mask'])) for batch in batches]
input_ids = torch.LongTensor(input_ids)
attention_mask = torch.LongTensor(attention_mask)
special_tokens_mask = torch.LongTensor(special_tokens_mask)
#masking 15% token
input_ids, label = self.dcl.torch_mask_tokens(input_ids, special_tokens_mask)
return input_ids, attention_mask, label
#mask directory for save result
result_path = "roberta_base_256"
if rank == 0:
result_path = os.path.join("results",result_path)
os.makedirs(os.path.join("results",result_path),exist_ok=True)
print("make_dir")
#load dataset from below code
wiki_dataset = load_from_disk("processed_wiki_for_roberta")
bk_dataset = load_from_disk("processed_bookcorpus_for_roberta")
#Learn dataset separately(wiki/bookcorpus) for learning speed
split_dataset = True
if split_dataset:
wiki_cllate_fn = custom_collate_fn()
bk_collate_fn = custom_collate_fn()
class CombineDataLoader:
def __init__(self, dataloader1, dataloader2):
self.dataloader1 = iter(dataloader1)
self.dataloader2 = iter(dataloader2)
self.lenDL1 = len(self.dataloader1)
self.lenDL2 = len(self.dataloader2)
self.select_loader = np.arange(self.lenDL1 + self.lenDL2) < self.lenDL1
np.random.shuffle(self.select_loader)
self.select_loader = torch.Tensor(self.select_loader).cuda()
dist.broadcast(self.select_loader, src=0)
def __iter__(self):
self.index = 0
return self
def __len__(self):
return self.lenDL1 + self.lenDL2
def __next__(self):
if self.index >= self.__len__():
raise StopIteration
if self.select_loader[self.index]:
self.index += 1
return next(self.dataloader1)
else:
self.index += 1
return next(self.dataloader2)
wiki_ddp_sampler = DistributedSampler(
wiki_dataset,
num_replicas=world_size,
rank=rank,
shuffle=True,
)
bk_ddp_sampler = DistributedSampler(
bk_dataset,
num_replicas=world_size,
rank=rank,
shuffle=True,
)
wiki_dataloader = DataLoader(wiki_dataset, batch_size=batch_size, collate_fn=wiki_cllate_fn.collate_fn, num_workers=4, sampler=wiki_ddp_sampler)
# print(len(wiki_dataloader))
bk_dataloader = DataLoader(bk_dataset, batch_size=batch_size, collate_fn=bk_collate_fn.collate_fn, num_workers=4, sampler=bk_ddp_sampler)
# print(len(bk_dataloader))
else:
train_dataset = ConcatDataset([wiki_dataset, bk_dataset])
collate_fn = custom_collate_fn()
train_ddp_sampler = DistributedSampler(
train_dataset,
num_replicas=world_size,
rank=rank,
shuffle=True,
)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, sampler=train_ddp_sampler, collate_fn=collate_fn.collate_fn)
#same setting with roberta base
config_tmp = RobertaConfig.from_pretrained("roberta-base")
model = RobertaForMaskedLM(config_tmp).cuda()
#for distribute learning
model = DistributedDataParallel(model, device_ids=[device], output_device=device,find_unused_parameters=True)
#optimizer
optimizer = AdamW(model.parameters(), lr=5e-05, betas=(0.9,0.98), eps=1e-6, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=10000000)
step = 1
logging_step = 100
save_step = 250000
loss_list = []
for E in range(EPOCH):
model.train()
if split_dataset:
train_dataloader = CombineDataLoader(bk_dataloader, wiki_dataloader)
for batches in tqdm(train_dataloader):
batches = [batch.cuda() for batch in batches]
out = model(input_ids=batches[0],attention_mask=batches[1],labels=batches[-1])
#cumulated loss
out.loss = out.loss / accumulate_step
out.loss.backward()
if step % accumulate_step == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
#for distributed learning logging
dist.reduce(out.loss, op=torch.distributed.ReduceOp.AVG, dst=0)
#logging & save
if rank == 0:
loss_list.append(out.loss.item())
if step % logging_step == 0:
result_dict = {}
result_dict["step"] = step
result_dict["loss"] = sum(loss_list)/len(loss_list)
# if not args.dev:
# wandb.log(result_dict)
loss_list=[]
if step % save_step == 0:
os.makedirs(os.path.join(result_path,str(step)), exist_ok=True)
model.module.save_pretrained(os.path.join(result_path,str(step),"model.pt"))
torch.save(model.state_dict(), os.path.join(result_path,str(step),'model_state_dict.pt'))
torch.save(optimizer.state_dict(), os.path.join(result_path,str(step),'optimizer.pt'))
#update
optimizer.zero_grad()
step+=1
run python using below script
python -u -m torch.distributed.launch --nproc_per_node=4 train_roberta.py
Thank you very much!