Hi All, I am trying to fine tune the gpt-2 with ddp in pytorch. the training part is correct my problem is the validation part, how I can aggregate the validation loss between different gpus? and about the model when I load the saved model ( just from rank=0) the size of the model is different from gpt-2, it is very strange for me why the size of the model changes? is the validation part correct? any ideas to improve the validation part? many thanks for the comments.
#!/usr/bin/env python
# coding: utf-8
from torch.utils.data import DataLoader
from transformers import TextDataset,DataCollatorForLanguageModeling
#from transformers import AutoModelWithLMHead
from transformers import AutoModelForCausalLM
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import gc
import math
import os
import time
import datetime
import torch
import torch.distributed as dist
import sys
## the directory include the package from INVIDIA
#sys.path.append('/home/momenisa/GPU_ZIP_Apex/apex-master/apex/')
#from apex import amp
#from apex.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
import random
import pandas as pd
######################
weight_decay=0
learning_rate=5e-5
adam_epsilon=1e-8
warmup_steps = 1e2
lr=5e-5
Max_length=400
PathData='/home//NLP_Projects/CaseSummary_resolutionProject/Results_GPT_2/model_v4200_k_bs=16_lr=5e-05_epochs=20/'
pretrained_model='/home///GPT_2/'
########################################
def format_time(elapsed):
return str(datetime.timedelta(seconds=int(round((elapsed)))))
################################################
class GPT2Dataset(Dataset):
def __init__(self, txt_list, tokenizer, gpt2_type=pretrained_model, max_length=400):
self.tokenizer = tokenizer
self.input_ids = []
self.attn_masks = []
for txt in txt_list:
encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.attn_masks[idx]
######################################################3
def ddp_setup(rank, world_size):
"""
Args:
rank: Unique identifier of each process
world_size: Total number of processes
"""
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
#########################################################
def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
gpu_id=rank
Path='/home//NLP_Projects/CaseSummary_resolutionProject/Results_GPT_2/multipleGPU/model_v4'\
'data_'+str(200)+'_k'+'_'+'bs='+str(batch_size)+'_lr='+str(learning_rate)+'_epochs='+str(total_epochs)
print(Path)
Results_Path=Path+'/Results/'
ss=os.path.isdir(Results_Path)
if ss==False:
os.makedirs(Results_Path)
### defined variable ###############
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
###############################
ddp_setup(rank, world_size)
###############################
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-small
model = GPT2LMHeadModel.from_pretrained(pretrained_model)
model.resize_token_embeddings(len(tokenizer))
## loading traina and tets dataset
print(PathData)
trains_titles=pd.read_csv(PathData+'/'+'traindata.csv')
valid_titles=pd.read_csv(PathData+'/'+'validdata.csv')
trains_titles=trains_titles.drop(columns=['Unnamed: 0'])['0'].iloc[:200]
valid_titles=valid_titles.drop(columns=['Unnamed: 0'])['0'].iloc[:30]
print(trains_titles.head(2))
train_dataset = GPT2Dataset(trains_titles, tokenizer, max_length=Max_length)
Val_dataset = GPT2Dataset(valid_titles, tokenizer, max_length=Max_length)
############################################################################
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(train_dataset))
validation_loader= torch.utils.data.DataLoader(dataset=Val_dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(Val_dataset))
total_steps = len(train_loader) * total_epochs
################# define optimizer and scheduler#########################
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(), lr = learning_rate,eps = adam_epsilon)
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = warmup_steps,
num_training_steps = total_steps)
############################## train_loader and validation_loader ##############
training_steps_per_epoch=len(train_loader)
total_num_training_steps = int(training_steps_per_epoch*total_epochs)
######################## applying DDP on the model for training ############################
model=model.to(gpu_id)
model = DDP(model, device_ids=[gpu_id])
print("gpu_id",gpu_id)
# ========================================
# Training
# ========================================
training_stats = []
for epoch_i in range(0, total_epochs):
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, total_epochs))
print('Training...')
##########################################
train_loader.sampler.set_epoch(epoch_i)
b_sz = len(next(iter(train_loader))[0])
print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(train_loader)}")
train_loader.sampler.set_epoch(epoch_i)
##########################################
t0 = time.time()
total_train_loss = 0
model.train()
for step, batch in enumerate(train_loader):
#################################
b_input_ids = batch[0].to(gpu_id,non_blocking=True)
b_labels = batch[0].to(gpu_id,non_blocking=True)
b_masks = batch[1].to(gpu_id,non_blocking=True)
#################################
optimizer.zero_grad()
outputs = model( b_input_ids,
labels=b_labels,
attention_mask = b_masks,
token_type_ids=None
)
loss = outputs[0]
batch_loss = loss.item()
total_train_loss += batch_loss
# print("total_train_loss",total_train_loss)
loss.backward()
optimizer.step()
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_loader)
del total_train_loss
del batch_loss
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
# ========================================
# Validation
# ========================================
print("")
print("Running Validation...")
avg_val_loss_1=[]
t0 = time.time()
#################### is this section corrcet for validation ??????????? #############
model.eval()
model = DDP(model, device_ids=[gpu_id])
########################################3
total_eval_loss = 0
nb_eval_steps = 0
########################################
validation_loader.sampler.set_epoch(epoch_i)
b_sz = len(next(iter(validation_loader))[0])
print("bz",b_sz)
print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(validation_loader)}")
validation_loader.sampler.set_epoch(epoch_i)
###########################################
# Evaluate data for one epoch
for batch in validation_loader:
b_input_ids = batch[0].to(gpu_id,non_blocking=True)
b_labels = batch[0].to(gpu_id,non_blocking=True)
b_masks = batch[1].to(gpu_id,non_blocking=True)
with torch.no_grad():
outputs = model.module(b_input_ids,attention_mask = b_masks,labels=b_labels)
loss = outputs[0]
batch_loss = loss.item()
# print("here batch loss",batch_loss)
total_eval_loss += batch_loss
avg_val_loss = total_eval_loss / len(validation_loader)
# print("here total_eval_loss=",total_eval_loss)
perplexity=math.exp(avg_val_loss)
avg_val_loss_1.append(avg_val_loss)
validation_time = format_time(time.time() - t0)
del total_eval_loss
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
# Record all statistics from this epoch.
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Training Time': training_time,
'Validation Time': validation_time,
'perplexity': perplexity
}
)
gc.collect()
################### saving the model ########################
if gpu_id == 0:
Path2=Results_Path+'/'+'savemodel_epoch=='+str(epoch_i)
ss=os.path.isdir(Path2)
if ss==False:
os.makedirs(Path2)
ckp = model.module.state_dict()
torch.save(ckp, Path2+"/checkpoint.pt")
############ save the results #####################
pt_save_directory=Results_Path+'/'+'analyticsnumber'
ss=os.path.isdir(pt_save_directory)
if ss==False:
os.makedirs(pt_save_directory)
print("here",training_stats)
Path_3=pt_save_directory+'/'+'training_stats='+str(42)+".csv"
torch.save(training_stats,Path_3)
destroy_process_group()
#############################
if __name__ == '__main__':
import sys
total_epochs=int(sys.argv[1])
save_every=int(sys.argv[2])
batch_size=int(sys.argv[3])
world_size = (torch.cuda.device_count())-1
print(world_size)
mp.spawn(main, args=(world_size, save_every, total_epochs, batch_size), nprocs=world_size,join=True)