GPT2 finetuning for text generation is getting overfitted

Hi , I am trying to generate text sentences from keywords. This is kind of similar to what has been mentioned in this medium blog here.

Input to the model is keywords and some auxiliary information separated by “|” symbol. and output is the actual sentence that contains the keywords.

keyword_keylabel_separator = "|"

def splitKeywords(keywords):
    keywords=keywords.replace(" , "," ; ")
    keywordsList=keywords.split(",")
    return keywordsList
def splitKeyLabels(keylabels):
    keylabelsList=keylabels.split(" ")
    return keylabelsList

class GPT2DataPreparer:
    def getInputString(self,row):
        keywords=row["Keyword"]
        keylabels=row["Keylabels"]
        
        keywords=splitKeywords(keywords)
        keylabels = splitKeyLabels(keylabels)
        sentence=row["Sentence"]
        
        
        _input=''

        for keyword,keylabel in zip(keywords,keylabels):
            _input+= keyword+","+keylabel+keyword_keylabel_separator
        
        
        _input=_input[:-1]
        


        return _input.strip()
    def getOutputString(self,row):
        sentence = row["Sentence"]
        return sentence.strip()

    

    
processer=GPT2DataPreparer()


def processForT5(readFile,writeFile,shuffle=False):
        global _row
        with open(readFile,'r',encoding='utf-8') as csvfile:
            with open(writeFile, 'w',encoding='utf-8') as processedfile:
                reader = csv.DictReader(csvfile)
                writer=csv.DictWriter(processedfile,fieldnames=["Input","Output"])
                writer.writeheader()
                rows=list(tqdm(reader))
                if shuffle:
                    random.shuffle(rows)
                for row in tqdm(rows):
    #                 _row=row
    #                 break
    #                 print(row["Sentence"])
                    _dict={"Input":processer.getInputString(row),"Output":processer.getOutputString(row)}
    #                 print(_dict)
                    writer.writerow(_dict)
MODEL = "gpt2-medium"

cache_dir = '/mnt/store/transformer_model_download_directory/'
tokenizer = GPT2Tokenizer.from_pretrained(MODEL,cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.unk_token
config = GPT2Config.from_pretrained(MODEL, 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False,cache_dir=cache_dir)

Then, I am trying to use the transfer learning approach to finetune the last 12 layers of the model.

UNFREEZE_LAST_N = 12
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > len(model.transformer.h) - UNFREEZE_LAST_N:
        print("un-freeze block number {} ".format(i+1))
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

And this is my Dataset class overridden

import random

MAXLEN=128

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, randomize=True,input_column_name='Input',output_column_name='Output'):

        self.randomize = randomize
        self.tokenizer = tokenizer
        self.input_column_name = input_column_name
        self.output_column_name = output_column_name
        self.input_data = dataframe[self.input_column_name]
        self.output_data = dataframe[self.output_column_name]
        self.data = dataframe
                
    #---------------------------------------------#


    #---------------------------------------------#

    def __len__(self):
        return len(self.data)

    #---------------------------------------------#
    
    def __getitem__(self, index):
        
        input_data = self.data.iloc[index,0]
        output_data = self.data.iloc[index,1]
        input_data_processed = 'GPT: '+input_data+tokenizer.eos_token
        output_data_processed = output_data+tokenizer.eos_token
        
        input_encoded = tokenizer(input_data_processed,truncation=True,max_length=MAXLEN,padding="max_length")
        label_encoded = tokenizer(output_data_processed,truncation=True,max_length=MAXLEN,padding="max_length")
        
        label = label_encoded['input_ids']
        input_ids = input_encoded['input_ids']
        attention_mask = label_encoded['attention_mask']
        
        return {'label':label,'input_ids':input_ids,'attention_mask':attention_mask}

And here are my TrainingArguments

training_args = TrainingArguments(
    output_dir="./output_gpt2_with_label/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
#     fp16=True,
#     fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

Here in this example I have replaced padding token with unk token , but I have replaced padding token with other custom tokens too. In both the cases , the model overfits after 3-4 epochs.

I have done the same experiment (i.e with same data format ) with BART and T5 (in BART and T5 also I used transfer learning to unfreeze last 6 to 12 layers of weight ), but there I got good performance.

Can anyone please help me ? Is it happening due to padding token ?