GPT2 finetuning for text generation is getting overfitted

coolcoder001 · August 27, 2021, 3:20pm

Hi , I am trying to generate text sentences from keywords. This is kind of similar to what has been mentioned in this medium blog here.

Input to the model is keywords and some auxiliary information separated by “|” symbol. and output is the actual sentence that contains the keywords.

keyword_keylabel_separator = "|"

def splitKeywords(keywords):
    keywords=keywords.replace(" , "," ; ")
    keywordsList=keywords.split(",")
    return keywordsList
def splitKeyLabels(keylabels):
    keylabelsList=keylabels.split(" ")
    return keylabelsList

class GPT2DataPreparer:
    def getInputString(self,row):
        keywords=row["Keyword"]
        keylabels=row["Keylabels"]
        
        keywords=splitKeywords(keywords)
        keylabels = splitKeyLabels(keylabels)
        sentence=row["Sentence"]
        
        
        _input=''

        for keyword,keylabel in zip(keywords,keylabels):
            _input+= keyword+","+keylabel+keyword_keylabel_separator
        
        
        _input=_input[:-1]
        


        return _input.strip()
    def getOutputString(self,row):
        sentence = row["Sentence"]
        return sentence.strip()

    

    
processer=GPT2DataPreparer()


def processForT5(readFile,writeFile,shuffle=False):
        global _row
        with open(readFile,'r',encoding='utf-8') as csvfile:
            with open(writeFile, 'w',encoding='utf-8') as processedfile:
                reader = csv.DictReader(csvfile)
                writer=csv.DictWriter(processedfile,fieldnames=["Input","Output"])
                writer.writeheader()
                rows=list(tqdm(reader))
                if shuffle:
                    random.shuffle(rows)
                for row in tqdm(rows):
    #                 _row=row
    #                 break
    #                 print(row["Sentence"])
                    _dict={"Input":processer.getInputString(row),"Output":processer.getOutputString(row)}
    #                 print(_dict)
                    writer.writerow(_dict)

MODEL = "gpt2-medium"

cache_dir = '/mnt/store/transformer_model_download_directory/'
tokenizer = GPT2Tokenizer.from_pretrained(MODEL,cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.unk_token

config = GPT2Config.from_pretrained(MODEL, 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False,cache_dir=cache_dir)

Then, I am trying to use the transfer learning approach to finetune the last 12 layers of the model.

UNFREEZE_LAST_N = 12
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > len(model.transformer.h) - UNFREEZE_LAST_N:
        print("un-freeze block number {} ".format(i+1))
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

And this is my Dataset class overridden

import random

MAXLEN=128

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, randomize=True,input_column_name='Input',output_column_name='Output'):

        self.randomize = randomize
        self.tokenizer = tokenizer
        self.input_column_name = input_column_name
        self.output_column_name = output_column_name
        self.input_data = dataframe[self.input_column_name]
        self.output_data = dataframe[self.output_column_name]
        self.data = dataframe
                
    #---------------------------------------------#


    #---------------------------------------------#

    def __len__(self):
        return len(self.data)

    #---------------------------------------------#
    
    def __getitem__(self, index):
        
        input_data = self.data.iloc[index,0]
        output_data = self.data.iloc[index,1]
        input_data_processed = 'GPT: '+input_data+tokenizer.eos_token
        output_data_processed = output_data+tokenizer.eos_token
        
        input_encoded = tokenizer(input_data_processed,truncation=True,max_length=MAXLEN,padding="max_length")
        label_encoded = tokenizer(output_data_processed,truncation=True,max_length=MAXLEN,padding="max_length")
        
        label = label_encoded['input_ids']
        input_ids = input_encoded['input_ids']
        attention_mask = label_encoded['attention_mask']
        
        return {'label':label,'input_ids':input_ids,'attention_mask':attention_mask}

And here are my TrainingArguments

training_args = TrainingArguments(
    output_dir="./output_gpt2_with_label/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
#     fp16=True,
#     fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

Here in this example I have replaced padding token with unk token , but I have replaced padding token with other custom tokens too. In both the cases , the model overfits after 3-4 epochs.

I have done the same experiment (i.e with same data format ) with BART and T5 (in BART and T5 also I used transfer learning to unfreeze last 6 to 12 layers of weight ), but there I got good performance.

Can anyone please help me ? Is it happening due to padding token ?

Topic		Replies	Views
Text Generation, adding random words, weird linebreaks & symbols at random Beginners	5	982	May 24, 2021
Keyword generation using T5 Models	4	1990	November 2, 2022
Fine tune the text generation with gpt2 Beginners	2	441	February 22, 2023
Generate sentences from keywords only Beginners	4	3019	November 26, 2021
Is it possible to generate GPT2 output without an input prompt text Beginners	5	4408	March 14, 2021

GPT2 finetuning for text generation is getting overfitted

Related topics