Hi , I am trying to generate text sentences from keywords. This is kind of similar to what has been mentioned in this medium blog here.
Input to the model is keywords and some auxiliary information separated by “|” symbol. and output is the actual sentence that contains the keywords.
keyword_keylabel_separator = "|"
def splitKeywords(keywords):
keywords=keywords.replace(" , "," ; ")
keywordsList=keywords.split(",")
return keywordsList
def splitKeyLabels(keylabels):
keylabelsList=keylabels.split(" ")
return keylabelsList
class GPT2DataPreparer:
def getInputString(self,row):
keywords=row["Keyword"]
keylabels=row["Keylabels"]
keywords=splitKeywords(keywords)
keylabels = splitKeyLabels(keylabels)
sentence=row["Sentence"]
_input=''
for keyword,keylabel in zip(keywords,keylabels):
_input+= keyword+","+keylabel+keyword_keylabel_separator
_input=_input[:-1]
return _input.strip()
def getOutputString(self,row):
sentence = row["Sentence"]
return sentence.strip()
processer=GPT2DataPreparer()
def processForT5(readFile,writeFile,shuffle=False):
global _row
with open(readFile,'r',encoding='utf-8') as csvfile:
with open(writeFile, 'w',encoding='utf-8') as processedfile:
reader = csv.DictReader(csvfile)
writer=csv.DictWriter(processedfile,fieldnames=["Input","Output"])
writer.writeheader()
rows=list(tqdm(reader))
if shuffle:
random.shuffle(rows)
for row in tqdm(rows):
# _row=row
# break
# print(row["Sentence"])
_dict={"Input":processer.getInputString(row),"Output":processer.getOutputString(row)}
# print(_dict)
writer.writerow(_dict)
MODEL = "gpt2-medium"
cache_dir = '/mnt/store/transformer_model_download_directory/'
tokenizer = GPT2Tokenizer.from_pretrained(MODEL,cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.unk_token
config = GPT2Config.from_pretrained(MODEL,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
sep_token_id=tokenizer.sep_token_id,
pad_token_id=tokenizer.pad_token_id,
output_hidden_states=False,cache_dir=cache_dir)
Then, I am trying to use the transfer learning approach to finetune the last 12 layers of the model.
UNFREEZE_LAST_N = 12
for parameter in model.parameters():
parameter.requires_grad = False
for i, m in enumerate(model.transformer.h):
#Only un-freeze the last n transformer blocks
if i+1 > len(model.transformer.h) - UNFREEZE_LAST_N:
print("un-freeze block number {} ".format(i+1))
for parameter in m.parameters():
parameter.requires_grad = True
for parameter in model.transformer.ln_f.parameters():
parameter.requires_grad = True
for parameter in model.lm_head.parameters():
parameter.requires_grad = True
And this is my Dataset class overridden
import random
MAXLEN=128
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, randomize=True,input_column_name='Input',output_column_name='Output'):
self.randomize = randomize
self.tokenizer = tokenizer
self.input_column_name = input_column_name
self.output_column_name = output_column_name
self.input_data = dataframe[self.input_column_name]
self.output_data = dataframe[self.output_column_name]
self.data = dataframe
#---------------------------------------------#
#---------------------------------------------#
def __len__(self):
return len(self.data)
#---------------------------------------------#
def __getitem__(self, index):
input_data = self.data.iloc[index,0]
output_data = self.data.iloc[index,1]
input_data_processed = 'GPT: '+input_data+tokenizer.eos_token
output_data_processed = output_data+tokenizer.eos_token
input_encoded = tokenizer(input_data_processed,truncation=True,max_length=MAXLEN,padding="max_length")
label_encoded = tokenizer(output_data_processed,truncation=True,max_length=MAXLEN,padding="max_length")
label = label_encoded['input_ids']
input_ids = input_encoded['input_ids']
attention_mask = label_encoded['attention_mask']
return {'label':label,'input_ids':input_ids,'attention_mask':attention_mask}
And here are my TrainingArguments
training_args = TrainingArguments(
output_dir="./output_gpt2_with_label/",
num_train_epochs=EPOCHS,
per_device_train_batch_size=TRAIN_BATCHSIZE,
per_device_eval_batch_size=TRAIN_BATCHSIZE,
gradient_accumulation_steps=BATCH_UPDATE,
evaluation_strategy="epoch",
save_strategy="epoch",
# fp16=True,
# fp16_opt_level=APEX_OPT_LEVEL,
warmup_steps=WARMUP_STEPS,
learning_rate=LR,
adam_epsilon=EPS,
weight_decay=0.01,
save_total_limit=1,
load_best_model_at_end=True,
)
Here in this example I have replaced padding token with unk token , but I have replaced padding token with other custom tokens too. In both the cases , the model overfits after 3-4 epochs.
I have done the same experiment (i.e with same data format ) with BART and T5 (in BART and T5 also I used transfer learning to unfreeze last 6 to 12 layers of weight ), but there I got good performance.
Can anyone please help me ? Is it happening due to padding token ?