GPT2 for QA Pair Generation

If I only wanted to generate questions, would I set the attention_mask for those tokens to 0 and use their text as the labels? Something like:

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
def my_data_collator(text_str):
    encoded_results = tokenizer(text_str, padding=True, truncation=True, return_tensors='pt',
                                     return_attention_mask=True)
    enncoded_results['attention_mask'] = set_my_attention_mask(encoded_results) #function to set attention mask to 0 on tokens in the question:... part of text_str
    label_ids = get_my_label_str(encoded_results['input_ids']) #function to return list of token ids for question:... part of text_str

    batch = {}
    batch['input_ids'] = encoded_results['input_ids']
    batch['past'] = None
    batch['attention_mask'] = encoded_results['attention_mask']
    batch['position_ids'] = None
    batch['head_mask'] = None
    batch['inputs_embeds'] = None
    batch['labels'] = label_ids
    batch['use_cache'] = True
    return batch

text_str = 'context: 42 is the answer to life, the universe and everything. question: What is the answer to life, universe and everything ? answer: 42'

And batch would get passed to a GPT2LMHeadModel?

1 Like