If I only wanted to generate questions, would I set the attention_mask
for those tokens to 0 and use their text as the labels
? Something like:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
def my_data_collator(text_str):
encoded_results = tokenizer(text_str, padding=True, truncation=True, return_tensors='pt',
return_attention_mask=True)
enncoded_results['attention_mask'] = set_my_attention_mask(encoded_results) #function to set attention mask to 0 on tokens in the question:... part of text_str
label_ids = get_my_label_str(encoded_results['input_ids']) #function to return list of token ids for question:... part of text_str
batch = {}
batch['input_ids'] = encoded_results['input_ids']
batch['past'] = None
batch['attention_mask'] = encoded_results['attention_mask']
batch['position_ids'] = None
batch['head_mask'] = None
batch['inputs_embeds'] = None
batch['labels'] = label_ids
batch['use_cache'] = True
return batch
text_str = 'context: 42 is the answer to life, the universe and everything. question: What is the answer to life, universe and everything ? answer: 42'
And batch
would get passed to a GPT2LMHeadModel
?