OK, I admit defeat. The short version is that I start with a pre-trained MLM model, bert-large-uncased-whole-word-masking, then fine tune it with a bunch of documents, then save it. That all works. I then load the saved model as âBertForQuestionAnswering.from_pretrainedâ and use the âBertTokenizer.from_pretrainedâ, then go on to load and tokenize a bunch of questions/contexts/answers to fine tune it. At every point I check, the input_ids are included, yet when I finally try to start training, I get an error âValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [âstart_positionsâ, âend_positionsâ]â. Nothing Iâve tried fixes the error - Iâm assuming itâs due to some fundamental ignorance on my part. Iâll try including the code that generates the examples, then the code that generates the dataset from the examples (happy to add anything else that is needed):
examples = []
for i in range(len(questions)):
question = questions[i]
context = contexts[i]
answer = answers[i]
text = question
text_pair = f"{context} {answer}"
encoded_example = tokenizer.encode_plus(
text,
text_pair,
max_length=512,
truncation='only_second', # or 'only_first'
padding='max_length',
return_tensors='pt',
return_overflowing_tokens=True,
add_special_tokens = True
)
example = {'question': question,
'answer': answer,
'context': context,
'input_ids': encoded_example['input_ids'].tolist(),
'attention_mask': encoded_example['attention_mask'].tolist(),
'overflowing_tokens': encoded_example['overflowing_tokens']}
examples.append(example)
class MathQADataset(DataProcessor):
def __init__(self, examples, tokenizer, max_length):
self.examples = examples
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
example = self.examples[idx]
question = example['question']
context = example['context']
answer = example['answer']
# Tokenize the question, context and the answer separately
question_encodings = self.tokenizer(question, max_length=self.max_length, padding='max_length', truncation=True)
context_encodings = self.tokenizer(context, max_length=self.max_length, padding='max_length', truncation=True)
answer_encodings = self.tokenizer(answer, max_length=self.max_length, padding='max_length', truncation=True)
# Concatenate the question, context and answer input sequences
# combine the question, context and answer encodings
input_ids = question_encodings['input_ids'] + answer_encodings['input_ids'][1:]
attention_mask = question_encodings['attention_mask'] + answer_encodings['attention_mask'][1:]
# Truncate the concatenated inputs to the maximum length of 512
input_ids = input_ids[:512]
attention_mask = attention_mask[:512]
# Compute the start and end positions of the answer within the concatenated input sequence
answer_start_idx = len(question_encodings['input_ids']) - 1
answer_end_idx = answer_start_idx + len(answer_encodings['input_ids'][1:]) - 1
start_positions = torch.tensor([answer_start_idx])
end_positions = torch.tensor([answer_end_idx])
# Create a dictionary of encodings for this example
encodings = {
'input_ids': input_ids,
'attention_mask': attention_mask,
'start_positions': start_positions,
'end_positions': end_positions,
}
# Add 'token_type_ids' to the encodings if it exists
if 'token_type_ids' in question_encodings:
encodings['token_type_ids'] = question_encodings['token_type_ids']
return encodings
Any suggestions will be gratefully received.
Jeremy