Hello everyone. I already have post a question about fine-tuning bert-base-italian-cased
on SQuAD-it dateset. Waiting for an answer I tried another solution, following the Question Answerinf tutorial on SQuAS 2.0 in the transformers docs on HuggingFace.
My data are taken from SQuAD-it. I followed this way:
import json
from pathlib import Path
def read_dataset(path):
path = Path(path)
with open(path, 'rb') as f:
squad_dict = json.load(f)
contexts = []
questions = []
answers = []
for group in squad_dict['data']:
for passage in group['paragraphs']:
context = passage['context']
for qa in passage['qas']:
question = qa['question']
for answer in qa['answers']:
contexts.append(context)
questions.append(question)
answers.append(answer)
return contexts, questions, answers
train_contexts, train_questions, train_answers = read_dataset('SQuAD_it-train.json')
val_contexts = []
val_questions = []
val_answers = []
while len(val_answers) != 5831:
value = train_contexts.pop()
val_contexts.append(value)
value = train_questions.pop()
val_questions.append(value)
value = train_answers.pop()
val_answers.append(value)
def add_end_idx(answers, contexts):
for answer, context in zip(answers, contexts):
gold_text = answer['text']
start_idx = answer['answer_start']
end_idx = start_idx + len(gold_text)
# sometimes squad answers are off by a character or two – fix this
# if context[start_idx:end_idx] == gold_text:
# answer['answer_end'] = end_idx
if context[start_idx-1:end_idx-1] == gold_text:
answer['answer_start'] = start_idx - 1
answer['answer_end'] = end_idx - 1 # When the gold label is off by one character
elif context[start_idx-2:end_idx-2] == gold_text:
answer['answer_start'] = start_idx - 2
answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters
elif context[start_idx-1:end_idx-2] == gold_text:
answer['answer_start'] = start_idx - 1
answer['answer_end'] = end_idx - 2
elif context[start_idx-2:end_idx-1] == gold_text:
answer['answer_start'] = start_idx - 2
answer['answer_end'] = end_idx - 1
elif context[start_idx-3:end_idx-3] == gold_text:
answer['answer_start'] = start_idx - 3
answer['answer_end'] = end_idx - 3
elif context[start_idx-2:end_idx-3] == gold_text:
answer['answer_start'] = start_idx - 2
answer['answer_end'] = end_idx - 3
elif context[start_idx-3:end_idx-2] == gold_text:
answer['answer_start'] = start_idx - 3
answer['answer_end'] = end_idx - 2
else:
answer['answer_end'] = end_idx
if answer['answer_start'] < 0:
answer['answer_start'] =+ 1
answer['answer_end'] =+ 1
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('dbmdz/bert-base-italian-cased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
from transformers import AutoModel,
model_name = "dbmdz/bert-base-italian-cased"
model = AutoModel.from_pretrained(model_name)
def add_token_positions(encodings, answers):
start_positions = []
end_positions = []
for i in range(len(answers)):
start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
# if start position is None, the answer passage has been truncated
if start_positions[-1] is None:
start_positions[-1] = tokenizer.model_max_length
if end_positions[-1] is None:
end_positions[-1] = tokenizer.model_max_length
encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
Then I created the datasets:
import torch
class SquadDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
def __len__(self):
return len(self.encodings.input_ids)
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)
And finally I tried to train:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
label_names = ["start_positions", "end_positions"]
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
trainer.train()
But it raises me this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-171-8794092ae722> in <module>()
20 )
21
---> 22 trainer.train()
3 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
TypeError: forward() got an unexpected keyword argument 'start_positions'
I’ve seen this has already an issue but in none topic I’ve found a solution. Thank you in advance.