I am fine-tuning Bert model for a question answering use case in a specific domain (medicine). My dataset has the following structure: question,answer. (it doesn’t have any context like in the squad dataset).
how should I prepare the input for the tokenizer, since I don’t have context?
I tried to do it in the following way and I am receiving this error:
from datasets import Dataset
import pandas as pd
import torch
from transformers import BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import BertTokenizer, DataCollatorWithPadding
# prepare dataset
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
path = "data/dataset_QA.csv"
dataset = pd.read_csv(path)
def prepare_dataset(dataset):
# Convert the DataFrame to a Dataset
dataset= Dataset.from_pandas(dataset)
return dataset
# Tokenize the inputs and add padding
def tokenize_data(example):
question = example["question"]
context = example["answer"]
tokenized_input = tokenizer(question, context, padding="max_length", truncation=True, max_length=512)
return tokenized_input
dataset = prepare_dataset(dataset)
dataset = dataset.train_test_split(test_size =0.2)
print(dataset)
tokenized_dataset = dataset.map(tokenize_data,batched = True)
training_args = TrainingArguments(
output_dir= "model_bert",
overwrite_output_dir=True,
logging_dir="./logs/logs_bert",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
evaluation_strategy="epoch",
logging_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
num_train_epochs=3,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
)
trainer.train()
I am receiving the following output with this error:
DatasetDict({
train: Dataset({
features: [‘question’, ‘answer’],
num_rows: 509
})
test: Dataset({
features: [‘question’, ‘answer’],
num_rows: 128
})
})
Map: 100%
509/509 [00:01<00:00, 334.08 examples/s]
Map: 100%
128/128 [00:00<00:00, 337.44 examples/s]
raise ValueError(
ValueError: The model did not return a loss from the inputs, only the following keys: start_logits,end_logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.
Is my approach correct? If yes, how to fix this error? and If no, is there an alternative approach? Thanks in advance for your help!