I am trying to fine-tune Bert using the Huggingface library on next sentence prediction task. I looked at the tutorial and I am trying to use DataCollatorForNextSentencePrediction
and TextDatasetForNextSentencePrediction
. When I am using that I get the following error(use the pastebin link to see the error)https://pastebin.pl/view/bde2c3d4. I have provided my code bellow.
============Code================
def train(bert_model,bert_tokenizer,path,eval_path=None):
out_dir = “/content/drive/My Drive/next_sentence/”
training_args = TrainingArguments(
output_dir=out_dir,
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=30,
save_steps=10000,
save_total_limit=2,
)
data_collator = DataCollatorForNextSentencePrediction(
tokenizer=bert_tokenizer,mlm=False,block_size=512,nsp_probability =0.5
)
dataset = TextDatasetForNextSentencePrediction(
tokenizer = bert_tokenizer,
file_path=path,
block_size=512,
)
trainer = Trainer(
model=bert_model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
)
trainer.train()
trainer.save_model(out_dir)
def main():
print("Running main")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
bert_model = BertForNextSentencePrediction.from_pretrained("bert-base-cased")
train_data_set_path = "/content/drive/My Drive/next_sentence/line_data_set_file.txt"
train(bert_model,bert_tokenizer,train_data_set_path)
#prepare_data_set(bert_tokenizer)
main()