Fine tuning an unsupervised model - BERT

Hello,

I am just getting started with text classification using transformers.
I used a pretrained BERT model to get sentence embeddings from two sets of texts and calculate the cosine distance between them.
I now want to fine-tune it with the first set of text that I have and then produce embeddings.

Can you please help with providing guidance on using BERT with this use case and on fixing this error?

Thanks!

from transformers import BertTokenizer, BertModel
from transformers import Trainer
from transformers import TrainingArguments
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
training_args = TrainingArguments(output_dir="test_trainer")

encoded_input=tokenizer(df['text'].values.tolist(), return_tensors='pt',padding=True,truncation=True, add_special_tokens = True)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_input,
    #eval_dataset=small_eval_dataset,
    #compute_metrics=compute_metrics,
)

trainer.train()

I get the following error:

KeyError                                  Traceback (most recent call last)
<ipython-input-52-0ed2acad17c8> in <module>
     15 )
     16 
---> 17 trainer.train()

d:\data\tracking_inequality\envs\ineq\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1288             self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
   1289 
-> 1290             for step, inputs in enumerate(epoch_iterator):
   1291 
   1292                 # Skip past any already trained steps if resuming training

d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
    559     def _next_data(self):
    560         index = self._next_index()  # may raise StopIteration
--> 561         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    562         if self._pin_memory:
    563             data = _utils.pin_memory.pin_memory(data)

d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
     47     def fetch(self, possibly_batched_index):
     48         if self.auto_collation:
---> 49             data = [self.dataset[idx] for idx in possibly_batched_index]
     50         else:
     51             data = self.dataset[possibly_batched_index]

d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
     47     def fetch(self, possibly_batched_index):
     48         if self.auto_collation:
---> 49             data = [self.dataset[idx] for idx in possibly_batched_index]
     50         else:
     51             data = self.dataset[possibly_batched_index]

d:\data\tracking_inequality\envs\ineq\lib\site-packages\transformers\tokenization_utils_base.py in __getitem__(self, item)
    240         else:
    241             raise KeyError(
--> 242                 "Indexing with integers (to access backend Encoding for a given batch index) "
    243                 "is not available when using Python based tokenizers"
    244             )

KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'