Hello,
I am just getting started with text classification using transformers.
I used a pretrained BERT model to get sentence embeddings from two sets of texts and calculate the cosine distance between them.
I now want to fine-tune it with the first set of text that I have and then produce embeddings.
Can you please help with providing guidance on using BERT with this use case and on fixing this error?
Thanks!
from transformers import BertTokenizer, BertModel
from transformers import Trainer
from transformers import TrainingArguments
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
training_args = TrainingArguments(output_dir="test_trainer")
encoded_input=tokenizer(df['text'].values.tolist(), return_tensors='pt',padding=True,truncation=True, add_special_tokens = True)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_input,
#eval_dataset=small_eval_dataset,
#compute_metrics=compute_metrics,
)
trainer.train()
I get the following error:
KeyError Traceback (most recent call last)
<ipython-input-52-0ed2acad17c8> in <module>
15 )
16
---> 17 trainer.train()
d:\data\tracking_inequality\envs\ineq\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1288 self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
1289
-> 1290 for step, inputs in enumerate(epoch_iterator):
1291
1292 # Skip past any already trained steps if resuming training
d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
d:\data\tracking_inequality\envs\ineq\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
d:\data\tracking_inequality\envs\ineq\lib\site-packages\transformers\tokenization_utils_base.py in __getitem__(self, item)
240 else:
241 raise KeyError(
--> 242 "Indexing with integers (to access backend Encoding for a given batch index) "
243 "is not available when using Python based tokenizers"
244 )
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'