Hello, I am new to transformers and want to finetune facebook/blenderbot_small-90M on my own chitchat dataset. Here is my test code:
from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
mname = 'facebook/blenderbot_small-90M'
model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
train_encoded = tokenizer(train[0], train[1]) # list, [0] is a list of questions and [1] is corresponding answers
test_encoded = tokenizer(test[0], test[1])
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments("test_trainer", output_dir='./blender_tuned/')
trainer = Trainer(
model=model, args=training_args, train_dataset=train_encoded, eval_dataset=test_encoded
)
trainer.train()
Then I got the error as below:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/tmp/ipykernel_47679/4032920361.py in <module>
----> 1 trainer.train()
~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1288 self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
1289
-> 1290 for step, inputs in enumerate(epoch_iterator):
1291
1292 # Skip past any already trained steps if resuming training
~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/transformers/tokenization_utils_base.py in __getitem__(self, item)
239 return self._encodings[item]
240 else:
--> 241 raise KeyError(
242 "Indexing with integers (to access backend Encoding for a given batch index) "
243 "is not available when using Python based tokenizers"
KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'
How should I fix this and is this the right way to finetune blenderbot? Thank you!