How to finefune blenderbot model?

Hello, I am new to transformers and want to finetune facebook/blenderbot_small-90M on my own chitchat dataset. Here is my test code:

from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
mname = 'facebook/blenderbot_small-90M'
model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)

train_encoded = tokenizer(train[0], train[1])  # list, [0] is a list of questions and [1] is corresponding answers
test_encoded = tokenizer(test[0], test[1])

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments("test_trainer", output_dir='./blender_tuned/')
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_encoded, eval_dataset=test_encoded
)

trainer.train()

Then I got the error as below:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/tmp/ipykernel_47679/4032920361.py in <module>
----> 1 trainer.train()

~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1288             self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
   1289 
-> 1290             for step, inputs in enumerate(epoch_iterator):
   1291 
   1292                 # Skip past any already trained steps if resuming training

~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/dataloader.py in _next_data(self)
    559     def _next_data(self):
    560         index = self._next_index()  # may raise StopIteration
--> 561         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    562         if self._pin_memory:
    563             data = _utils.pin_memory.pin_memory(data)

~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

~/software/anaconda3/envs/transformers/lib/python3.9/site-packages/transformers/tokenization_utils_base.py in __getitem__(self, item)
    239             return self._encodings[item]
    240         else:
--> 241             raise KeyError(
    242                 "Indexing with integers (to access backend Encoding for a given batch index) "
    243                 "is not available when using Python based tokenizers"

KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'

How should I fix this and is this the right way to finetune blenderbot? Thank you!

1 Like

As someone tried further ? It seems also to be available through autotrain is that really the case?