KeyError: 0 issue with trainer

Hello, i have this error, and i can’t solve the issue even if i saw all the forums. The purpose is to finetune the french model “etalab-ia/camembert-base-squadFR-fquad-piaf” for question answering. The training data is from two pdf files that i converted to texts and i made some examples of questions/answers. If someone can help…

C:\..path..\Python\Python310\lib\site-packages\transformers\ FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  0%|          | 0/3 [00:00<?, ?it/s]Traceback (most recent call last):
  File "C:\..path..\PycharmProjects\pythonProject\src\nlp\hugging face finetune\", line 97, in <module>
  File "C:\..path..\Programs\Python\Python310\lib\site-packages\transformers\", line 1633, in train
    return inner_training_loop(
  File "C:\..path..\Programs\Python\Python310\lib\site-packages\transformers\", line 1872, in _inner_training_loop
    for step, inputs in enumerate(epoch_iterator):
  File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\", line 628, in __next__
    data = self._next_data()
  File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\", line 671, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
KeyError: 0
  0%|          | 0/3 [00:00<?, ?it/s]

here’s my code :

from transformers import AutoTokenizer, Trainer, \
    AutoModelForQuestionAnswering, TrainingArguments
from extraction_types import *

file = 'path_to_a_pdf'
filetest= 'path_to_a_second_pdf'

model_checkpoint = "etalab-ia/camembert-base-squadFR-fquad-piaf"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

extractor = DataExtractor()

# Define your own training data
train_questions = [
    "Quel est le numéro de vérification ?",
    "Quel est la décision ?",
    "Quel est le nom de l'emmeteur du rapport ?",
    "Quel est le numéro de vérification ?",
    "Quel est la décision ?",
    "Quel est le nom de l'emmeteur du rapport?",
train_contexts = [
train_answers = [
    "Elvis mattéo",
    "Conforme avec réserve",
    "Frédéric MARGOT",

# Tokenize the training data
train_encodings = tokenizer(train_questions, train_contexts,max_length=524, truncation=True, padding=True)

train_start_positions, train_end_positions = [], []

for i in range(len(train_contexts)):
    start_pos, end_pos = extractor.extract_answer_position(train_contexts[i], train_answers[i])

if 'input_ids' in train_encodings:
    input_ids = train_encodings['input_ids']
    input_ids = None

if 'attention_mask' in train_encodings:
    input_ids = train_encodings['attention_mask']
    input_ids = None

train_data = {
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'start_positions': train_start_positions,
    'end_positions': train_end_positions,

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs

# Define the trainer
trainer = Trainer(
    model=model,                         # the instantiated Hugging Face model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_data,       # training dataset


Here’s the code of the two called functions from a class DataExtractor :

    def extract_text_from_pdf(self, file):
        # Set the path to the output image file
        image_path = 'file.png'

        # Convert the PDF to an image because it gave a better result 
        pages = convert_from_path(file, 500)  # 500 is the DPI of the output image

        text = ''

        # Save the image to disk
        for i, page in enumerate(pages):
            if i == 0:
      , 'PNG')
                image_path = os.path.splitext(image_path)[0] + '_' + str(i) + '.png'
      , 'PNG')

            # Load the image and extract text from it
            image =
            page_text = pytesseract.image_to_string(image)

            # Append the page text to the full text
            text += page_text

        return text.encode('utf-8').decode('utf-8')

    def extract_answer_position(self, text, answer):
        # Find the start and end positions of the answer within the text

        match =, text.lower())
        if match:
            start_position = match.start()
            end_position = start_position + len(answer)
            start_position = -1
            end_position = -1

        return start_position, end_position