Hello, i have this error, and i can’t solve the issue even if i saw all the forums. The purpose is to finetune the french model “etalab-ia/camembert-base-squadFR-fquad-piaf” for question answering. The training data is from two pdf files that i converted to texts and i made some examples of questions/answers. If someone can help…
C:\..path..\Python\Python310\lib\site-packages\transformers\optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
0%| | 0/3 [00:00<?, ?it/s]Traceback (most recent call last):
File "C:\..path..\PycharmProjects\pythonProject\src\nlp\hugging face finetune\model_finetuned.py", line 97, in <module>
trainer.train()
File "C:\..path..\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1633, in train
return inner_training_loop(
File "C:\..path..\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1872, in _inner_training_loop
for step, inputs in enumerate(epoch_iterator):
File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 628, in __next__
data = self._next_data()
File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 671, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 58, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "C:\..path..\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 58, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
KeyError: 0
0%| | 0/3 [00:00<?, ?it/s]
here’s my code :
from transformers import AutoTokenizer, Trainer, \
AutoModelForQuestionAnswering, TrainingArguments
from extraction_types import *
file = 'path_to_a_pdf'
filetest= 'path_to_a_second_pdf'
model_checkpoint = "etalab-ia/camembert-base-squadFR-fquad-piaf"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
extractor = DataExtractor()
# Define your own training data
train_questions = [
"Quel est le numéro de vérification ?",
"Quel est la décision ?",
"Quel est le nom de l'emmeteur du rapport ?",
"Quel est le numéro de vérification ?",
"Quel est la décision ?",
"Quel est le nom de l'emmeteur du rapport?",
]
train_contexts = [
extractor.extract_text_from_pdf(file),
extractor.extract_text_from_pdf(file),
extractor.extract_text_from_pdf(file),
extractor.extract_text_from_pdf(filetest),
extractor.extract_text_from_pdf(filetest),
extractor.extract_text_from_pdf(filetest),
]
train_answers = [
"5628930",
"conforme",
"Elvis mattéo",
"3456866",
"Conforme avec réserve",
"Frédéric MARGOT",
]
# Tokenize the training data
train_encodings = tokenizer(train_questions, train_contexts,max_length=524, truncation=True, padding=True)
train_start_positions, train_end_positions = [], []
for i in range(len(train_contexts)):
start_pos, end_pos = extractor.extract_answer_position(train_contexts[i], train_answers[i])
train_start_positions.append(start_pos)
train_end_positions.append(end_pos)
if 'input_ids' in train_encodings:
input_ids = train_encodings['input_ids']
else:
input_ids = None
if 'attention_mask' in train_encodings:
input_ids = train_encodings['attention_mask']
else:
input_ids = None
train_data = {
'input_ids': train_encodings['input_ids'],
'attention_mask': train_encodings['attention_mask'],
'start_positions': train_start_positions,
'end_positions': train_end_positions,
}
# Define the training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
# Define the trainer
trainer = Trainer(
model=model, # the instantiated Hugging Face model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_data, # training dataset
)
trainer.train()
Here’s the code of the two called functions from a class DataExtractor :
def extract_text_from_pdf(self, file):
# Set the path to the output image file
image_path = 'file.png'
# Convert the PDF to an image because it gave a better result
pages = convert_from_path(file, 500) # 500 is the DPI of the output image
text = ''
# Save the image to disk
for i, page in enumerate(pages):
if i == 0:
page.save(image_path, 'PNG')
else:
image_path = os.path.splitext(image_path)[0] + '_' + str(i) + '.png'
page.save(image_path, 'PNG')
# Load the image and extract text from it
image = Image.open(image_path)
page_text = pytesseract.image_to_string(image)
# Append the page text to the full text
text += page_text
return text.encode('utf-8').decode('utf-8')
def extract_answer_position(self, text, answer):
# Find the start and end positions of the answer within the text
match = re.search(answer.lower(), text.lower())
if match:
start_position = match.start()
end_position = start_position + len(answer)
else:
start_position = -1
end_position = -1
return start_position, end_position