Hello,
I am trying to fine-tune the LILT model to extract information from invoices. I annotated my documents using UBIAI, and then I ran this code to adjust the format to be the input of my model. This is the code:
import pandas as pd
import numpy as np
import os
import argparse
from datasets.features import ClassLabel
from transformers import AutoProcessor
from sklearn.model_selection import train_test_split
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D, Dataset
from datasets import Image as Img
from PIL import Image
import warnings
warnings.filterwarnings(‘ignore’)
def read_text_file(file_path):
with open(file_path, ‘r’) as f:
return (f.readlines())
def prepare_examples(examples):
images = examples[image_column_name]
words = examples[text_column_name]
boxes = examples[boxes_column_name]
word_labels = examples[label_column_name]
encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
truncation=True, padding=“max_length”, max_length=512)
return encoding
def get_zip_dir_name():
try:
os.chdir(‘/content/data’)
dir_list = os.listdir()
any_file_name = dir_list[0]
zip_dir_name = any_file_name[:any_file_name.find(‘/’)]
if all(list(map(lambda x: x.startswith(zip_dir_name), dir_list))):
return zip_dir_name
return False
finally:
os.chdir(‘./…/’)
def filter_out_unannotated(example):
tags = example[‘ner_tags’]
return not all([tag == label2id[‘O’] for tag in tags])
if name == ‘main’:
parser = argparse.ArgumentParser()
parser.add_argument('--valid_size')
parser.add_argument('--output_path')
args = parser.parse_args()
TEST_SIZE = float(args.valid_size)
OUTPUT_PATH = args.output_path
os.makedirs(args.output_path, exist_ok=True)
files = {}
zip_dir_name = "/content/data"
zip_id = "34c8d6d4-5a0f-48be-bc8d-5085ae70ff89"
if zip_dir_name and zip_id:
files['train_box'] = read_text_file(os.path.join(
os.curdir, 'data', f'{zip_dir_name}/{zip_id}_box.txt'))
files['train_image'] = read_text_file(os.path.join(
os.curdir, 'data', f'{zip_dir_name}/{zip_id}_image.txt'))
files['train'] = read_text_file(os.path.join(
os.curdir, 'data', f'{zip_dir_name}/{zip_id}.txt'))
else:
for f in os.listdir():
if f.endswith('.txt') and f.find('box') != -1:
files['train_box'] = read_text_file(os.path.join(os.curdir, f))
elif f.endswith('.txt') and f.find('image') != -1:
files['train_image'] = read_text_file(
os.path.join(os.curdir, f))
elif f.endswith('.txt') and f.find('labels') == -1:
files['train'] = read_text_file(os.path.join(os.curdir, f))
assert(len(files['train']) == len(files['train_box']))
assert(len(files['train_box']) == len(files['train_image']))
assert(len(files['train_image']) == len(files['train']))
images = {}
for i, row in enumerate(files['train_image']):
if row != '\n':
image_name = row.split('\t')[-1]
images.setdefault(image_name.replace('\n', ''), []).append(i)
words, bboxes, ner_tags, image_path = [], [], [], []
for image, rows in images.items():
words.append([row.split('\t')[0].replace('\n', '')
for row in files['train'][rows[0]:rows[-1]+1] if len(row) > 1])
ner_tags.append([row.split('\t')[1].replace('\n', '')
for row in files['train'][rows[0]:rows[-1]+1] if len(row) > 1])
bboxes.append([box.split('\t')[1].replace('\n', '')
for box in files['train_box'][rows[0]:rows[-1]+1] if len(box) > 1])
im = Image.open(str('data/'+image))
width, height = im.size
for i in range(len(bboxes)):
list_box=bboxes[0][i].split()
list_box[0]=str(round(1000 * (float(list_box[0]) / width)))
list_box[1]=str(round(1000 * (float(list_box[1]) / height)))
list_box[2]=str(round(1000 * (float(list_box[2]) / width)))
list_box[3]=str(round(1000 * (float(list_box[3]) / height)))
bboxes[0][i] = ' '.join(list_box)
if zip_dir_name:
image_path.append(f"/content/data/{image}")
else:
image_path.append(f"/content/data/{image}")
labels = list(set([tag for doc_tag in ner_tags for tag in doc_tag]))
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
dataset_dict = {
'id': range(len(words)),
'tokens': words,
'bboxes': [[list(map(int, bbox.split())) for bbox in doc] for doc in bboxes],
'ner_tags': [[label2id[tag] for tag in ner_tag] for ner_tag in ner_tags],
'image': [Image.open(path).convert("RGB") for path in image_path]
}
#raw features
features = Features({
'id': Value(dtype='string', id=None),
'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
'bboxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
'ner_tags': Sequence(feature=ClassLabel(num_classes=len(labels), names=labels, names_file=None, id=None), length=-1, id=None),
'image': Img(decode=True, id=None)
})
full_data_set = Dataset.from_dict(dataset_dict, features=features)
dataset = full_data_set.train_test_split(test_size=TEST_SIZE)
dataset["train"] = dataset["train"].filter(filter_out_unannotated)
processor = AutoProcessor.from_pretrained(
"microsoft/layoutlmv3-base", apply_ocr=False)
features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"
features = Features({
'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
'input_ids': Sequence(feature=Value(dtype='int64')),
'attention_mask': Sequence(Value(dtype='int64')),
'bbox': Array2D(dtype="int64", shape=(512, 4)),
'labels': Sequence(ClassLabel(names=labels)),
})
train_dataset = dataset["train"].map(
prepare_examples,
batched=True,
remove_columns=column_names,
features=features,
)
eval_dataset = dataset["test"].map(
prepare_examples,
batched=True,
remove_columns=column_names,
features=features,
)
train_dataset.set_format("torch")
print(bboxes)
if not OUTPUT_PATH.endswith('/'):
OUTPUT_PATH += '/'
train_dataset.save_to_disk(f'{OUTPUT_PATH}train_split')
eval_dataset.save_to_disk(f'{OUTPUT_PATH}eval_split')
dataset.save_to_disk(f'{OUTPUT_PATH}raw_data')
Then I used Trainer.train() to train the model:
from transformers import Trainer, TrainingArguments
NUM_TRAIN_EPOCHS = 50
PER_DEVICE_TRAIN_BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH_SIZE = 8
LEARNING_RATE = 4e-5
training_args = TrainingArguments(output_dir=“LiLT_INVOICE”,
# max_steps=1500,
num_train_epochs=NUM_TRAIN_EPOCHS,
logging_strategy=“epoch”,
save_total_limit=1,
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
learning_rate=LEARNING_RATE,
evaluation_strategy=“no”,
save_strategy=“no”,
# eval_steps=100,
load_best_model_at_end=True,
metric_for_best_model=“f1”)
Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
but I got this error:
IndexError Traceback (most recent call last) Cell In[12], line 1 ----> 1 trainer.train() File [c:\Users\heha\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py:1624](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1624), in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) [1622](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1622) hf_hub_utils.enable_progress_bars() [1623](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1623) else: → [1624](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1624) return inner_training_loop( [1625](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1625) args=args, [1626](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1626) resume_from_checkpoint=resume_from_checkpoint, [1627](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1627) trial=trial, [1628](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1628) ignore_keys_for_eval=ignore_keys_for_eval, [1629](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1629) ) File [c:\Users\heha\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py:1961](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1961), in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) [1958](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1958) self.control = self.callback_handler.on_step_begin(args, self.state, self.control) [1960](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1960) with self.accelerator.accumulate(model): → [1961](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1961) tr_loss_step = self.training_step(model, inputs) [1963](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1963) if ( [1964](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1964) args.logging_nan_inf_filter [1965](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1965) and not is_torch_tpu_available() [1966](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1966) and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) [1967](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1967) ): [1968](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/transformers/trainer.py:1968) # if loss is nan or inf simply add the average of previous logged losses
…
[2235](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/torch/nn/functional.py:2235) # remove once script supports set_grad_enabled [2236](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/torch/nn/functional.py:2236) no_grad_embedding_renorm(weight, input, max_norm, norm_type) → [2237](file:///C:/Users/heha/AppData/Local/Programs/Python/Python310/lib/site-packages/torch/nn/functional.py:2237) return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)