I want to be able to feed a model with raw text to give me a JSON output with the keys I have asked it to fill in. For example, let’s say I have an invoice and I want the model to retrieve in JSON format the following:
{
“invoice_item”: “item that has been invoiced”,
“amount”: “how much the invoice item cost in total”,
“company_name”: “company that issued the invoice”,
“invoice_date”: “when the invoice was issued”,
}
I saw many tutorials that do this, but they all use the OpenAI API. Instead I would like to use one of the open source hugging face models, but I have had no luck because all the models I have tried (falcon-7b-instruct, vicuna, distilgpt2) do not work. Is there any specific model for this kind of task? And is there any method that can help me with this like fine tuning?
Thanks.
there is this library. I have not used it though.
Thank you! I did kind of a mix with that library and this tutorial https://www.youtube.com/watch?v=v_cfORExneQ and it seems to be working quite good. I still have to make some modifications but this is a good kick start!
Would you like to share your solution?
Yeah sure, this example is for extracting information from invoices that are pdfs, hope it helps!
from dotenv import load_dotenv
from pytesseract import image_to_string
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
import streamlit as st
import multiprocessing
from tempfile import NamedTemporaryFile
import pandas as pd
import json
import requests
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from tempfile import NamedTemporaryFile
from jsonformer.format import highlight_values
from jsonformer.main import Jsonformer
import time
load_dotenv()
# 1. Convert PDF file into images via pypdfium2
def convert_pdf_to_images(file_path, scale=300/72):
pdf_file = pdfium.PdfDocument(file_path)
page_indices = [i for i in range(len(pdf_file))]
renderer = pdf_file.render(
pdfium.PdfBitmap.to_pil,
page_indices=page_indices,
scale=scale,
)
final_images = []
for i, image in zip(page_indices, renderer):
image_byte_array = BytesIO()
image.save(image_byte_array, format='jpeg', optimize=True)
image_byte_array = image_byte_array.getvalue()
final_images.append(dict({i: image_byte_array}))
return final_images
# 2. Extract text from images via pytesseract
def extract_text_from_img(list_dict_final_images):
image_list = [list(data.values())[0] for data in list_dict_final_images]
image_content = []
for index, image_bytes in enumerate(image_list):
image = Image.open(BytesIO(image_bytes))
raw_text = str(image_to_string(image))
image_content.append(raw_text)
return "\n".join(image_content)
def extract_content_from_url(url: str):
images_list = convert_pdf_to_images(url)
text_with_pytesseract = extract_text_from_img(images_list)
return text_with_pytesseract
# 3. Extract structured info from text via LLM
class HuggingFaceLLM:
def __init__(self, temperature=0, top_k=50, model_name="databricks/dolly-v2-12b"):
self.model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=True, device_map="auto")
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_cache=True)
self.top_k = top_k
def generate(self, prompt, max_length=1024):
json = {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"description": {"type": "string"},
"price": {"type": "number"}
}
}
},
"Company_name": {"type": "string"},
"invoice_date": {"type": "string"},
}
}
builder = Jsonformer(
model=self.model,
tokenizer=self.tokenizer,
json_schema=json,
prompt= prompt,
max_string_token_length=20
)
print("Generating...")
output = builder()
#highlight_values(output)
#print(output)
return output
def extract_structured_data(content: str, data_points):
llm = HuggingFaceLLM(temperature=0) # Choose the desired Hugging Face model
template = """
You are an expert admin people who will extract core information from documents
{content}
Above is the content; please try to extract all data points from the content above:
{data_points}
"""
# Fill in the placeholders in the template
formatted_template = template.format(content=content, data_points=data_points)
#print(formatted_template)
# Generate text using the formatted template
results = llm.generate(formatted_template)
return results
def main():
default_data_points = """{
"item": [{
"description": "description or name of the item that has been bougth",
"price": "how much does the item cost"
}],
"Company_name": "company that issued the invoice",
"invoice_date": "when was the invoice issued",
}"""
st.set_page_config(page_title="Doc extraction", page_icon=":bird:")
st.header("Doc extraction :bird:")
data_points = st.text_area(
"Data points", value=default_data_points, height=170)
folder_path = './pdfs' # Replace this with your folder path containing PDFs
pdf_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
results = []
if pdf_paths:
total_start_time = time.time()
with open("output_results.txt", "w") as output_file:
for pdf_path in pdf_paths:
with NamedTemporaryFile(dir='.', suffix='.csv') as f:
output_file.write(f"PDF Path: {pdf_path}\n")
start_time = time.time() # Record the start time
content = extract_content_from_url(pdf_path)
data = extract_structured_data(content, default_data_points)
json_data = json.dumps(data)
if isinstance(json_data, list):
results.extend(json_data)
else:
results.append(json_data)
end_time = time.time() # Record the end time
elapsed_time = end_time - start_time
output_file.write(f"Execution time: {elapsed_time:.2f} seconds\n")
output_file.write(f"Results: {json_data}\n")
output_file.write("\n")
total_end_time = time.time()
total_elapsed_time = total_end_time - total_start_time
output_file.write(f"Total execution time: {total_elapsed_time:.2f} seconds\n")
if __name__ == '__main__':
multiprocessing.freeze_support()
main()
i want to do it for csv’s
Hi bro, nice work. Can you tell how well this has perfomed for your invoices.
also i would really appreciate if you shared any sample image of the invoice for which you constructed json and default_data_points in the code.
im trying to create structured json from multiple pdf files that have tabular data using deep learning and ocr.
We have manually created json from these pdfs using rule based approach, so i do have the training and label dataset if I have to go for finetuning.
Any insight would be appreciated.
Reagrds.