It is necessary to first perform OCR on the PDF to convert it to plain text. Additionally, since overly long text can be challenging for LLMs, the text in the following example has been shortened in advance. The following example uses dotcr, but Tesseract is also a good option. There are various OCR models available, so it is advisable to select one that suits your specific use case.
# pip install -U python-doctr[torch] sentence-transformers transformers>=4.50 accelerate huggingface_hub[hf_xet] requests numpy<2
import io, re, requests, numpy as np
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PDF_URL = "https://nlsblog.org/wp-content/uploads/2020/06/image-based-pdf-sample.pdf"
EMB_ID = "sentence-transformers/all-MiniLM-L6-v2"
LLM_ID = "unsloth/gemma-3-270m-it-qat-bnb-4bit" # or any LLM like "Qwen/Qwen2.5-0.5B-Instruct", etc.
# 1) OCR the PDF (docTR)
pdf = requests.get(PDF_URL, timeout=60); pdf.raise_for_status()
doc = DocumentFile.from_pdf(io.BytesIO(pdf.content))
ocr = (ocr_predictor(pretrained=True)).to(DEVICE)
res = ocr(doc).export()
text_lines = []
for p in res["pages"]:
for b in p.get("blocks", []):
for ln in b.get("lines", []):
w = [x["value"] for x in ln.get("words", [])]
if w: text_lines.append(" ".join(w))
full_text = "\n".join(text_lines).strip()
if not full_text:
raise RuntimeError("OCR produced empty text")
# 2) Embedding-based pre-shrink (top sentences by cosine to document centroid)
sents = [s for s in re.split(r"(?<=[.!?])\s+", full_text) if s.strip()]
emb = SentenceTransformer(EMB_ID).to(DEVICE)
E = emb.encode(sents, convert_to_tensor=True, normalize_embeddings=True)
centroid = E.mean(dim=0, keepdim=True)
scores = util.cos_sim(centroid, E).cpu().numpy().ravel()
order = np.argsort(-scores)
tok = AutoTokenizer.from_pretrained(LLM_ID, use_fast=True)
max_ctx = getattr(tok, "model_max_length", 128_000)
reserve = 512
selected, tok_count = [], 0
for i in order:
ids = tok(sents[i], add_special_tokens=False).input_ids
if tok_count + len(ids) > max_ctx - reserve:
break
selected.append(sents[i]); tok_count += len(ids)
reduced_text = " ".join(selected)
# 3) Summarize once with Gemma-3 270M IT
model = AutoModelForCausalLM.from_pretrained(LLM_ID, device_map="auto")
gen = pipeline("text-generation", model=model, tokenizer=tok, device_map="auto")
msgs = [
{"role": "system", "content": "Summarize long documents accurately and concisely."},
{"role": "user", "content": f"Summarize it:\n\n{reduced_text}"},
]
prompt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
out = gen(prompt, max_new_tokens=512, do_sample=False, temperature=0.0, return_full_text=False)[0]["generated_text"]
#print("Full text:", full_text)
#print("Reduced text:", reduced_text)
print("Summary:", out.strip())
#When using image-based PDFs, such as those created by scanning or photographing paper, it's important to determine their format so you can understand how to interpret the content. If the file appears in an image-based format, it might contain searchable text. However, without this information, it's challenging to fully understand the document. For instance, if someone provides an image-based PDF with no searchable text, you should inquire about the format of the original file and whether it has been converted into a digital version. This approach helps ensure accurate interpretation and understanding of the document contents.