On a 32 GB CPU-only box, run 3–8B models quantized (GGUF) via llama.cpp or Ollama. Don’t use a VLM for OCR do OCR → layout → RAG → small instruct LLM. Fine-tune with LoRA on a rented GPU, then merge + quantize and run on CPU.
What fits in 32 GB (CPU, GGUF)
-
3–4B (Q5/Q6) ≈ 2–3 GB model; fast on CPU.
-
7–8B (Q4_K_M) ≈ 5–6 GB model; best quality/latency trade-off.
-
13B (Q4_K_M) ≈ 9–11 GB; slower but still fits.
-
Keep context 2–4k; large context inflates KV cache.
Good open models (CPU-friendly)
For document processing (forms, policies, claims)
-
OCR & layout (CPU): Tesseract or PaddleOCR; pdfplumber/PyMuPDF for digital PDFs; layoutparser/doctr if you need zones.
-
RAG embeddings (CPU): bge-small-en-v1.5 / e5-small-v2 / all-MiniLM-L6-v2 + FAISS.
-
Pattern tasks: Add light rules/regex after LLM to extract fields reliably.
Fine-tuning
Minimal CPU RAG pipeline (paste-ready)
Plan
-
Extract text (PDF→text or OCR).
-
Chunk → embed (CPU) → FAISS index.
-
Retrieve top-k, build prompt, call local CPU LLM (Ollama).
-
Evaluate with Recall@k/MRR if desired.
file: cpu_rag_min.py
“”"
CPU-only doc QA with Ollama + Sentence-Transformers + FAISS.
Why: Fits 32 GB RAM, no GPU required.
“”"
import argparse, glob, json, re
from pathlib import Path
from typing import List, Tuple
import numpy as np, requests
def load_texts(glob_pat: str) → List[Tuple[str, str]]:
try:
import fitz # PyMuPDF
has_pdf = True
except Exception:
fitz, has_pdf = None, False
out =
for p in glob.glob(glob_pat):
path = Path(p)
if path.suffix.lower() == “.pdf” and has_pdf:
doc = fitz.open(str(path))
out.append((path.name, “\n”.join(page.get_text() for page in doc)))
elif path.suffix.lower() in {“.txt”, “.md”}:
out.append((path.name, path.read_text(encoding=“utf-8”, errors=“ignore”)))
For image-only PDFs, run OCR separately to .txt and include here.
return out
def chunk(text: str, max_words: int = 350) → List[str]:
sents = re.split(r’(?<=[.!?])\s+', text)
chunks, cur, n = , , 0
for s in sents:
w = len(s.split())
if n + w > max_words and cur:
chunks.append(" “.join(cur)); cur, n = [s], w
else:
cur.append(s); n += w
if cur: chunks.append(” ".join(cur))
return chunks
def build_corpus(files: List[Tuple[str, str]]) → Tuple[List[str], List[str]]:
ids, texts = ,
for fname, txt in files:
for i, c in enumerate(chunk(txt)):
ids.append(f"{fname}#chunk{i}"); texts.append(c)
return ids, texts
def embed(texts: List[str], model_name=“sentence-transformers/all-MiniLM-L6-v2”) → np.ndarray:
from sentence_transformers import SentenceTransformer
m = SentenceTransformer(model_name) # CPU
e = m.encode(texts, batch_size=256, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
return e.astype(np.float32)
def build_faiss(embs: np.ndarray):
import faiss
idx = faiss.IndexFlatIP(embs.shape[1]); idx.add(embs)
return idx
PROMPT = “”"You are a careful analyst. Answer using ONLY the context.
If unsure, say you don’t know.
Question:
{q}
Context:
{ctx}
Answer:“”"
def ask_ollama(model: str, prompt: str, num_ctx: int = 4096) → str:
r = requests.post(“http://localhost:11434/api/generate”,
json={“model”: model, “prompt”: prompt, “options”: {“num_ctx”: num_ctx, “temperature”: 0.2}},
stream=True, timeout=600)
out =
for line in r.iter_lines():
if line:
obj = json.loads(line)
if “response” in obj: out.append(obj[“response”])
return “”.join(out)
def main():
ap = argparse.ArgumentParser()
ap.add_argument(“–docs”, required=True, help=“Glob for PDFs/TXT, e.g., data/*.pdf”)
ap.add_argument(“–model”, default=“llama3.1:8b-instruct-q4_K_M”, help=“Ollama model tag”)
ap.add_argument(“–k”, type=int, default=5)
args = ap.parse_args()
files = load_texts(args.docs)
if not files: raise SystemExit("No docs found. Provide PDFs or TXT.")
ids, texts = build_corpus(files)
print(f"Embedding {len(texts)} chunks on CPU...")
embs = embed(texts)
faiss = build_faiss(embs)
from sentence_transformers import SentenceTransformer
q_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Ready. Ask a question (Ctrl+C to exit).")
while True:
try:
q = input("> ").strip()
except KeyboardInterrupt:
break
q_emb = q_model.encode([q], normalize_embeddings=True, convert_to_numpy=True)[0].astype(np.float32)
D, I = faiss.search(q_emb[None, :], args.k)
ctx = "\n\n---\n\n".join(texts[i] for i in I[0])
ans = ask_ollama(args.model, PROMPT.format(q=q, ctx=ctx[:12000]))
print("\n" + ans.strip() + "\n")
if name == “main”:
main()
CPU-only dependencies
pip install sentence transformers faiss cpu pymupdf requests
Start a local CPU LLM
ollama pull llama3.1:8b-instruct-q4_K_M
ollama serve # in another terminal
Build a tiny knowledge base and chat
python cpu_rag_min.py --docs “./docs/*.{pdf,txt}” --model llama3.1:8b-instruct-q4_K_M
Response generated by TD Ai