Files
hermes-agent/skills/ocr-and-documents/scripts/extract_pymupdf.py
teknium1 19abbfff96 feat(ocr-and-documents): add OCR and document extraction skills
- Introduced new skills for extracting text from PDFs, scanned documents, and images using OCR and document parsing tools.
- Added detailed documentation for usage and installation of `pymupdf` and `marker-pdf` for local extraction.
- Implemented scripts for text extraction with both lightweight and high-quality options, including support for various document formats.
- Updated web extraction functionality to handle PDF URLs directly, enhancing usability for academic papers and documents.
2026-02-26 23:06:08 -08:00

99 lines
3.1 KiB
Python

#!/usr/bin/env python3
"""Extract text from documents using pymupdf. Lightweight (~25MB), no models.
Usage:
python extract_pymupdf.py document.pdf
python extract_pymupdf.py document.pdf --markdown
python extract_pymupdf.py document.pdf --pages 0-4
python extract_pymupdf.py document.pdf --images output_dir/
python extract_pymupdf.py document.pdf --tables
python extract_pymupdf.py document.pdf --metadata
"""
import sys
import json
def extract_text(path, pages=None):
import pymupdf
doc = pymupdf.open(path)
page_range = range(len(doc)) if pages is None else pages
for i in page_range:
if i < len(doc):
print(f"\n--- Page {i+1}/{len(doc)} ---\n")
print(doc[i].get_text())
def extract_markdown(path, pages=None):
import pymupdf4llm
md = pymupdf4llm.to_markdown(path, pages=pages)
print(md)
def extract_tables(path):
import pymupdf
doc = pymupdf.open(path)
for i, page in enumerate(doc):
tables = page.find_tables()
for j, table in enumerate(tables.tables):
print(f"\n--- Page {i+1}, Table {j+1} ---\n")
df = table.to_pandas()
print(df.to_markdown(index=False))
def extract_images(path, output_dir):
import pymupdf
from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)
doc = pymupdf.open(path)
count = 0
for i, page in enumerate(doc):
for img_idx, img in enumerate(page.get_images(full=True)):
xref = img[0]
pix = pymupdf.Pixmap(doc, xref)
if pix.n >= 5:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
out_path = f"{output_dir}/page{i+1}_img{img_idx+1}.png"
pix.save(out_path)
count += 1
print(f"Extracted {count} images to {output_dir}/")
def show_metadata(path):
import pymupdf
doc = pymupdf.open(path)
print(json.dumps({
"pages": len(doc),
"title": doc.metadata.get("title", ""),
"author": doc.metadata.get("author", ""),
"subject": doc.metadata.get("subject", ""),
"creator": doc.metadata.get("creator", ""),
"producer": doc.metadata.get("producer", ""),
"format": doc.metadata.get("format", ""),
}, indent=2))
if __name__ == "__main__":
args = sys.argv[1:]
if not args or args[0] in ("-h", "--help"):
print(__doc__)
sys.exit(0)
path = args[0]
pages = None
if "--pages" in args:
idx = args.index("--pages")
p = args[idx + 1]
if "-" in p:
start, end = p.split("-")
pages = list(range(int(start), int(end) + 1))
else:
pages = [int(p)]
if "--metadata" in args:
show_metadata(path)
elif "--tables" in args:
extract_tables(path)
elif "--images" in args:
idx = args.index("--images")
output_dir = args[idx + 1] if idx + 1 < len(args) else "./images"
extract_images(path, output_dir)
elif "--markdown" in args:
extract_markdown(path, pages=pages)
else:
extract_text(path, pages=pages)