- Introduced new skills for extracting text from PDFs, scanned documents, and images using OCR and document parsing tools. - Added detailed documentation for usage and installation of `pymupdf` and `marker-pdf` for local extraction. - Implemented scripts for text extraction with both lightweight and high-quality options, including support for various document formats. - Updated web extraction functionality to handle PDF URLs directly, enhancing usability for academic papers and documents.
99 lines
3.1 KiB
Python
99 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Extract text from documents using pymupdf. Lightweight (~25MB), no models.
|
|
|
|
Usage:
|
|
python extract_pymupdf.py document.pdf
|
|
python extract_pymupdf.py document.pdf --markdown
|
|
python extract_pymupdf.py document.pdf --pages 0-4
|
|
python extract_pymupdf.py document.pdf --images output_dir/
|
|
python extract_pymupdf.py document.pdf --tables
|
|
python extract_pymupdf.py document.pdf --metadata
|
|
"""
|
|
import sys
|
|
import json
|
|
|
|
def extract_text(path, pages=None):
|
|
import pymupdf
|
|
doc = pymupdf.open(path)
|
|
page_range = range(len(doc)) if pages is None else pages
|
|
for i in page_range:
|
|
if i < len(doc):
|
|
print(f"\n--- Page {i+1}/{len(doc)} ---\n")
|
|
print(doc[i].get_text())
|
|
|
|
def extract_markdown(path, pages=None):
|
|
import pymupdf4llm
|
|
md = pymupdf4llm.to_markdown(path, pages=pages)
|
|
print(md)
|
|
|
|
def extract_tables(path):
|
|
import pymupdf
|
|
doc = pymupdf.open(path)
|
|
for i, page in enumerate(doc):
|
|
tables = page.find_tables()
|
|
for j, table in enumerate(tables.tables):
|
|
print(f"\n--- Page {i+1}, Table {j+1} ---\n")
|
|
df = table.to_pandas()
|
|
print(df.to_markdown(index=False))
|
|
|
|
def extract_images(path, output_dir):
|
|
import pymupdf
|
|
from pathlib import Path
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
doc = pymupdf.open(path)
|
|
count = 0
|
|
for i, page in enumerate(doc):
|
|
for img_idx, img in enumerate(page.get_images(full=True)):
|
|
xref = img[0]
|
|
pix = pymupdf.Pixmap(doc, xref)
|
|
if pix.n >= 5:
|
|
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
|
out_path = f"{output_dir}/page{i+1}_img{img_idx+1}.png"
|
|
pix.save(out_path)
|
|
count += 1
|
|
print(f"Extracted {count} images to {output_dir}/")
|
|
|
|
def show_metadata(path):
|
|
import pymupdf
|
|
doc = pymupdf.open(path)
|
|
print(json.dumps({
|
|
"pages": len(doc),
|
|
"title": doc.metadata.get("title", ""),
|
|
"author": doc.metadata.get("author", ""),
|
|
"subject": doc.metadata.get("subject", ""),
|
|
"creator": doc.metadata.get("creator", ""),
|
|
"producer": doc.metadata.get("producer", ""),
|
|
"format": doc.metadata.get("format", ""),
|
|
}, indent=2))
|
|
|
|
if __name__ == "__main__":
|
|
args = sys.argv[1:]
|
|
if not args or args[0] in ("-h", "--help"):
|
|
print(__doc__)
|
|
sys.exit(0)
|
|
|
|
path = args[0]
|
|
pages = None
|
|
|
|
if "--pages" in args:
|
|
idx = args.index("--pages")
|
|
p = args[idx + 1]
|
|
if "-" in p:
|
|
start, end = p.split("-")
|
|
pages = list(range(int(start), int(end) + 1))
|
|
else:
|
|
pages = [int(p)]
|
|
|
|
if "--metadata" in args:
|
|
show_metadata(path)
|
|
elif "--tables" in args:
|
|
extract_tables(path)
|
|
elif "--images" in args:
|
|
idx = args.index("--images")
|
|
output_dir = args[idx + 1] if idx + 1 < len(args) else "./images"
|
|
extract_images(path, output_dir)
|
|
elif "--markdown" in args:
|
|
extract_markdown(path, pages=pages)
|
|
else:
|
|
extract_text(path, pages=pages)
|