Sync all skills and memories 2026-04-14 07:27
This commit is contained in:
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract text from documents using marker-pdf. High-quality OCR + layout analysis.
|
||||
|
||||
Requires ~3-5GB disk (PyTorch + models downloaded on first use).
|
||||
Supports: PDF, DOCX, PPTX, XLSX, HTML, EPUB, images.
|
||||
|
||||
Usage:
|
||||
python extract_marker.py document.pdf
|
||||
python extract_marker.py document.pdf --output_dir ./output
|
||||
python extract_marker.py presentation.pptx
|
||||
python extract_marker.py spreadsheet.xlsx
|
||||
python extract_marker.py scanned_doc.pdf # OCR works here
|
||||
python extract_marker.py document.pdf --json # Structured output
|
||||
python extract_marker.py document.pdf --use_llm # LLM-boosted accuracy
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
|
||||
def convert(path, output_dir=None, output_format="markdown", use_llm=False):
|
||||
from marker.converters.pdf import PdfConverter
|
||||
from marker.models import create_model_dict
|
||||
from marker.config.parser import ConfigParser
|
||||
|
||||
config_dict = {}
|
||||
if use_llm:
|
||||
config_dict["use_llm"] = True
|
||||
|
||||
config_parser = ConfigParser(config_dict)
|
||||
models = create_model_dict()
|
||||
converter = PdfConverter(config=config_parser.generate_config_dict(), artifact_dict=models)
|
||||
rendered = converter(path)
|
||||
|
||||
if output_format == "json":
|
||||
import json
|
||||
print(json.dumps({
|
||||
"markdown": rendered.markdown,
|
||||
"metadata": rendered.metadata if hasattr(rendered, "metadata") else {},
|
||||
}, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
print(rendered.markdown)
|
||||
|
||||
# Save images if output_dir specified
|
||||
if output_dir and hasattr(rendered, "images") and rendered.images:
|
||||
from pathlib import Path
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
for name, img_data in rendered.images.items():
|
||||
img_path = os.path.join(output_dir, name)
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(img_data)
|
||||
print(f"\nSaved {len(rendered.images)} image(s) to {output_dir}/", file=sys.stderr)
|
||||
|
||||
|
||||
def check_requirements():
|
||||
"""Check disk space before installing."""
|
||||
import shutil
|
||||
free_gb = shutil.disk_usage("/").free / (1024**3)
|
||||
if free_gb < 5:
|
||||
print(f"⚠️ Only {free_gb:.1f}GB free. marker-pdf needs ~5GB for PyTorch + models.")
|
||||
print("Use pymupdf instead (scripts/extract_pymupdf.py) or free up disk space.")
|
||||
sys.exit(1)
|
||||
print(f"✓ {free_gb:.1f}GB free — sufficient for marker-pdf")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = sys.argv[1:]
|
||||
if not args or args[0] in ("-h", "--help"):
|
||||
print(__doc__)
|
||||
sys.exit(0)
|
||||
|
||||
if args[0] == "--check":
|
||||
check_requirements()
|
||||
sys.exit(0)
|
||||
|
||||
path = args[0]
|
||||
output_dir = None
|
||||
output_format = "markdown"
|
||||
use_llm = False
|
||||
|
||||
if "--output_dir" in args:
|
||||
idx = args.index("--output_dir")
|
||||
output_dir = args[idx + 1]
|
||||
if "--json" in args:
|
||||
output_format = "json"
|
||||
if "--use_llm" in args:
|
||||
use_llm = True
|
||||
|
||||
convert(path, output_dir=output_dir, output_format=output_format, use_llm=use_llm)
|
||||
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract text from documents using pymupdf. Lightweight (~25MB), no models.
|
||||
|
||||
Usage:
|
||||
python extract_pymupdf.py document.pdf
|
||||
python extract_pymupdf.py document.pdf --markdown
|
||||
python extract_pymupdf.py document.pdf --pages 0-4
|
||||
python extract_pymupdf.py document.pdf --images output_dir/
|
||||
python extract_pymupdf.py document.pdf --tables
|
||||
python extract_pymupdf.py document.pdf --metadata
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
|
||||
def extract_text(path, pages=None):
|
||||
import pymupdf
|
||||
doc = pymupdf.open(path)
|
||||
page_range = range(len(doc)) if pages is None else pages
|
||||
for i in page_range:
|
||||
if i < len(doc):
|
||||
print(f"\n--- Page {i+1}/{len(doc)} ---\n")
|
||||
print(doc[i].get_text())
|
||||
|
||||
def extract_markdown(path, pages=None):
|
||||
import pymupdf4llm
|
||||
md = pymupdf4llm.to_markdown(path, pages=pages)
|
||||
print(md)
|
||||
|
||||
def extract_tables(path):
|
||||
import pymupdf
|
||||
doc = pymupdf.open(path)
|
||||
for i, page in enumerate(doc):
|
||||
tables = page.find_tables()
|
||||
for j, table in enumerate(tables.tables):
|
||||
print(f"\n--- Page {i+1}, Table {j+1} ---\n")
|
||||
df = table.to_pandas()
|
||||
print(df.to_markdown(index=False))
|
||||
|
||||
def extract_images(path, output_dir):
|
||||
import pymupdf
|
||||
from pathlib import Path
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
doc = pymupdf.open(path)
|
||||
count = 0
|
||||
for i, page in enumerate(doc):
|
||||
for img_idx, img in enumerate(page.get_images(full=True)):
|
||||
xref = img[0]
|
||||
pix = pymupdf.Pixmap(doc, xref)
|
||||
if pix.n >= 5:
|
||||
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
||||
out_path = f"{output_dir}/page{i+1}_img{img_idx+1}.png"
|
||||
pix.save(out_path)
|
||||
count += 1
|
||||
print(f"Extracted {count} images to {output_dir}/")
|
||||
|
||||
def show_metadata(path):
|
||||
import pymupdf
|
||||
doc = pymupdf.open(path)
|
||||
print(json.dumps({
|
||||
"pages": len(doc),
|
||||
"title": doc.metadata.get("title", ""),
|
||||
"author": doc.metadata.get("author", ""),
|
||||
"subject": doc.metadata.get("subject", ""),
|
||||
"creator": doc.metadata.get("creator", ""),
|
||||
"producer": doc.metadata.get("producer", ""),
|
||||
"format": doc.metadata.get("format", ""),
|
||||
}, indent=2))
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = sys.argv[1:]
|
||||
if not args or args[0] in ("-h", "--help"):
|
||||
print(__doc__)
|
||||
sys.exit(0)
|
||||
|
||||
path = args[0]
|
||||
pages = None
|
||||
|
||||
if "--pages" in args:
|
||||
idx = args.index("--pages")
|
||||
p = args[idx + 1]
|
||||
if "-" in p:
|
||||
start, end = p.split("-")
|
||||
pages = list(range(int(start), int(end) + 1))
|
||||
else:
|
||||
pages = [int(p)]
|
||||
|
||||
if "--metadata" in args:
|
||||
show_metadata(path)
|
||||
elif "--tables" in args:
|
||||
extract_tables(path)
|
||||
elif "--images" in args:
|
||||
idx = args.index("--images")
|
||||
output_dir = args[idx + 1] if idx + 1 < len(args) else "./images"
|
||||
extract_images(path, output_dir)
|
||||
elif "--markdown" in args:
|
||||
extract_markdown(path, pages=pages)
|
||||
else:
|
||||
extract_text(path, pages=pages)
|
||||
Reference in New Issue
Block a user