Sync all skills and memories 2026-04-14 07:27

2026-04-14 07:27:20 +09:00
parent 516bb44fe6
commit 1eba2bca95
386 changed files with 167655 additions and 0 deletions
--- a/skills/productivity/ocr-and-documents/scripts/extract_marker.py
+++ b/skills/productivity/ocr-and-documents/scripts/extract_marker.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""Extract text from documents using marker-pdf. High-quality OCR + layout analysis.
+
+Requires ~3-5GB disk (PyTorch + models downloaded on first use).
+Supports: PDF, DOCX, PPTX, XLSX, HTML, EPUB, images.
+
+Usage:
+    python extract_marker.py document.pdf
+    python extract_marker.py document.pdf --output_dir ./output
+    python extract_marker.py presentation.pptx
+    python extract_marker.py spreadsheet.xlsx
+    python extract_marker.py scanned_doc.pdf           # OCR works here
+    python extract_marker.py document.pdf --json        # Structured output
+    python extract_marker.py document.pdf --use_llm     # LLM-boosted accuracy
+"""
+import sys
+import os
+
+def convert(path, output_dir=None, output_format="markdown", use_llm=False):
+    from marker.converters.pdf import PdfConverter
+    from marker.models import create_model_dict
+    from marker.config.parser import ConfigParser
+
+    config_dict = {}
+    if use_llm:
+        config_dict["use_llm"] = True
+
+    config_parser = ConfigParser(config_dict)
+    models = create_model_dict()
+    converter = PdfConverter(config=config_parser.generate_config_dict(), artifact_dict=models)
+    rendered = converter(path)
+
+    if output_format == "json":
+        import json
+        print(json.dumps({
+            "markdown": rendered.markdown,
+            "metadata": rendered.metadata if hasattr(rendered, "metadata") else {},
+        }, indent=2, ensure_ascii=False))
+    else:
+        print(rendered.markdown)
+
+    # Save images if output_dir specified
+    if output_dir and hasattr(rendered, "images") and rendered.images:
+        from pathlib import Path
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        for name, img_data in rendered.images.items():
+            img_path = os.path.join(output_dir, name)
+            with open(img_path, "wb") as f:
+                f.write(img_data)
+        print(f"\nSaved {len(rendered.images)} image(s) to {output_dir}/", file=sys.stderr)
+
+
+def check_requirements():
+    """Check disk space before installing."""
+    import shutil
+    free_gb = shutil.disk_usage("/").free / (1024**3)
+    if free_gb < 5:
+        print(f"⚠️  Only {free_gb:.1f}GB free. marker-pdf needs ~5GB for PyTorch + models.")
+        print("Use pymupdf instead (scripts/extract_pymupdf.py) or free up disk space.")
+        sys.exit(1)
+    print(f"✓ {free_gb:.1f}GB free — sufficient for marker-pdf")
+
+
+if __name__ == "__main__":
+    args = sys.argv[1:]
+    if not args or args[0] in ("-h", "--help"):
+        print(__doc__)
+        sys.exit(0)
+
+    if args[0] == "--check":
+        check_requirements()
+        sys.exit(0)
+
+    path = args[0]
+    output_dir = None
+    output_format = "markdown"
+    use_llm = False
+
+    if "--output_dir" in args:
+        idx = args.index("--output_dir")
+        output_dir = args[idx + 1]
+    if "--json" in args:
+        output_format = "json"
+    if "--use_llm" in args:
+        use_llm = True
+
+    convert(path, output_dir=output_dir, output_format=output_format, use_llm=use_llm)
--- a/skills/productivity/ocr-and-documents/scripts/extract_pymupdf.py
+++ b/skills/productivity/ocr-and-documents/scripts/extract_pymupdf.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""Extract text from documents using pymupdf. Lightweight (~25MB), no models.
+
+Usage:
+    python extract_pymupdf.py document.pdf
+    python extract_pymupdf.py document.pdf --markdown
+    python extract_pymupdf.py document.pdf --pages 0-4
+    python extract_pymupdf.py document.pdf --images output_dir/
+    python extract_pymupdf.py document.pdf --tables
+    python extract_pymupdf.py document.pdf --metadata
+"""
+import sys
+import json
+
+def extract_text(path, pages=None):
+    import pymupdf
+    doc = pymupdf.open(path)
+    page_range = range(len(doc)) if pages is None else pages
+    for i in page_range:
+        if i < len(doc):
+            print(f"\n--- Page {i+1}/{len(doc)} ---\n")
+            print(doc[i].get_text())
+
+def extract_markdown(path, pages=None):
+    import pymupdf4llm
+    md = pymupdf4llm.to_markdown(path, pages=pages)
+    print(md)
+
+def extract_tables(path):
+    import pymupdf
+    doc = pymupdf.open(path)
+    for i, page in enumerate(doc):
+        tables = page.find_tables()
+        for j, table in enumerate(tables.tables):
+            print(f"\n--- Page {i+1}, Table {j+1} ---\n")
+            df = table.to_pandas()
+            print(df.to_markdown(index=False))
+
+def extract_images(path, output_dir):
+    import pymupdf
+    from pathlib import Path
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    doc = pymupdf.open(path)
+    count = 0
+    for i, page in enumerate(doc):
+        for img_idx, img in enumerate(page.get_images(full=True)):
+            xref = img[0]
+            pix = pymupdf.Pixmap(doc, xref)
+            if pix.n >= 5:
+                pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+            out_path = f"{output_dir}/page{i+1}_img{img_idx+1}.png"
+            pix.save(out_path)
+            count += 1
+    print(f"Extracted {count} images to {output_dir}/")
+
+def show_metadata(path):
+    import pymupdf
+    doc = pymupdf.open(path)
+    print(json.dumps({
+        "pages": len(doc),
+        "title": doc.metadata.get("title", ""),
+        "author": doc.metadata.get("author", ""),
+        "subject": doc.metadata.get("subject", ""),
+        "creator": doc.metadata.get("creator", ""),
+        "producer": doc.metadata.get("producer", ""),
+        "format": doc.metadata.get("format", ""),
+    }, indent=2))
+
+if __name__ == "__main__":
+    args = sys.argv[1:]
+    if not args or args[0] in ("-h", "--help"):
+        print(__doc__)
+        sys.exit(0)
+
+    path = args[0]
+    pages = None
+
+    if "--pages" in args:
+        idx = args.index("--pages")
+        p = args[idx + 1]
+        if "-" in p:
+            start, end = p.split("-")
+            pages = list(range(int(start), int(end) + 1))
+        else:
+            pages = [int(p)]
+
+    if "--metadata" in args:
+        show_metadata(path)
+    elif "--tables" in args:
+        extract_tables(path)
+    elif "--images" in args:
+        idx = args.index("--images")
+        output_dir = args[idx + 1] if idx + 1 < len(args) else "./images"
+        extract_images(path, output_dir)
+    elif "--markdown" in args:
+        extract_markdown(path, pages=pages)
+    else:
+        extract_text(path, pages=pages)