OCR/scripts/run_preprocessing_tests.py

#!/usr/bin/env python3
"""
Test script for PDF Preprocessing Pipeline.

Usage:
    PYTHONPATH=backend python3 scripts/run_preprocessing_tests.py
"""

import sys
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))

from app.services.direct_extraction_engine import DirectExtractionEngine


def test_preprocessing_pipeline():
    """Test the preprocessing pipeline with demo PDFs."""

    # Initialize engine with preprocessing options
    engine = DirectExtractionEngine(
        enable_content_sanitization=True,
        enable_hidden_layer_removal=True,
        enable_whiteout_detection=True,
        whiteout_iou_threshold=0.8,
        enable_page_number_filter=True,
        enable_garble_detection=True,
        garble_ocr_fallback_threshold=0.1
    )

    # Test files
    demo_docs = Path(__file__).parent.parent / "demo_docs"
    test_files = [
        demo_docs / "edit.pdf",
        demo_docs / "edit2.pdf",
        demo_docs / "edit3.pdf",
    ]

    print("=" * 60)
    print("PDF Preprocessing Pipeline Test")
    print("=" * 60)

    # Check GS availability
    gs_available = engine.is_ghostscript_available()
    print(f"\nGhostscript available: {gs_available}")

    for pdf_path in test_files:
        if not pdf_path.exists():
            print(f"\n[SKIP] {pdf_path.name} - file not found")
            continue

        print(f"\n{'=' * 60}")
        print(f"Testing: {pdf_path.name}")
        print("=" * 60)

        try:
            # Extract with preprocessing
            doc = engine.extract(pdf_path)

            # Get quality report
            report = engine.get_extraction_quality_report(doc)

            print(f"\n[Quality Report]")
            print(f"  Total pages: {report['total_pages']}")
            print(f"  Pages sanitized: {report['preprocessing_stats']['pages_sanitized']}")
            print(f"  Whiteout regions (vector): {report['preprocessing_stats']['total_whiteout_regions']}")
            print(f"  Covering images (black/white): {report['preprocessing_stats'].get('total_covering_images', 0)}")
            print(f"  Average garble rate: {report['average_garble_rate']:.2%}")
            print(f"  Needs OCR fallback: {report['needs_ocr_fallback']}")

            # Show per-page garble rates
            if report['garble_rates']:
                print(f"\n[Per-page Garble Rates]")
                for page_num, rate in report['garble_rates'].items():
                    status = "⚠️ HIGH" if rate > 0.1 else "✓ OK"
                    print(f"  Page {page_num}: {rate:.2%} {status}")

            # Pages needing OCR
            ocr_pages = engine.get_pages_needing_ocr(doc)
            if ocr_pages:
                print(f"\n[Pages needing OCR]: {ocr_pages}")

            # Show extraction summary
            print(f"\n[Extraction Summary]")
            for page in doc.pages:
                text_count = len([e for e in page.elements if hasattr(e, 'content')])
                table_count = len([e for e in page.elements if hasattr(e, 'rows')])
                image_count = len([e for e in page.elements if hasattr(e, 'image_path')])
                print(f"  Page {page.page_number}: {text_count} text blocks, {table_count} tables, {image_count} images")

            print(f"\n[SUCCESS] {pdf_path.name} processed successfully")

        except Exception as e:
            print(f"\n[ERROR] {pdf_path.name}: {e}")
            import traceback
            traceback.print_exc()

    print("\n" + "=" * 60)
    print("Test Complete")
    print("=" * 60)


def test_garble_detection():
    """Test garble rate calculation with sample text."""

    engine = DirectExtractionEngine()

    print("\n" + "=" * 60)
    print("Garble Detection Unit Test")
    print("=" * 60)

    test_cases = [
        ("Normal text without issues", 0.0),
        ("Text with (cid:123) garbled chars", 0.1),  # ~10 chars out of ~35
        ("(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)", 1.0),  # 100% garbled
        ("Hello \ufffd world", 0.08),  # replacement char
        ("Mixed (cid:99) and \ufffd issues", 0.25),
    ]

    for text, expected_approx in test_cases:
        rate = engine._calculate_garble_rate(text)
        status = "✓" if abs(rate - expected_approx) < 0.15 else "✗"
        print(f"  {status} '{text[:40]}...' -> {rate:.2%}")


def test_page_number_detection():
    """Test page number pattern detection."""

    engine = DirectExtractionEngine()

    print("\n" + "=" * 60)
    print("Page Number Detection Unit Test")
    print("=" * 60)

    test_cases = [
        ("1", True),
        ("42", True),
        ("Page 1", True),
        ("page 123", True),
        ("- 5 -", True),
        ("1/10", True),
        ("第 3 頁", True),
        ("第5页", True),
        ("Hello World", False),
        ("Chapter 1", False),
        ("2023-01-01", False),
    ]

    for text, expected in test_cases:
        result = engine._is_page_number(text)
        status = "✓" if result == expected else "✗"
        print(f"  {status} '{text}' -> {result} (expected {expected})")


if __name__ == "__main__":
    test_garble_detection()
    test_page_number_detection()
    test_preprocessing_pipeline()