test: add preprocessing pipeline test script

Adds test script for validating PDF preprocessing pipeline: - Garble rate detection unit tests - Page number pattern detection unit tests - Integration tests with demo_docs/edit*.pdf files - Quality report generation verification Usage: PYTHONPATH=backend python3 scripts/run_preprocessing_tests.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 16:51:12 +08:00
parent 6a65c7617d
commit 63b474f93a
1 changed files with 159 additions and 0 deletions
--- a/scripts/run_preprocessing_tests.py
+++ b/scripts/run_preprocessing_tests.py
@@ -0,0 +1,159 @@
 #!/usr/bin/env python3
 """
 Test script for PDF Preprocessing Pipeline.
 Usage:
    cd /home/egg/project/Tool_OCR
    PYTHONPATH=backend python3 scripts/test_preprocessing_pipeline.py
 """
 import sys
 from pathlib import Path
 # Add backend to path
 sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
 from app.services.direct_extraction_engine import DirectExtractionEngine
 def test_preprocessing_pipeline():
    """Test the preprocessing pipeline with demo PDFs."""
    # Initialize engine with preprocessing options
    engine = DirectExtractionEngine(
        enable_content_sanitization=True,
        enable_hidden_layer_removal=True,
        enable_whiteout_detection=True,
        whiteout_iou_threshold=0.8,
        enable_page_number_filter=True,
        enable_garble_detection=True,
        garble_ocr_fallback_threshold=0.1
    )
    # Test files
    demo_docs = Path(__file__).parent.parent / "demo_docs"
    test_files = [
        demo_docs / "edit.pdf",
        demo_docs / "edit2.pdf",
        demo_docs / "edit3.pdf",
    ]
    print("=" * 60)
    print("PDF Preprocessing Pipeline Test")
    print("=" * 60)
    # Check GS availability
    gs_available = engine.is_ghostscript_available()
    print(f"\nGhostscript available: {gs_available}")
    for pdf_path in test_files:
        if not pdf_path.exists():
            print(f"\n[SKIP] {pdf_path.name} - file not found")
            continue
        print(f"\n{'=' * 60}")
        print(f"Testing: {pdf_path.name}")
        print("=" * 60)
        try:
            # Extract with preprocessing
            doc = engine.extract(pdf_path)
            # Get quality report
            report = engine.get_extraction_quality_report(doc)
            print(f"\n[Quality Report]")
            print(f"  Total pages: {report['total_pages']}")
            print(f"  Pages sanitized: {report['preprocessing_stats']['pages_sanitized']}")
            print(f"  Whiteout regions detected: {report['preprocessing_stats']['total_whiteout_regions']}")
            print(f"  Average garble rate: {report['average_garble_rate']:.2%}")
            print(f"  Needs OCR fallback: {report['needs_ocr_fallback']}")
            # Show per-page garble rates
            if report['garble_rates']:
                print(f"\n[Per-page Garble Rates]")
                for page_num, rate in report['garble_rates'].items():
                    status = "⚠️ HIGH" if rate > 0.1 else "✓ OK"
                    print(f"  Page {page_num}: {rate:.2%} {status}")
            # Pages needing OCR
            ocr_pages = engine.get_pages_needing_ocr(doc)
            if ocr_pages:
                print(f"\n[Pages needing OCR]: {ocr_pages}")
            # Show extraction summary
            print(f"\n[Extraction Summary]")
            for page in doc.pages:
                text_count = len([e for e in page.elements if hasattr(e, 'content')])
                table_count = len([e for e in page.elements if hasattr(e, 'rows')])
                image_count = len([e for e in page.elements if hasattr(e, 'image_path')])
                print(f"  Page {page.page_number}: {text_count} text blocks, {table_count} tables, {image_count} images")
            print(f"\n[SUCCESS] {pdf_path.name} processed successfully")
        except Exception as e:
            print(f"\n[ERROR] {pdf_path.name}: {e}")
            import traceback
            traceback.print_exc()
    print("\n" + "=" * 60)
    print("Test Complete")
    print("=" * 60)
 def test_garble_detection():
    """Test garble rate calculation with sample text."""
    engine = DirectExtractionEngine()
    print("\n" + "=" * 60)
    print("Garble Detection Unit Test")
    print("=" * 60)
    test_cases = [
        ("Normal text without issues", 0.0),
        ("Text with (cid:123) garbled chars", 0.1),  # ~10 chars out of ~35
        ("(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)", 1.0),  # 100% garbled
        ("Hello \ufffd world", 0.08),  # replacement char
        ("Mixed (cid:99) and \ufffd issues", 0.25),
    ]
    for text, expected_approx in test_cases:
        rate = engine._calculate_garble_rate(text)
        status = "✓" if abs(rate - expected_approx) < 0.15 else "✗"
        print(f"  {status} '{text[:40]}...' -> {rate:.2%}")
 def test_page_number_detection():
    """Test page number pattern detection."""
    engine = DirectExtractionEngine()
    print("\n" + "=" * 60)
    print("Page Number Detection Unit Test")
    print("=" * 60)
    test_cases = [
        ("1", True),
        ("42", True),
        ("Page 1", True),
        ("page 123", True),
        ("- 5 -", True),
        ("1/10", True),
        ("第 3 頁", True),
        ("第5页", True),
        ("Hello World", False),
        ("Chapter 1", False),
        ("2023-01-01", False),
    ]
    for text, expected in test_cases:
        result = engine._is_page_number(text)
        status = "✓" if result == expected else "✗"
        print(f"  {status} '{text}' -> {result} (expected {expected})")
 if __name__ == "__main__":
    test_garble_detection()
    test_page_number_detection()
    test_preprocessing_pipeline()