From 63b474f93a0649ba9c1c739597d15d1297aaa31c Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Wed, 3 Dec 2025 16:51:12 +0800
Subject: [PATCH] test: add preprocessing pipeline test script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds test script for validating PDF preprocessing pipeline:
- Garble rate detection unit tests
- Page number pattern detection unit tests
- Integration tests with demo_docs/edit*.pdf files
- Quality report generation verification

Usage: PYTHONPATH=backend python3 scripts/run_preprocessing_tests.py

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 scripts/run_preprocessing_tests.py | 159 +++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 scripts/run_preprocessing_tests.py

diff --git a/scripts/run_preprocessing_tests.py b/scripts/run_preprocessing_tests.py
new file mode 100644
index 0000000..260be53
--- /dev/null
+++ b/scripts/run_preprocessing_tests.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+Test script for PDF Preprocessing Pipeline.
+
+Usage:
+    cd /home/egg/project/Tool_OCR
+    PYTHONPATH=backend python3 scripts/test_preprocessing_pipeline.py
+"""
+
+import sys
+from pathlib import Path
+
+# Add backend to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
+
+from app.services.direct_extraction_engine import DirectExtractionEngine
+
+
+def test_preprocessing_pipeline():
+    """Test the preprocessing pipeline with demo PDFs."""
+
+    # Initialize engine with preprocessing options
+    engine = DirectExtractionEngine(
+        enable_content_sanitization=True,
+        enable_hidden_layer_removal=True,
+        enable_whiteout_detection=True,
+        whiteout_iou_threshold=0.8,
+        enable_page_number_filter=True,
+        enable_garble_detection=True,
+        garble_ocr_fallback_threshold=0.1
+    )
+
+    # Test files
+    demo_docs = Path(__file__).parent.parent / "demo_docs"
+    test_files = [
+        demo_docs / "edit.pdf",
+        demo_docs / "edit2.pdf",
+        demo_docs / "edit3.pdf",
+    ]
+
+    print("=" * 60)
+    print("PDF Preprocessing Pipeline Test")
+    print("=" * 60)
+
+    # Check GS availability
+    gs_available = engine.is_ghostscript_available()
+    print(f"\nGhostscript available: {gs_available}")
+
+    for pdf_path in test_files:
+        if not pdf_path.exists():
+            print(f"\n[SKIP] {pdf_path.name} - file not found")
+            continue
+
+        print(f"\n{'=' * 60}")
+        print(f"Testing: {pdf_path.name}")
+        print("=" * 60)
+
+        try:
+            # Extract with preprocessing
+            doc = engine.extract(pdf_path)
+
+            # Get quality report
+            report = engine.get_extraction_quality_report(doc)
+
+            print(f"\n[Quality Report]")
+            print(f"  Total pages: {report['total_pages']}")
+            print(f"  Pages sanitized: {report['preprocessing_stats']['pages_sanitized']}")
+            print(f"  Whiteout regions detected: {report['preprocessing_stats']['total_whiteout_regions']}")
+            print(f"  Average garble rate: {report['average_garble_rate']:.2%}")
+            print(f"  Needs OCR fallback: {report['needs_ocr_fallback']}")
+
+            # Show per-page garble rates
+            if report['garble_rates']:
+                print(f"\n[Per-page Garble Rates]")
+                for page_num, rate in report['garble_rates'].items():
+                    status = "⚠️ HIGH" if rate > 0.1 else "✓ OK"
+                    print(f"  Page {page_num}: {rate:.2%} {status}")
+
+            # Pages needing OCR
+            ocr_pages = engine.get_pages_needing_ocr(doc)
+            if ocr_pages:
+                print(f"\n[Pages needing OCR]: {ocr_pages}")
+
+            # Show extraction summary
+            print(f"\n[Extraction Summary]")
+            for page in doc.pages:
+                text_count = len([e for e in page.elements if hasattr(e, 'content')])
+                table_count = len([e for e in page.elements if hasattr(e, 'rows')])
+                image_count = len([e for e in page.elements if hasattr(e, 'image_path')])
+                print(f"  Page {page.page_number}: {text_count} text blocks, {table_count} tables, {image_count} images")
+
+            print(f"\n[SUCCESS] {pdf_path.name} processed successfully")
+
+        except Exception as e:
+            print(f"\n[ERROR] {pdf_path.name}: {e}")
+            import traceback
+            traceback.print_exc()
+
+    print("\n" + "=" * 60)
+    print("Test Complete")
+    print("=" * 60)
+
+
+def test_garble_detection():
+    """Test garble rate calculation with sample text."""
+
+    engine = DirectExtractionEngine()
+
+    print("\n" + "=" * 60)
+    print("Garble Detection Unit Test")
+    print("=" * 60)
+
+    test_cases = [
+        ("Normal text without issues", 0.0),
+        ("Text with (cid:123) garbled chars", 0.1),  # ~10 chars out of ~35
+        ("(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)", 1.0),  # 100% garbled
+        ("Hello \ufffd world", 0.08),  # replacement char
+        ("Mixed (cid:99) and \ufffd issues", 0.25),
+    ]
+
+    for text, expected_approx in test_cases:
+        rate = engine._calculate_garble_rate(text)
+        status = "✓" if abs(rate - expected_approx) < 0.15 else "✗"
+        print(f"  {status} '{text[:40]}...' -> {rate:.2%}")
+
+
+def test_page_number_detection():
+    """Test page number pattern detection."""
+
+    engine = DirectExtractionEngine()
+
+    print("\n" + "=" * 60)
+    print("Page Number Detection Unit Test")
+    print("=" * 60)
+
+    test_cases = [
+        ("1", True),
+        ("42", True),
+        ("Page 1", True),
+        ("page 123", True),
+        ("- 5 -", True),
+        ("1/10", True),
+        ("第 3 頁", True),
+        ("第5页", True),
+        ("Hello World", False),
+        ("Chapter 1", False),
+        ("2023-01-01", False),
+    ]
+
+    for text, expected in test_cases:
+        result = engine._is_page_number(text)
+        status = "✓" if result == expected else "✗"
+        print(f"  {status} '{text}' -> {result} (expected {expected})")
+
+
+if __name__ == "__main__":
+    test_garble_detection()
+    test_page_number_detection()
+    test_preprocessing_pipeline()