#!/usr/bin/env python3 """ Test script for PDF Preprocessing Pipeline. Usage: PYTHONPATH=backend python3 scripts/run_preprocessing_tests.py """ import sys from pathlib import Path # Add backend to path sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) from app.services.direct_extraction_engine import DirectExtractionEngine def test_preprocessing_pipeline(): """Test the preprocessing pipeline with demo PDFs.""" # Initialize engine with preprocessing options engine = DirectExtractionEngine( enable_content_sanitization=True, enable_hidden_layer_removal=True, enable_whiteout_detection=True, whiteout_iou_threshold=0.8, enable_page_number_filter=True, enable_garble_detection=True, garble_ocr_fallback_threshold=0.1 ) # Test files demo_docs = Path(__file__).parent.parent / "demo_docs" test_files = [ demo_docs / "edit.pdf", demo_docs / "edit2.pdf", demo_docs / "edit3.pdf", ] print("=" * 60) print("PDF Preprocessing Pipeline Test") print("=" * 60) # Check GS availability gs_available = engine.is_ghostscript_available() print(f"\nGhostscript available: {gs_available}") for pdf_path in test_files: if not pdf_path.exists(): print(f"\n[SKIP] {pdf_path.name} - file not found") continue print(f"\n{'=' * 60}") print(f"Testing: {pdf_path.name}") print("=" * 60) try: # Extract with preprocessing doc = engine.extract(pdf_path) # Get quality report report = engine.get_extraction_quality_report(doc) print(f"\n[Quality Report]") print(f" Total pages: {report['total_pages']}") print(f" Pages sanitized: {report['preprocessing_stats']['pages_sanitized']}") print(f" Whiteout regions (vector): {report['preprocessing_stats']['total_whiteout_regions']}") print(f" Covering images (black/white): {report['preprocessing_stats'].get('total_covering_images', 0)}") print(f" Average garble rate: {report['average_garble_rate']:.2%}") print(f" Needs OCR fallback: {report['needs_ocr_fallback']}") # Show per-page garble rates if report['garble_rates']: print(f"\n[Per-page Garble Rates]") for page_num, rate in report['garble_rates'].items(): status = "⚠️ HIGH" if rate > 0.1 else "✓ OK" print(f" Page {page_num}: {rate:.2%} {status}") # Pages needing OCR ocr_pages = engine.get_pages_needing_ocr(doc) if ocr_pages: print(f"\n[Pages needing OCR]: {ocr_pages}") # Show extraction summary print(f"\n[Extraction Summary]") for page in doc.pages: text_count = len([e for e in page.elements if hasattr(e, 'content')]) table_count = len([e for e in page.elements if hasattr(e, 'rows')]) image_count = len([e for e in page.elements if hasattr(e, 'image_path')]) print(f" Page {page.page_number}: {text_count} text blocks, {table_count} tables, {image_count} images") print(f"\n[SUCCESS] {pdf_path.name} processed successfully") except Exception as e: print(f"\n[ERROR] {pdf_path.name}: {e}") import traceback traceback.print_exc() print("\n" + "=" * 60) print("Test Complete") print("=" * 60) def test_garble_detection(): """Test garble rate calculation with sample text.""" engine = DirectExtractionEngine() print("\n" + "=" * 60) print("Garble Detection Unit Test") print("=" * 60) test_cases = [ ("Normal text without issues", 0.0), ("Text with (cid:123) garbled chars", 0.1), # ~10 chars out of ~35 ("(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)", 1.0), # 100% garbled ("Hello \ufffd world", 0.08), # replacement char ("Mixed (cid:99) and \ufffd issues", 0.25), ] for text, expected_approx in test_cases: rate = engine._calculate_garble_rate(text) status = "✓" if abs(rate - expected_approx) < 0.15 else "✗" print(f" {status} '{text[:40]}...' -> {rate:.2%}") def test_page_number_detection(): """Test page number pattern detection.""" engine = DirectExtractionEngine() print("\n" + "=" * 60) print("Page Number Detection Unit Test") print("=" * 60) test_cases = [ ("1", True), ("42", True), ("Page 1", True), ("page 123", True), ("- 5 -", True), ("1/10", True), ("第 3 頁", True), ("第5页", True), ("Hello World", False), ("Chapter 1", False), ("2023-01-01", False), ] for text, expected in test_cases: result = engine._is_page_number(text) status = "✓" if result == expected else "✗" print(f" {status} '{text}' -> {result} (expected {expected})") if __name__ == "__main__": test_garble_detection() test_page_number_detection() test_preprocessing_pipeline()