From 63b474f93a0649ba9c1c739597d15d1297aaa31c Mon Sep 17 00:00:00 2001 From: egg Date: Wed, 3 Dec 2025 16:51:12 +0800 Subject: [PATCH] test: add preprocessing pipeline test script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds test script for validating PDF preprocessing pipeline: - Garble rate detection unit tests - Page number pattern detection unit tests - Integration tests with demo_docs/edit*.pdf files - Quality report generation verification Usage: PYTHONPATH=backend python3 scripts/run_preprocessing_tests.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- scripts/run_preprocessing_tests.py | 159 +++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 scripts/run_preprocessing_tests.py diff --git a/scripts/run_preprocessing_tests.py b/scripts/run_preprocessing_tests.py new file mode 100644 index 0000000..260be53 --- /dev/null +++ b/scripts/run_preprocessing_tests.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Test script for PDF Preprocessing Pipeline. + +Usage: + cd /home/egg/project/Tool_OCR + PYTHONPATH=backend python3 scripts/test_preprocessing_pipeline.py +""" + +import sys +from pathlib import Path + +# Add backend to path +sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) + +from app.services.direct_extraction_engine import DirectExtractionEngine + + +def test_preprocessing_pipeline(): + """Test the preprocessing pipeline with demo PDFs.""" + + # Initialize engine with preprocessing options + engine = DirectExtractionEngine( + enable_content_sanitization=True, + enable_hidden_layer_removal=True, + enable_whiteout_detection=True, + whiteout_iou_threshold=0.8, + enable_page_number_filter=True, + enable_garble_detection=True, + garble_ocr_fallback_threshold=0.1 + ) + + # Test files + demo_docs = Path(__file__).parent.parent / "demo_docs" + test_files = [ + demo_docs / "edit.pdf", + demo_docs / "edit2.pdf", + demo_docs / "edit3.pdf", + ] + + print("=" * 60) + print("PDF Preprocessing Pipeline Test") + print("=" * 60) + + # Check GS availability + gs_available = engine.is_ghostscript_available() + print(f"\nGhostscript available: {gs_available}") + + for pdf_path in test_files: + if not pdf_path.exists(): + print(f"\n[SKIP] {pdf_path.name} - file not found") + continue + + print(f"\n{'=' * 60}") + print(f"Testing: {pdf_path.name}") + print("=" * 60) + + try: + # Extract with preprocessing + doc = engine.extract(pdf_path) + + # Get quality report + report = engine.get_extraction_quality_report(doc) + + print(f"\n[Quality Report]") + print(f" Total pages: {report['total_pages']}") + print(f" Pages sanitized: {report['preprocessing_stats']['pages_sanitized']}") + print(f" Whiteout regions detected: {report['preprocessing_stats']['total_whiteout_regions']}") + print(f" Average garble rate: {report['average_garble_rate']:.2%}") + print(f" Needs OCR fallback: {report['needs_ocr_fallback']}") + + # Show per-page garble rates + if report['garble_rates']: + print(f"\n[Per-page Garble Rates]") + for page_num, rate in report['garble_rates'].items(): + status = "⚠️ HIGH" if rate > 0.1 else "✓ OK" + print(f" Page {page_num}: {rate:.2%} {status}") + + # Pages needing OCR + ocr_pages = engine.get_pages_needing_ocr(doc) + if ocr_pages: + print(f"\n[Pages needing OCR]: {ocr_pages}") + + # Show extraction summary + print(f"\n[Extraction Summary]") + for page in doc.pages: + text_count = len([e for e in page.elements if hasattr(e, 'content')]) + table_count = len([e for e in page.elements if hasattr(e, 'rows')]) + image_count = len([e for e in page.elements if hasattr(e, 'image_path')]) + print(f" Page {page.page_number}: {text_count} text blocks, {table_count} tables, {image_count} images") + + print(f"\n[SUCCESS] {pdf_path.name} processed successfully") + + except Exception as e: + print(f"\n[ERROR] {pdf_path.name}: {e}") + import traceback + traceback.print_exc() + + print("\n" + "=" * 60) + print("Test Complete") + print("=" * 60) + + +def test_garble_detection(): + """Test garble rate calculation with sample text.""" + + engine = DirectExtractionEngine() + + print("\n" + "=" * 60) + print("Garble Detection Unit Test") + print("=" * 60) + + test_cases = [ + ("Normal text without issues", 0.0), + ("Text with (cid:123) garbled chars", 0.1), # ~10 chars out of ~35 + ("(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)", 1.0), # 100% garbled + ("Hello \ufffd world", 0.08), # replacement char + ("Mixed (cid:99) and \ufffd issues", 0.25), + ] + + for text, expected_approx in test_cases: + rate = engine._calculate_garble_rate(text) + status = "✓" if abs(rate - expected_approx) < 0.15 else "✗" + print(f" {status} '{text[:40]}...' -> {rate:.2%}") + + +def test_page_number_detection(): + """Test page number pattern detection.""" + + engine = DirectExtractionEngine() + + print("\n" + "=" * 60) + print("Page Number Detection Unit Test") + print("=" * 60) + + test_cases = [ + ("1", True), + ("42", True), + ("Page 1", True), + ("page 123", True), + ("- 5 -", True), + ("1/10", True), + ("第 3 頁", True), + ("第5页", True), + ("Hello World", False), + ("Chapter 1", False), + ("2023-01-01", False), + ] + + for text, expected in test_cases: + result = engine._is_page_number(text) + status = "✓" if result == expected else "✗" + print(f" {status} '{text}' -> {result} (expected {expected})") + + +if __name__ == "__main__": + test_garble_detection() + test_page_number_detection() + test_preprocessing_pipeline()