test: add preprocessing pipeline test script
Adds test script for validating PDF preprocessing pipeline: - Garble rate detection unit tests - Page number pattern detection unit tests - Integration tests with demo_docs/edit*.pdf files - Quality report generation verification Usage: PYTHONPATH=backend python3 scripts/run_preprocessing_tests.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
159
scripts/run_preprocessing_tests.py
Normal file
159
scripts/run_preprocessing_tests.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for PDF Preprocessing Pipeline.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
cd /home/egg/project/Tool_OCR
|
||||||
|
PYTHONPATH=backend python3 scripts/test_preprocessing_pipeline.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add backend to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
|
||||||
|
|
||||||
|
from app.services.direct_extraction_engine import DirectExtractionEngine
|
||||||
|
|
||||||
|
|
||||||
|
def test_preprocessing_pipeline():
|
||||||
|
"""Test the preprocessing pipeline with demo PDFs."""
|
||||||
|
|
||||||
|
# Initialize engine with preprocessing options
|
||||||
|
engine = DirectExtractionEngine(
|
||||||
|
enable_content_sanitization=True,
|
||||||
|
enable_hidden_layer_removal=True,
|
||||||
|
enable_whiteout_detection=True,
|
||||||
|
whiteout_iou_threshold=0.8,
|
||||||
|
enable_page_number_filter=True,
|
||||||
|
enable_garble_detection=True,
|
||||||
|
garble_ocr_fallback_threshold=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test files
|
||||||
|
demo_docs = Path(__file__).parent.parent / "demo_docs"
|
||||||
|
test_files = [
|
||||||
|
demo_docs / "edit.pdf",
|
||||||
|
demo_docs / "edit2.pdf",
|
||||||
|
demo_docs / "edit3.pdf",
|
||||||
|
]
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("PDF Preprocessing Pipeline Test")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Check GS availability
|
||||||
|
gs_available = engine.is_ghostscript_available()
|
||||||
|
print(f"\nGhostscript available: {gs_available}")
|
||||||
|
|
||||||
|
for pdf_path in test_files:
|
||||||
|
if not pdf_path.exists():
|
||||||
|
print(f"\n[SKIP] {pdf_path.name} - file not found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Testing: {pdf_path.name}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Extract with preprocessing
|
||||||
|
doc = engine.extract(pdf_path)
|
||||||
|
|
||||||
|
# Get quality report
|
||||||
|
report = engine.get_extraction_quality_report(doc)
|
||||||
|
|
||||||
|
print(f"\n[Quality Report]")
|
||||||
|
print(f" Total pages: {report['total_pages']}")
|
||||||
|
print(f" Pages sanitized: {report['preprocessing_stats']['pages_sanitized']}")
|
||||||
|
print(f" Whiteout regions detected: {report['preprocessing_stats']['total_whiteout_regions']}")
|
||||||
|
print(f" Average garble rate: {report['average_garble_rate']:.2%}")
|
||||||
|
print(f" Needs OCR fallback: {report['needs_ocr_fallback']}")
|
||||||
|
|
||||||
|
# Show per-page garble rates
|
||||||
|
if report['garble_rates']:
|
||||||
|
print(f"\n[Per-page Garble Rates]")
|
||||||
|
for page_num, rate in report['garble_rates'].items():
|
||||||
|
status = "⚠️ HIGH" if rate > 0.1 else "✓ OK"
|
||||||
|
print(f" Page {page_num}: {rate:.2%} {status}")
|
||||||
|
|
||||||
|
# Pages needing OCR
|
||||||
|
ocr_pages = engine.get_pages_needing_ocr(doc)
|
||||||
|
if ocr_pages:
|
||||||
|
print(f"\n[Pages needing OCR]: {ocr_pages}")
|
||||||
|
|
||||||
|
# Show extraction summary
|
||||||
|
print(f"\n[Extraction Summary]")
|
||||||
|
for page in doc.pages:
|
||||||
|
text_count = len([e for e in page.elements if hasattr(e, 'content')])
|
||||||
|
table_count = len([e for e in page.elements if hasattr(e, 'rows')])
|
||||||
|
image_count = len([e for e in page.elements if hasattr(e, 'image_path')])
|
||||||
|
print(f" Page {page.page_number}: {text_count} text blocks, {table_count} tables, {image_count} images")
|
||||||
|
|
||||||
|
print(f"\n[SUCCESS] {pdf_path.name} processed successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n[ERROR] {pdf_path.name}: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Test Complete")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
def test_garble_detection():
|
||||||
|
"""Test garble rate calculation with sample text."""
|
||||||
|
|
||||||
|
engine = DirectExtractionEngine()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Garble Detection Unit Test")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
("Normal text without issues", 0.0),
|
||||||
|
("Text with (cid:123) garbled chars", 0.1), # ~10 chars out of ~35
|
||||||
|
("(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)", 1.0), # 100% garbled
|
||||||
|
("Hello \ufffd world", 0.08), # replacement char
|
||||||
|
("Mixed (cid:99) and \ufffd issues", 0.25),
|
||||||
|
]
|
||||||
|
|
||||||
|
for text, expected_approx in test_cases:
|
||||||
|
rate = engine._calculate_garble_rate(text)
|
||||||
|
status = "✓" if abs(rate - expected_approx) < 0.15 else "✗"
|
||||||
|
print(f" {status} '{text[:40]}...' -> {rate:.2%}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_number_detection():
|
||||||
|
"""Test page number pattern detection."""
|
||||||
|
|
||||||
|
engine = DirectExtractionEngine()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Page Number Detection Unit Test")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
("1", True),
|
||||||
|
("42", True),
|
||||||
|
("Page 1", True),
|
||||||
|
("page 123", True),
|
||||||
|
("- 5 -", True),
|
||||||
|
("1/10", True),
|
||||||
|
("第 3 頁", True),
|
||||||
|
("第5页", True),
|
||||||
|
("Hello World", False),
|
||||||
|
("Chapter 1", False),
|
||||||
|
("2023-01-01", False),
|
||||||
|
]
|
||||||
|
|
||||||
|
for text, expected in test_cases:
|
||||||
|
result = engine._is_page_number(text)
|
||||||
|
status = "✓" if result == expected else "✗"
|
||||||
|
print(f" {status} '{text}' -> {result} (expected {expected})")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_garble_detection()
|
||||||
|
test_page_number_detection()
|
||||||
|
test_preprocessing_pipeline()
|
||||||
Reference in New Issue
Block a user