Files
OCR/scripts/run_preprocessing_tests.py
egg 86a6633000 feat: consolidate env config and add deployment files
- Add debug_font_path, demo_docs_dir, e2e_api_base_url to config.py
- Fix hardcoded paths in pp_structure_debug.py, create_demo_images.py
- Fix hardcoded paths in test files
- Update .env.example with new configuration options
- Update .gitignore to exclude AI development files (.claude/, openspec/, AGENTS.md, CLAUDE.md)
- Add production startup script (start-prod.sh)
- Add README.md with project documentation
- Add 1panel Docker deployment files (docker-compose.yml, Dockerfiles, nginx.conf)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 15:02:16 +08:00

160 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""
Test script for PDF Preprocessing Pipeline.
Usage:
PYTHONPATH=backend python3 scripts/run_preprocessing_tests.py
"""
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
from app.services.direct_extraction_engine import DirectExtractionEngine
def test_preprocessing_pipeline():
"""Test the preprocessing pipeline with demo PDFs."""
# Initialize engine with preprocessing options
engine = DirectExtractionEngine(
enable_content_sanitization=True,
enable_hidden_layer_removal=True,
enable_whiteout_detection=True,
whiteout_iou_threshold=0.8,
enable_page_number_filter=True,
enable_garble_detection=True,
garble_ocr_fallback_threshold=0.1
)
# Test files
demo_docs = Path(__file__).parent.parent / "demo_docs"
test_files = [
demo_docs / "edit.pdf",
demo_docs / "edit2.pdf",
demo_docs / "edit3.pdf",
]
print("=" * 60)
print("PDF Preprocessing Pipeline Test")
print("=" * 60)
# Check GS availability
gs_available = engine.is_ghostscript_available()
print(f"\nGhostscript available: {gs_available}")
for pdf_path in test_files:
if not pdf_path.exists():
print(f"\n[SKIP] {pdf_path.name} - file not found")
continue
print(f"\n{'=' * 60}")
print(f"Testing: {pdf_path.name}")
print("=" * 60)
try:
# Extract with preprocessing
doc = engine.extract(pdf_path)
# Get quality report
report = engine.get_extraction_quality_report(doc)
print(f"\n[Quality Report]")
print(f" Total pages: {report['total_pages']}")
print(f" Pages sanitized: {report['preprocessing_stats']['pages_sanitized']}")
print(f" Whiteout regions (vector): {report['preprocessing_stats']['total_whiteout_regions']}")
print(f" Covering images (black/white): {report['preprocessing_stats'].get('total_covering_images', 0)}")
print(f" Average garble rate: {report['average_garble_rate']:.2%}")
print(f" Needs OCR fallback: {report['needs_ocr_fallback']}")
# Show per-page garble rates
if report['garble_rates']:
print(f"\n[Per-page Garble Rates]")
for page_num, rate in report['garble_rates'].items():
status = "⚠️ HIGH" if rate > 0.1 else "✓ OK"
print(f" Page {page_num}: {rate:.2%} {status}")
# Pages needing OCR
ocr_pages = engine.get_pages_needing_ocr(doc)
if ocr_pages:
print(f"\n[Pages needing OCR]: {ocr_pages}")
# Show extraction summary
print(f"\n[Extraction Summary]")
for page in doc.pages:
text_count = len([e for e in page.elements if hasattr(e, 'content')])
table_count = len([e for e in page.elements if hasattr(e, 'rows')])
image_count = len([e for e in page.elements if hasattr(e, 'image_path')])
print(f" Page {page.page_number}: {text_count} text blocks, {table_count} tables, {image_count} images")
print(f"\n[SUCCESS] {pdf_path.name} processed successfully")
except Exception as e:
print(f"\n[ERROR] {pdf_path.name}: {e}")
import traceback
traceback.print_exc()
print("\n" + "=" * 60)
print("Test Complete")
print("=" * 60)
def test_garble_detection():
"""Test garble rate calculation with sample text."""
engine = DirectExtractionEngine()
print("\n" + "=" * 60)
print("Garble Detection Unit Test")
print("=" * 60)
test_cases = [
("Normal text without issues", 0.0),
("Text with (cid:123) garbled chars", 0.1), # ~10 chars out of ~35
("(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)", 1.0), # 100% garbled
("Hello \ufffd world", 0.08), # replacement char
("Mixed (cid:99) and \ufffd issues", 0.25),
]
for text, expected_approx in test_cases:
rate = engine._calculate_garble_rate(text)
status = "" if abs(rate - expected_approx) < 0.15 else ""
print(f" {status} '{text[:40]}...' -> {rate:.2%}")
def test_page_number_detection():
"""Test page number pattern detection."""
engine = DirectExtractionEngine()
print("\n" + "=" * 60)
print("Page Number Detection Unit Test")
print("=" * 60)
test_cases = [
("1", True),
("42", True),
("Page 1", True),
("page 123", True),
("- 5 -", True),
("1/10", True),
("第 3 頁", True),
("第5页", True),
("Hello World", False),
("Chapter 1", False),
("2023-01-01", False),
]
for text, expected in test_cases:
result = engine._is_page_number(text)
status = "" if result == expected else ""
print(f" {status} '{text}' -> {result} (expected {expected})")
if __name__ == "__main__":
test_garble_detection()
test_page_number_detection()
test_preprocessing_pipeline()