112 lines
3.5 KiB
Python
112 lines
3.5 KiB
Python
"""
|
|
Generate test PDF to verify Phase 1 fixes
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Add backend to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from app.services.direct_extraction_engine import DirectExtractionEngine
|
|
from app.services.pdf_generator_service import PDFGeneratorService
|
|
from app.services.unified_document_exporter import UnifiedDocumentExporter
|
|
|
|
|
|
def generate_test_pdf(input_pdf: str, output_dir: Path):
|
|
"""Generate test PDF using Direct Track extraction"""
|
|
|
|
input_path = Path(input_pdf)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Processing: {input_path.name}")
|
|
print(f"Output dir: {output_dir}")
|
|
|
|
# Step 1: Extract with Direct Track
|
|
engine = DirectExtractionEngine(
|
|
enable_table_detection=True,
|
|
enable_image_extraction=True,
|
|
min_image_area=200.0, # Filter tiny images
|
|
enable_whiteout_detection=True,
|
|
enable_content_sanitization=True
|
|
)
|
|
|
|
unified_doc = engine.extract(input_path, output_dir=output_dir)
|
|
|
|
# Print extraction stats
|
|
print(f"\n=== Extraction Results ===")
|
|
print(f"Document ID: {unified_doc.document_id}")
|
|
print(f"Pages: {len(unified_doc.pages)}")
|
|
|
|
table_count = 0
|
|
image_count = 0
|
|
merged_cells = 0
|
|
total_cells = 0
|
|
|
|
for page in unified_doc.pages:
|
|
for elem in page.elements:
|
|
if elem.type.value == 'table':
|
|
table_count += 1
|
|
if elem.content and hasattr(elem.content, 'cells'):
|
|
total_cells += len(elem.content.cells)
|
|
for cell in elem.content.cells:
|
|
if cell.row_span > 1 or cell.col_span > 1:
|
|
merged_cells += 1
|
|
elif elem.type.value == 'image':
|
|
image_count += 1
|
|
|
|
print(f"Tables: {table_count}")
|
|
print(f" - Total cells: {total_cells}")
|
|
print(f" - Merged cells: {merged_cells}")
|
|
print(f"Images: {image_count}")
|
|
|
|
# Step 2: Export to JSON
|
|
exporter = UnifiedDocumentExporter()
|
|
json_path = output_dir / f"{input_path.stem}_result.json"
|
|
exporter.export_to_json(unified_doc, json_path)
|
|
print(f"\nJSON saved: {json_path}")
|
|
|
|
# Step 3: Generate layout PDF
|
|
pdf_generator = PDFGeneratorService()
|
|
pdf_path = output_dir / f"{input_path.stem}_layout.pdf"
|
|
|
|
try:
|
|
pdf_generator.generate_from_unified_document(
|
|
unified_doc=unified_doc,
|
|
output_path=pdf_path,
|
|
source_file_path=input_path
|
|
)
|
|
print(f"PDF saved: {pdf_path}")
|
|
return pdf_path
|
|
except Exception as e:
|
|
print(f"PDF generation error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test with edit3.pdf (has complex tables with merging)
|
|
demo_docs = Path(__file__).parent.parent.parent / "demo_docs"
|
|
output_base = Path(__file__).parent.parent / "storage" / "test_phase1"
|
|
|
|
# Process edit3.pdf
|
|
edit3_pdf = demo_docs / "edit3.pdf"
|
|
if edit3_pdf.exists():
|
|
output_dir = output_base / "edit3"
|
|
result = generate_test_pdf(str(edit3_pdf), output_dir)
|
|
if result:
|
|
print(f"\n✓ Test PDF generated: {result}")
|
|
|
|
# Also process edit.pdf for comparison
|
|
edit_pdf = demo_docs / "edit.pdf"
|
|
if edit_pdf.exists():
|
|
output_dir = output_base / "edit"
|
|
result = generate_test_pdf(str(edit_pdf), output_dir)
|
|
if result:
|
|
print(f"\n✓ Test PDF generated: {result}")
|
|
|
|
print(f"\n=== Output Location ===")
|
|
print(f"{output_base}")
|