""" Generate test PDF to verify Phase 1 fixes """ import sys import os from pathlib import Path # Add backend to path sys.path.insert(0, str(Path(__file__).parent.parent)) from app.services.direct_extraction_engine import DirectExtractionEngine from app.services.pdf_generator_service import PDFGeneratorService from app.services.unified_document_exporter import UnifiedDocumentExporter def generate_test_pdf(input_pdf: str, output_dir: Path): """Generate test PDF using Direct Track extraction""" input_path = Path(input_pdf) output_dir.mkdir(parents=True, exist_ok=True) print(f"Processing: {input_path.name}") print(f"Output dir: {output_dir}") # Step 1: Extract with Direct Track engine = DirectExtractionEngine( enable_table_detection=True, enable_image_extraction=True, min_image_area=200.0, # Filter tiny images enable_whiteout_detection=True, enable_content_sanitization=True ) unified_doc = engine.extract(input_path, output_dir=output_dir) # Print extraction stats print(f"\n=== Extraction Results ===") print(f"Document ID: {unified_doc.document_id}") print(f"Pages: {len(unified_doc.pages)}") table_count = 0 image_count = 0 merged_cells = 0 total_cells = 0 for page in unified_doc.pages: for elem in page.elements: if elem.type.value == 'table': table_count += 1 if elem.content and hasattr(elem.content, 'cells'): total_cells += len(elem.content.cells) for cell in elem.content.cells: if cell.row_span > 1 or cell.col_span > 1: merged_cells += 1 elif elem.type.value == 'image': image_count += 1 print(f"Tables: {table_count}") print(f" - Total cells: {total_cells}") print(f" - Merged cells: {merged_cells}") print(f"Images: {image_count}") # Step 2: Export to JSON exporter = UnifiedDocumentExporter() json_path = output_dir / f"{input_path.stem}_result.json" exporter.export_to_json(unified_doc, json_path) print(f"\nJSON saved: {json_path}") # Step 3: Generate layout PDF pdf_generator = PDFGeneratorService() pdf_path = output_dir / f"{input_path.stem}_layout.pdf" try: pdf_generator.generate_from_unified_document( unified_doc=unified_doc, output_path=pdf_path, source_file_path=input_path ) print(f"PDF saved: {pdf_path}") return pdf_path except Exception as e: print(f"PDF generation error: {e}") import traceback traceback.print_exc() return None if __name__ == "__main__": # Test with edit3.pdf (has complex tables with merging) demo_docs = Path(__file__).parent.parent.parent / "demo_docs" output_base = Path(__file__).parent.parent / "storage" / "test_phase1" # Process edit3.pdf edit3_pdf = demo_docs / "edit3.pdf" if edit3_pdf.exists(): output_dir = output_base / "edit3" result = generate_test_pdf(str(edit3_pdf), output_dir) if result: print(f"\n✓ Test PDF generated: {result}") # Also process edit.pdf for comparison edit_pdf = demo_docs / "edit.pdf" if edit_pdf.exists(): output_dir = output_base / "edit" result = generate_test_pdf(str(edit_pdf), output_dir) if result: print(f"\n✓ Test PDF generated: {result}") print(f"\n=== Output Location ===") print(f"{output_base}")