""" Test script for layered rendering approach. Tests that table borders are drawn from cell_boxes while text is rendered at raw OCR positions. """ import sys import json from pathlib import Path from app.services.pdf_generator_service import PDFGeneratorService from app.services.gap_filling_service import GapFillingService def test_layered_rendering(): """Test the layered rendering approach.""" # Use existing test task task_id = "84899366-f361-44f1-b989-5aba72419ca5" result_dir = Path(__file__).resolve().parents[2] / "storage" / "results" / task_id if not result_dir.exists(): print(f"[ERROR] Result directory not found: {result_dir}") return False # Load scan_result.json scan_result_path = result_dir / "scan_result.json" raw_ocr_path = result_dir / f"{task_id}_scan_page_1_raw_ocr_regions.json" if not scan_result_path.exists(): print(f"[ERROR] scan_result.json not found") return False print(f"[INFO] Loading scan_result.json from {scan_result_path}") with open(scan_result_path, 'r', encoding='utf-8') as f: scan_result = json.load(f) # Parse as UnifiedDocument using PDFGeneratorService's method # scan_result IS the unified document (not nested under 'unified_document') pdf_service = PDFGeneratorService() unified_doc = pdf_service._json_to_unified_document(scan_result, result_dir) if not unified_doc: print(f"[ERROR] Failed to parse UnifiedDocument") return False print(f"[INFO] UnifiedDocument: {unified_doc.page_count} pages") # Count elements table_count = 0 text_count = 0 for page in unified_doc.pages: for elem in page.elements: if elem.type.value == 'table': table_count += 1 # Check if cell_boxes are present (in metadata, not content) cell_boxes = elem.metadata.get('cell_boxes', []) if elem.metadata else [] embedded_images = elem.metadata.get('embedded_images', []) if elem.metadata else [] print(f"[INFO] Table {elem.element_id}: {len(cell_boxes)} cell_boxes, {len(embedded_images)} embedded_images") elif elem.type.value in ['text', 'paragraph', 'title']: text_count += 1 print(f"[INFO] Tables: {table_count}, Text elements: {text_count}") # Load raw OCR regions if available raw_ocr_regions = [] if raw_ocr_path.exists(): print(f"[INFO] Loading raw OCR regions from {raw_ocr_path}") with open(raw_ocr_path, 'r', encoding='utf-8') as f: raw_ocr_data = json.load(f) # Could be a list or dict with 'text_regions' key if isinstance(raw_ocr_data, list): raw_ocr_regions = raw_ocr_data else: raw_ocr_regions = raw_ocr_data.get('text_regions', []) print(f"[INFO] Raw OCR regions: {len(raw_ocr_regions)}") # Apply gap filling for each page print(f"[INFO] Applying GapFillingService...") gap_service = GapFillingService() gap_filled_doc = unified_doc # Start with original for page in unified_doc.pages: page_num = page.page_number page_dims = page.dimensions # Get elements for this page pp_elements = page.elements # Apply gap filling filled_elements, stats = gap_service.fill_gaps( raw_ocr_regions=raw_ocr_regions, pp_structure_elements=pp_elements, page_number=page_num, pp_dimensions=page_dims ) # Update the page's elements page.elements = filled_elements print(f"[INFO] Page {page_num}: Added {stats.get('gaps_filled', 0)} gap-filled regions") # Count elements after gap filling final_text_count = 0 for page in gap_filled_doc.pages: for elem in page.elements: if elem.type.value in ['text', 'paragraph', 'title']: final_text_count += 1 print(f"[INFO] After gap filling: {final_text_count} text elements (was {text_count})") # Generate PDF print(f"[INFO] Generating PDF with layered rendering...") output_pdf = result_dir / "test_layered_rendering.pdf" try: success = pdf_service.generate_from_unified_document( unified_doc=gap_filled_doc, output_path=output_pdf ) if success: print(f"[SUCCESS] PDF generated: {output_pdf}") print(f"[INFO] PDF size: {output_pdf.stat().st_size} bytes") return True else: print(f"[ERROR] PDF generation returned False") return False except Exception as e: print(f"[ERROR] PDF generation failed: {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": success = test_layered_rendering() sys.exit(0 if success else 1)