OCR/backend/tests/archived/manual_layered_rendering.py

"""
Test script for layered rendering approach.
Tests that table borders are drawn from cell_boxes
while text is rendered at raw OCR positions.
"""
import sys

import json
from pathlib import Path
from app.services.pdf_generator_service import PDFGeneratorService
from app.services.gap_filling_service import GapFillingService


def test_layered_rendering():
    """Test the layered rendering approach."""
    # Use existing test task
    task_id = "84899366-f361-44f1-b989-5aba72419ca5"
    result_dir = Path(__file__).resolve().parents[2] / "storage" / "results" / task_id

    if not result_dir.exists():
        print(f"[ERROR] Result directory not found: {result_dir}")
        return False

    # Load scan_result.json
    scan_result_path = result_dir / "scan_result.json"
    raw_ocr_path = result_dir / f"{task_id}_scan_page_1_raw_ocr_regions.json"

    if not scan_result_path.exists():
        print(f"[ERROR] scan_result.json not found")
        return False

    print(f"[INFO] Loading scan_result.json from {scan_result_path}")
    with open(scan_result_path, 'r', encoding='utf-8') as f:
        scan_result = json.load(f)

    # Parse as UnifiedDocument using PDFGeneratorService's method
    # scan_result IS the unified document (not nested under 'unified_document')
    pdf_service = PDFGeneratorService()
    unified_doc = pdf_service._json_to_unified_document(scan_result, result_dir)

    if not unified_doc:
        print(f"[ERROR] Failed to parse UnifiedDocument")
        return False

    print(f"[INFO] UnifiedDocument: {unified_doc.page_count} pages")

    # Count elements
    table_count = 0
    text_count = 0
    for page in unified_doc.pages:
        for elem in page.elements:
            if elem.type.value == 'table':
                table_count += 1
                # Check if cell_boxes are present (in metadata, not content)
                cell_boxes = elem.metadata.get('cell_boxes', []) if elem.metadata else []
                embedded_images = elem.metadata.get('embedded_images', []) if elem.metadata else []
                print(f"[INFO] Table {elem.element_id}: {len(cell_boxes)} cell_boxes, {len(embedded_images)} embedded_images")
            elif elem.type.value in ['text', 'paragraph', 'title']:
                text_count += 1

    print(f"[INFO] Tables: {table_count}, Text elements: {text_count}")

    # Load raw OCR regions if available
    raw_ocr_regions = []
    if raw_ocr_path.exists():
        print(f"[INFO] Loading raw OCR regions from {raw_ocr_path}")
        with open(raw_ocr_path, 'r', encoding='utf-8') as f:
            raw_ocr_data = json.load(f)
            # Could be a list or dict with 'text_regions' key
            if isinstance(raw_ocr_data, list):
                raw_ocr_regions = raw_ocr_data
            else:
                raw_ocr_regions = raw_ocr_data.get('text_regions', [])
        print(f"[INFO] Raw OCR regions: {len(raw_ocr_regions)}")

    # Apply gap filling for each page
    print(f"[INFO] Applying GapFillingService...")
    gap_service = GapFillingService()
    gap_filled_doc = unified_doc  # Start with original

    for page in unified_doc.pages:
        page_num = page.page_number
        page_dims = page.dimensions

        # Get elements for this page
        pp_elements = page.elements

        # Apply gap filling
        filled_elements, stats = gap_service.fill_gaps(
            raw_ocr_regions=raw_ocr_regions,
            pp_structure_elements=pp_elements,
            page_number=page_num,
            pp_dimensions=page_dims
        )

        # Update the page's elements
        page.elements = filled_elements
        print(f"[INFO] Page {page_num}: Added {stats.get('gaps_filled', 0)} gap-filled regions")

    # Count elements after gap filling
    final_text_count = 0
    for page in gap_filled_doc.pages:
        for elem in page.elements:
            if elem.type.value in ['text', 'paragraph', 'title']:
                final_text_count += 1

    print(f"[INFO] After gap filling: {final_text_count} text elements (was {text_count})")

    # Generate PDF
    print(f"[INFO] Generating PDF with layered rendering...")
    output_pdf = result_dir / "test_layered_rendering.pdf"

    try:
        success = pdf_service.generate_from_unified_document(
            unified_doc=gap_filled_doc,
            output_path=output_pdf
        )
        if success:
            print(f"[SUCCESS] PDF generated: {output_pdf}")
            print(f"[INFO] PDF size: {output_pdf.stat().st_size} bytes")
            return True
        else:
            print(f"[ERROR] PDF generation returned False")
            return False
    except Exception as e:
        print(f"[ERROR] PDF generation failed: {e}")
        import traceback
        traceback.print_exc()
        return False


if __name__ == "__main__":
    success = test_layered_rendering()
    sys.exit(0 if success else 1)