OCR/backend/test_bbox_scaling.py

#!/usr/bin/env python
"""
Test script for PDF generation with proper bbox-based dimension calculation
Simulates the real scenario where OCR processes on high-res images (e.g., 2189x3500)
but we want to generate PDFs at original size (e.g., A4: 595x842)
"""

import json
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
import logging

# Set up logging to see dimension calculations
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

def test_high_res_ocr_to_a4_pdf():
    """
    Test the scenario user described:
    - PaddleOCR processes PDF at high resolution (2189x3500)
    - OCR bbox coordinates are in this high-res space
    - We want to generate A4 PDF (595x842)
    - Scale factors should be ~0.27 and ~0.24
    """

    # Create test directory
    test_dir = Path("test_output_bbox")
    test_dir.mkdir(exist_ok=True)

    print("\n" + "="*70)
    print("測試場景：高解析度 OCR → A4 PDF 縮放")
    print("="*70)

    # Create mock OCR data with high-res bbox coordinates
    # Simulating text at various positions in the 2189x3500 coordinate space
    mock_ocr_data = {
        "status": "success",
        "file_name": "test_document.pdf",
        "language": "ch",
        "text_regions": [
            {
                "text": "標題文字在頂部",
                "bbox": [[230, 195], [1189, 182], [1189, 350], [230, 363]],  # Top of page
                "confidence": 0.95
            },
            {
                "text": "中間的文字內容",
                "bbox": [[1521, 1750], [2185, 1750], [2185, 1820], [1521, 1820]],  # Middle
                "confidence": 0.92
            },
            {
                "text": "底部的文字",
                "bbox": [[400, 3200], [1200, 3200], [1200, 3280], [400, 3280]],  # Bottom
                "confidence": 0.93
            }
        ],
        "total_text_regions": 3,
        "average_confidence": 0.933,
        "layout_data": None,
        "images_metadata": [],
        "markdown_content": "# Test Document\n\n標題文字在頂部\n中間的文字內容\n底部的文字",
        "processing_time": 2.5,
        "timestamp": "2025-11-17T00:00:00"
    }

    # Save mock JSON
    json_path = test_dir / "high_res_ocr_result.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2)

    print(f"\n✓ 創建測試 JSON: {json_path}")
    print(f"  - OCR 座標範圍: X=[230..2185], Y=[182..3280]")
    print(f"  - 預期 OCR 尺寸: ~2185 x ~3280")

    # Create a mock A4 source PDF for target dimensions
    from PIL import Image
    from reportlab.lib.pagesizes import A4

    # Create dummy source image at A4 size (595x842 points)
    source_pdf = test_dir / "source_a4.pdf"

    # For this test, we'll create a simple PDF using reportlab
    from reportlab.pdfgen import canvas
    c = canvas.Canvas(str(source_pdf), pagesize=A4)
    c.drawString(100, 800, "Original A4 Document")
    c.save()

    print(f"✓ 創建 A4 源文件: {source_pdf}")
    print(f"  - A4 尺寸: 595 x 842 點")

    # Test PDF generation
    pdf_path = test_dir / "scaled_output.pdf"

    print(f"\n開始生成 PDF...")
    print("-" * 70)

    success = pdf_generator_service.generate_layout_pdf(
        json_path=json_path,
        output_path=pdf_path,
        source_file_path=source_pdf
    )

    print("-" * 70)

    if success:
        print(f"\n✓ PDF 生成成功: {pdf_path}")
        print(f"\n預期結果:")
        print(f"  - OCR 尺寸: ~2185 x ~3280")
        print(f"  - 目標 PDF 尺寸: 595 x 842")
        print(f"  - 預期縮放因子: X={595/2185:.3f}, Y={842/3280:.3f}")
        print(f"\n實際結果應該與預期一致（見上方日誌）")
        return True
    else:
        print(f"\n✗ PDF 生成失敗")
        return False

if __name__ == "__main__":
    import sys
    sys.path.insert(0, str(Path(__file__).parent))

    success = test_high_res_ocr_to_a4_pdf()

    print("\n" + "="*70)
    if success:
        print("✓ 測試通過！縮放邏輯正確")
        print("="*70)
        sys.exit(0)
    else:
        print("✗ 測試失敗")
        print("="*70)
        sys.exit(1)