OCR/backend/test_all_regions.py

#!/usr/bin/env python
"""
測試 calculate_page_dimensions 是否正確檢查所有可能的區域
包括: text_regions, image_regions, tables, layout, layout_data.elements
"""

import json
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

def test_all_region_types():
    """
    測試場景：
    - layout: [] (空列表)
    - text_regions: 包含文字區域
    - image_regions: 包含圖片區域 (關鍵！)
    - tables: 包含表格區域 (關鍵！)
    """

    test_dir = Path("test_output_all_regions")
    test_dir.mkdir(exist_ok=True)

    print("\n" + "="*70)
    print("測試場景：檢查所有區域類型 (text, image, table)")
    print("="*70)

    # 模擬包含所有區域類型的 JSON
    mock_ocr_data = {
        "status": "success",
        "file_name": "complete_document.pdf",
        "language": "ch",
        "layout": [],  # 空列表
        "text_regions": [
            {
                "text": "標題文字",
                "bbox": [[461, 270], [819, 270], [819, 408], [461, 408]],
                "confidence": 0.95
            },
            {
                "text": "內容文字",
                "bbox": [[1521, 936], [1850, 936], [1850, 1020], [1521, 1020]],
                "confidence": 0.93
            }
        ],
        "image_regions": [
            {
                "type": "figure",
                "bbox": [[1434, 1500], [2204, 1500], [2204, 2100], [1434, 2100]],  # 圖片在右下角
                "image_path": "imgs/figure_1.jpg"
            },
            {
                "type": "chart",
                "bbox": [[200, 2200], [800, 2200], [800, 2800], [200, 2800]],
                "image_path": "imgs/chart_1.jpg"
            }
        ],
        "tables": [
            {
                "type": "table",
                "bbox": [[300, 3000], [1900, 3000], [1900, 3500], [300, 3500]],  # 表格在底部
                "html": "<table>...</table>"
            }
        ],
        "total_text_regions": 2,
        "average_confidence": 0.94,
        "layout_data": None,
        "images_metadata": [],
        "markdown_content": "標題文字\n內容文字",
        "processing_time": 4.5,
        "timestamp": "2025-11-17T00:00:00"
    }

    # Save mock JSON
    json_path = test_dir / "all_regions_test.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2)

    print(f"\n✓ 創建測試 JSON: {json_path}")
    print(f"  - layout: [] (空列表)")
    print(f"  - text_regions: 2 個區域 (max X=1850)")
    print(f"  - image_regions: 2 個區域 (max X=2204) *** 關鍵！")
    print(f"  - tables: 1 個區域 (max Y=3500) *** 關鍵！")
    print(f"  - 預期 OCR 尺寸: ~2204 x ~3500 (取自所有區域的最大值)")

    # Create A4 source PDF
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import A4

    source_pdf = test_dir / "source_a4.pdf"
    c = canvas.Canvas(str(source_pdf), pagesize=A4)
    c.drawString(100, 800, "Original A4 Document")
    c.save()

    print(f"✓ 創建 A4 源文件: {source_pdf}")
    print(f"  - A4 尺寸: 595 x 842 點")

    # Test PDF generation
    pdf_path = test_dir / "output_all_regions.pdf"

    print(f"\n開始生成 PDF...")
    print("-" * 70)

    success = pdf_generator_service.generate_layout_pdf(
        json_path=json_path,
        output_path=pdf_path,
        source_file_path=source_pdf
    )

    print("-" * 70)

    if success:
        print(f"\n✓ PDF 生成成功: {pdf_path}")
        print(f"\n預期結果:")
        print(f"  - OCR 尺寸（從所有區域推斷）: ~2204 x ~3500")
        print(f"  - 目標 PDF 尺寸: 595 x 842")
        print(f"  - 預期縮放因子: X={595/2204:.3f}, Y={842/3500:.3f}")
        print(f"\n關鍵驗證:")
        print(f"  - 如果只檢查 text_regions，max_x 只有 1850 (錯誤！)")
        print(f"  - 必須檢查 image_regions 才能得到正確的 max_x=2204")
        print(f"  - 必須檢查 tables 才能得到正確的 max_y=3500")
        return True
    else:
        print(f"\n✗ PDF 生成失敗")
        return False

if __name__ == "__main__":
    import sys
    sys.path.insert(0, str(Path(__file__).parent))

    success = test_all_region_types()

    print("\n" + "="*70)
    if success:
        print("✓ 測試通過！所有區域類型都被正確檢查")
        print("="*70)
        sys.exit(0)
    else:
        print("✗ 測試失敗")
        print("="*70)
        sys.exit(1)