#!/usr/bin/env python """ 測試 calculate_page_dimensions 是否正確檢查所有可能的區域 包括: text_regions, image_regions, tables, layout, layout_data.elements """ import json from pathlib import Path from app.services.pdf_generator_service import pdf_generator_service import logging # Set up logging logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') def test_all_region_types(): """ 測試場景: - layout: [] (空列表) - text_regions: 包含文字區域 - image_regions: 包含圖片區域 (關鍵!) - tables: 包含表格區域 (關鍵!) """ test_dir = Path("test_output_all_regions") test_dir.mkdir(exist_ok=True) print("\n" + "="*70) print("測試場景:檢查所有區域類型 (text, image, table)") print("="*70) # 模擬包含所有區域類型的 JSON mock_ocr_data = { "status": "success", "file_name": "complete_document.pdf", "language": "ch", "layout": [], # 空列表 "text_regions": [ { "text": "標題文字", "bbox": [[461, 270], [819, 270], [819, 408], [461, 408]], "confidence": 0.95 }, { "text": "內容文字", "bbox": [[1521, 936], [1850, 936], [1850, 1020], [1521, 1020]], "confidence": 0.93 } ], "image_regions": [ { "type": "figure", "bbox": [[1434, 1500], [2204, 1500], [2204, 2100], [1434, 2100]], # 圖片在右下角 "image_path": "imgs/figure_1.jpg" }, { "type": "chart", "bbox": [[200, 2200], [800, 2200], [800, 2800], [200, 2800]], "image_path": "imgs/chart_1.jpg" } ], "tables": [ { "type": "table", "bbox": [[300, 3000], [1900, 3000], [1900, 3500], [300, 3500]], # 表格在底部 "html": "...
" } ], "total_text_regions": 2, "average_confidence": 0.94, "layout_data": None, "images_metadata": [], "markdown_content": "標題文字\n內容文字", "processing_time": 4.5, "timestamp": "2025-11-17T00:00:00" } # Save mock JSON json_path = test_dir / "all_regions_test.json" with open(json_path, "w", encoding="utf-8") as f: json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2) print(f"\n✓ 創建測試 JSON: {json_path}") print(f" - layout: [] (空列表)") print(f" - text_regions: 2 個區域 (max X=1850)") print(f" - image_regions: 2 個區域 (max X=2204) *** 關鍵!") print(f" - tables: 1 個區域 (max Y=3500) *** 關鍵!") print(f" - 預期 OCR 尺寸: ~2204 x ~3500 (取自所有區域的最大值)") # Create A4 source PDF from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import A4 source_pdf = test_dir / "source_a4.pdf" c = canvas.Canvas(str(source_pdf), pagesize=A4) c.drawString(100, 800, "Original A4 Document") c.save() print(f"✓ 創建 A4 源文件: {source_pdf}") print(f" - A4 尺寸: 595 x 842 點") # Test PDF generation pdf_path = test_dir / "output_all_regions.pdf" print(f"\n開始生成 PDF...") print("-" * 70) success = pdf_generator_service.generate_layout_pdf( json_path=json_path, output_path=pdf_path, source_file_path=source_pdf ) print("-" * 70) if success: print(f"\n✓ PDF 生成成功: {pdf_path}") print(f"\n預期結果:") print(f" - OCR 尺寸(從所有區域推斷): ~2204 x ~3500") print(f" - 目標 PDF 尺寸: 595 x 842") print(f" - 預期縮放因子: X={595/2204:.3f}, Y={842/3500:.3f}") print(f"\n關鍵驗證:") print(f" - 如果只檢查 text_regions,max_x 只有 1850 (錯誤!)") print(f" - 必須檢查 image_regions 才能得到正確的 max_x=2204") print(f" - 必須檢查 tables 才能得到正確的 max_y=3500") return True else: print(f"\n✗ PDF 生成失敗") return False if __name__ == "__main__": import sys sys.path.insert(0, str(Path(__file__).parent)) success = test_all_region_types() print("\n" + "="*70) if success: print("✓ 測試通過!所有區域類型都被正確檢查") print("="*70) sys.exit(0) else: print("✗ 測試失敗") print("="*70) sys.exit(1)