#!/usr/bin/env python """ Test script for PDF generation with proper bbox-based dimension calculation Simulates the real scenario where OCR processes on high-res images (e.g., 2189x3500) but we want to generate PDFs at original size (e.g., A4: 595x842) """ import json from pathlib import Path from app.services.pdf_generator_service import pdf_generator_service import logging # Set up logging to see dimension calculations logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') def test_high_res_ocr_to_a4_pdf(): """ Test the scenario user described: - PaddleOCR processes PDF at high resolution (2189x3500) - OCR bbox coordinates are in this high-res space - We want to generate A4 PDF (595x842) - Scale factors should be ~0.27 and ~0.24 """ # Create test directory test_dir = Path("test_output_bbox") test_dir.mkdir(exist_ok=True) print("\n" + "="*70) print("測試場景:高解析度 OCR → A4 PDF 縮放") print("="*70) # Create mock OCR data with high-res bbox coordinates # Simulating text at various positions in the 2189x3500 coordinate space mock_ocr_data = { "status": "success", "file_name": "test_document.pdf", "language": "ch", "text_regions": [ { "text": "標題文字在頂部", "bbox": [[230, 195], [1189, 182], [1189, 350], [230, 363]], # Top of page "confidence": 0.95 }, { "text": "中間的文字內容", "bbox": [[1521, 1750], [2185, 1750], [2185, 1820], [1521, 1820]], # Middle "confidence": 0.92 }, { "text": "底部的文字", "bbox": [[400, 3200], [1200, 3200], [1200, 3280], [400, 3280]], # Bottom "confidence": 0.93 } ], "total_text_regions": 3, "average_confidence": 0.933, "layout_data": None, "images_metadata": [], "markdown_content": "# Test Document\n\n標題文字在頂部\n中間的文字內容\n底部的文字", "processing_time": 2.5, "timestamp": "2025-11-17T00:00:00" } # Save mock JSON json_path = test_dir / "high_res_ocr_result.json" with open(json_path, "w", encoding="utf-8") as f: json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2) print(f"\n✓ 創建測試 JSON: {json_path}") print(f" - OCR 座標範圍: X=[230..2185], Y=[182..3280]") print(f" - 預期 OCR 尺寸: ~2185 x ~3280") # Create a mock A4 source PDF for target dimensions from PIL import Image from reportlab.lib.pagesizes import A4 # Create dummy source image at A4 size (595x842 points) source_pdf = test_dir / "source_a4.pdf" # For this test, we'll create a simple PDF using reportlab from reportlab.pdfgen import canvas c = canvas.Canvas(str(source_pdf), pagesize=A4) c.drawString(100, 800, "Original A4 Document") c.save() print(f"✓ 創建 A4 源文件: {source_pdf}") print(f" - A4 尺寸: 595 x 842 點") # Test PDF generation pdf_path = test_dir / "scaled_output.pdf" print(f"\n開始生成 PDF...") print("-" * 70) success = pdf_generator_service.generate_layout_pdf( json_path=json_path, output_path=pdf_path, source_file_path=source_pdf ) print("-" * 70) if success: print(f"\n✓ PDF 生成成功: {pdf_path}") print(f"\n預期結果:") print(f" - OCR 尺寸: ~2185 x ~3280") print(f" - 目標 PDF 尺寸: 595 x 842") print(f" - 預期縮放因子: X={595/2185:.3f}, Y={842/3280:.3f}") print(f"\n實際結果應該與預期一致(見上方日誌)") return True else: print(f"\n✗ PDF 生成失敗") return False if __name__ == "__main__": import sys sys.path.insert(0, str(Path(__file__).parent)) success = test_high_res_ocr_to_a4_pdf() print("\n" + "="*70) if success: print("✓ 測試通過!縮放邏輯正確") print("="*70) sys.exit(0) else: print("✗ 測試失敗") print("="*70) sys.exit(1)