From ecdce961cad1d1b16bff0f4df038f71097c5ff4b Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Wed, 19 Nov 2025 08:48:25 +0800
Subject: [PATCH] feat: update PDF generator to support UnifiedDocument
 directly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add generate_from_unified_document() method for direct UnifiedDocument processing
- Create convert_unified_document_to_ocr_data() for format conversion
- Extract _generate_pdf_from_data() as reusable core logic
- Support both OCR and DIRECT processing tracks in PDF generation
- Handle coordinate transformations (BoundingBox to polygon format)
- Update OCR service to use appropriate PDF generation method

Completes Section 4 (Unified Processing Pipeline) of dual-track proposal.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 backend/app/services/ocr_service.py           |  20 +-
 backend/app/services/pdf_generator_service.py | 451 +++++++++++++-----
 .../dual-track-document-processing/tasks.md   |   8 +-
 3 files changed, 341 insertions(+), 138 deletions(-)

diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py
index 7a92752..b861a57 100644
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -1223,11 +1223,21 @@ class OCRService:
 
                 logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
 
-                success = pdf_generator_service.generate_layout_pdf(
-                    json_path=json_path,
-                    output_path=pdf_path,
-                    source_file_path=source_file_path
-                )
+                # Use appropriate method based on result type
+                if isinstance(result, UnifiedDocument):
+                    # Use direct UnifiedDocument generation for better accuracy
+                    success = pdf_generator_service.generate_from_unified_document(
+                        unified_doc=result,
+                        output_path=pdf_path,
+                        source_file_path=source_file_path
+                    )
+                else:
+                    # Legacy path: use JSON file
+                    success = pdf_generator_service.generate_layout_pdf(
+                        json_path=json_path,
+                        output_path=pdf_path,
+                        source_file_path=source_file_path
+                    )
 
                 if success:
                     logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py
index b76bf1f..a029023 100644
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -24,6 +24,17 @@ from html.parser import HTMLParser
 
 from app.core.config import settings
 
+# Import UnifiedDocument for dual-track support
+try:
+    from app.models.unified_document import (
+        UnifiedDocument, DocumentElement, ElementType,
+        BoundingBox, TableData, ProcessingTrack
+    )
+    UNIFIED_DOCUMENT_AVAILABLE = True
+except ImportError:
+    UNIFIED_DOCUMENT_AVAILABLE = False
+    UnifiedDocument = None
+
 logger = logging.getLogger(__name__)
 
 
@@ -138,6 +149,310 @@ class PDFGeneratorService:
             logger.error(f"Failed to load JSON {json_path}: {e}")
             return None
 
+    def convert_unified_document_to_ocr_data(self, unified_doc: 'UnifiedDocument') -> Dict:
+        """
+        Convert UnifiedDocument to OCR data format for PDF generation.
+
+        This method transforms the UnifiedDocument structure into the legacy
+        OCR data format that the PDF generator expects, supporting both
+        OCR and DIRECT processing tracks.
+
+        Args:
+            unified_doc: UnifiedDocument object from either processing track
+
+        Returns:
+            Dictionary in OCR data format with text_regions, images_metadata, layout_data
+        """
+        text_regions = []
+        images_metadata = []
+        layout_elements = []
+
+        for page in unified_doc.pages:
+            page_num = page.page_number  # 1-based
+
+            for element in page.elements:
+                # Convert BoundingBox to polygon format [[x,y], [x,y], [x,y], [x,y]]
+                bbox_polygon = [
+                    [element.bbox.x0, element.bbox.y0],  # top-left
+                    [element.bbox.x1, element.bbox.y0],  # top-right
+                    [element.bbox.x1, element.bbox.y1],  # bottom-right
+                    [element.bbox.x0, element.bbox.y1],  # bottom-left
+                ]
+
+                # Handle text elements
+                if element.is_text or element.type in [
+                    ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
+                    ElementType.FOOTER, ElementType.PARAGRAPH, ElementType.CAPTION,
+                    ElementType.LIST_ITEM, ElementType.FOOTNOTE, ElementType.REFERENCE
+                ]:
+                    text_content = element.get_text()
+                    if text_content:
+                        text_regions.append({
+                            'text': text_content,
+                            'bbox': bbox_polygon,
+                            'confidence': element.confidence or 1.0,
+                            'page': page_num
+                        })
+
+                # Handle table elements
+                elif element.type == ElementType.TABLE:
+                    # Convert TableData to HTML for layout_data
+                    if isinstance(element.content, TableData):
+                        html_content = element.content.to_html()
+                    elif isinstance(element.content, dict):
+                        html_content = element.content.get('html', str(element.content))
+                    else:
+                        html_content = str(element.content)
+
+                    layout_elements.append({
+                        'type': 'table',
+                        'content': html_content,
+                        'bbox': [element.bbox.x0, element.bbox.y0,
+                                element.bbox.x1, element.bbox.y1],
+                        'page': page_num - 1  # layout uses 0-based
+                    })
+
+                    # Also add to images_metadata for overlap filtering
+                    # Tables are often rendered as images
+                    table_id = element.element_id or f"table_{page_num}_{len(images_metadata)}"
+                    images_metadata.append({
+                        'image_path': f"table_{table_id}.png",
+                        'bbox': bbox_polygon,
+                        'page': page_num - 1,  # 0-based for images_metadata
+                        'type': 'table'
+                    })
+
+                # Handle image/visual elements
+                elif element.is_visual or element.type in [
+                    ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
+                    ElementType.DIAGRAM, ElementType.LOGO
+                ]:
+                    # Get image path from content or metadata
+                    if isinstance(element.content, dict):
+                        image_path = element.content.get('path', '')
+                    else:
+                        image_path = element.metadata.get('path', f"image_{element.element_id}.png")
+
+                    images_metadata.append({
+                        'image_path': image_path,
+                        'bbox': bbox_polygon,
+                        'page': page_num - 1,  # 0-based
+                        'type': element.type.value
+                    })
+
+        # Build OCR data structure
+        ocr_data = {
+            'text_regions': text_regions,
+            'images_metadata': images_metadata,
+            'layout_data': {
+                'elements': layout_elements,
+                'total_elements': len(layout_elements)
+            },
+            'total_pages': unified_doc.page_count,
+            'ocr_dimensions': {
+                'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0,
+                'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0
+            },
+            # Metadata for tracking
+            '_from_unified_document': True,
+            '_processing_track': unified_doc.metadata.processing_track.value
+        }
+
+        logger.info(f"Converted UnifiedDocument to OCR data: "
+                   f"{len(text_regions)} text regions, "
+                   f"{len(images_metadata)} images, "
+                   f"{len(layout_elements)} layout elements, "
+                   f"track={unified_doc.metadata.processing_track.value}")
+
+        return ocr_data
+
+    def generate_from_unified_document(
+        self,
+        unified_doc: 'UnifiedDocument',
+        output_path: Path,
+        source_file_path: Optional[Path] = None
+    ) -> bool:
+        """
+        Generate layout-preserving PDF directly from UnifiedDocument.
+
+        This method supports both OCR and DIRECT processing tracks,
+        preserving layout and coordinate information from either source.
+
+        Args:
+            unified_doc: UnifiedDocument object
+            output_path: Path to save generated PDF
+            source_file_path: Optional path to original source file
+
+        Returns:
+            True if successful, False otherwise
+        """
+        if not UNIFIED_DOCUMENT_AVAILABLE:
+            logger.error("UnifiedDocument support not available")
+            return False
+
+        try:
+            # Convert UnifiedDocument to OCR data format
+            ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
+
+            # Use internal generation with pre-loaded data
+            return self._generate_pdf_from_data(
+                ocr_data=ocr_data,
+                output_path=output_path,
+                source_file_path=source_file_path
+            )
+
+        except Exception as e:
+            logger.error(f"Failed to generate PDF from UnifiedDocument: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
+    def _generate_pdf_from_data(
+        self,
+        ocr_data: Dict,
+        output_path: Path,
+        source_file_path: Optional[Path] = None,
+        json_parent_dir: Optional[Path] = None
+    ) -> bool:
+        """
+        Internal method to generate PDF from OCR data dictionary.
+
+        This is the core generation logic extracted for reuse by both
+        JSON-based and UnifiedDocument-based generation paths.
+
+        Args:
+            ocr_data: OCR data dictionary
+            output_path: Path to save generated PDF
+            source_file_path: Optional path to original source file
+            json_parent_dir: Directory containing images (for JSON-based generation)
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            # Check if PDF already exists (caching)
+            if output_path.exists():
+                logger.info(f"PDF already exists: {output_path.name}")
+                return True
+
+            # Get text regions
+            text_regions = ocr_data.get('text_regions', [])
+            if not text_regions:
+                logger.warning("No text regions found in data")
+                # Don't fail - might have only tables/images
+
+            # Get images metadata
+            images_metadata = ocr_data.get('images_metadata', [])
+
+            # Get layout data
+            layout_data = ocr_data.get('layout_data', {})
+
+            # Step 1: Get OCR processing dimensions
+            ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
+            logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
+
+            # Step 2: Get target PDF dimensions
+            if source_file_path:
+                target_dims = self.get_original_page_size(source_file_path)
+                if target_dims:
+                    target_width, target_height = target_dims
+                    logger.info(f"目標 PDF 尺寸（來自原始文件）: {target_width:.1f} x {target_height:.1f}")
+                else:
+                    target_width, target_height = ocr_width, ocr_height
+                    logger.warning(f"無法獲取原始文件尺寸，使用 OCR 尺寸作為目標")
+            else:
+                target_width, target_height = ocr_width, ocr_height
+                logger.info(f"無原始文件，使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
+
+            # Step 3: Calculate scale factors
+            scale_w = target_width / ocr_width if ocr_width > 0 else 1.0
+            scale_h = target_height / ocr_height if ocr_height > 0 else 1.0
+            logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}")
+
+            # Create PDF canvas
+            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
+
+            # Filter text regions to avoid overlap with tables/images
+            regions_to_avoid = images_metadata
+            table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()])
+
+            logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免")
+
+            filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
+
+            # Group regions by page
+            pages_data = {}
+            for region in filtered_text_regions:
+                page_num = region.get('page', 1)
+                if page_num not in pages_data:
+                    pages_data[page_num] = []
+                pages_data[page_num].append(region)
+
+            # Get table elements from layout_data
+            table_elements = []
+            if layout_data and layout_data.get('elements'):
+                table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
+
+            # Process each page
+            total_pages = ocr_data.get('total_pages', 1)
+            logger.info(f"開始處理 {total_pages} 頁 PDF")
+
+            # Determine image directory
+            if json_parent_dir is None:
+                json_parent_dir = output_path.parent
+
+            for page_num in range(1, total_pages + 1):
+                logger.info(f">>> 處理第 {page_num}/{total_pages} 頁")
+                if page_num > 1:
+                    pdf_canvas.showPage()
+
+                # Get regions for this page
+                page_text_regions = pages_data.get(page_num, [])
+                page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
+                page_image_regions = [
+                    img for img in images_metadata
+                    if img.get('page') == page_num - 1
+                    and 'table' not in img.get('image_path', '').lower()
+                ]
+
+                # Draw in layers: images → tables → text
+
+                # 1. Draw images (bottom layer)
+                for img_meta in page_image_regions:
+                    self.draw_image_region(
+                        pdf_canvas, img_meta, target_height,
+                        json_parent_dir, scale_w, scale_h
+                    )
+
+                # 2. Draw tables (middle layer)
+                for table_elem in page_table_regions:
+                    self.draw_table_region(
+                        pdf_canvas, table_elem, images_metadata,
+                        target_height, scale_w, scale_h
+                    )
+
+                # 3. Draw text (top layer)
+                for region in page_text_regions:
+                    self.draw_text_region(
+                        pdf_canvas, region, target_height,
+                        scale_w, scale_h
+                    )
+
+                logger.info(f"<<< 第 {page_num} 頁完成")
+
+            # Save PDF
+            pdf_canvas.save()
+
+            file_size = output_path.stat().st_size
+            logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to generate PDF: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
     def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
         """
         從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
@@ -717,140 +1032,18 @@ class PDFGeneratorService:
             True if successful, False otherwise
         """
         try:
-            # Check if PDF already exists (caching)
-            if output_path.exists():
-                logger.info(f"PDF already exists: {output_path.name}")
-                return True
-
             # Load JSON data
             ocr_data = self.load_ocr_json(json_path)
             if not ocr_data:
                 return False
 
-            # Get text regions
-            text_regions = ocr_data.get('text_regions', [])
-            if not text_regions:
-                logger.warning("No text regions found in JSON")
-                return False
-
-            # Get images metadata
-            images_metadata = ocr_data.get('images_metadata', [])
-
-            # Get layout data
-            layout_data = ocr_data.get('layout_data', {})
-
-            # Step 1: Get OCR processing dimensions (the large image OCR actually used)
-            # This comes from analyzing all bbox coordinates in the OCR data
-            ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
-            logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
-
-            # Step 2: Get target PDF dimensions (usually the original file size)
-            # This is what we want the final PDF size to be
-            if source_file_path:
-                target_dims = self.get_original_page_size(source_file_path)
-                if target_dims:
-                    target_width, target_height = target_dims
-                    logger.info(f"目標 PDF 尺寸（來自原始文件）: {target_width:.1f} x {target_height:.1f}")
-                else:
-                    # If we can't get original size, use OCR dimensions as target
-                    target_width, target_height = ocr_width, ocr_height
-                    logger.warning(f"無法獲取原始文件尺寸，使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
-            else:
-                # No source file, use OCR dimensions as target (1:1 mapping)
-                target_width, target_height = ocr_width, ocr_height
-                logger.info(f"無原始文件，使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
-
-            # Step 3: Calculate scale factors to convert OCR coordinates to PDF coordinates
-            scale_w = target_width / ocr_width
-            scale_h = target_height / ocr_height
-            logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f} (OCR座標 → PDF座標)")
-
-            # Create PDF canvas with target dimensions
-            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
-
-            # *** 關鍵修復：收集所有需要避免的區域（表格 + 圖片）***
-            # 注意：OCR JSON 中沒有 'tables' 和 'image_regions' 頂層欄位
-            # 重要發現：
-            #   - layout_data.elements 中的表格元素沒有 bbox（都是空列表）
-            #   - images_metadata 包含所有表格和圖片，並且有正確的 bbox
-            #   - 因此，只需使用 images_metadata 來過濾文字即可
-
-            # 使用 images_metadata 作為要避免的區域（包含表格圖片和其他圖片）
-            regions_to_avoid = images_metadata
-
-            table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()])
-            other_count = len(images_metadata) - table_count
-
-            logger.info(f"使用 images_metadata 過濾文字區域:")
-            logger.info(f"  - 表格圖片: {table_count}")
-            logger.info(f"  - 其他圖片: {other_count}")
-            logger.info(f"  - 總計需要避免的區域: {len(regions_to_avoid)}")
-
-            # 使用新的過濾函式過濾文字區域
-            filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
-
-            # Group regions by page
-            pages_data = {}
-            for region in filtered_text_regions:
-                page_num = region.get('page', 1)
-                if page_num not in pages_data:
-                    pages_data[page_num] = []
-                pages_data[page_num].append(region)
-
-            # Get table elements from layout_data
-            table_elements = []
-            if layout_data and layout_data.get('elements'):
-                table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
-
-            # Process each page
-            total_pages = ocr_data.get('total_pages', 1)
-            logger.info(f"=" * 70)
-            logger.info(f"開始處理 {total_pages} 頁 PDF")
-            logger.info(f"=" * 70)
-
-            for page_num in range(1, total_pages + 1):
-                logger.info(f"\n>>> 處理第 {page_num}/{total_pages} 頁")
-                if page_num > 1:
-                    pdf_canvas.showPage()  # Start new page
-
-                # Get filtered regions for this page
-                page_text_regions = pages_data.get(page_num, [])
-                page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
-                page_image_regions = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()]
-
-                # 繪製順序：圖片(底層) → 表格(中間層) → 文字(最上層)
-
-                # 1. Draw images first (bottom layer)
-                logger.info(f"第 {page_num} 頁: 繪製 {len(page_image_regions)} 個圖片")
-                for img_meta in page_image_regions:
-                    self.draw_image_region(
-                        pdf_canvas,
-                        img_meta,
-                        target_height,
-                        json_path.parent,
-                        scale_w,
-                        scale_h
-                    )
-
-                # 2. Draw tables (middle layer)
-                logger.info(f"第 {page_num} 頁: 繪製 {len(page_table_regions)} 個表格")
-                for table_elem in page_table_regions:
-                    self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
-
-                # 3. Draw text regions last (top layer) - excluding table text
-                logger.info(f"第 {page_num} 頁: 繪製 {len(page_text_regions)} 個文字區域")
-                for i, region in enumerate(page_text_regions, 1):
-                    logger.debug(f"  文字 {i}/{len(page_text_regions)}")
-                    self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
-
-                logger.info(f"<<< 第 {page_num} 頁完成")
-
-            # Save PDF
-            pdf_canvas.save()
-
-            file_size = output_path.stat().st_size
-            logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
-            return True
+            # Use internal generation with pre-loaded data
+            return self._generate_pdf_from_data(
+                ocr_data=ocr_data,
+                output_path=output_path,
+                source_file_path=source_file_path,
+                json_parent_dir=json_path.parent
+            )
 
         except Exception as e:
             logger.error(f"Failed to generate PDF: {e}")
diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md
index 338437f..e45cd02 100644
--- a/openspec/changes/dual-track-document-processing/tasks.md
+++ b/openspec/changes/dual-track-document-processing/tasks.md
@@ -63,10 +63,10 @@
   - [x] 4.2.1 Define standardized JSON schema
   - [x] 4.2.2 Include processing metadata
   - [x] 4.2.3 Support both track outputs
-- [ ] 4.3 Update PDF generator for UnifiedDocument
-  - [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument
-  - [ ] 4.3.2 Preserve layout from both tracks
-  - [ ] 4.3.3 Handle coordinate transformations
+- [x] 4.3 Update PDF generator for UnifiedDocument
+  - [x] 4.3.1 Adapt PDF generation to use UnifiedDocument
+  - [x] 4.3.2 Preserve layout from both tracks
+  - [x] 4.3.3 Handle coordinate transformations
 
 ## 5. Translation System Foundation
 - [ ] 5.1 Create TranslationEngine interface