fix: improve multi-page PDF dimension handling and coordinate transformation

Resolve issues where multi-page PDFs with varying page sizes had incorrect element positioning and scaling. Each page now maintains its own dimensions and scale factors throughout the generation process. Key improvements: Direct Track Processing: - Store per-page dimensions in page_dimensions mapping (0-based index) - Set correct page size for each page using setPageSize() - Pass current page height to all drawing methods for accurate Y-axis conversion - Each page uses its own dimensions instead of first page dimensions OCR Track Processing: - Calculate per-page scale factors with 3-tier priority: 1. Original file dimensions (highest priority) 2. OCR/UnifiedDocument dimensions 3. Fallback to first page dimensions - Apply correct scaling factors for each page's coordinate transformation - Handle mixed-size pages correctly (e.g., A4 + A3 in same document) Dimension Extraction: - Add get_all_page_sizes() method to extract dimensions for all PDF pages - Return Dict[int, Tuple[float, float]] mapping page index to (width, height) - Maintain backward compatibility with get_original_page_size() for first page - Support both images (single page) and multi-page PDFs Coordinate System: - Add ocr_dimensions priority check in calculate_page_dimensions() - Priority order: ocr_dimensions > dimensions > bbox inference - Ensure consistent coordinate space across processing tracks Benefits: - Correct rendering for documents with mixed page sizes - Accurate element positioning on all pages - Proper scaling for scanned documents with varying DPI per page - Better handling of landscape/portrait mixed documents Related to archived proposal: fix-pdf-coordinate-system 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 15:09:39 +08:00
parent 2312b4cd66
commit 0999898358
1 changed files with 191 additions and 66 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -503,6 +503,14 @@ class PDFGeneratorService:
                    else:
                        logger.warning(f"No image path found for visual element {element.element_id}")

+        # Build page dimensions mapping for multi-page support
+        page_dimensions = {}
+        for page in unified_doc.pages:
+            page_dimensions[page.page_number - 1] = {  # 0-based index
+                'width': page.dimensions.width,
+                'height': page.dimensions.height
+            }
+
        # Build OCR data structure
        ocr_data = {
            'text_regions': text_regions,
@@ -516,6 +524,7 @@ class PDFGeneratorService:
                'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0,
                'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0
            },
+            'page_dimensions': page_dimensions,  # Per-page dimensions for multi-page support
            # Metadata for tracking
            '_from_unified_document': True,
            '_processing_track': unified_doc.metadata.processing_track.value
@@ -669,7 +678,7 @@ class PDFGeneratorService:
            # Set current track for helper methods
            self.current_processing_track = 'direct'

-            # Get page dimensions from first page
+            # Get page dimensions from first page (for canvas initialization)
            if not unified_doc.pages:
                logger.error("No pages in document")
                return False
@@ -678,9 +687,9 @@ class PDFGeneratorService:
            page_width = first_page.dimensions.width
            page_height = first_page.dimensions.height

-            logger.info(f"Page dimensions: {page_width} x {page_height}")
+            logger.info(f"First page dimensions: {page_width} x {page_height}")

-            # Create PDF canvas with source dimensions
+            # Create PDF canvas with first page dimensions (will be updated per page)
            from reportlab.pdfgen import canvas
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))

@@ -688,9 +697,17 @@ class PDFGeneratorService:
            for page_idx, page in enumerate(unified_doc.pages):
                logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}")

+                # Get current page dimensions
+                current_page_width = page.dimensions.width
+                current_page_height = page.dimensions.height
+                logger.info(f"Page {page_idx + 1} dimensions: {current_page_width} x {current_page_height}")
+
                if page_idx > 0:
                    pdf_canvas.showPage()

+                # Set page size for current page
+                pdf_canvas.setPageSize((current_page_width, current_page_height))
+
                # Separate elements by type
                text_elements = []
                table_elements = []
@@ -757,19 +774,19 @@ class PDFGeneratorService:
                # Draw elements in document order
                for elem_type, elem in all_elements:
                    if elem_type == 'image':
-                        self._draw_image_element_direct(pdf_canvas, elem, page_height, output_path.parent)
+                        self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent)
                    elif elem_type == 'table':
-                        self._draw_table_element_direct(pdf_canvas, elem, page_height)
+                        self._draw_table_element_direct(pdf_canvas, elem, current_page_height)
                    elif elem_type == 'list':
                        # FIX: Check if list item overlaps with table/image
                        if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
-                            self._draw_text_element_direct(pdf_canvas, elem, page_height)
+                            self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
                        else:
                            logger.debug(f"Skipping list element {elem.element_id} inside table/image region")
                    elif elem_type == 'text':
                        # FIX: Check if text overlaps with table/image before drawing
                        if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
-                            self._draw_text_element_direct(pdf_canvas, elem, page_height)
+                            self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
                        else:
                            logger.debug(f"Skipping text element {elem.element_id} inside table/image region")

@@ -875,29 +892,38 @@ class PDFGeneratorService:
            # Get layout data
            layout_data = ocr_data.get('layout_data', {})

-            # Step 1: Get OCR processing dimensions
+            # Step 1: Get OCR processing dimensions (for first page / default)
            ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
-            logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
+            logger.info(f"OCR 處理時使用的座標系尺寸 (第一頁): {ocr_width:.1f} x {ocr_height:.1f}")

-            # Step 2: Get target PDF dimensions
+            # Step 2: Get page dimensions mapping for multi-page support
+            page_dimensions = ocr_data.get('page_dimensions', {})
+            if not page_dimensions:
+                # Fallback: use first page dimensions for all pages
+                page_dimensions = {0: {'width': ocr_width, 'height': ocr_height}}
+                logger.info("No page_dimensions found, using first page size for all pages")
+
+            # Step 3: Get original file dimensions for all pages
+            original_page_sizes = {}
            if source_file_path:
-                target_dims = self.get_original_page_size(source_file_path)
-                if target_dims:
-                    target_width, target_height = target_dims
-                    logger.info(f"目標 PDF 尺寸（來自原始文件）: {target_width:.1f} x {target_height:.1f}")
+                original_page_sizes = self.get_all_page_sizes(source_file_path)
+                if original_page_sizes:
+                    logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
                else:
-                    target_width, target_height = ocr_width, ocr_height
-                    logger.warning(f"無法獲取原始文件尺寸，使用 OCR 尺寸作為目標")
+                    logger.warning(f"無法獲取原始文件尺寸，將使用 OCR/UnifiedDocument 尺寸")
+            else:
+                logger.info(f"無原始文件，將使用 OCR/UnifiedDocument 尺寸")
+
+            # Determine initial canvas size (will be updated per page)
+            # Priority: original file first page > OCR/UnifiedDocument first page
+            if 0 in original_page_sizes:
+                target_width, target_height = original_page_sizes[0]
+                logger.info(f"初始 PDF 尺寸（來自原始文件首頁）: {target_width:.1f} x {target_height:.1f}")
            else:
                target_width, target_height = ocr_width, ocr_height
-                logger.info(f"無原始文件，使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
+                logger.info(f"初始 PDF 尺寸（來自 OCR/UnifiedDocument）: {target_width:.1f} x {target_height:.1f}")

-            # Step 3: Calculate scale factors
-            scale_w = target_width / ocr_width if ocr_width > 0 else 1.0
-            scale_h = target_height / ocr_height if ocr_height > 0 else 1.0
-            logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}")
-
-            # Create PDF canvas
+            # Create PDF canvas with initial page size (will be updated per page)
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))

            # Filter text regions to avoid overlap with tables/images
@@ -931,9 +957,60 @@ class PDFGeneratorService:

            for page_num in range(1, total_pages + 1):
                logger.info(f">>> 處理第 {page_num}/{total_pages} 頁")
+
+                # Get current page dimensions with priority order:
+                # 1. Original file dimensions (highest priority)
+                # 2. OCR/UnifiedDocument dimensions
+                # 3. Fallback to first page dimensions
+                page_idx = page_num - 1
+                dimension_source = "unknown"
+
+                # Priority 1: Original file dimensions
+                if page_idx in original_page_sizes:
+                    current_target_w, current_target_h = original_page_sizes[page_idx]
+                    dimension_source = "original_file"
+
+                # Priority 2: OCR/UnifiedDocument dimensions
+                elif page_idx in page_dimensions:
+                    current_page_dims = page_dimensions[page_idx]
+                    current_target_w = float(current_page_dims['width'])
+                    current_target_h = float(current_page_dims['height'])
+                    dimension_source = "ocr_unified_doc"
+
+                # Priority 3: Fallback to first page
+                else:
+                    current_target_w = ocr_width
+                    current_target_h = ocr_height
+                    dimension_source = "fallback_first_page"
+                    logger.warning(f"No dimensions for page {page_num}, using first page size")
+
+                # Calculate scale factors for coordinate transformation
+                # OCR coordinates need to be scaled if original file dimensions differ
+                if dimension_source == "original_file":
+                    # Get OCR dimensions for this page to calculate scale
+                    if page_idx in page_dimensions:
+                        ocr_page_w = float(page_dimensions[page_idx]['width'])
+                        ocr_page_h = float(page_dimensions[page_idx]['height'])
+                    else:
+                        ocr_page_w = ocr_width
+                        ocr_page_h = ocr_height
+
+                    current_scale_w = current_target_w / ocr_page_w if ocr_page_w > 0 else 1.0
+                    current_scale_h = current_target_h / ocr_page_h if ocr_page_h > 0 else 1.0
+                else:
+                    # Using OCR/UnifiedDocument dimensions directly, no scaling needed
+                    current_scale_w = 1.0
+                    current_scale_h = 1.0
+
+                logger.info(f"第 {page_num} 頁尺寸: {current_target_w:.1f} x {current_target_h:.1f} "
+                           f"(來源: {dimension_source}, 縮放: {current_scale_w:.3f}x{current_scale_h:.3f})")
+
                if page_num > 1:
                    pdf_canvas.showPage()

+                # Set page size for current page
+                pdf_canvas.setPageSize((current_target_w, current_target_h))
+
                # Get regions for this page
                page_text_regions = pages_data.get(page_num, [])
                page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
@@ -949,22 +1026,22 @@ class PDFGeneratorService:
                # 1. Draw images (bottom layer)
                for img_meta in page_image_regions:
                    self.draw_image_region(
-                        pdf_canvas, img_meta, target_height,
-                        json_parent_dir, scale_w, scale_h
+                        pdf_canvas, img_meta, current_target_h,
+                        json_parent_dir, current_scale_w, current_scale_h
                    )

                # 2. Draw tables (middle layer)
                for table_elem in page_table_regions:
                    self.draw_table_region(
                        pdf_canvas, table_elem, images_metadata,
-                        target_height, scale_w, scale_h
+                        current_target_h, current_scale_w, current_scale_h
                    )

                # 3. Draw text (top layer)
                for region in page_text_regions:
                    self.draw_text_region(
-                        pdf_canvas, region, target_height,
-                        scale_w, scale_h
+                        pdf_canvas, region, current_target_h,
+                        current_scale_w, current_scale_h
                    )

                logger.info(f"<<< 第 {page_num} 頁完成")
@@ -984,8 +1061,8 @@ class PDFGeneratorService:

    def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
        """
-        從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
-        這非常重要，因為 OCR 可能在高解析度影像上運行。
+        從 OCR JSON 數據中取得頁面尺寸。
+        優先使用明確的 dimensions 欄位，失敗時才回退到 bbox 推斷。

        Args:
            ocr_data: Complete OCR data dictionary with text_regions and layout
@@ -994,6 +1071,26 @@ class PDFGeneratorService:
        Returns:
            Tuple of (width, height) in points
        """
+        # *** 優先級 1: 檢查 ocr_dimensions (UnifiedDocument 轉換來的) ***
+        if 'ocr_dimensions' in ocr_data:
+            dims = ocr_data['ocr_dimensions']
+            w = float(dims.get('width', 0))
+            h = float(dims.get('height', 0))
+            if w > 0 and h > 0:
+                logger.info(f"使用 ocr_dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}")
+                return (w, h)
+
+        # *** 優先級 2: 檢查原始 JSON 的 dimensions ***
+        if 'dimensions' in ocr_data:
+            dims = ocr_data['dimensions']
+            w = float(dims.get('width', 0))
+            h = float(dims.get('height', 0))
+            if w > 0 and h > 0:
+                logger.info(f"使用 dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}")
+                return (w, h)
+
+        # *** 優先級 3: Fallback - 從 bbox 推斷 (僅當上述皆缺失時使用) ***
+        logger.info("dimensions 欄位不可用，回退到 bbox 推斷")
        max_x = 0
        max_y = 0

@@ -1069,9 +1166,69 @@ class PDFGeneratorService:
                    return dims
            return A4

+    def get_all_page_sizes(self, file_path: Path) -> Dict[int, Tuple[float, float]]:
+        """
+        Extract dimensions for all pages from original source file
+
+        Args:
+            file_path: Path to original file (image or PDF)
+
+        Returns:
+            Dict mapping page index (0-based) to (width, height) in points
+            Empty dict if extraction fails
+        """
+        page_sizes = {}
+
+        try:
+            if not file_path.exists():
+                logger.warning(f"File not found: {file_path}")
+                return page_sizes
+
+            # For images, single page with dimensions from PIL
+            if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
+                img = Image.open(file_path)
+                # Use pixel dimensions directly as points (1:1 mapping)
+                # This matches how PaddleOCR reports coordinates
+                width_pt = float(img.width)
+                height_pt = float(img.height)
+                page_sizes[0] = (width_pt, height_pt)
+                logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
+                return page_sizes
+
+            # For PDFs, extract dimensions for all pages using PyPDF2
+            if file_path.suffix.lower() == '.pdf':
+                try:
+                    from PyPDF2 import PdfReader
+                    reader = PdfReader(file_path)
+                    total_pages = len(reader.pages)
+
+                    for page_idx in range(total_pages):
+                        page = reader.pages[page_idx]
+                        # MediaBox gives [x1, y1, x2, y2] in points
+                        mediabox = page.mediabox
+                        width_pt = float(mediabox.width)
+                        height_pt = float(mediabox.height)
+                        page_sizes[page_idx] = (width_pt, height_pt)
+
+                    logger.info(f"Extracted dimensions from PDF: {total_pages} pages")
+                    for idx, (w, h) in page_sizes.items():
+                        logger.debug(f"  Page {idx}: {w:.1f} x {h:.1f} points")
+
+                    return page_sizes
+
+                except ImportError:
+                    logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
+                except Exception as e:
+                    logger.warning(f"Failed to extract PDF dimensions: {e}")
+
+        except Exception as e:
+            logger.warning(f"Failed to get page sizes from {file_path}: {e}")
+
+        return page_sizes
+
    def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
        """
-        Extract page dimensions from original source file
+        Extract first page dimensions from original source file (backward compatibility)

        Args:
            file_path: Path to original file (image or PDF)
@@ -1079,41 +1236,9 @@ class PDFGeneratorService:
        Returns:
            Tuple of (width, height) in points or None
        """
-        try:
-            if not file_path.exists():
-                return None
-
-            # For images, get dimensions from PIL
-            if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
-                img = Image.open(file_path)
-                # Use pixel dimensions directly as points (1:1 mapping)
-                # This matches how PaddleOCR reports coordinates
-                width_pt = float(img.width)
-                height_pt = float(img.height)
-                logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
-                return (width_pt, height_pt)
-
-            # For PDFs, extract dimensions using PyPDF2
-            if file_path.suffix.lower() == '.pdf':
-                try:
-                    from PyPDF2 import PdfReader
-                    reader = PdfReader(file_path)
-                    if len(reader.pages) > 0:
-                        page = reader.pages[0]
-                        # MediaBox gives [x1, y1, x2, y2] in points
-                        mediabox = page.mediabox
-                        width_pt = float(mediabox.width)
-                        height_pt = float(mediabox.height)
-                        logger.info(f"Extracted dimensions from PDF: {width_pt:.1f} x {height_pt:.1f} points")
-                        return (width_pt, height_pt)
-                except ImportError:
-                    logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
-                except Exception as e:
-                    logger.warning(f"Failed to extract PDF dimensions: {e}")
-
-        except Exception as e:
-            logger.warning(f"Failed to get page size from {file_path}: {e}")
-
+        page_sizes = self.get_all_page_sizes(file_path)
+        if 0 in page_sizes:
+            return page_sizes[0]
        return None

    def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]: