fix: detect and handle rotated document content in PDF generation

Add orientation detection to handle cases where scanned documents have content in a different orientation than the image dimensions suggest. When PP-StructureV3 processes rotated documents, it may return bounding boxes in the "corrected" orientation while the image remains in its scanned orientation. This causes content to extend beyond page boundaries. The fix: - Add _detect_content_orientation() method to detect when content bbox exceeds page dimensions significantly - Automatically swap page dimensions when landscape content is detected in portrait-oriented images (and vice versa) - Apply orientation detection for both single-page and multi-page documents Fixes issue where horizontal delivery slips scanned vertically were generating PDFs with content cut off or incorrectly positioned. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 13:27:01 +08:00
parent 95ae1f1bdb
commit c65e4f98d4
1 changed files with 155 additions and 1 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -161,6 +161,125 @@ class PDFGeneratorService:
            logger.error(f"Failed to register Chinese font: {e}")
            self.font_registered = False

+    def _detect_content_orientation(
+        self,
+        page_width: float,
+        page_height: float,
+        ocr_data: Dict
+    ) -> Tuple[bool, float, float]:
+        """
+        Detect if content orientation differs from page dimensions.
+
+        This handles cases where a document is scanned in portrait orientation
+        but the actual content is landscape (or vice versa). PP-StructureV3
+        may return bounding boxes in the "corrected" orientation while the
+        image remains in its scanned orientation.
+
+        Args:
+            page_width: Declared page width from image dimensions
+            page_height: Declared page height from image dimensions
+            ocr_data: OCR data dictionary containing bounding boxes
+
+        Returns:
+            Tuple of (needs_rotation, adjusted_width, adjusted_height)
+            - needs_rotation: True if page orientation should be swapped
+            - adjusted_width: Width to use for PDF page
+            - adjusted_height: Height to use for PDF page
+        """
+        # Find max content bounds from all regions
+        max_x = 0
+        max_y = 0
+
+        all_regions = []
+
+        # Collect regions from various sources
+        if 'text_regions' in ocr_data and isinstance(ocr_data['text_regions'], list):
+            all_regions.extend(ocr_data['text_regions'])
+
+        if 'layout_data' in ocr_data and isinstance(ocr_data['layout_data'], dict):
+            elements = ocr_data['layout_data'].get('elements', [])
+            if elements:
+                all_regions.extend(elements)
+
+        if 'images_metadata' in ocr_data and isinstance(ocr_data['images_metadata'], list):
+            all_regions.extend(ocr_data['images_metadata'])
+
+        for region in all_regions:
+            try:
+                bbox = region.get('bbox')
+                if not bbox:
+                    continue
+
+                # Handle different bbox formats
+                if isinstance(bbox, dict):
+                    # BoundingBox object format
+                    max_x = max(max_x, float(bbox.get('x1', bbox.get('x0', 0) + bbox.get('width', 0))))
+                    max_y = max(max_y, float(bbox.get('y1', bbox.get('y0', 0) + bbox.get('height', 0))))
+                elif isinstance(bbox, (list, tuple)):
+                    if len(bbox) >= 4 and isinstance(bbox[0], (int, float)):
+                        # [x1, y1, x2, y2] format
+                        max_x = max(max_x, float(bbox[2]))
+                        max_y = max(max_y, float(bbox[3]))
+                    elif isinstance(bbox[0], (list, tuple)):
+                        # Polygon format [[x, y], ...]
+                        x_coords = [p[0] for p in bbox if len(p) >= 2]
+                        y_coords = [p[1] for p in bbox if len(p) >= 2]
+                        if x_coords and y_coords:
+                            max_x = max(max_x, max(x_coords))
+                            max_y = max(max_y, max(y_coords))
+            except Exception as e:
+                logger.debug(f"Error processing bbox for orientation detection: {e}")
+                continue
+
+        if max_x == 0 or max_y == 0:
+            # No valid bboxes found, use original dimensions
+            return (False, page_width, page_height)
+
+        logger.info(f"內容邊界偵測: max_x={max_x:.1f}, max_y={max_y:.1f}, "
+                   f"page_dims={page_width:.1f}x{page_height:.1f}")
+
+        # Calculate how much content extends beyond page boundaries
+        x_overflow = max_x / page_width if page_width > 0 else 1
+        y_overflow = max_y / page_height if page_height > 0 else 1
+
+        # Check if content significantly exceeds page dimensions in one direction
+        # This suggests the content is in a different orientation
+        OVERFLOW_THRESHOLD = 1.15  # Content extends >15% beyond declared dimensions
+
+        if x_overflow > OVERFLOW_THRESHOLD and y_overflow <= 1.05:
+            # Content is wider than page but fits in height
+            # This suggests portrait image with landscape content
+            logger.warning(f"偵測到內容方向可能與頁面不符: "
+                         f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}")
+
+            # Check if swapping dimensions would help
+            # If max_x fits better in page_height, swap
+            if max_x <= page_height * 1.05:
+                logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> "
+                           f"{page_height:.1f}x{page_width:.1f}")
+                return (True, page_height, page_width)
+            else:
+                # Content still doesn't fit, just scale to fit content
+                logger.info(f"內容超出頁面邊界，調整頁面大小以容納內容")
+                return (False, max_x * 1.02, page_height)
+
+        elif y_overflow > OVERFLOW_THRESHOLD and x_overflow <= 1.05:
+            # Content is taller than page but fits in width
+            # Less common - landscape image with portrait content
+            logger.warning(f"偵測到內容方向可能與頁面不符 (高度溢出): "
+                         f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}")
+
+            if max_y <= page_width * 1.05:
+                logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> "
+                           f"{page_height:.1f}x{page_width:.1f}")
+                return (True, page_height, page_width)
+            else:
+                logger.info(f"內容超出頁面邊界，調整頁面大小以容納內容")
+                return (False, page_width, max_y * 1.02)
+
+        # No orientation issue detected
+        return (False, page_width, page_height)
+
    def _parse_color(self, color_value) -> Tuple[float, float, float]:
        """
        Parse color value to RGB tuple.
@@ -943,6 +1062,20 @@ class PDFGeneratorService:
                target_width, target_height = ocr_width, ocr_height
                logger.info(f"初始 PDF 尺寸（來自 OCR/UnifiedDocument）: {target_width:.1f} x {target_height:.1f}")

+            # Step 4: Detect content orientation mismatch
+            # This handles rotated scans where content bbox exceeds page dimensions
+            needs_rotation, adjusted_width, adjusted_height = self._detect_content_orientation(
+                target_width, target_height, ocr_data
+            )
+
+            if needs_rotation or (adjusted_width != target_width or adjusted_height != target_height):
+                logger.info(f"頁面尺寸調整: {target_width:.1f}x{target_height:.1f} -> "
+                           f"{adjusted_width:.1f}x{adjusted_height:.1f} (旋轉={needs_rotation})")
+                target_width, target_height = adjusted_width, adjusted_height
+                # Also update page_dimensions for consistency in per-page processing
+                if 0 in page_dimensions:
+                    page_dimensions[0] = {'width': target_width, 'height': target_height}
+
            # Create PDF canvas with initial page size (will be updated per page)
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))

@@ -993,7 +1126,7 @@ class PDFGeneratorService:
                    current_target_w, current_target_h = original_page_sizes[page_idx]
                    dimension_source = "original_file"

-                # Priority 2: OCR/UnifiedDocument dimensions
+                # Priority 2: OCR/UnifiedDocument dimensions (which may have been adjusted for orientation)
                elif page_idx in page_dimensions:
                    current_page_dims = page_dimensions[page_idx]
                    current_target_w = float(current_page_dims['width'])
@@ -1007,6 +1140,27 @@ class PDFGeneratorService:
                    dimension_source = "fallback_first_page"
                    logger.warning(f"No dimensions for page {page_num}, using first page size")

+                # For pages after the first, check if orientation adjustment is needed
+                # (First page was already handled above)
+                if page_num > 1 and dimension_source == "original_file":
+                    # Build per-page data for orientation detection
+                    page_ocr_data = {
+                        'text_regions': [r for r in text_regions if r.get('page', 1) == page_num],
+                        'layout_data': {
+                            'elements': [e for e in layout_data.get('elements', [])
+                                        if e.get('page', 0) == page_idx]
+                        },
+                        'images_metadata': [i for i in images_metadata if i.get('page', 0) == page_idx]
+                    }
+                    needs_page_rotation, adj_w, adj_h = self._detect_content_orientation(
+                        current_target_w, current_target_h, page_ocr_data
+                    )
+                    if needs_page_rotation or (adj_w != current_target_w or adj_h != current_target_h):
+                        logger.info(f"第 {page_num} 頁尺寸調整: "
+                                   f"{current_target_w:.1f}x{current_target_h:.1f} -> "
+                                   f"{adj_w:.1f}x{adj_h:.1f}")
+                        current_target_w, current_target_h = adj_w, adj_h
+
                # Calculate scale factors for coordinate transformation
                # OCR coordinates need to be scaled if original file dimensions differ
                if dimension_source == "original_file":