diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 5ccc5d6..8fe1137 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -161,6 +161,125 @@ class PDFGeneratorService: logger.error(f"Failed to register Chinese font: {e}") self.font_registered = False + def _detect_content_orientation( + self, + page_width: float, + page_height: float, + ocr_data: Dict + ) -> Tuple[bool, float, float]: + """ + Detect if content orientation differs from page dimensions. + + This handles cases where a document is scanned in portrait orientation + but the actual content is landscape (or vice versa). PP-StructureV3 + may return bounding boxes in the "corrected" orientation while the + image remains in its scanned orientation. + + Args: + page_width: Declared page width from image dimensions + page_height: Declared page height from image dimensions + ocr_data: OCR data dictionary containing bounding boxes + + Returns: + Tuple of (needs_rotation, adjusted_width, adjusted_height) + - needs_rotation: True if page orientation should be swapped + - adjusted_width: Width to use for PDF page + - adjusted_height: Height to use for PDF page + """ + # Find max content bounds from all regions + max_x = 0 + max_y = 0 + + all_regions = [] + + # Collect regions from various sources + if 'text_regions' in ocr_data and isinstance(ocr_data['text_regions'], list): + all_regions.extend(ocr_data['text_regions']) + + if 'layout_data' in ocr_data and isinstance(ocr_data['layout_data'], dict): + elements = ocr_data['layout_data'].get('elements', []) + if elements: + all_regions.extend(elements) + + if 'images_metadata' in ocr_data and isinstance(ocr_data['images_metadata'], list): + all_regions.extend(ocr_data['images_metadata']) + + for region in all_regions: + try: + bbox = region.get('bbox') + if not bbox: + continue + + # Handle different bbox formats + if isinstance(bbox, dict): + # BoundingBox object format + max_x = max(max_x, float(bbox.get('x1', bbox.get('x0', 0) + bbox.get('width', 0)))) + max_y = max(max_y, float(bbox.get('y1', bbox.get('y0', 0) + bbox.get('height', 0)))) + elif isinstance(bbox, (list, tuple)): + if len(bbox) >= 4 and isinstance(bbox[0], (int, float)): + # [x1, y1, x2, y2] format + max_x = max(max_x, float(bbox[2])) + max_y = max(max_y, float(bbox[3])) + elif isinstance(bbox[0], (list, tuple)): + # Polygon format [[x, y], ...] + x_coords = [p[0] for p in bbox if len(p) >= 2] + y_coords = [p[1] for p in bbox if len(p) >= 2] + if x_coords and y_coords: + max_x = max(max_x, max(x_coords)) + max_y = max(max_y, max(y_coords)) + except Exception as e: + logger.debug(f"Error processing bbox for orientation detection: {e}") + continue + + if max_x == 0 or max_y == 0: + # No valid bboxes found, use original dimensions + return (False, page_width, page_height) + + logger.info(f"內容邊界偵測: max_x={max_x:.1f}, max_y={max_y:.1f}, " + f"page_dims={page_width:.1f}x{page_height:.1f}") + + # Calculate how much content extends beyond page boundaries + x_overflow = max_x / page_width if page_width > 0 else 1 + y_overflow = max_y / page_height if page_height > 0 else 1 + + # Check if content significantly exceeds page dimensions in one direction + # This suggests the content is in a different orientation + OVERFLOW_THRESHOLD = 1.15 # Content extends >15% beyond declared dimensions + + if x_overflow > OVERFLOW_THRESHOLD and y_overflow <= 1.05: + # Content is wider than page but fits in height + # This suggests portrait image with landscape content + logger.warning(f"偵測到內容方向可能與頁面不符: " + f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}") + + # Check if swapping dimensions would help + # If max_x fits better in page_height, swap + if max_x <= page_height * 1.05: + logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> " + f"{page_height:.1f}x{page_width:.1f}") + return (True, page_height, page_width) + else: + # Content still doesn't fit, just scale to fit content + logger.info(f"內容超出頁面邊界,調整頁面大小以容納內容") + return (False, max_x * 1.02, page_height) + + elif y_overflow > OVERFLOW_THRESHOLD and x_overflow <= 1.05: + # Content is taller than page but fits in width + # Less common - landscape image with portrait content + logger.warning(f"偵測到內容方向可能與頁面不符 (高度溢出): " + f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}") + + if max_y <= page_width * 1.05: + logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> " + f"{page_height:.1f}x{page_width:.1f}") + return (True, page_height, page_width) + else: + logger.info(f"內容超出頁面邊界,調整頁面大小以容納內容") + return (False, page_width, max_y * 1.02) + + # No orientation issue detected + return (False, page_width, page_height) + def _parse_color(self, color_value) -> Tuple[float, float, float]: """ Parse color value to RGB tuple. @@ -943,6 +1062,20 @@ class PDFGeneratorService: target_width, target_height = ocr_width, ocr_height logger.info(f"初始 PDF 尺寸(來自 OCR/UnifiedDocument): {target_width:.1f} x {target_height:.1f}") + # Step 4: Detect content orientation mismatch + # This handles rotated scans where content bbox exceeds page dimensions + needs_rotation, adjusted_width, adjusted_height = self._detect_content_orientation( + target_width, target_height, ocr_data + ) + + if needs_rotation or (adjusted_width != target_width or adjusted_height != target_height): + logger.info(f"頁面尺寸調整: {target_width:.1f}x{target_height:.1f} -> " + f"{adjusted_width:.1f}x{adjusted_height:.1f} (旋轉={needs_rotation})") + target_width, target_height = adjusted_width, adjusted_height + # Also update page_dimensions for consistency in per-page processing + if 0 in page_dimensions: + page_dimensions[0] = {'width': target_width, 'height': target_height} + # Create PDF canvas with initial page size (will be updated per page) pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height)) @@ -993,7 +1126,7 @@ class PDFGeneratorService: current_target_w, current_target_h = original_page_sizes[page_idx] dimension_source = "original_file" - # Priority 2: OCR/UnifiedDocument dimensions + # Priority 2: OCR/UnifiedDocument dimensions (which may have been adjusted for orientation) elif page_idx in page_dimensions: current_page_dims = page_dimensions[page_idx] current_target_w = float(current_page_dims['width']) @@ -1007,6 +1140,27 @@ class PDFGeneratorService: dimension_source = "fallback_first_page" logger.warning(f"No dimensions for page {page_num}, using first page size") + # For pages after the first, check if orientation adjustment is needed + # (First page was already handled above) + if page_num > 1 and dimension_source == "original_file": + # Build per-page data for orientation detection + page_ocr_data = { + 'text_regions': [r for r in text_regions if r.get('page', 1) == page_num], + 'layout_data': { + 'elements': [e for e in layout_data.get('elements', []) + if e.get('page', 0) == page_idx] + }, + 'images_metadata': [i for i in images_metadata if i.get('page', 0) == page_idx] + } + needs_page_rotation, adj_w, adj_h = self._detect_content_orientation( + current_target_w, current_target_h, page_ocr_data + ) + if needs_page_rotation or (adj_w != current_target_w or adj_h != current_target_h): + logger.info(f"第 {page_num} 頁尺寸調整: " + f"{current_target_w:.1f}x{current_target_h:.1f} -> " + f"{adj_w:.1f}x{adj_h:.1f}") + current_target_w, current_target_h = adj_w, adj_h + # Calculate scale factors for coordinate transformation # OCR coordinates need to be scaled if original file dimensions differ if dimension_source == "original_file":