fix: detect and handle rotated document content in PDF generation

Add orientation detection to handle cases where scanned documents have
content in a different orientation than the image dimensions suggest.

When PP-StructureV3 processes rotated documents, it may return bounding
boxes in the "corrected" orientation while the image remains in its
scanned orientation. This causes content to extend beyond page boundaries.

The fix:
- Add _detect_content_orientation() method to detect when content bbox
  exceeds page dimensions significantly
- Automatically swap page dimensions when landscape content is detected
  in portrait-oriented images (and vice versa)
- Apply orientation detection for both single-page and multi-page documents

Fixes issue where horizontal delivery slips scanned vertically were
generating PDFs with content cut off or incorrectly positioned.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-30 13:27:01 +08:00
parent 95ae1f1bdb
commit c65e4f98d4

View File

@@ -161,6 +161,125 @@ class PDFGeneratorService:
logger.error(f"Failed to register Chinese font: {e}")
self.font_registered = False
def _detect_content_orientation(
self,
page_width: float,
page_height: float,
ocr_data: Dict
) -> Tuple[bool, float, float]:
"""
Detect if content orientation differs from page dimensions.
This handles cases where a document is scanned in portrait orientation
but the actual content is landscape (or vice versa). PP-StructureV3
may return bounding boxes in the "corrected" orientation while the
image remains in its scanned orientation.
Args:
page_width: Declared page width from image dimensions
page_height: Declared page height from image dimensions
ocr_data: OCR data dictionary containing bounding boxes
Returns:
Tuple of (needs_rotation, adjusted_width, adjusted_height)
- needs_rotation: True if page orientation should be swapped
- adjusted_width: Width to use for PDF page
- adjusted_height: Height to use for PDF page
"""
# Find max content bounds from all regions
max_x = 0
max_y = 0
all_regions = []
# Collect regions from various sources
if 'text_regions' in ocr_data and isinstance(ocr_data['text_regions'], list):
all_regions.extend(ocr_data['text_regions'])
if 'layout_data' in ocr_data and isinstance(ocr_data['layout_data'], dict):
elements = ocr_data['layout_data'].get('elements', [])
if elements:
all_regions.extend(elements)
if 'images_metadata' in ocr_data and isinstance(ocr_data['images_metadata'], list):
all_regions.extend(ocr_data['images_metadata'])
for region in all_regions:
try:
bbox = region.get('bbox')
if not bbox:
continue
# Handle different bbox formats
if isinstance(bbox, dict):
# BoundingBox object format
max_x = max(max_x, float(bbox.get('x1', bbox.get('x0', 0) + bbox.get('width', 0))))
max_y = max(max_y, float(bbox.get('y1', bbox.get('y0', 0) + bbox.get('height', 0))))
elif isinstance(bbox, (list, tuple)):
if len(bbox) >= 4 and isinstance(bbox[0], (int, float)):
# [x1, y1, x2, y2] format
max_x = max(max_x, float(bbox[2]))
max_y = max(max_y, float(bbox[3]))
elif isinstance(bbox[0], (list, tuple)):
# Polygon format [[x, y], ...]
x_coords = [p[0] for p in bbox if len(p) >= 2]
y_coords = [p[1] for p in bbox if len(p) >= 2]
if x_coords and y_coords:
max_x = max(max_x, max(x_coords))
max_y = max(max_y, max(y_coords))
except Exception as e:
logger.debug(f"Error processing bbox for orientation detection: {e}")
continue
if max_x == 0 or max_y == 0:
# No valid bboxes found, use original dimensions
return (False, page_width, page_height)
logger.info(f"內容邊界偵測: max_x={max_x:.1f}, max_y={max_y:.1f}, "
f"page_dims={page_width:.1f}x{page_height:.1f}")
# Calculate how much content extends beyond page boundaries
x_overflow = max_x / page_width if page_width > 0 else 1
y_overflow = max_y / page_height if page_height > 0 else 1
# Check if content significantly exceeds page dimensions in one direction
# This suggests the content is in a different orientation
OVERFLOW_THRESHOLD = 1.15 # Content extends >15% beyond declared dimensions
if x_overflow > OVERFLOW_THRESHOLD and y_overflow <= 1.05:
# Content is wider than page but fits in height
# This suggests portrait image with landscape content
logger.warning(f"偵測到內容方向可能與頁面不符: "
f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}")
# Check if swapping dimensions would help
# If max_x fits better in page_height, swap
if max_x <= page_height * 1.05:
logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> "
f"{page_height:.1f}x{page_width:.1f}")
return (True, page_height, page_width)
else:
# Content still doesn't fit, just scale to fit content
logger.info(f"內容超出頁面邊界,調整頁面大小以容納內容")
return (False, max_x * 1.02, page_height)
elif y_overflow > OVERFLOW_THRESHOLD and x_overflow <= 1.05:
# Content is taller than page but fits in width
# Less common - landscape image with portrait content
logger.warning(f"偵測到內容方向可能與頁面不符 (高度溢出): "
f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}")
if max_y <= page_width * 1.05:
logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> "
f"{page_height:.1f}x{page_width:.1f}")
return (True, page_height, page_width)
else:
logger.info(f"內容超出頁面邊界,調整頁面大小以容納內容")
return (False, page_width, max_y * 1.02)
# No orientation issue detected
return (False, page_width, page_height)
def _parse_color(self, color_value) -> Tuple[float, float, float]:
"""
Parse color value to RGB tuple.
@@ -943,6 +1062,20 @@ class PDFGeneratorService:
target_width, target_height = ocr_width, ocr_height
logger.info(f"初始 PDF 尺寸(來自 OCR/UnifiedDocument: {target_width:.1f} x {target_height:.1f}")
# Step 4: Detect content orientation mismatch
# This handles rotated scans where content bbox exceeds page dimensions
needs_rotation, adjusted_width, adjusted_height = self._detect_content_orientation(
target_width, target_height, ocr_data
)
if needs_rotation or (adjusted_width != target_width or adjusted_height != target_height):
logger.info(f"頁面尺寸調整: {target_width:.1f}x{target_height:.1f} -> "
f"{adjusted_width:.1f}x{adjusted_height:.1f} (旋轉={needs_rotation})")
target_width, target_height = adjusted_width, adjusted_height
# Also update page_dimensions for consistency in per-page processing
if 0 in page_dimensions:
page_dimensions[0] = {'width': target_width, 'height': target_height}
# Create PDF canvas with initial page size (will be updated per page)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
@@ -993,7 +1126,7 @@ class PDFGeneratorService:
current_target_w, current_target_h = original_page_sizes[page_idx]
dimension_source = "original_file"
# Priority 2: OCR/UnifiedDocument dimensions
# Priority 2: OCR/UnifiedDocument dimensions (which may have been adjusted for orientation)
elif page_idx in page_dimensions:
current_page_dims = page_dimensions[page_idx]
current_target_w = float(current_page_dims['width'])
@@ -1007,6 +1140,27 @@ class PDFGeneratorService:
dimension_source = "fallback_first_page"
logger.warning(f"No dimensions for page {page_num}, using first page size")
# For pages after the first, check if orientation adjustment is needed
# (First page was already handled above)
if page_num > 1 and dimension_source == "original_file":
# Build per-page data for orientation detection
page_ocr_data = {
'text_regions': [r for r in text_regions if r.get('page', 1) == page_num],
'layout_data': {
'elements': [e for e in layout_data.get('elements', [])
if e.get('page', 0) == page_idx]
},
'images_metadata': [i for i in images_metadata if i.get('page', 0) == page_idx]
}
needs_page_rotation, adj_w, adj_h = self._detect_content_orientation(
current_target_w, current_target_h, page_ocr_data
)
if needs_page_rotation or (adj_w != current_target_w or adj_h != current_target_h):
logger.info(f"{page_num} 頁尺寸調整: "
f"{current_target_w:.1f}x{current_target_h:.1f} -> "
f"{adj_w:.1f}x{adj_h:.1f}")
current_target_w, current_target_h = adj_w, adj_h
# Calculate scale factors for coordinate transformation
# OCR coordinates need to be scaled if original file dimensions differ
if dimension_source == "original_file":