feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature - Detect rotation angle from doc_preprocessor_res.angle - Swap page dimensions (width <-> height) for 90°/270° rotations - Output PDF now correctly displays landscape-scanned content Also includes: - Archive completed openspec proposals - Add simplify-frontend-ocr-config proposal (pending) - Code cleanup and frontend simplification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -25,6 +25,7 @@ from PIL import Image
 from html.parser import HTMLParser

 from app.core.config import settings
+from app.utils.bbox_utils import normalize_bbox

 # Import table column corrector for column alignment fix
 try:
@@ -1258,8 +1259,44 @@ class PDFGeneratorService:
                                else:
                                    logger.warning(f"Image file not found: {saved_path}")

+                        # Also check for embedded images in table elements
+                        # These are images detected inside table regions by PP-Structure
+                        elif elem_type == 'table':
+                            metadata = elem.metadata if hasattr(elem, 'metadata') else elem.get('metadata', {})
+                            embedded_images = metadata.get('embedded_images', []) if metadata else []
+                            for emb_img in embedded_images:
+                                emb_bbox = emb_img.get('bbox', [])
+                                if emb_bbox and len(emb_bbox) >= 4:
+                                    ex0, ey0, ex1, ey1 = emb_bbox[0], emb_bbox[1], emb_bbox[2], emb_bbox[3]
+                                    exclusion_zones.append((ex0, ey0, ex1, ey1))
+
+                                    # Also render the embedded image
+                                    saved_path = emb_img.get('saved_path', '')
+                                    if saved_path:
+                                        image_path = result_dir / saved_path
+                                        if not image_path.exists():
+                                            image_path = result_dir / Path(saved_path).name
+                                        if image_path.exists():
+                                            try:
+                                                pdf_x = ex0
+                                                pdf_y = current_height - ey1
+                                                img_width = ex1 - ex0
+                                                img_height = ey1 - ey0
+                                                pdf_canvas.drawImage(
+                                                    str(image_path),
+                                                    pdf_x, pdf_y,
+                                                    width=img_width,
+                                                    height=img_height,
+                                                    preserveAspectRatio=True,
+                                                    mask='auto'
+                                                )
+                                                image_elements_rendered += 1
+                                                logger.debug(f"Rendered embedded image: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
+                                            except Exception as e:
+                                                logger.warning(f"Failed to render embedded image {saved_path}: {e}")
+
                    if image_elements_rendered > 0:
-                        logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
+                        logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas/embedded)")

                    if exclusion_zones:
                        logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
@@ -1857,38 +1894,8 @@ class PDFGeneratorService:
        return None

    def _get_bbox_coords(self, bbox: Union[Dict, List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
-        """將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
-        try:
-            if bbox is None:
-                return None
-
-            # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
-            if isinstance(bbox, dict):
-                if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
-                    return float(bbox['x0']), float(bbox['y0']), float(bbox['x1']), float(bbox['y1'])
-                else:
-                    logger.warning(f"Dict bbox 缺少必要欄位: {bbox}")
-                    return None
-
-            if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
-                return None
-
-            if isinstance(bbox[0], (list, tuple)):
-                # 處理多邊形 [[x, y], ...]
-                x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
-                y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
-                if not x_coords or not y_coords:
-                    return None
-                return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
-            elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
-                # 處理 [x1, y1, x2, y2]
-                return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
-            else:
-                logger.warning(f"未知的 bbox 格式: {bbox}")
-                return None
-        except Exception as e:
-            logger.error(f"解析 bbox {bbox} 時出錯: {e}")
-            return None
+        """將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]. Uses shared bbox utility."""
+        return normalize_bbox(bbox)

    def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
        """
@@ -2463,29 +2470,7 @@ class PDFGeneratorService:
                            else:
                                logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
                        else:
-                            # Grid mismatch: try cellboxes-first rendering if enabled
-                            if settings.table_rendering_prefer_cellboxes:
-                                logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
-                                from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
-                                renderer = TableRenderer(TableRenderConfig())
-                                success = renderer.render_from_cellboxes_grid(
-                                    pdf_canvas,
-                                    cell_boxes,
-                                    html_content,
-                                    tuple(raw_bbox),
-                                    page_height,
-                                    scale_w,
-                                    scale_h,
-                                    row_threshold=settings.table_cellboxes_row_threshold,
-                                    col_threshold=settings.table_cellboxes_col_threshold
-                                )
-                                if success:
-                                    logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
-                                    return  # Table fully rendered, exit early
-                                else:
-                                    logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
-                            else:
-                                logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
+                            logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
                    else:
                        logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")

@@ -2942,47 +2927,16 @@ class PDFGeneratorService:
        """
        Check the quality of cell_boxes to determine rendering strategy.

+        Always returns 'good' to use pure PP-Structure output (quality check removed).
+
        Args:
            cell_boxes: List of cell bounding boxes
            element_id: Optional element ID for logging

        Returns:
-            'good' if cell_boxes form a proper grid, 'bad' otherwise
+            'good' - always use cell_boxes rendering
        """
-        # If quality check is disabled, always return 'good' to use pure PP-Structure output
-        if not settings.table_quality_check_enabled:
-            logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
-            return 'good'
-
-        if not cell_boxes or len(cell_boxes) < 2:
-            logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
-            return 'bad'  # No cell_boxes or too few
-
-        # Count overlapping cell pairs
-        overlap_count = 0
-        for i, box1 in enumerate(cell_boxes):
-            for j, box2 in enumerate(cell_boxes):
-                if i >= j:
-                    continue
-                if not isinstance(box1, (list, tuple)) or len(box1) < 4:
-                    continue
-                if not isinstance(box2, (list, tuple)) or len(box2) < 4:
-                    continue
-                x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
-                y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
-                if x_overlap and y_overlap:
-                    overlap_count += 1
-
-        total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
-        overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
-
-        # Relaxed threshold: 20% overlap instead of 10% to allow more tables through
-        # This is because PP-StructureV3's cell detection sometimes has slight overlaps
-        if overlap_ratio > 0.20:
-            logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
-            return 'bad'
-
-        logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
+        logger.debug(f"[TABLE QUALITY] {element_id}: good - pure PP-Structure mode")
        return 'good'

    def _draw_table_with_cell_boxes(