test

2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -178,6 +178,114 @@ def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
    return result


+def validate_cell_boxes(
+    cell_boxes: List[List[float]],
+    table_bbox: List[float],
+    page_width: float,
+    page_height: float,
+    tolerance: float = 5.0
+) -> Dict[str, Any]:
+    """
+    Validate cell_boxes coordinates against page boundaries and table bbox.
+
+    PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
+    page boundaries. This function validates and reports issues.
+
+    Args:
+        cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
+        table_bbox: Table bounding box [x0, y0, x1, y1]
+        page_width: Page width in pixels
+        page_height: Page height in pixels
+        tolerance: Allowed tolerance for boundary checks (pixels)
+
+    Returns:
+        Dict with:
+            - valid: bool - whether all cell_boxes are valid
+            - invalid_count: int - number of invalid cell_boxes
+            - clamped_boxes: List - cell_boxes clamped to valid boundaries
+            - issues: List[str] - description of issues found
+    """
+    if not cell_boxes:
+        return {'valid': True, 'invalid_count': 0, 'clamped_boxes': [], 'issues': []}
+
+    issues = []
+    invalid_count = 0
+    clamped_boxes = []
+
+    # Page boundaries with tolerance
+    min_x = -tolerance
+    min_y = -tolerance
+    max_x = page_width + tolerance
+    max_y = page_height + tolerance
+
+    for idx, box in enumerate(cell_boxes):
+        if not box or len(box) < 4:
+            issues.append(f"Cell {idx}: Invalid box format")
+            invalid_count += 1
+            clamped_boxes.append([0, 0, 0, 0])
+            continue
+
+        x0, y0, x1, y1 = box[:4]
+        is_valid = True
+        cell_issues = []
+
+        # Check if coordinates exceed page boundaries
+        if x0 < min_x:
+            cell_issues.append(f"x0={x0:.1f} < 0")
+            is_valid = False
+        if y0 < min_y:
+            cell_issues.append(f"y0={y0:.1f} < 0")
+            is_valid = False
+        if x1 > max_x:
+            cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
+            is_valid = False
+        if y1 > max_y:
+            cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
+            is_valid = False
+
+        # Check for inverted coordinates
+        if x0 > x1:
+            cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
+            is_valid = False
+        if y0 > y1:
+            cell_issues.append(f"y0={y0:.1f} > y1={y1:.1f}")
+            is_valid = False
+
+        if not is_valid:
+            invalid_count += 1
+            issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
+
+        # Clamp to valid boundaries
+        clamped_box = [
+            max(0, min(x0, page_width)),
+            max(0, min(y0, page_height)),
+            max(0, min(x1, page_width)),
+            max(0, min(y1, page_height))
+        ]
+
+        # Ensure proper ordering after clamping
+        if clamped_box[0] > clamped_box[2]:
+            clamped_box[0], clamped_box[2] = clamped_box[2], clamped_box[0]
+        if clamped_box[1] > clamped_box[3]:
+            clamped_box[1], clamped_box[3] = clamped_box[3], clamped_box[1]
+
+        clamped_boxes.append(clamped_box)
+
+    if invalid_count > 0:
+        logger.warning(
+            f"Cell boxes validation: {invalid_count}/{len(cell_boxes)} invalid. "
+            f"Page: {page_width:.0f}x{page_height:.0f}, Table bbox: {table_bbox}"
+        )
+
+    return {
+        'valid': invalid_count == 0,
+        'invalid_count': invalid_count,
+        'clamped_boxes': clamped_boxes,
+        'issues': issues,
+        'needs_fallback': invalid_count > len(cell_boxes) * 0.5  # >50% invalid = needs fallback
+    }
+
+
 class OCRToUnifiedConverter:
    """
    Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
@@ -337,19 +445,22 @@ class OCRToUnifiedConverter:
        for page_idx, page_result in enumerate(enhanced_results):
            elements = []

+            # Get page dimensions first (needed for element conversion)
+            page_width = page_result.get('width', 0)
+            page_height = page_result.get('height', 0)
+            pp_dimensions = Dimensions(width=page_width, height=page_height)
+
            # Process elements from parsing_res_list
            if 'elements' in page_result:
                for elem_data in page_result['elements']:
-                    element = self._convert_pp3_element(elem_data, page_idx)
+                    element = self._convert_pp3_element(
+                        elem_data, page_idx,
+                        page_width=page_width,
+                        page_height=page_height
+                    )
                    if element:
                        elements.append(element)

-            # Get page dimensions
-            pp_dimensions = Dimensions(
-                width=page_result.get('width', 0),
-                height=page_result.get('height', 0)
-            )
-
            # Apply gap filling if enabled and raw regions available
            if self.gap_filling_service and raw_text_regions:
                # Filter raw regions for current page
@@ -556,9 +667,19 @@ class OCRToUnifiedConverter:
    def _convert_pp3_element(
        self,
        elem_data: Dict[str, Any],
-        page_idx: int
+        page_idx: int,
+        page_width: float = 0,
+        page_height: float = 0
    ) -> Optional[DocumentElement]:
-        """Convert PP-StructureV3 element to DocumentElement."""
+        """
+        Convert PP-StructureV3 element to DocumentElement.
+
+        Args:
+            elem_data: Element data from PP-StructureV3
+            page_idx: Page index (0-based)
+            page_width: Page width for coordinate validation
+            page_height: Page height for coordinate validation
+        """
        try:
            # Extract bbox
            bbox_data = elem_data.get('bbox', [0, 0, 0, 0])
@@ -597,18 +718,67 @@ class OCRToUnifiedConverter:
                # Preserve cell_boxes and embedded_images in metadata for PDF generation
                # These are extracted by PP-StructureV3 and provide accurate cell positioning
                if 'cell_boxes' in elem_data:
-                    elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
-                    elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
+                    cell_boxes = elem_data['cell_boxes']
+                    elem_data.setdefault('metadata', {})['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
+
+                    # Validate cell_boxes coordinates if page dimensions are available
+                    if page_width > 0 and page_height > 0:
+                        validation = validate_cell_boxes(
+                            cell_boxes=cell_boxes,
+                            table_bbox=bbox_data,
+                            page_width=page_width,
+                            page_height=page_height
+                        )
+
+                        if not validation['valid']:
+                            elem_data['metadata']['cell_boxes_validation'] = {
+                                'valid': False,
+                                'invalid_count': validation['invalid_count'],
+                                'total_count': len(cell_boxes),
+                                'needs_fallback': validation['needs_fallback']
+                            }
+                            # Use clamped boxes instead of invalid ones
+                            elem_data['metadata']['cell_boxes'] = validation['clamped_boxes']
+                            elem_data['metadata']['cell_boxes_original'] = cell_boxes
+
+                            if validation['needs_fallback']:
+                                logger.warning(
+                                    f"Table {elem_data.get('element_id')}: "
+                                    f"{validation['invalid_count']}/{len(cell_boxes)} cell_boxes invalid, "
+                                    f"fallback recommended"
+                                )
+                        else:
+                            elem_data['metadata']['cell_boxes'] = cell_boxes
+                            elem_data['metadata']['cell_boxes_validation'] = {'valid': True}
+                    else:
+                        # No page dimensions available, store as-is
+                        elem_data['metadata']['cell_boxes'] = cell_boxes
+
                if 'embedded_images' in elem_data:
                    elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
-            elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
-                # For images, use metadata dict as content
+            elif element_type in [
+                ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
+                ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
+            ]:
+                # For all visual elements, use metadata dict as content
+                # Priority: saved_path > img_path (PP-StructureV3 uses saved_path)
+                image_path = (
+                    elem_data.get('saved_path') or
+                    elem_data.get('img_path') or
+                    ''
+                )
                content = {
-                    'path': elem_data.get('img_path', ''),
+                    'saved_path': image_path,  # Preserve original path key
+                    'path': image_path,        # For backward compatibility
                    'width': elem_data.get('width', 0),
                    'height': elem_data.get('height', 0),
                    'format': elem_data.get('format', 'unknown')
                }
+                if not image_path:
+                    logger.warning(
+                        f"Visual element {element_type.value} missing image path: "
+                        f"saved_path={elem_data.get('saved_path')}, img_path={elem_data.get('img_path')}"
+                    )
            else:
                content = elem_data.get('content', '')

@@ -1139,10 +1309,18 @@ class OCRToUnifiedConverter:
        for page_idx, page_data in enumerate(pages_data):
            elements = []

+            # Get page dimensions first
+            page_width = page_data.get('width', 0)
+            page_height = page_data.get('height', 0)
+
            # Process each element in the page
            if 'elements' in page_data:
                for elem_data in page_data['elements']:
-                    element = self._convert_pp3_element(elem_data, page_idx)
+                    element = self._convert_pp3_element(
+                        elem_data, page_idx,
+                        page_width=page_width,
+                        page_height=page_height
+                    )
                    if element:
                        elements.append(element)

@@ -1150,8 +1328,8 @@ class OCRToUnifiedConverter:
            page = Page(
                page_number=page_idx + 1,
                dimensions=Dimensions(
-                    width=page_data.get('width', 0),
-                    height=page_data.get('height', 0)
+                    width=page_width,
+                    height=page_height
                ),
                elements=elements,
                metadata={'reading_order': self._calculate_reading_order(elements)}