chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -189,7 +189,7 @@ def validate_cell_boxes(
    Validate cell_boxes coordinates against page boundaries and table bbox.

    PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
-    page boundaries. This function validates and reports issues.
+    page boundaries or table bbox. This function validates and clamps to valid boundaries.

    Args:
        cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
@@ -213,10 +213,22 @@ def validate_cell_boxes(
    clamped_boxes = []

    # Page boundaries with tolerance
-    min_x = -tolerance
-    min_y = -tolerance
-    max_x = page_width + tolerance
-    max_y = page_height + tolerance
+    page_min_x = -tolerance
+    page_min_y = -tolerance
+    page_max_x = page_width + tolerance
+    page_max_y = page_height + tolerance
+
+    # Table boundaries with tolerance (prefer clamping to table bbox)
+    table_min_x = table_bbox[0] - tolerance if len(table_bbox) >= 4 else page_min_x
+    table_min_y = table_bbox[1] - tolerance if len(table_bbox) >= 4 else page_min_y
+    table_max_x = table_bbox[2] + tolerance if len(table_bbox) >= 4 else page_max_x
+    table_max_y = table_bbox[3] + tolerance if len(table_bbox) >= 4 else page_max_y
+
+    # For clamping, use the intersection of page and expanded table bbox
+    clamp_min_x = max(0, table_bbox[0] - tolerance) if len(table_bbox) >= 4 else 0
+    clamp_min_y = max(0, table_bbox[1] - tolerance) if len(table_bbox) >= 4 else 0
+    clamp_max_x = min(page_width, table_bbox[2] + tolerance) if len(table_bbox) >= 4 else page_width
+    clamp_max_y = min(page_height, table_bbox[3] + tolerance) if len(table_bbox) >= 4 else page_height

    for idx, box in enumerate(cell_boxes):
        if not box or len(box) < 4:
@@ -230,19 +242,38 @@ def validate_cell_boxes(
        cell_issues = []

        # Check if coordinates exceed page boundaries
-        if x0 < min_x:
+        if x0 < page_min_x:
            cell_issues.append(f"x0={x0:.1f} < 0")
            is_valid = False
-        if y0 < min_y:
+        if y0 < page_min_y:
            cell_issues.append(f"y0={y0:.1f} < 0")
            is_valid = False
-        if x1 > max_x:
+        if x1 > page_max_x:
            cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
            is_valid = False
-        if y1 > max_y:
+        if y1 > page_max_y:
            cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
            is_valid = False

+        # Check if coordinates significantly exceed table bbox (more than 20% of table size)
+        if len(table_bbox) >= 4:
+            table_w = table_bbox[2] - table_bbox[0]
+            table_h = table_bbox[3] - table_bbox[1]
+            expand_tolerance = max(tolerance, table_h * 0.2)  # 20% of table height
+
+            if y0 < table_bbox[1] - expand_tolerance:
+                cell_issues.append(f"y0={y0:.1f} above table (table_y0={table_bbox[1]:.1f})")
+                is_valid = False
+            if y1 > table_bbox[3] + expand_tolerance:
+                cell_issues.append(f"y1={y1:.1f} below table (table_y1={table_bbox[3]:.1f})")
+                is_valid = False
+            if x0 < table_bbox[0] - expand_tolerance:
+                cell_issues.append(f"x0={x0:.1f} left of table (table_x0={table_bbox[0]:.1f})")
+                is_valid = False
+            if x1 > table_bbox[2] + expand_tolerance:
+                cell_issues.append(f"x1={x1:.1f} right of table (table_x1={table_bbox[2]:.1f})")
+                is_valid = False
+
        # Check for inverted coordinates
        if x0 > x1:
            cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
@@ -255,12 +286,12 @@ def validate_cell_boxes(
            invalid_count += 1
            issues.append(f"Cell {idx}: {', '.join(cell_issues)}")

-        # Clamp to valid boundaries
+        # Clamp to valid boundaries (table bbox with some tolerance)
        clamped_box = [
-            max(0, min(x0, page_width)),
-            max(0, min(y0, page_height)),
-            max(0, min(x1, page_width)),
-            max(0, min(y1, page_height))
+            max(clamp_min_x, min(x0, clamp_max_x)),
+            max(clamp_min_y, min(y0, clamp_max_y)),
+            max(clamp_min_x, min(x1, clamp_max_x)),
+            max(clamp_min_y, min(y1, clamp_max_y))
        ]

        # Ensure proper ordering after clamping
@@ -395,10 +426,15 @@ class OCRToUnifiedConverter:

        Handles both enhanced PP-StructureV3 results (with parsing_res_list)
        and traditional markdown results. Applies gap filling when enabled.
+
+        Gap filling can use either:
+        1. overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
+        2. Separate raw OCR text_regions (fallback)
        """
        pages = []

        # Extract raw OCR text regions for gap filling
+        # Prefer overall_ocr_res from PP-StructureV3 when available
        raw_text_regions = ocr_results.get('text_regions', [])
        ocr_dimensions = ocr_results.get('ocr_dimensions', {})

@@ -461,13 +497,22 @@ class OCRToUnifiedConverter:
                    if element:
                        elements.append(element)

-            # Apply gap filling if enabled and raw regions available
-            if self.gap_filling_service and raw_text_regions:
-                # Filter raw regions for current page
-                page_raw_regions = [
-                    r for r in raw_text_regions
-                    if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
-                ]
+            # Apply gap filling if enabled
+            # Priority: 1) overall_ocr_res from page_result, 2) raw_text_regions from separate OCR
+            if self.gap_filling_service:
+                # Check for overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
+                page_raw_regions = page_result.get('overall_ocr_res', [])
+
+                if page_raw_regions:
+                    logger.debug(f"Page {page_idx + 1}: Using overall_ocr_res ({len(page_raw_regions)} regions)")
+                elif raw_text_regions:
+                    # Fallback to separate raw OCR regions
+                    page_raw_regions = [
+                        r for r in raw_text_regions
+                        if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
+                    ]
+                    if page_raw_regions:
+                        logger.debug(f"Page {page_idx + 1}: Using separate raw OCR ({len(page_raw_regions)} regions)")

                if page_raw_regions:
                    supplemented, stats = self.gap_filling_service.fill_gaps(
@@ -711,8 +756,33 @@ class OCRToUnifiedConverter:
            # Prepare content based on element type
            if element_type == ElementType.TABLE:
                # For tables, use TableData as content
-                # Pass cell_boxes for accurate cell positioning
-                table_data = self._extract_table_data(elem_data)
+                # Priority: rebuilt_table > HTML parsing
+                # rebuilt_table contains clean cells without empty padding
+                if 'rebuilt_table' in elem_data:
+                    rebuilt = elem_data['rebuilt_table']
+                    # Use rebuilt cells directly - they don't include empty cells
+                    rebuilt_cells = rebuilt.get('cells', [])
+                    from app.models.unified_document import TableCell
+                    table_cells = [
+                        TableCell(
+                            row=c.get('row', 0),
+                            col=c.get('col', 0),
+                            row_span=c.get('row_span', 1),
+                            col_span=c.get('col_span', 1),
+                            content=c.get('content', '')
+                        )
+                        for c in rebuilt_cells
+                    ]
+                    table_data = TableData(
+                        rows=rebuilt.get('rows', 0),
+                        cols=rebuilt.get('cols', 0),
+                        cells=table_cells,
+                        caption=elem_data.get('extracted_text')
+                    )
+                    logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: Using rebuilt_table directly ({len(rebuilt_cells)} cells)")
+                else:
+                    # Fallback to HTML parsing for non-rebuilt tables
+                    table_data = self._extract_table_data(elem_data)
                content = table_data if table_data else elem_data.get('content', '')

                # Preserve cell_boxes and embedded_images in metadata for PDF generation
@@ -756,6 +826,18 @@ class OCRToUnifiedConverter:

                if 'embedded_images' in elem_data:
                    elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
+
+                # Pass through rebuild information for tables that were rebuilt
+                # This tells the PDF renderer to use HTML content instead of cell_boxes
+                logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: checking for rebuild_stats, keys={list(elem_data.keys())}")
+                if 'rebuild_stats' in elem_data:
+                    elem_data.setdefault('metadata', {})['rebuild_stats'] = elem_data['rebuild_stats']
+                    elem_data['metadata']['was_rebuilt'] = True
+                    logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: FOUND rebuild_stats, setting was_rebuilt=True")
+
+                if 'rebuilt_table' in elem_data:
+                    elem_data.setdefault('metadata', {})['rebuilt_table'] = elem_data['rebuilt_table']
+
            elif element_type in [
                ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
                ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP