chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/services/gap_filling_service.py
+++ b/backend/app/services/gap_filling_service.py
@@ -83,12 +83,34 @@ class TextRegion:
        return ((x0 + x1) / 2, (y0 + y1) / 2)


+# Element type to IoA threshold mapping
+# TABLE needs strict filtering (low threshold) to prevent duplicate content
+# FIGURE allows more text through (high threshold) to preserve axis labels, legends
+# TEXT/TITLE uses moderate threshold to tolerate boundary detection errors
+ELEMENT_TYPE_IOA_THRESHOLDS = {
+    ElementType.TABLE: 'table',
+    ElementType.FIGURE: 'figure',
+    ElementType.IMAGE: 'figure',
+    ElementType.CHART: 'figure',
+    ElementType.DIAGRAM: 'figure',
+}
+
+
 class GapFillingService:
    """
    Service for detecting and filling gaps in PP-StructureV3 output.

+    This service uses IoA (Intersection over Area) algorithm for coverage detection,
+    which correctly measures "small box contained in large box" relationship.
+
+    Key improvements over IoU:
+    - IoA = intersection_area / ocr_box_area (non-symmetric)
+    - Better for detecting if OCR text is covered by larger layout regions
+    - Different thresholds per element type (TEXT, TABLE, FIGURE)
+    - Optional boundary shrinking to reduce edge duplicates
+
    This service:
-    1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
+    1. Calculates coverage of PP-StructureV3 elements over raw OCR regions using IoA
    2. Identifies uncovered raw OCR regions
    3. Supplements uncovered regions as TEXT elements
    4. Deduplicates against existing PP-StructureV3 TEXT elements
@@ -98,9 +120,12 @@ class GapFillingService:
    def __init__(
        self,
        coverage_threshold: float = None,
-        iou_threshold: float = None,
        confidence_threshold: float = None,
-        dedup_iou_threshold: float = None,
+        ioa_threshold_text: float = None,
+        ioa_threshold_table: float = None,
+        ioa_threshold_figure: float = None,
+        dedup_ioa_threshold: float = None,
+        shrink_pixels: int = None,
        enabled: bool = None
    ):
        """
@@ -108,27 +133,48 @@ class GapFillingService:

        Args:
            coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
-            iou_threshold: IoU threshold for coverage detection (default: 0.15)
            confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
-            dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
+            ioa_threshold_text: IoA threshold for TEXT/TITLE elements (default: 0.6)
+            ioa_threshold_table: IoA threshold for TABLE elements (default: 0.1)
+            ioa_threshold_figure: IoA threshold for FIGURE/IMAGE elements (default: 0.8)
+            dedup_ioa_threshold: IoA threshold for deduplication (default: 0.5)
+            shrink_pixels: Shrink OCR bbox inward by this many pixels (default: 1)
            enabled: Whether gap filling is enabled (default: True)
        """
        self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
            settings, 'gap_filling_coverage_threshold', 0.7
        )
-        self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
-            settings, 'gap_filling_iou_threshold', 0.15
-        )
        self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
            settings, 'gap_filling_confidence_threshold', 0.3
        )
-        self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
-            settings, 'gap_filling_dedup_iou_threshold', 0.5
+
+        # IoA thresholds per element type
+        self.ioa_threshold_text = ioa_threshold_text if ioa_threshold_text is not None else getattr(
+            settings, 'gap_filling_ioa_threshold_text', 0.6
        )
+        self.ioa_threshold_table = ioa_threshold_table if ioa_threshold_table is not None else getattr(
+            settings, 'gap_filling_ioa_threshold_table', 0.1
+        )
+        self.ioa_threshold_figure = ioa_threshold_figure if ioa_threshold_figure is not None else getattr(
+            settings, 'gap_filling_ioa_threshold_figure', 0.8
+        )
+        self.dedup_ioa_threshold = dedup_ioa_threshold if dedup_ioa_threshold is not None else getattr(
+            settings, 'gap_filling_dedup_ioa_threshold', 0.5
+        )
+
+        # Boundary shrinking
+        self.shrink_pixels = shrink_pixels if shrink_pixels is not None else getattr(
+            settings, 'gap_filling_shrink_pixels', 1
+        )
+
        self.enabled = enabled if enabled is not None else getattr(
            settings, 'gap_filling_enabled', True
        )

+        # Legacy compatibility
+        self.iou_threshold = getattr(settings, 'gap_filling_iou_threshold', 0.15)
+        self.dedup_iou_threshold = getattr(settings, 'gap_filling_dedup_iou_threshold', 0.5)
+
    def should_activate(
        self,
        raw_ocr_regions: List[TextRegion],
@@ -209,21 +255,83 @@ class GapFillingService:
        logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
        return uncovered

+    def _get_ioa_threshold_for_element(self, element_type: ElementType) -> float:
+        """
+        Get the IoA threshold for a specific element type.
+
+        Different element types have different thresholds:
+        - TABLE: 0.1 (strict, prevents duplicate table content)
+        - FIGURE/IMAGE: 0.8 (preserves text inside figures)
+        - TEXT/others: 0.6 (tolerates boundary errors)
+
+        Args:
+            element_type: The element type to get threshold for
+
+        Returns:
+            IoA threshold value
+        """
+        threshold_type = ELEMENT_TYPE_IOA_THRESHOLDS.get(element_type, 'text')
+        if threshold_type == 'table':
+            return self.ioa_threshold_table
+        elif threshold_type == 'figure':
+            return self.ioa_threshold_figure
+        else:
+            return self.ioa_threshold_text
+
+    def _shrink_bbox(
+        self,
+        bbox: Tuple[float, float, float, float],
+        pixels: int
+    ) -> Tuple[float, float, float, float]:
+        """
+        Shrink a bounding box inward by the specified number of pixels.
+
+        This reduces false "uncovered" detection at region boundaries.
+
+        Args:
+            bbox: Original bbox (x0, y0, x1, y1)
+            pixels: Number of pixels to shrink on each side
+
+        Returns:
+            Shrunk bbox (x0, y0, x1, y1)
+        """
+        x0, y0, x1, y1 = bbox
+        # Ensure we don't shrink to negative width/height
+        width = x1 - x0
+        height = y1 - y0
+        max_shrink = min(width / 2, height / 2, pixels)
+
+        return (
+            x0 + max_shrink,
+            y0 + max_shrink,
+            x1 - max_shrink,
+            y1 - max_shrink
+        )
+
    def _is_region_covered(
        self,
        region: TextRegion,
        pp_structure_elements: List[DocumentElement],
-        skip_table_coverage: bool = True
+        skip_table_coverage: bool = False
    ) -> bool:
        """
        Check if a raw OCR region is covered by any PP-StructureV3 element.

+        Uses IoA (Intersection over Area) instead of IoU for better coverage detection.
+        IoA = intersection_area / ocr_box_area
+        This correctly measures "OCR box is contained in layout region".
+
+        Different element types use different IoA thresholds:
+        - TABLE: 0.1 (strict, any overlap means covered)
+        - FIGURE/IMAGE: 0.8 (preserve text inside figures like axis labels)
+        - TEXT/others: 0.6 (tolerate boundary errors)
+
        Args:
            region: Raw OCR text region
            pp_structure_elements: List of PP-StructureV3 elements
-            skip_table_coverage: If True, don't consider TABLE elements as covering
-                                 (allows raw OCR text inside tables to pass through
-                                 for layered rendering)
+            skip_table_coverage: If True, don't consider TABLE elements as covering.
+                                 Default is False - TABLE elements DO cover regions
+                                 to prevent duplicate rendering of table cell content.

        Returns:
            True if the region is covered
@@ -231,10 +339,13 @@ class GapFillingService:
        center_x, center_y = region.center
        region_bbox = region.normalized_bbox

+        # Apply boundary shrinking to reduce edge duplicates
+        if self.shrink_pixels > 0:
+            region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
+
        for element in pp_structure_elements:
-            # Skip TABLE elements when checking coverage
-            # This allows raw OCR text inside tables to be preserved
-            # PDF generator will render: table borders + raw text positions
+            # Check TABLE elements for coverage (default behavior)
+            # This prevents gap_fill from adding duplicate text inside table areas
            if skip_table_coverage and element.type == ElementType.TABLE:
                continue

@@ -247,9 +358,11 @@ class GapFillingService:
            if self._point_in_bbox(center_x, center_y, elem_bbox):
                return True

-            # Check 2: IoU exceeds threshold
-            iou = self._calculate_iou(region_bbox, elem_bbox)
-            if iou > self.iou_threshold:
+            # Check 2: IoA exceeds element-type-specific threshold
+            # IoA = intersection_area / ocr_box_area
+            ioa = self._calculate_ioa(region_bbox, elem_bbox)
+            threshold = self._get_ioa_threshold_for_element(element.type)
+            if ioa > threshold:
                return True

        return False
@@ -262,6 +375,9 @@ class GapFillingService:
        """
        Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.

+        Uses IoA (Intersection over Area) for deduplication to correctly detect
+        when an OCR region is already covered by an existing TEXT element.
+
        Args:
            uncovered_regions: List of uncovered raw OCR regions
            pp_structure_elements: List of PP-StructureV3 elements
@@ -278,6 +394,11 @@ class GapFillingService:
        deduplicated = []
        for region in uncovered_regions:
            region_bbox = region.normalized_bbox
+
+            # Apply boundary shrinking for deduplication as well
+            if self.shrink_pixels > 0:
+                region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
+
            is_duplicate = False

            for element in text_elements:
@@ -286,10 +407,11 @@ class GapFillingService:
                    element.bbox.x1, element.bbox.y1
                )

-                iou = self._calculate_iou(region_bbox, elem_bbox)
-                if iou > self.dedup_iou_threshold:
+                # Use IoA for deduplication
+                ioa = self._calculate_ioa(region_bbox, elem_bbox)
+                if ioa > self.dedup_ioa_threshold:
                    logger.debug(
-                        f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
+                        f"Skipping duplicate region (IoA={ioa:.2f}): '{region.text[:30]}...'"
                    )
                    is_duplicate = True
                    break
@@ -622,6 +744,52 @@ class GapFillingService:
        x0, y0, x1, y1 = bbox
        return x0 <= x <= x1 and y0 <= y <= y1

+    @staticmethod
+    def _calculate_ioa(
+        ocr_bbox: Tuple[float, float, float, float],
+        layout_bbox: Tuple[float, float, float, float]
+    ) -> float:
+        """
+        Calculate Intersection over Area (IoA) of OCR bbox relative to layout bbox.
+
+        IoA = intersection_area / ocr_box_area
+
+        This is the recommended algorithm for detecting if an OCR text region
+        is contained within a larger layout region. Unlike IoU which is symmetric,
+        IoA correctly measures "how much of the OCR box is inside the layout region".
+
+        Example:
+        - OCR box: 100x20 pixels (small text line)
+        - Layout box: 500x800 pixels (large paragraph region)
+        - IoU would be very small (~0.005) even if OCR is fully inside layout
+        - IoA would be 1.0 if OCR is fully inside layout, which is correct
+
+        Args:
+            ocr_bbox: OCR text region bbox (x0, y0, x1, y1) - typically smaller
+            layout_bbox: Layout element bbox (x0, y0, x1, y1) - typically larger
+
+        Returns:
+            IoA value between 0 and 1
+        """
+        # Calculate intersection
+        x0 = max(ocr_bbox[0], layout_bbox[0])
+        y0 = max(ocr_bbox[1], layout_bbox[1])
+        x1 = min(ocr_bbox[2], layout_bbox[2])
+        y1 = min(ocr_bbox[3], layout_bbox[3])
+
+        if x1 <= x0 or y1 <= y0:
+            return 0.0
+
+        intersection = (x1 - x0) * (y1 - y0)
+
+        # Calculate OCR box area (denominator for IoA)
+        ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1])
+
+        if ocr_area <= 0:
+            return 0.0
+
+        return intersection / ocr_area
+
    @staticmethod
    def _calculate_iou(
        bbox1: Tuple[float, float, float, float],
@@ -630,6 +798,9 @@ class GapFillingService:
        """
        Calculate Intersection over Union (IoU) of two bboxes.

+        Note: This method is kept for backward compatibility.
+        For coverage detection, use _calculate_ioa() instead.
+
        Args:
            bbox1: First bbox (x0, y0, x1, y1)
            bbox2: Second bbox (x0, y0, x1, y1)