chore: backup before code cleanup
Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -83,12 +83,34 @@ class TextRegion:
|
||||
return ((x0 + x1) / 2, (y0 + y1) / 2)
|
||||
|
||||
|
||||
# Element type to IoA threshold mapping
|
||||
# TABLE needs strict filtering (low threshold) to prevent duplicate content
|
||||
# FIGURE allows more text through (high threshold) to preserve axis labels, legends
|
||||
# TEXT/TITLE uses moderate threshold to tolerate boundary detection errors
|
||||
ELEMENT_TYPE_IOA_THRESHOLDS = {
|
||||
ElementType.TABLE: 'table',
|
||||
ElementType.FIGURE: 'figure',
|
||||
ElementType.IMAGE: 'figure',
|
||||
ElementType.CHART: 'figure',
|
||||
ElementType.DIAGRAM: 'figure',
|
||||
}
|
||||
|
||||
|
||||
class GapFillingService:
|
||||
"""
|
||||
Service for detecting and filling gaps in PP-StructureV3 output.
|
||||
|
||||
This service uses IoA (Intersection over Area) algorithm for coverage detection,
|
||||
which correctly measures "small box contained in large box" relationship.
|
||||
|
||||
Key improvements over IoU:
|
||||
- IoA = intersection_area / ocr_box_area (non-symmetric)
|
||||
- Better for detecting if OCR text is covered by larger layout regions
|
||||
- Different thresholds per element type (TEXT, TABLE, FIGURE)
|
||||
- Optional boundary shrinking to reduce edge duplicates
|
||||
|
||||
This service:
|
||||
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
|
||||
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions using IoA
|
||||
2. Identifies uncovered raw OCR regions
|
||||
3. Supplements uncovered regions as TEXT elements
|
||||
4. Deduplicates against existing PP-StructureV3 TEXT elements
|
||||
@@ -98,9 +120,12 @@ class GapFillingService:
|
||||
def __init__(
|
||||
self,
|
||||
coverage_threshold: float = None,
|
||||
iou_threshold: float = None,
|
||||
confidence_threshold: float = None,
|
||||
dedup_iou_threshold: float = None,
|
||||
ioa_threshold_text: float = None,
|
||||
ioa_threshold_table: float = None,
|
||||
ioa_threshold_figure: float = None,
|
||||
dedup_ioa_threshold: float = None,
|
||||
shrink_pixels: int = None,
|
||||
enabled: bool = None
|
||||
):
|
||||
"""
|
||||
@@ -108,27 +133,48 @@ class GapFillingService:
|
||||
|
||||
Args:
|
||||
coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
|
||||
iou_threshold: IoU threshold for coverage detection (default: 0.15)
|
||||
confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
|
||||
dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
|
||||
ioa_threshold_text: IoA threshold for TEXT/TITLE elements (default: 0.6)
|
||||
ioa_threshold_table: IoA threshold for TABLE elements (default: 0.1)
|
||||
ioa_threshold_figure: IoA threshold for FIGURE/IMAGE elements (default: 0.8)
|
||||
dedup_ioa_threshold: IoA threshold for deduplication (default: 0.5)
|
||||
shrink_pixels: Shrink OCR bbox inward by this many pixels (default: 1)
|
||||
enabled: Whether gap filling is enabled (default: True)
|
||||
"""
|
||||
self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
|
||||
settings, 'gap_filling_coverage_threshold', 0.7
|
||||
)
|
||||
self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
|
||||
settings, 'gap_filling_iou_threshold', 0.15
|
||||
)
|
||||
self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
|
||||
settings, 'gap_filling_confidence_threshold', 0.3
|
||||
)
|
||||
self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
|
||||
settings, 'gap_filling_dedup_iou_threshold', 0.5
|
||||
|
||||
# IoA thresholds per element type
|
||||
self.ioa_threshold_text = ioa_threshold_text if ioa_threshold_text is not None else getattr(
|
||||
settings, 'gap_filling_ioa_threshold_text', 0.6
|
||||
)
|
||||
self.ioa_threshold_table = ioa_threshold_table if ioa_threshold_table is not None else getattr(
|
||||
settings, 'gap_filling_ioa_threshold_table', 0.1
|
||||
)
|
||||
self.ioa_threshold_figure = ioa_threshold_figure if ioa_threshold_figure is not None else getattr(
|
||||
settings, 'gap_filling_ioa_threshold_figure', 0.8
|
||||
)
|
||||
self.dedup_ioa_threshold = dedup_ioa_threshold if dedup_ioa_threshold is not None else getattr(
|
||||
settings, 'gap_filling_dedup_ioa_threshold', 0.5
|
||||
)
|
||||
|
||||
# Boundary shrinking
|
||||
self.shrink_pixels = shrink_pixels if shrink_pixels is not None else getattr(
|
||||
settings, 'gap_filling_shrink_pixels', 1
|
||||
)
|
||||
|
||||
self.enabled = enabled if enabled is not None else getattr(
|
||||
settings, 'gap_filling_enabled', True
|
||||
)
|
||||
|
||||
# Legacy compatibility
|
||||
self.iou_threshold = getattr(settings, 'gap_filling_iou_threshold', 0.15)
|
||||
self.dedup_iou_threshold = getattr(settings, 'gap_filling_dedup_iou_threshold', 0.5)
|
||||
|
||||
def should_activate(
|
||||
self,
|
||||
raw_ocr_regions: List[TextRegion],
|
||||
@@ -209,21 +255,83 @@ class GapFillingService:
|
||||
logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
|
||||
return uncovered
|
||||
|
||||
def _get_ioa_threshold_for_element(self, element_type: ElementType) -> float:
|
||||
"""
|
||||
Get the IoA threshold for a specific element type.
|
||||
|
||||
Different element types have different thresholds:
|
||||
- TABLE: 0.1 (strict, prevents duplicate table content)
|
||||
- FIGURE/IMAGE: 0.8 (preserves text inside figures)
|
||||
- TEXT/others: 0.6 (tolerates boundary errors)
|
||||
|
||||
Args:
|
||||
element_type: The element type to get threshold for
|
||||
|
||||
Returns:
|
||||
IoA threshold value
|
||||
"""
|
||||
threshold_type = ELEMENT_TYPE_IOA_THRESHOLDS.get(element_type, 'text')
|
||||
if threshold_type == 'table':
|
||||
return self.ioa_threshold_table
|
||||
elif threshold_type == 'figure':
|
||||
return self.ioa_threshold_figure
|
||||
else:
|
||||
return self.ioa_threshold_text
|
||||
|
||||
def _shrink_bbox(
|
||||
self,
|
||||
bbox: Tuple[float, float, float, float],
|
||||
pixels: int
|
||||
) -> Tuple[float, float, float, float]:
|
||||
"""
|
||||
Shrink a bounding box inward by the specified number of pixels.
|
||||
|
||||
This reduces false "uncovered" detection at region boundaries.
|
||||
|
||||
Args:
|
||||
bbox: Original bbox (x0, y0, x1, y1)
|
||||
pixels: Number of pixels to shrink on each side
|
||||
|
||||
Returns:
|
||||
Shrunk bbox (x0, y0, x1, y1)
|
||||
"""
|
||||
x0, y0, x1, y1 = bbox
|
||||
# Ensure we don't shrink to negative width/height
|
||||
width = x1 - x0
|
||||
height = y1 - y0
|
||||
max_shrink = min(width / 2, height / 2, pixels)
|
||||
|
||||
return (
|
||||
x0 + max_shrink,
|
||||
y0 + max_shrink,
|
||||
x1 - max_shrink,
|
||||
y1 - max_shrink
|
||||
)
|
||||
|
||||
def _is_region_covered(
|
||||
self,
|
||||
region: TextRegion,
|
||||
pp_structure_elements: List[DocumentElement],
|
||||
skip_table_coverage: bool = True
|
||||
skip_table_coverage: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a raw OCR region is covered by any PP-StructureV3 element.
|
||||
|
||||
Uses IoA (Intersection over Area) instead of IoU for better coverage detection.
|
||||
IoA = intersection_area / ocr_box_area
|
||||
This correctly measures "OCR box is contained in layout region".
|
||||
|
||||
Different element types use different IoA thresholds:
|
||||
- TABLE: 0.1 (strict, any overlap means covered)
|
||||
- FIGURE/IMAGE: 0.8 (preserve text inside figures like axis labels)
|
||||
- TEXT/others: 0.6 (tolerate boundary errors)
|
||||
|
||||
Args:
|
||||
region: Raw OCR text region
|
||||
pp_structure_elements: List of PP-StructureV3 elements
|
||||
skip_table_coverage: If True, don't consider TABLE elements as covering
|
||||
(allows raw OCR text inside tables to pass through
|
||||
for layered rendering)
|
||||
skip_table_coverage: If True, don't consider TABLE elements as covering.
|
||||
Default is False - TABLE elements DO cover regions
|
||||
to prevent duplicate rendering of table cell content.
|
||||
|
||||
Returns:
|
||||
True if the region is covered
|
||||
@@ -231,10 +339,13 @@ class GapFillingService:
|
||||
center_x, center_y = region.center
|
||||
region_bbox = region.normalized_bbox
|
||||
|
||||
# Apply boundary shrinking to reduce edge duplicates
|
||||
if self.shrink_pixels > 0:
|
||||
region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
|
||||
|
||||
for element in pp_structure_elements:
|
||||
# Skip TABLE elements when checking coverage
|
||||
# This allows raw OCR text inside tables to be preserved
|
||||
# PDF generator will render: table borders + raw text positions
|
||||
# Check TABLE elements for coverage (default behavior)
|
||||
# This prevents gap_fill from adding duplicate text inside table areas
|
||||
if skip_table_coverage and element.type == ElementType.TABLE:
|
||||
continue
|
||||
|
||||
@@ -247,9 +358,11 @@ class GapFillingService:
|
||||
if self._point_in_bbox(center_x, center_y, elem_bbox):
|
||||
return True
|
||||
|
||||
# Check 2: IoU exceeds threshold
|
||||
iou = self._calculate_iou(region_bbox, elem_bbox)
|
||||
if iou > self.iou_threshold:
|
||||
# Check 2: IoA exceeds element-type-specific threshold
|
||||
# IoA = intersection_area / ocr_box_area
|
||||
ioa = self._calculate_ioa(region_bbox, elem_bbox)
|
||||
threshold = self._get_ioa_threshold_for_element(element.type)
|
||||
if ioa > threshold:
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -262,6 +375,9 @@ class GapFillingService:
|
||||
"""
|
||||
Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.
|
||||
|
||||
Uses IoA (Intersection over Area) for deduplication to correctly detect
|
||||
when an OCR region is already covered by an existing TEXT element.
|
||||
|
||||
Args:
|
||||
uncovered_regions: List of uncovered raw OCR regions
|
||||
pp_structure_elements: List of PP-StructureV3 elements
|
||||
@@ -278,6 +394,11 @@ class GapFillingService:
|
||||
deduplicated = []
|
||||
for region in uncovered_regions:
|
||||
region_bbox = region.normalized_bbox
|
||||
|
||||
# Apply boundary shrinking for deduplication as well
|
||||
if self.shrink_pixels > 0:
|
||||
region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
|
||||
|
||||
is_duplicate = False
|
||||
|
||||
for element in text_elements:
|
||||
@@ -286,10 +407,11 @@ class GapFillingService:
|
||||
element.bbox.x1, element.bbox.y1
|
||||
)
|
||||
|
||||
iou = self._calculate_iou(region_bbox, elem_bbox)
|
||||
if iou > self.dedup_iou_threshold:
|
||||
# Use IoA for deduplication
|
||||
ioa = self._calculate_ioa(region_bbox, elem_bbox)
|
||||
if ioa > self.dedup_ioa_threshold:
|
||||
logger.debug(
|
||||
f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
|
||||
f"Skipping duplicate region (IoA={ioa:.2f}): '{region.text[:30]}...'"
|
||||
)
|
||||
is_duplicate = True
|
||||
break
|
||||
@@ -622,6 +744,52 @@ class GapFillingService:
|
||||
x0, y0, x1, y1 = bbox
|
||||
return x0 <= x <= x1 and y0 <= y <= y1
|
||||
|
||||
@staticmethod
|
||||
def _calculate_ioa(
|
||||
ocr_bbox: Tuple[float, float, float, float],
|
||||
layout_bbox: Tuple[float, float, float, float]
|
||||
) -> float:
|
||||
"""
|
||||
Calculate Intersection over Area (IoA) of OCR bbox relative to layout bbox.
|
||||
|
||||
IoA = intersection_area / ocr_box_area
|
||||
|
||||
This is the recommended algorithm for detecting if an OCR text region
|
||||
is contained within a larger layout region. Unlike IoU which is symmetric,
|
||||
IoA correctly measures "how much of the OCR box is inside the layout region".
|
||||
|
||||
Example:
|
||||
- OCR box: 100x20 pixels (small text line)
|
||||
- Layout box: 500x800 pixels (large paragraph region)
|
||||
- IoU would be very small (~0.005) even if OCR is fully inside layout
|
||||
- IoA would be 1.0 if OCR is fully inside layout, which is correct
|
||||
|
||||
Args:
|
||||
ocr_bbox: OCR text region bbox (x0, y0, x1, y1) - typically smaller
|
||||
layout_bbox: Layout element bbox (x0, y0, x1, y1) - typically larger
|
||||
|
||||
Returns:
|
||||
IoA value between 0 and 1
|
||||
"""
|
||||
# Calculate intersection
|
||||
x0 = max(ocr_bbox[0], layout_bbox[0])
|
||||
y0 = max(ocr_bbox[1], layout_bbox[1])
|
||||
x1 = min(ocr_bbox[2], layout_bbox[2])
|
||||
y1 = min(ocr_bbox[3], layout_bbox[3])
|
||||
|
||||
if x1 <= x0 or y1 <= y0:
|
||||
return 0.0
|
||||
|
||||
intersection = (x1 - x0) * (y1 - y0)
|
||||
|
||||
# Calculate OCR box area (denominator for IoA)
|
||||
ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1])
|
||||
|
||||
if ocr_area <= 0:
|
||||
return 0.0
|
||||
|
||||
return intersection / ocr_area
|
||||
|
||||
@staticmethod
|
||||
def _calculate_iou(
|
||||
bbox1: Tuple[float, float, float, float],
|
||||
@@ -630,6 +798,9 @@ class GapFillingService:
|
||||
"""
|
||||
Calculate Intersection over Union (IoU) of two bboxes.
|
||||
|
||||
Note: This method is kept for backward compatibility.
|
||||
For coverage detection, use _calculate_ioa() instead.
|
||||
|
||||
Args:
|
||||
bbox1: First bbox (x0, y0, x1, y1)
|
||||
bbox2: Second bbox (x0, y0, x1, y1)
|
||||
|
||||
Reference in New Issue
Block a user