chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -83,12 +83,34 @@ class TextRegion:
return ((x0 + x1) / 2, (y0 + y1) / 2)
# Element type to IoA threshold mapping
# TABLE needs strict filtering (low threshold) to prevent duplicate content
# FIGURE allows more text through (high threshold) to preserve axis labels, legends
# TEXT/TITLE uses moderate threshold to tolerate boundary detection errors
ELEMENT_TYPE_IOA_THRESHOLDS = {
ElementType.TABLE: 'table',
ElementType.FIGURE: 'figure',
ElementType.IMAGE: 'figure',
ElementType.CHART: 'figure',
ElementType.DIAGRAM: 'figure',
}
class GapFillingService:
"""
Service for detecting and filling gaps in PP-StructureV3 output.
This service uses IoA (Intersection over Area) algorithm for coverage detection,
which correctly measures "small box contained in large box" relationship.
Key improvements over IoU:
- IoA = intersection_area / ocr_box_area (non-symmetric)
- Better for detecting if OCR text is covered by larger layout regions
- Different thresholds per element type (TEXT, TABLE, FIGURE)
- Optional boundary shrinking to reduce edge duplicates
This service:
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions using IoA
2. Identifies uncovered raw OCR regions
3. Supplements uncovered regions as TEXT elements
4. Deduplicates against existing PP-StructureV3 TEXT elements
@@ -98,9 +120,12 @@ class GapFillingService:
def __init__(
self,
coverage_threshold: float = None,
iou_threshold: float = None,
confidence_threshold: float = None,
dedup_iou_threshold: float = None,
ioa_threshold_text: float = None,
ioa_threshold_table: float = None,
ioa_threshold_figure: float = None,
dedup_ioa_threshold: float = None,
shrink_pixels: int = None,
enabled: bool = None
):
"""
@@ -108,27 +133,48 @@ class GapFillingService:
Args:
coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
iou_threshold: IoU threshold for coverage detection (default: 0.15)
confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
ioa_threshold_text: IoA threshold for TEXT/TITLE elements (default: 0.6)
ioa_threshold_table: IoA threshold for TABLE elements (default: 0.1)
ioa_threshold_figure: IoA threshold for FIGURE/IMAGE elements (default: 0.8)
dedup_ioa_threshold: IoA threshold for deduplication (default: 0.5)
shrink_pixels: Shrink OCR bbox inward by this many pixels (default: 1)
enabled: Whether gap filling is enabled (default: True)
"""
self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
settings, 'gap_filling_coverage_threshold', 0.7
)
self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
settings, 'gap_filling_iou_threshold', 0.15
)
self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
settings, 'gap_filling_confidence_threshold', 0.3
)
self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
settings, 'gap_filling_dedup_iou_threshold', 0.5
# IoA thresholds per element type
self.ioa_threshold_text = ioa_threshold_text if ioa_threshold_text is not None else getattr(
settings, 'gap_filling_ioa_threshold_text', 0.6
)
self.ioa_threshold_table = ioa_threshold_table if ioa_threshold_table is not None else getattr(
settings, 'gap_filling_ioa_threshold_table', 0.1
)
self.ioa_threshold_figure = ioa_threshold_figure if ioa_threshold_figure is not None else getattr(
settings, 'gap_filling_ioa_threshold_figure', 0.8
)
self.dedup_ioa_threshold = dedup_ioa_threshold if dedup_ioa_threshold is not None else getattr(
settings, 'gap_filling_dedup_ioa_threshold', 0.5
)
# Boundary shrinking
self.shrink_pixels = shrink_pixels if shrink_pixels is not None else getattr(
settings, 'gap_filling_shrink_pixels', 1
)
self.enabled = enabled if enabled is not None else getattr(
settings, 'gap_filling_enabled', True
)
# Legacy compatibility
self.iou_threshold = getattr(settings, 'gap_filling_iou_threshold', 0.15)
self.dedup_iou_threshold = getattr(settings, 'gap_filling_dedup_iou_threshold', 0.5)
def should_activate(
self,
raw_ocr_regions: List[TextRegion],
@@ -209,21 +255,83 @@ class GapFillingService:
logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
return uncovered
def _get_ioa_threshold_for_element(self, element_type: ElementType) -> float:
"""
Get the IoA threshold for a specific element type.
Different element types have different thresholds:
- TABLE: 0.1 (strict, prevents duplicate table content)
- FIGURE/IMAGE: 0.8 (preserves text inside figures)
- TEXT/others: 0.6 (tolerates boundary errors)
Args:
element_type: The element type to get threshold for
Returns:
IoA threshold value
"""
threshold_type = ELEMENT_TYPE_IOA_THRESHOLDS.get(element_type, 'text')
if threshold_type == 'table':
return self.ioa_threshold_table
elif threshold_type == 'figure':
return self.ioa_threshold_figure
else:
return self.ioa_threshold_text
def _shrink_bbox(
self,
bbox: Tuple[float, float, float, float],
pixels: int
) -> Tuple[float, float, float, float]:
"""
Shrink a bounding box inward by the specified number of pixels.
This reduces false "uncovered" detection at region boundaries.
Args:
bbox: Original bbox (x0, y0, x1, y1)
pixels: Number of pixels to shrink on each side
Returns:
Shrunk bbox (x0, y0, x1, y1)
"""
x0, y0, x1, y1 = bbox
# Ensure we don't shrink to negative width/height
width = x1 - x0
height = y1 - y0
max_shrink = min(width / 2, height / 2, pixels)
return (
x0 + max_shrink,
y0 + max_shrink,
x1 - max_shrink,
y1 - max_shrink
)
def _is_region_covered(
self,
region: TextRegion,
pp_structure_elements: List[DocumentElement],
skip_table_coverage: bool = True
skip_table_coverage: bool = False
) -> bool:
"""
Check if a raw OCR region is covered by any PP-StructureV3 element.
Uses IoA (Intersection over Area) instead of IoU for better coverage detection.
IoA = intersection_area / ocr_box_area
This correctly measures "OCR box is contained in layout region".
Different element types use different IoA thresholds:
- TABLE: 0.1 (strict, any overlap means covered)
- FIGURE/IMAGE: 0.8 (preserve text inside figures like axis labels)
- TEXT/others: 0.6 (tolerate boundary errors)
Args:
region: Raw OCR text region
pp_structure_elements: List of PP-StructureV3 elements
skip_table_coverage: If True, don't consider TABLE elements as covering
(allows raw OCR text inside tables to pass through
for layered rendering)
skip_table_coverage: If True, don't consider TABLE elements as covering.
Default is False - TABLE elements DO cover regions
to prevent duplicate rendering of table cell content.
Returns:
True if the region is covered
@@ -231,10 +339,13 @@ class GapFillingService:
center_x, center_y = region.center
region_bbox = region.normalized_bbox
# Apply boundary shrinking to reduce edge duplicates
if self.shrink_pixels > 0:
region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
for element in pp_structure_elements:
# Skip TABLE elements when checking coverage
# This allows raw OCR text inside tables to be preserved
# PDF generator will render: table borders + raw text positions
# Check TABLE elements for coverage (default behavior)
# This prevents gap_fill from adding duplicate text inside table areas
if skip_table_coverage and element.type == ElementType.TABLE:
continue
@@ -247,9 +358,11 @@ class GapFillingService:
if self._point_in_bbox(center_x, center_y, elem_bbox):
return True
# Check 2: IoU exceeds threshold
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.iou_threshold:
# Check 2: IoA exceeds element-type-specific threshold
# IoA = intersection_area / ocr_box_area
ioa = self._calculate_ioa(region_bbox, elem_bbox)
threshold = self._get_ioa_threshold_for_element(element.type)
if ioa > threshold:
return True
return False
@@ -262,6 +375,9 @@ class GapFillingService:
"""
Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.
Uses IoA (Intersection over Area) for deduplication to correctly detect
when an OCR region is already covered by an existing TEXT element.
Args:
uncovered_regions: List of uncovered raw OCR regions
pp_structure_elements: List of PP-StructureV3 elements
@@ -278,6 +394,11 @@ class GapFillingService:
deduplicated = []
for region in uncovered_regions:
region_bbox = region.normalized_bbox
# Apply boundary shrinking for deduplication as well
if self.shrink_pixels > 0:
region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
is_duplicate = False
for element in text_elements:
@@ -286,10 +407,11 @@ class GapFillingService:
element.bbox.x1, element.bbox.y1
)
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.dedup_iou_threshold:
# Use IoA for deduplication
ioa = self._calculate_ioa(region_bbox, elem_bbox)
if ioa > self.dedup_ioa_threshold:
logger.debug(
f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
f"Skipping duplicate region (IoA={ioa:.2f}): '{region.text[:30]}...'"
)
is_duplicate = True
break
@@ -622,6 +744,52 @@ class GapFillingService:
x0, y0, x1, y1 = bbox
return x0 <= x <= x1 and y0 <= y <= y1
@staticmethod
def _calculate_ioa(
ocr_bbox: Tuple[float, float, float, float],
layout_bbox: Tuple[float, float, float, float]
) -> float:
"""
Calculate Intersection over Area (IoA) of OCR bbox relative to layout bbox.
IoA = intersection_area / ocr_box_area
This is the recommended algorithm for detecting if an OCR text region
is contained within a larger layout region. Unlike IoU which is symmetric,
IoA correctly measures "how much of the OCR box is inside the layout region".
Example:
- OCR box: 100x20 pixels (small text line)
- Layout box: 500x800 pixels (large paragraph region)
- IoU would be very small (~0.005) even if OCR is fully inside layout
- IoA would be 1.0 if OCR is fully inside layout, which is correct
Args:
ocr_bbox: OCR text region bbox (x0, y0, x1, y1) - typically smaller
layout_bbox: Layout element bbox (x0, y0, x1, y1) - typically larger
Returns:
IoA value between 0 and 1
"""
# Calculate intersection
x0 = max(ocr_bbox[0], layout_bbox[0])
y0 = max(ocr_bbox[1], layout_bbox[1])
x1 = min(ocr_bbox[2], layout_bbox[2])
y1 = min(ocr_bbox[3], layout_bbox[3])
if x1 <= x0 or y1 <= y0:
return 0.0
intersection = (x1 - x0) * (y1 - y0)
# Calculate OCR box area (denominator for IoA)
ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1])
if ocr_area <= 0:
return 0.0
return intersection / ocr_area
@staticmethod
def _calculate_iou(
bbox1: Tuple[float, float, float, float],
@@ -630,6 +798,9 @@ class GapFillingService:
"""
Calculate Intersection over Union (IoU) of two bboxes.
Note: This method is kept for backward compatibility.
For coverage detection, use _calculate_ioa() instead.
Args:
bbox1: First bbox (x0, y0, x1, y1)
bbox2: Second bbox (x0, y0, x1, y1)