feat: simplify layout model selection and archive proposals

Changes:
- Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector
- Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla
- Add LayoutModelSelector component and zh-TW translations
- Fix "default" model behavior with sentinel value for PubLayNet
- Add gap filling service for OCR track coverage improvement
- Add PP-Structure debug utilities
- Archive completed/incomplete proposals:
  - add-ocr-track-gap-filling (complete)
  - fix-ocr-track-table-rendering (incomplete)
  - simplify-ppstructure-model-selection (22/25 tasks)
- Add new layout model tests, archive old PP-Structure param tests
- Update OpenSpec ocr-processing spec with layout model requirements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions

View File

@@ -0,0 +1,649 @@
"""
Gap Filling Service for OCR Track
This service detects and fills gaps in PP-StructureV3 output by supplementing
with Raw OCR text regions when significant content loss is detected.
The hybrid approach uses Raw OCR's comprehensive text detection to compensate
for PP-StructureV3's layout model limitations on certain document types.
"""
import logging
from typing import Dict, List, Optional, Tuple, Set, Any
from dataclasses import dataclass
from app.models.unified_document import (
DocumentElement, BoundingBox, ElementType, Dimensions
)
from app.core.config import settings
logger = logging.getLogger(__name__)
# Element types that should NOT be supplemented (preserve structural integrity)
SKIP_ELEMENT_TYPES: Set[ElementType] = {
ElementType.TABLE,
ElementType.IMAGE,
ElementType.FIGURE,
ElementType.CHART,
ElementType.DIAGRAM,
ElementType.HEADER,
ElementType.FOOTER,
ElementType.FORMULA,
ElementType.CODE,
ElementType.BARCODE,
ElementType.QR_CODE,
ElementType.LOGO,
ElementType.STAMP,
ElementType.SIGNATURE,
}
@dataclass
class TextRegion:
"""Represents a raw OCR text region."""
text: str
bbox: List[float] # [x0, y0, x1, y1] or polygon format
confidence: float
page: int = 0
@property
def normalized_bbox(self) -> Tuple[float, float, float, float]:
"""Get normalized bbox as (x0, y0, x1, y1)."""
if not self.bbox:
return (0, 0, 0, 0)
# Check if bbox is nested list format [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
# This is common PaddleOCR polygon format
if len(self.bbox) >= 1 and isinstance(self.bbox[0], (list, tuple)):
# Nested format: extract all x and y coordinates
xs = [pt[0] for pt in self.bbox if len(pt) >= 2]
ys = [pt[1] for pt in self.bbox if len(pt) >= 2]
if xs and ys:
return (min(xs), min(ys), max(xs), max(ys))
return (0, 0, 0, 0)
# Flat format
if len(self.bbox) == 4:
# Simple [x0, y0, x1, y1] format
return (float(self.bbox[0]), float(self.bbox[1]),
float(self.bbox[2]), float(self.bbox[3]))
elif len(self.bbox) >= 8:
# Flat polygon format: [x1, y1, x2, y2, x3, y3, x4, y4]
xs = [self.bbox[i] for i in range(0, len(self.bbox), 2)]
ys = [self.bbox[i] for i in range(1, len(self.bbox), 2)]
return (min(xs), min(ys), max(xs), max(ys))
return (0, 0, 0, 0)
@property
def center(self) -> Tuple[float, float]:
"""Get center point of the bbox."""
x0, y0, x1, y1 = self.normalized_bbox
return ((x0 + x1) / 2, (y0 + y1) / 2)
class GapFillingService:
"""
Service for detecting and filling gaps in PP-StructureV3 output.
This service:
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
2. Identifies uncovered raw OCR regions
3. Supplements uncovered regions as TEXT elements
4. Deduplicates against existing PP-StructureV3 TEXT elements
5. Recalculates reading order for the combined result
"""
def __init__(
self,
coverage_threshold: float = None,
iou_threshold: float = None,
confidence_threshold: float = None,
dedup_iou_threshold: float = None,
enabled: bool = None
):
"""
Initialize the gap filling service.
Args:
coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
iou_threshold: IoU threshold for coverage detection (default: 0.15)
confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
enabled: Whether gap filling is enabled (default: True)
"""
self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
settings, 'gap_filling_coverage_threshold', 0.7
)
self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
settings, 'gap_filling_iou_threshold', 0.15
)
self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
settings, 'gap_filling_confidence_threshold', 0.3
)
self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
settings, 'gap_filling_dedup_iou_threshold', 0.5
)
self.enabled = enabled if enabled is not None else getattr(
settings, 'gap_filling_enabled', True
)
def should_activate(
self,
raw_ocr_regions: List[TextRegion],
pp_structure_elements: List[DocumentElement]
) -> Tuple[bool, float]:
"""
Determine if gap filling should be activated.
Gap filling activates when:
1. Coverage ratio is below threshold (default: 70%)
2. OR element count disparity is significant
Args:
raw_ocr_regions: List of raw OCR text regions
pp_structure_elements: List of PP-StructureV3 elements
Returns:
Tuple of (should_activate, coverage_ratio)
"""
if not self.enabled:
return False, 1.0
if not raw_ocr_regions:
return False, 1.0
# Calculate coverage
covered_count = 0
for region in raw_ocr_regions:
if self._is_region_covered(region, pp_structure_elements):
covered_count += 1
coverage_ratio = covered_count / len(raw_ocr_regions)
# Check activation conditions
should_activate = coverage_ratio < self.coverage_threshold
if should_activate:
logger.info(
f"Gap filling activated: coverage={coverage_ratio:.2%} < threshold={self.coverage_threshold:.0%}, "
f"raw_regions={len(raw_ocr_regions)}, pp_elements={len(pp_structure_elements)}"
)
else:
logger.debug(
f"Gap filling not needed: coverage={coverage_ratio:.2%} >= threshold={self.coverage_threshold:.0%}"
)
return should_activate, coverage_ratio
def find_uncovered_regions(
self,
raw_ocr_regions: List[TextRegion],
pp_structure_elements: List[DocumentElement]
) -> List[TextRegion]:
"""
Find raw OCR regions not covered by PP-StructureV3 elements.
A region is considered covered if:
1. Its center point falls inside any PP-StructureV3 element bbox, OR
2. IoU with any PP-StructureV3 element exceeds iou_threshold
Args:
raw_ocr_regions: List of raw OCR text regions
pp_structure_elements: List of PP-StructureV3 elements
Returns:
List of uncovered raw OCR regions
"""
uncovered = []
for region in raw_ocr_regions:
# Skip low confidence regions
if region.confidence < self.confidence_threshold:
continue
if not self._is_region_covered(region, pp_structure_elements):
uncovered.append(region)
logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
return uncovered
def _is_region_covered(
self,
region: TextRegion,
pp_structure_elements: List[DocumentElement]
) -> bool:
"""
Check if a raw OCR region is covered by any PP-StructureV3 element.
Args:
region: Raw OCR text region
pp_structure_elements: List of PP-StructureV3 elements
Returns:
True if the region is covered
"""
center_x, center_y = region.center
region_bbox = region.normalized_bbox
for element in pp_structure_elements:
elem_bbox = (
element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1
)
# Check 1: Center point falls inside element bbox
if self._point_in_bbox(center_x, center_y, elem_bbox):
return True
# Check 2: IoU exceeds threshold
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.iou_threshold:
return True
return False
def deduplicate_regions(
self,
uncovered_regions: List[TextRegion],
pp_structure_elements: List[DocumentElement]
) -> List[TextRegion]:
"""
Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.
Args:
uncovered_regions: List of uncovered raw OCR regions
pp_structure_elements: List of PP-StructureV3 elements
Returns:
Deduplicated list of regions
"""
# Get TEXT elements only for deduplication
text_elements = [
e for e in pp_structure_elements
if e.type not in SKIP_ELEMENT_TYPES
]
deduplicated = []
for region in uncovered_regions:
region_bbox = region.normalized_bbox
is_duplicate = False
for element in text_elements:
elem_bbox = (
element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1
)
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.dedup_iou_threshold:
logger.debug(
f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
)
is_duplicate = True
break
if not is_duplicate:
deduplicated.append(region)
removed_count = len(uncovered_regions) - len(deduplicated)
if removed_count > 0:
logger.debug(f"Removed {removed_count} duplicate regions")
return deduplicated
def convert_regions_to_elements(
self,
regions: List[TextRegion],
page_number: int,
start_element_id: int = 0
) -> List[DocumentElement]:
"""
Convert raw OCR regions to DocumentElement objects.
Args:
regions: List of raw OCR regions to convert
page_number: Page number for the elements
start_element_id: Starting ID counter for elements
Returns:
List of DocumentElement objects
"""
elements = []
for idx, region in enumerate(regions):
x0, y0, x1, y1 = region.normalized_bbox
element = DocumentElement(
element_id=f"gap_fill_{page_number}_{start_element_id + idx}",
type=ElementType.TEXT,
content=region.text,
bbox=BoundingBox(x0=x0, y0=y0, x1=x1, y1=y1),
confidence=region.confidence,
metadata={
'source': 'gap_filling',
'original_confidence': region.confidence
}
)
elements.append(element)
return elements
def recalculate_reading_order(
self,
elements: List[DocumentElement]
) -> List[int]:
"""
Recalculate reading order for elements based on position.
Sorts elements by y0 (top to bottom) then x0 (left to right).
Args:
elements: List of DocumentElement objects
Returns:
List of element indices in reading order
"""
# Create indexed list with position info
indexed_elements = [
(idx, e.bbox.y0, e.bbox.x0)
for idx, e in enumerate(elements)
]
# Sort by y0 then x0
indexed_elements.sort(key=lambda x: (x[1], x[2]))
# Return indices in reading order
return [idx for idx, _, _ in indexed_elements]
def merge_adjacent_regions(
self,
regions: List[TextRegion],
max_horizontal_gap: float = 20.0,
max_vertical_gap: float = 5.0
) -> List[TextRegion]:
"""
Merge fragmented adjacent regions on the same line.
This is optional and can reduce fragmentation from raw OCR.
Args:
regions: List of raw OCR regions
max_horizontal_gap: Maximum horizontal gap to merge (pixels)
max_vertical_gap: Maximum vertical gap to merge (pixels)
Returns:
List of merged regions
"""
if not regions:
return regions
# Sort by y0, then x0
sorted_regions = sorted(
regions,
key=lambda r: (r.normalized_bbox[1], r.normalized_bbox[0])
)
merged = []
current = sorted_regions[0]
for next_region in sorted_regions[1:]:
curr_bbox = current.normalized_bbox
next_bbox = next_region.normalized_bbox
# Check if on same line (vertical overlap)
curr_y_center = (curr_bbox[1] + curr_bbox[3]) / 2
next_y_center = (next_bbox[1] + next_bbox[3]) / 2
vertical_distance = abs(curr_y_center - next_y_center)
# Check horizontal gap
horizontal_gap = next_bbox[0] - curr_bbox[2]
if (vertical_distance < max_vertical_gap and
0 <= horizontal_gap <= max_horizontal_gap):
# Merge regions
merged_bbox = [
min(curr_bbox[0], next_bbox[0]),
min(curr_bbox[1], next_bbox[1]),
max(curr_bbox[2], next_bbox[2]),
max(curr_bbox[3], next_bbox[3])
]
current = TextRegion(
text=current.text + " " + next_region.text,
bbox=merged_bbox,
confidence=min(current.confidence, next_region.confidence),
page=current.page
)
else:
merged.append(current)
current = next_region
merged.append(current)
if len(merged) < len(regions):
logger.debug(f"Merged {len(regions)} regions into {len(merged)}")
return merged
def fill_gaps(
self,
raw_ocr_regions: List[Dict[str, Any]],
pp_structure_elements: List[DocumentElement],
page_number: int,
ocr_dimensions: Optional[Dict[str, Any]] = None,
pp_dimensions: Optional[Dimensions] = None
) -> Tuple[List[DocumentElement], Dict[str, Any]]:
"""
Main entry point: detect gaps and fill with raw OCR regions.
Args:
raw_ocr_regions: Raw OCR results (list of dicts with text, bbox, confidence)
pp_structure_elements: PP-StructureV3 elements
page_number: Current page number
ocr_dimensions: OCR image dimensions for coordinate alignment
pp_dimensions: PP-Structure dimensions for coordinate alignment
Returns:
Tuple of (supplemented_elements, statistics)
"""
statistics = {
'enabled': self.enabled,
'activated': False,
'coverage_ratio': 1.0,
'raw_ocr_count': len(raw_ocr_regions),
'pp_structure_count': len(pp_structure_elements),
'uncovered_count': 0,
'deduplicated_count': 0,
'supplemented_count': 0
}
if not self.enabled:
logger.debug("Gap filling is disabled")
return [], statistics
# Convert raw OCR regions to TextRegion objects
text_regions = self._convert_raw_ocr_regions(
raw_ocr_regions, page_number, ocr_dimensions, pp_dimensions
)
if not text_regions:
logger.debug("No valid text regions to process")
return [], statistics
# Check if gap filling should activate
should_activate, coverage_ratio = self.should_activate(
text_regions, pp_structure_elements
)
statistics['coverage_ratio'] = coverage_ratio
statistics['activated'] = should_activate
if not should_activate:
return [], statistics
# Find uncovered regions
uncovered = self.find_uncovered_regions(text_regions, pp_structure_elements)
statistics['uncovered_count'] = len(uncovered)
if not uncovered:
logger.debug("No uncovered regions found")
return [], statistics
# Deduplicate against existing TEXT elements
deduplicated = self.deduplicate_regions(uncovered, pp_structure_elements)
statistics['deduplicated_count'] = len(deduplicated)
if not deduplicated:
logger.debug("All uncovered regions were duplicates")
return [], statistics
# Optional: Merge adjacent regions
# merged = self.merge_adjacent_regions(deduplicated)
# Convert to DocumentElements
start_id = len(pp_structure_elements)
supplemented = self.convert_regions_to_elements(
deduplicated, page_number, start_id
)
statistics['supplemented_count'] = len(supplemented)
logger.info(
f"Gap filling complete: supplemented {len(supplemented)} elements "
f"(coverage: {coverage_ratio:.2%} -> estimated {(coverage_ratio + len(supplemented)/len(text_regions) if text_regions else 0):.2%})"
)
return supplemented, statistics
def _convert_raw_ocr_regions(
self,
raw_regions: List[Dict[str, Any]],
page_number: int,
ocr_dimensions: Optional[Dict[str, Any]] = None,
pp_dimensions: Optional[Dimensions] = None
) -> List[TextRegion]:
"""
Convert raw OCR region dicts to TextRegion objects.
Handles coordinate alignment if dimensions are provided.
Args:
raw_regions: List of raw OCR region dictionaries
page_number: Current page number
ocr_dimensions: OCR image dimensions
pp_dimensions: PP-Structure dimensions
Returns:
List of TextRegion objects
"""
text_regions = []
# Calculate scale factors if needed
scale_x, scale_y = 1.0, 1.0
if ocr_dimensions and pp_dimensions:
ocr_width = ocr_dimensions.get('width', 0)
ocr_height = ocr_dimensions.get('height', 0)
if ocr_width > 0 and pp_dimensions.width > 0:
scale_x = pp_dimensions.width / ocr_width
if ocr_height > 0 and pp_dimensions.height > 0:
scale_y = pp_dimensions.height / ocr_height
if scale_x != 1.0 or scale_y != 1.0:
logger.debug(f"Coordinate scaling: x={scale_x:.3f}, y={scale_y:.3f}")
for region in raw_regions:
text = region.get('text', '')
if not text or not text.strip():
continue
confidence = region.get('confidence', 0.0)
bbox_raw = region.get('bbox', [])
# Normalize bbox
if isinstance(bbox_raw, dict):
# Dict format: {x_min, y_min, x_max, y_max}
bbox = [
bbox_raw.get('x_min', 0),
bbox_raw.get('y_min', 0),
bbox_raw.get('x_max', 0),
bbox_raw.get('y_max', 0)
]
elif isinstance(bbox_raw, (list, tuple)):
bbox = list(bbox_raw)
else:
continue
# Apply scaling if needed
if scale_x != 1.0 or scale_y != 1.0:
# Check if nested list format [[x1,y1], [x2,y2], ...]
if len(bbox) >= 1 and isinstance(bbox[0], (list, tuple)):
bbox = [
[pt[0] * scale_x, pt[1] * scale_y]
for pt in bbox if len(pt) >= 2
]
elif len(bbox) == 4 and not isinstance(bbox[0], (list, tuple)):
# Simple [x0, y0, x1, y1] format
bbox = [
bbox[0] * scale_x, bbox[1] * scale_y,
bbox[2] * scale_x, bbox[3] * scale_y
]
elif len(bbox) >= 8:
# Flat polygon format [x1, y1, x2, y2, ...]
bbox = [
bbox[i] * (scale_x if i % 2 == 0 else scale_y)
for i in range(len(bbox))
]
text_regions.append(TextRegion(
text=text,
bbox=bbox,
confidence=confidence,
page=page_number
))
return text_regions
@staticmethod
def _point_in_bbox(
x: float, y: float,
bbox: Tuple[float, float, float, float]
) -> bool:
"""Check if point (x, y) is inside bbox (x0, y0, x1, y1)."""
x0, y0, x1, y1 = bbox
return x0 <= x <= x1 and y0 <= y <= y1
@staticmethod
def _calculate_iou(
bbox1: Tuple[float, float, float, float],
bbox2: Tuple[float, float, float, float]
) -> float:
"""
Calculate Intersection over Union (IoU) of two bboxes.
Args:
bbox1: First bbox (x0, y0, x1, y1)
bbox2: Second bbox (x0, y0, x1, y1)
Returns:
IoU value between 0 and 1
"""
# Calculate intersection
x0 = max(bbox1[0], bbox2[0])
y0 = max(bbox1[1], bbox2[1])
x1 = min(bbox1[2], bbox2[2])
y1 = min(bbox1[3], bbox2[3])
if x1 <= x0 or y1 <= y0:
return 0.0
intersection = (x1 - x0) * (y1 - y0)
# Calculate union
area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
union = area1 + area2 - intersection
if union <= 0:
return 0.0
return intersection / union