feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection
- Add CV-based table line detector module (disabled due to poor performance)
- Add scan artifact removal preprocessing step (removes faint horizontal lines)
- Add PreprocessingConfig schema with remove_scan_artifacts option
- Update frontend PreprocessingSettings with scan artifact toggle
- Integrate table detection config into ProcessingPage
- Archive extract-table-cell-boxes proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions

View File

@@ -212,7 +212,8 @@ class GapFillingService:
def _is_region_covered(
self,
region: TextRegion,
pp_structure_elements: List[DocumentElement]
pp_structure_elements: List[DocumentElement],
skip_table_coverage: bool = True
) -> bool:
"""
Check if a raw OCR region is covered by any PP-StructureV3 element.
@@ -220,6 +221,9 @@ class GapFillingService:
Args:
region: Raw OCR text region
pp_structure_elements: List of PP-StructureV3 elements
skip_table_coverage: If True, don't consider TABLE elements as covering
(allows raw OCR text inside tables to pass through
for layered rendering)
Returns:
True if the region is covered
@@ -228,6 +232,12 @@ class GapFillingService:
region_bbox = region.normalized_bbox
for element in pp_structure_elements:
# Skip TABLE elements when checking coverage
# This allows raw OCR text inside tables to be preserved
# PDF generator will render: table borders + raw text positions
if skip_table_coverage and element.type == ElementType.TABLE:
continue
elem_bbox = (
element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1