feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection
- Add CV-based table line detector module (disabled due to poor performance)
- Add scan artifact removal preprocessing step (removes faint horizontal lines)
- Add PreprocessingConfig schema with remove_scan_artifacts option
- Update frontend PreprocessingSettings with scan artifact toggle
- Integrate table detection config into ProcessingPage
- Archive extract-table-cell-boxes proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions

View File

@@ -104,7 +104,15 @@ class Settings(BaseSettings):
# Now using None to let PaddleX use its optimized defaults.
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_merge_mode: Optional[str] = Field(default=None) # None = use PaddleX default
# layout_merge_bboxes_mode options:
# - "large": Keep larger box when overlap (default)
# - "small": Keep smaller box when overlap
# - "union": Keep all boxes (preserve overlapping tables/images)
# Using "union" to prevent tables from being merged together
layout_merge_mode: Optional[str] = Field(
default="union",
description="How to handle overlapping detection boxes. 'union' preserves all detected regions."
)
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
# Text Detection Parameters
@@ -161,13 +169,8 @@ class Settings(BaseSettings):
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
)
# Table Cell Boxes Extraction - supplement PPStructureV3 with direct SLANeXt calls
# When enabled, directly invokes SLANeXt models to extract cell bounding boxes
# which are not exposed by the PPStructureV3 high-level API
enable_table_cell_boxes_extraction: bool = Field(
default=True,
description="Enable direct SLANeXt model calls to extract table cell bounding boxes for accurate PDF layout."
)
# Note: Table cell boxes are now extracted from table_res_list returned by PPStructureV3
# No additional model calls needed - PPStructureV3 provides cell_box_list in table_res_list
# Formula Recognition Model Configuration (Stage 4)
# Available models: