feat: add table detection options and scan artifact removal
- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -104,7 +104,15 @@ class Settings(BaseSettings):
|
||||
# Now using None to let PaddleX use its optimized defaults.
|
||||
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
layout_merge_mode: Optional[str] = Field(default=None) # None = use PaddleX default
|
||||
# layout_merge_bboxes_mode options:
|
||||
# - "large": Keep larger box when overlap (default)
|
||||
# - "small": Keep smaller box when overlap
|
||||
# - "union": Keep all boxes (preserve overlapping tables/images)
|
||||
# Using "union" to prevent tables from being merged together
|
||||
layout_merge_mode: Optional[str] = Field(
|
||||
default="union",
|
||||
description="How to handle overlapping detection boxes. 'union' preserves all detected regions."
|
||||
)
|
||||
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
|
||||
# Text Detection Parameters
|
||||
@@ -161,13 +169,8 @@ class Settings(BaseSettings):
|
||||
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
|
||||
)
|
||||
|
||||
# Table Cell Boxes Extraction - supplement PPStructureV3 with direct SLANeXt calls
|
||||
# When enabled, directly invokes SLANeXt models to extract cell bounding boxes
|
||||
# which are not exposed by the PPStructureV3 high-level API
|
||||
enable_table_cell_boxes_extraction: bool = Field(
|
||||
default=True,
|
||||
description="Enable direct SLANeXt model calls to extract table cell bounding boxes for accurate PDF layout."
|
||||
)
|
||||
# Note: Table cell boxes are now extracted from table_res_list returned by PPStructureV3
|
||||
# No additional model calls needed - PPStructureV3 provides cell_box_list in table_res_list
|
||||
|
||||
# Formula Recognition Model Configuration (Stage 4)
|
||||
# Available models:
|
||||
|
||||
Reference in New Issue
Block a user