feat: add table detection options and scan artifact removal
- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -30,7 +30,7 @@ from app.services.layout_preprocessing_service import (
|
||||
get_layout_preprocessing_service,
|
||||
LayoutPreprocessingService,
|
||||
)
|
||||
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
|
||||
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig, TableDetectionConfig
|
||||
|
||||
# Import dual-track components
|
||||
try:
|
||||
@@ -454,7 +454,11 @@ class OCRService:
|
||||
|
||||
return self.ocr_engines[lang]
|
||||
|
||||
def _ensure_structure_engine(self, layout_model: Optional[str] = None) -> PPStructureV3:
|
||||
def _ensure_structure_engine(
|
||||
self,
|
||||
layout_model: Optional[str] = None,
|
||||
table_detection_config: Optional[TableDetectionConfig] = None
|
||||
) -> PPStructureV3:
|
||||
"""
|
||||
Get or create PP-Structure engine for layout analysis with GPU support.
|
||||
Supports layout model selection for different document types.
|
||||
@@ -465,6 +469,10 @@ class OCRService:
|
||||
- "default": PubLayNet-based (best for English documents)
|
||||
- "cdla": CDLA model (alternative for Chinese layout)
|
||||
- None: Use config default
|
||||
table_detection_config: Table detection configuration
|
||||
- enable_wired_table: Enable bordered table detection
|
||||
- enable_wireless_table: Enable borderless table detection
|
||||
- enable_region_detection: Enable region detection
|
||||
|
||||
Returns:
|
||||
PPStructure engine instance
|
||||
@@ -492,6 +500,19 @@ class OCRService:
|
||||
logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine")
|
||||
self.structure_engine = None # Force recreation
|
||||
|
||||
# Check if we need to recreate the engine due to different table detection config
|
||||
current_table_config = getattr(self, '_current_table_detection_config', None)
|
||||
if self.structure_engine is not None and table_detection_config:
|
||||
# Compare table detection settings
|
||||
new_config_tuple = (
|
||||
table_detection_config.enable_wired_table,
|
||||
table_detection_config.enable_wireless_table,
|
||||
table_detection_config.enable_region_detection
|
||||
)
|
||||
if current_table_config != new_config_tuple:
|
||||
logger.info(f"Table detection config changed from {current_table_config} to {new_config_tuple}, recreating engine")
|
||||
self.structure_engine = None # Force recreation
|
||||
|
||||
# Use cached engine or create new one
|
||||
if self.structure_engine is None:
|
||||
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
|
||||
@@ -504,6 +525,15 @@ class OCRService:
|
||||
use_table = settings.enable_table_recognition
|
||||
use_seal = settings.enable_seal_recognition
|
||||
use_region = settings.enable_region_detection
|
||||
|
||||
# Apply table detection config overrides if provided
|
||||
if table_detection_config:
|
||||
# If both wired and wireless are disabled, disable table recognition entirely
|
||||
if not table_detection_config.enable_wired_table and not table_detection_config.enable_wireless_table:
|
||||
use_table = False
|
||||
use_region = table_detection_config.enable_region_detection
|
||||
logger.info(f"Table detection config applied: wired={table_detection_config.enable_wired_table}, "
|
||||
f"wireless={table_detection_config.enable_wireless_table}, region={use_region}")
|
||||
layout_threshold = settings.layout_detection_threshold
|
||||
layout_nms = settings.layout_nms_threshold
|
||||
layout_merge = settings.layout_merge_mode
|
||||
@@ -538,6 +568,17 @@ class OCRService:
|
||||
formula_model = settings.formula_recognition_model_name
|
||||
chart_model = settings.chart_recognition_model_name
|
||||
|
||||
# Apply table detection config overrides for individual table types
|
||||
if table_detection_config:
|
||||
if not table_detection_config.enable_wired_table:
|
||||
wired_table_model = None
|
||||
wired_cell_det_model = None
|
||||
logger.info("Wired table detection disabled by config")
|
||||
if not table_detection_config.enable_wireless_table:
|
||||
wireless_table_model = None
|
||||
wireless_cell_det_model = None
|
||||
logger.info("Wireless table detection disabled by config")
|
||||
|
||||
# Text detection/recognition model configuration
|
||||
text_det_model = settings.text_detection_model_name
|
||||
text_rec_model = settings.text_recognition_model_name
|
||||
@@ -641,6 +682,15 @@ class OCRService:
|
||||
# Track model loading for cache management
|
||||
self._model_last_used['structure'] = datetime.now()
|
||||
self._current_layout_model = layout_model # Track current model for recreation check
|
||||
# Track table detection config for recreation check
|
||||
if table_detection_config:
|
||||
self._current_table_detection_config = (
|
||||
table_detection_config.enable_wired_table,
|
||||
table_detection_config.enable_wireless_table,
|
||||
table_detection_config.enable_region_detection
|
||||
)
|
||||
else:
|
||||
self._current_table_detection_config = None
|
||||
|
||||
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
||||
|
||||
@@ -712,6 +762,15 @@ class OCRService:
|
||||
|
||||
self.structure_engine = PPStructureV3(**cpu_kwargs)
|
||||
self._current_layout_model = layout_model # Track current model for recreation check
|
||||
# Track table detection config for recreation check
|
||||
if table_detection_config:
|
||||
self._current_table_detection_config = (
|
||||
table_detection_config.enable_wired_table,
|
||||
table_detection_config.enable_wireless_table,
|
||||
table_detection_config.enable_region_detection
|
||||
)
|
||||
else:
|
||||
self._current_table_detection_config = None
|
||||
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
|
||||
else:
|
||||
raise
|
||||
@@ -956,7 +1015,8 @@ class OCRService:
|
||||
current_page: int = 0,
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||
table_detection_config: Optional[TableDetectionConfig] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
@@ -971,6 +1031,7 @@ class OCRService:
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
table_detection_config: Table detection config (wired/wireless/region options)
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results and metadata
|
||||
@@ -1041,7 +1102,8 @@ class OCRService:
|
||||
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
||||
layout_model=layout_model,
|
||||
preprocessing_mode=preprocessing_mode,
|
||||
preprocessing_config=preprocessing_config
|
||||
preprocessing_config=preprocessing_config,
|
||||
table_detection_config=table_detection_config
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
@@ -1189,7 +1251,8 @@ class OCRService:
|
||||
current_page=current_page,
|
||||
layout_model=layout_model,
|
||||
preprocessing_mode=preprocessing_mode,
|
||||
preprocessing_config=preprocessing_config
|
||||
preprocessing_config=preprocessing_config,
|
||||
table_detection_config=table_detection_config
|
||||
)
|
||||
|
||||
# Generate Markdown
|
||||
@@ -1347,7 +1410,8 @@ class OCRService:
|
||||
current_page: int = 0,
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||
table_detection_config: Optional[TableDetectionConfig] = None
|
||||
) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
||||
@@ -1359,6 +1423,7 @@ class OCRService:
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
table_detection_config: Table detection config (wired/wireless/region options)
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
@@ -1376,7 +1441,7 @@ class OCRService:
|
||||
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
|
||||
)
|
||||
|
||||
structure_engine = self._ensure_structure_engine(layout_model)
|
||||
structure_engine = self._ensure_structure_engine(layout_model, table_detection_config)
|
||||
|
||||
# Apply image preprocessing for layout detection
|
||||
# Preprocessing includes:
|
||||
@@ -1432,10 +1497,19 @@ class OCRService:
|
||||
# Get scaling info for bbox coordinate restoration
|
||||
scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
|
||||
|
||||
# CV table detection is disabled due to poor performance on complex tables
|
||||
# Issues: 1) Detected boundaries smaller than content
|
||||
# 2) Incorrectly splits merged cells
|
||||
# The ML-based RT-DETR-L detection is currently more reliable.
|
||||
# TODO: Improve CV algorithm with better line detection and grid alignment
|
||||
use_cv_table_detection = False
|
||||
|
||||
result = enhanced_processor.analyze_with_full_structure(
|
||||
image_path, output_dir, current_page,
|
||||
preprocessed_image=preprocessed_image,
|
||||
scaling_info=scaling_info
|
||||
scaling_info=scaling_info,
|
||||
save_visualization=True, # Save layout detection visualization images
|
||||
use_cv_table_detection=use_cv_table_detection
|
||||
)
|
||||
|
||||
if result.get('has_parsing_res_list'):
|
||||
@@ -1673,7 +1747,8 @@ class OCRService:
|
||||
force_track: Optional[str] = None,
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||
table_detection_config: Optional[TableDetectionConfig] = None
|
||||
) -> Union[UnifiedDocument, Dict]:
|
||||
"""
|
||||
Process document using dual-track approach.
|
||||
@@ -1688,6 +1763,7 @@ class OCRService:
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
table_detection_config: Table detection config (wired/wireless/region options)
|
||||
|
||||
Returns:
|
||||
UnifiedDocument if dual-track is enabled, Dict otherwise
|
||||
@@ -1696,7 +1772,7 @@ class OCRService:
|
||||
# Fallback to traditional OCR processing
|
||||
return self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
preprocessing_mode, preprocessing_config, table_detection_config
|
||||
)
|
||||
|
||||
start_time = datetime.now()
|
||||
@@ -1770,7 +1846,8 @@ class OCRService:
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir, layout_model=layout_model,
|
||||
preprocessing_mode=preprocessing_mode,
|
||||
preprocessing_config=preprocessing_config
|
||||
preprocessing_config=preprocessing_config,
|
||||
table_detection_config=table_detection_config
|
||||
)
|
||||
|
||||
# Convert OCR result to extract images
|
||||
@@ -1804,7 +1881,7 @@ class OCRService:
|
||||
logger.info("Using OCR track (PaddleOCR)")
|
||||
ocr_result = self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
preprocessing_mode, preprocessing_config, table_detection_config
|
||||
)
|
||||
|
||||
# Convert OCR result to UnifiedDocument using the converter
|
||||
@@ -1835,7 +1912,7 @@ class OCRService:
|
||||
# Fallback to traditional OCR
|
||||
return self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
preprocessing_mode, preprocessing_config, table_detection_config
|
||||
)
|
||||
|
||||
def _merge_ocr_images_into_direct(
|
||||
@@ -1916,7 +1993,8 @@ class OCRService:
|
||||
output_dir: Optional[Path] = None,
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||
table_detection_config: Optional[TableDetectionConfig] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Traditional OCR processing (legacy method).
|
||||
@@ -1930,6 +2008,7 @@ class OCRService:
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
table_detection_config: Table detection config (wired/wireless/region options)
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results in legacy format
|
||||
@@ -1943,7 +2022,7 @@ class OCRService:
|
||||
for i, image_path in enumerate(image_paths):
|
||||
result = self.process_image(
|
||||
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
preprocessing_mode, preprocessing_config, table_detection_config
|
||||
)
|
||||
all_results.append(result)
|
||||
|
||||
@@ -1960,7 +2039,7 @@ class OCRService:
|
||||
# Single image or other file
|
||||
return self.process_image(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
preprocessing_mode, preprocessing_config, table_detection_config
|
||||
)
|
||||
|
||||
def _combine_results(self, results: List[Dict]) -> Dict:
|
||||
@@ -2047,7 +2126,8 @@ class OCRService:
|
||||
force_track: Optional[str] = None,
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||
table_detection_config: Optional[TableDetectionConfig] = None
|
||||
) -> Union[UnifiedDocument, Dict]:
|
||||
"""
|
||||
Main processing method with dual-track support.
|
||||
@@ -2063,6 +2143,7 @@ class OCRService:
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
table_detection_config: Table detection config (wired/wireless/region options)
|
||||
|
||||
Returns:
|
||||
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
||||
@@ -2075,13 +2156,13 @@ class OCRService:
|
||||
# Use dual-track processing (or forced track)
|
||||
return self.process_with_dual_track(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
preprocessing_mode, preprocessing_config, table_detection_config
|
||||
)
|
||||
else:
|
||||
# Use traditional OCR processing (no force_track support)
|
||||
return self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
preprocessing_mode, preprocessing_config, table_detection_config
|
||||
)
|
||||
|
||||
def process_legacy(
|
||||
|
||||
Reference in New Issue
Block a user