feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -590,8 +590,17 @@ class OCRToUnifiedConverter:
            # Prepare content based on element type
            if element_type == ElementType.TABLE:
                # For tables, use TableData as content
+                # Pass cell_boxes for accurate cell positioning
                table_data = self._extract_table_data(elem_data)
                content = table_data if table_data else elem_data.get('content', '')
+
+                # Preserve cell_boxes and embedded_images in metadata for PDF generation
+                # These are extracted by PP-StructureV3 and provide accurate cell positioning
+                if 'cell_boxes' in elem_data:
+                    elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
+                    elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
+                if 'embedded_images' in elem_data:
+                    elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
            elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
                # For images, use metadata dict as content
                content = {