chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -6,7 +6,7 @@ Supports both PaddleOCR (for scanned documents) and direct extraction (for edita
 import json
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 from datetime import datetime
 import uuid
 import gc  # For garbage collection
@@ -446,6 +446,47 @@ class OCRService:
        except Exception as e:
            logger.warning(f"Failed to clear GPU cache: {e}")

+    def _apply_ocr_config(self, ocr_config: 'OCRConfig'):
+        """
+        Apply OCR configuration from preset or custom settings.
+        This modifies the runtime settings used by PP-Structure.
+
+        Args:
+            ocr_config: OCRConfig object with processing settings
+        """
+        logger.info(f"Applying OCR config: {ocr_config.model_dump()}")
+
+        # Store the config for use in PP-Structure initialization
+        self._runtime_ocr_config = ocr_config
+
+        # Apply table parsing mode settings
+        # These will be used when initializing PP-StructureV3
+        settings.table_parsing_mode = ocr_config.table_parsing_mode.value if hasattr(ocr_config.table_parsing_mode, 'value') else ocr_config.table_parsing_mode
+
+        # Apply preprocessing settings
+        settings.use_doc_orientation_classify = ocr_config.use_doc_orientation_classify
+        settings.use_doc_unwarping = ocr_config.use_doc_unwarping
+        settings.use_textline_orientation = ocr_config.use_textline_orientation
+
+        # Apply recognition module settings
+        settings.enable_chart_recognition = ocr_config.enable_chart_recognition
+        settings.enable_formula_recognition = ocr_config.enable_formula_recognition
+        settings.enable_seal_recognition = ocr_config.enable_seal_recognition
+        settings.enable_region_detection = ocr_config.enable_region_detection
+
+        # Apply layout threshold if specified
+        if ocr_config.layout_threshold is not None:
+            settings.layout_detection_threshold = ocr_config.layout_threshold
+        if ocr_config.layout_nms_threshold is not None:
+            settings.layout_nms_threshold = ocr_config.layout_nms_threshold
+
+        # Invalidate existing structure engine to force re-initialization with new settings
+        if self.structure_engine is not None:
+            logger.info("Invalidating PP-StructureV3 engine to apply new OCR config")
+            self._unload_structure_engine()
+
+        logger.info(f"OCR config applied: table_parsing_mode={settings.table_parsing_mode}")
+
    def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
        """
        Get or create OCR engine for specified language with GPU support
@@ -615,6 +656,39 @@ class OCRService:
                formula_model = settings.formula_recognition_model_name
                chart_model = settings.chart_recognition_model_name

+                # Apply table_parsing_mode settings
+                # This is the KEY configuration to prevent "cell explosion" on datasheet-type documents
+                table_parsing_mode = settings.table_parsing_mode
+                logger.info(f"Table parsing mode: {table_parsing_mode}")
+
+                if table_parsing_mode == "disabled":
+                    # 方案A: 完全關閉 table recognition
+                    use_table = False
+                    wired_table_model = None
+                    wireless_table_model = None
+                    wired_cell_det_model = None
+                    wireless_cell_det_model = None
+                    logger.info("Table parsing DISABLED - no cell segmentation")
+
+                elif table_parsing_mode == "classification_only":
+                    # 方案C: 只做 table classification，不做 cell segmentation
+                    use_table = False  # Don't parse table structure
+                    wired_table_model = None
+                    wireless_table_model = None
+                    wired_cell_det_model = None
+                    wireless_cell_det_model = None
+                    # Keep table_cls_model to identify table regions
+                    logger.info("Table parsing CLASSIFICATION_ONLY - regions identified but no cell parsing")
+
+                elif table_parsing_mode == "conservative":
+                    # 方案B: 保守模式 - 只禁用 wireless tables (aggressive)
+                    # 注意：不要修改 layout_threshold，它會影響所有元素偵測，不只是表格
+                    wireless_table_model = None
+                    wireless_cell_det_model = None
+                    logger.info(f"Table parsing CONSERVATIVE - wireless disabled (layout_threshold unchanged)")
+
+                # else: "full" mode - use all default settings (aggressive)
+
                # Apply table detection config overrides for individual table types
                if table_detection_config:
                    if not table_detection_config.enable_wired_table:
@@ -1343,6 +1417,7 @@ class OCRService:

            if detect_layout:
                # Pass current_page to analyze_layout for correct page numbering
+                # Also pass text_regions for table content rebuilding
                layout_data, images_metadata = self.analyze_layout(
                    image_path,
                    output_dir=output_dir,
@@ -1350,7 +1425,8 @@ class OCRService:
                    layout_model=layout_model,
                    preprocessing_mode=preprocessing_mode,
                    preprocessing_config=preprocessing_config,
-                    table_detection_config=table_detection_config
+                    table_detection_config=table_detection_config,
+                    raw_ocr_regions=text_regions  # For table content rebuilding
                )

            # Generate Markdown
@@ -1379,6 +1455,12 @@ class OCRService:

            # If layout data is enhanced, add enhanced results for converter
            if layout_data and layout_data.get('enhanced'):
+                # Debug: check if table elements have rebuild_stats
+                for elem in layout_data.get('elements', []):
+                    if elem.get('type') == 'table':
+                        has_rebuild = 'rebuild_stats' in elem
+                        logger.info(f"[OCR_SERVICE] Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}, keys={list(elem.keys())[:10]}")
+
                result['enhanced_results'] = [{
                    'elements': layout_data.get('elements', []),
                    'reading_order': layout_data.get('reading_order', []),
@@ -1509,7 +1591,8 @@ class OCRService:
        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None,
-        table_detection_config: Optional[TableDetectionConfig] = None
+        table_detection_config: Optional[TableDetectionConfig] = None,
+        raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
    ) -> Tuple[Optional[Dict], List[Dict]]:
        """
        Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1522,6 +1605,7 @@ class OCRService:
            preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
            table_detection_config: Table detection config (wired/wireless/region options)
+            raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding

        Returns:
            Tuple of (layout_data, images_metadata)
@@ -1607,7 +1691,8 @@ class OCRService:
                    preprocessed_image=preprocessed_image,
                    scaling_info=scaling_info,
                    save_visualization=True,  # Save layout detection visualization images
-                    use_cv_table_detection=use_cv_table_detection
+                    use_cv_table_detection=use_cv_table_detection,
+                    raw_ocr_regions=raw_ocr_regions  # For table content rebuilding
                )

                if result.get('has_parsing_res_list'):
@@ -2225,7 +2310,8 @@ class OCRService:
        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None,
-        table_detection_config: Optional[TableDetectionConfig] = None
+        table_detection_config: Optional[TableDetectionConfig] = None,
+        ocr_config: Optional['OCRConfig'] = None
    ) -> Union[UnifiedDocument, Dict]:
        """
        Main processing method with dual-track support.
@@ -2242,11 +2328,16 @@ class OCRService:
            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
            table_detection_config: Table detection config (wired/wireless/region options)
+            ocr_config: OCR processing config from preset or custom settings

        Returns:
            UnifiedDocument if dual-track is enabled and use_dual_track=True,
            Dict with legacy format otherwise
        """
+        # Apply OCR config to settings if provided
+        if ocr_config:
+            self._apply_ocr_config(ocr_config)
+
        # Use dual-track processing if:
        # 1. use_dual_track is True (auto-detection), OR
        # 2. force_track is specified (explicit track selection)