feat: upgrade PP-StructureV3 models to latest versions

- Layout: PP-DocLayout-S → PP-DocLayout_plus-L (83.2% mAP) - Table: Single model → Dual SLANeXt (wired/wireless) - Formula: PP-FormulaNet_plus-L for enhanced recognition - Add preprocessing flags support (orientation, unwarping) - Update frontend i18n descriptions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 14:21:24 +08:00
parent 59206a6ab8
commit 6235280c45
9 changed files with 504 additions and 25 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -91,6 +91,11 @@ class Settings(BaseSettings):
    enable_table_recognition: bool = Field(default=True)  # Table structure recognition
    enable_seal_recognition: bool = Field(default=True)  # Seal/stamp recognition
    enable_text_recognition: bool = Field(default=True)  # General text recognition
+
+    # PP-StructureV3 Preprocessing (Stage 1)
+    use_doc_orientation_classify: bool = Field(default=True)  # Auto-detect and correct document rotation
+    use_doc_unwarping: bool = Field(default=True)  # Correct document warping from photos
+    use_textline_orientation: bool = Field(default=True)  # Detect textline orientation
    layout_detection_threshold: float = Field(default=0.2)  # Lower threshold for more sensitive detection
    layout_nms_threshold: float = Field(default=0.2)  # Lower NMS to preserve more individual elements
    layout_merge_mode: str = Field(default="small")  # Use 'small' to minimize bbox merging
@@ -99,20 +104,48 @@ class Settings(BaseSettings):
    text_det_box_thresh: float = Field(default=0.3)  # Lower box threshold for better detection
    text_det_unclip_ratio: float = Field(default=1.2)  # Smaller unclip for tighter text boxes

-    # Layout Detection Model Configuration
+    # Layout Detection Model Configuration (Stage 3)
    # Available models:
    # - None (default): Use PP-StructureV3's built-in model (PubLayNet-based)
-    # - "PP-DocLayout-S": Better for Chinese docs, papers, contracts, exams (23 categories)
+    # - "PP-DocLayout_plus-L": Best for Chinese docs (83.2% mAP, 20 categories) - complex layouts
+    # - "PP-DocLayout-L": High accuracy (90.4% mAP, 23 categories) - general purpose
    # - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout
    layout_detection_model_name: Optional[str] = Field(
-        default="PP-DocLayout-S",
-        description="Layout detection model name. Set to 'PP-DocLayout-S' for better Chinese document support."
+        default="PP-DocLayout_plus-L",
+        description="Layout detection model name. PP-DocLayout_plus-L recommended for complex Chinese documents."
    )
    layout_detection_model_dir: Optional[str] = Field(
        default=None,
        description="Custom layout detection model directory. If None, downloads official model."
    )

+    # Table Structure Recognition Model Configuration (Stage 4)
+    # PP-StructureV3 uses separate models for wired (bordered) and wireless (borderless) tables
+    # Both models should be configured for comprehensive table detection
+    # Available models:
+    # - "SLANeXt_wired": Best for wired/bordered tables (69.65% accuracy, 351MB)
+    # - "SLANeXt_wireless": Best for wireless/borderless tables (69.65% accuracy, 351MB)
+    # - "SLANet": Legacy model (59.52% accuracy, 6.9MB)
+    # - "SLANet_plus": Improved legacy (63.69% accuracy, 6.9MB)
+    wired_table_model_name: Optional[str] = Field(
+        default="SLANeXt_wired",
+        description="Table structure model for bordered tables. SLANeXt_wired recommended."
+    )
+    wireless_table_model_name: Optional[str] = Field(
+        default="SLANeXt_wireless",
+        description="Table structure model for borderless tables. SLANeXt_wireless recommended."
+    )
+
+    # Formula Recognition Model Configuration (Stage 4)
+    # Available models:
+    # - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
+    # - "PP-FormulaNet-L": Good for English formulas (90.36% English BLEU)
+    # - "PP-FormulaNet-S": Fast inference (87% English BLEU)
+    formula_recognition_model_name: Optional[str] = Field(
+        default="PP-FormulaNet_plus-L",
+        description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
+    )
+
    # ===== Gap Filling Configuration =====
    # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
    gap_filling_enabled: bool = Field(default=True)  # Enable gap filling for OCR track
--- a/backend/app/schemas/task.py
+++ b/backend/app/schemas/task.py
@@ -28,11 +28,11 @@ class LayoutModelEnum(str, Enum):
    """Layout detection model selection for OCR track.

    Different models are optimized for different document types:
-    - CHINESE: PP-DocLayout-S, optimized for Chinese documents (forms, contracts, invoices)
-    - DEFAULT: PubLayNet-based, optimized for English academic papers
-    - CDLA: CDLA model, specialized Chinese document layout analysis
+    - CHINESE: PP-DocLayout_plus-L (83.2% mAP), optimized for complex Chinese documents
+    - DEFAULT: PubLayNet-based (~94% mAP), optimized for English academic papers
+    - CDLA: CDLA model (~86% mAP), specialized Chinese document layout analysis
    """
-    CHINESE = "chinese"   # PP-DocLayout-S - Best for Chinese documents (recommended)
+    CHINESE = "chinese"   # PP-DocLayout_plus-L - Best for Chinese documents (recommended)
    DEFAULT = "default"   # PubLayNet-based - Best for English documents
    CDLA = "cdla"         # CDLA model - Alternative for Chinese layout

--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -50,11 +50,11 @@ logger = logging.getLogger(__name__)
 _USE_PUBLAYNET_DEFAULT = "__USE_PUBLAYNET_DEFAULT__"

 # Layout model mapping: user-friendly names to actual model names
-# - "chinese": PP-DocLayout-S - Best for Chinese documents (forms, contracts, invoices)
+# - "chinese": PP-DocLayout_plus-L - Best for Chinese documents (83.2% mAP, complex layouts)
 # - "default": PubLayNet-based default model - Best for English documents
 # - "cdla": picodet_lcnet_x1_0_fgd_layout_cdla - Alternative for Chinese layout
 LAYOUT_MODEL_MAPPING = {
-    "chinese": "PP-DocLayout-S",
+    "chinese": "PP-DocLayout_plus-L",
    "default": _USE_PUBLAYNET_DEFAULT,  # Uses default PubLayNet-based model (no custom model)
    "cdla": "picodet_lcnet_x1_0_fgd_layout_cdla",
 }
@@ -517,34 +517,63 @@ class OCRService:
                    layout_model_name = settings.layout_detection_model_name
                layout_model_dir = settings.layout_detection_model_dir

+                # Preprocessing configuration (Stage 1)
+                use_orientation = settings.use_doc_orientation_classify
+                use_unwarping = settings.use_doc_unwarping
+                use_textline = settings.use_textline_orientation
+
+                # Table and formula model configuration (Stage 4)
+                wired_table_model = settings.wired_table_model_name
+                wireless_table_model = settings.wireless_table_model_name
+                formula_model = settings.formula_recognition_model_name
+
                logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
+                logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}")
                logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
+                logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}")
+                logger.info(f"Formula model: {formula_model}")
                logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
                logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")

                # Build PPStructureV3 kwargs
                pp_kwargs = {
-                    'use_doc_orientation_classify': False,
-                    'use_doc_unwarping': False,
-                    'use_textline_orientation': False,
+                    # Preprocessing (Stage 1)
+                    'use_doc_orientation_classify': use_orientation,
+                    'use_doc_unwarping': use_unwarping,
+                    'use_textline_orientation': use_textline,
+                    # Element recognition (Stage 4)
                    'use_table_recognition': use_table,
                    'use_formula_recognition': use_formula,
                    'use_chart_recognition': use_chart,
+                    # Layout detection parameters
                    'layout_threshold': layout_threshold,
                    'layout_nms': layout_nms,
                    'layout_unclip_ratio': layout_unclip,
                    'layout_merge_bboxes_mode': layout_merge,
+                    # Text detection parameters
                    'text_det_thresh': text_thresh,
                    'text_det_box_thresh': text_box_thresh,
                    'text_det_unclip_ratio': text_unclip,
                }

-                # Add layout model configuration if specified
+                # Add layout model configuration if specified (Stage 3)
                if layout_model_name:
                    pp_kwargs['layout_detection_model_name'] = layout_model_name
                if layout_model_dir:
                    pp_kwargs['layout_detection_model_dir'] = layout_model_dir

+                # Add table structure model configuration (Stage 4)
+                # PPStructureV3 uses separate models for wired (bordered) and wireless (borderless) tables
+                # Both models should be configured for comprehensive table detection
+                if wired_table_model:
+                    pp_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
+                if wireless_table_model:
+                    pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
+
+                # Add formula recognition model configuration (Stage 4)
+                if formula_model:
+                    pp_kwargs['formula_recognition_model_name'] = formula_model
+
                self.structure_engine = PPStructureV3(**pp_kwargs)

                # Track model loading for cache management
@@ -571,12 +600,15 @@ class OCRService:
                    layout_threshold = settings.layout_detection_threshold
                    layout_model_name = settings.layout_detection_model_name
                    layout_model_dir = settings.layout_detection_model_dir
+                    wired_table_model = settings.wired_table_model_name
+                    wireless_table_model = settings.wireless_table_model_name
+                    formula_model = settings.formula_recognition_model_name

                    # Build CPU fallback kwargs
                    cpu_kwargs = {
-                        'use_doc_orientation_classify': False,
-                        'use_doc_unwarping': False,
-                        'use_textline_orientation': False,
+                        'use_doc_orientation_classify': settings.use_doc_orientation_classify,
+                        'use_doc_unwarping': settings.use_doc_unwarping,
+                        'use_textline_orientation': settings.use_textline_orientation,
                        'use_table_recognition': use_table,
                        'use_formula_recognition': use_formula,
                        'use_chart_recognition': use_chart,
@@ -586,6 +618,12 @@ class OCRService:
                        cpu_kwargs['layout_detection_model_name'] = layout_model_name
                    if layout_model_dir:
                        cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
+                    if wired_table_model:
+                        cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
+                    if wireless_table_model:
+                        cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
+                    if formula_model:
+                        cpu_kwargs['formula_recognition_model_name'] = formula_model

                    self.structure_engine = PPStructureV3(**cpu_kwargs)
                    self._current_layout_model = layout_model  # Track current model for recreation check
--- a/backend/tests/services/test_layout_model.py
+++ b/backend/tests/services/test_layout_model.py
@@ -40,8 +40,8 @@ class TestLayoutModelMapping:
        assert 'cdla' in LAYOUT_MODEL_MAPPING

    def test_chinese_model_maps_to_pp_doclayout(self):
-        """Verify 'chinese' maps to PP-DocLayout-S"""
-        assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout-S'
+        """Verify 'chinese' maps to PP-DocLayout_plus-L"""
+        assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout_plus-L'

    def test_default_model_maps_to_publaynet_sentinel(self):
        """Verify 'default' maps to sentinel value for PubLayNet default"""
@@ -57,7 +57,7 @@ class TestLayoutModelEngine:
    """Test engine creation with different layout models"""

    def test_chinese_model_creates_engine_with_pp_doclayout(self):
-        """Verify 'chinese' layout model uses PP-DocLayout-S"""
+        """Verify 'chinese' layout model uses PP-DocLayout_plus-L"""
        ocr_service = OCRService()

        with patch.object(ocr_service, 'structure_engine', None):
@@ -70,7 +70,7 @@ class TestLayoutModelEngine:
                mock_ppstructure.assert_called_once()
                call_kwargs = mock_ppstructure.call_args[1]

-                assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
+                assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout_plus-L'

    def test_default_model_creates_engine_without_model_name(self):
        """Verify 'default' layout model does not specify model name (uses default)"""
@@ -121,7 +121,7 @@ class TestLayoutModelEngine:
                call_kwargs = mock_ppstructure.call_args[1]

                # Should use 'chinese' model as default
-                assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
+                assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout_plus-L'


 class TestLayoutModelCaching: