feat: enhance layout preprocessing and unify image scaling proposal

Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -90,19 +90,27 @@ class Settings(BaseSettings):
    enable_formula_recognition: bool = Field(default=True)  # Math formula recognition
    enable_table_recognition: bool = Field(default=True)  # Table structure recognition
    enable_seal_recognition: bool = Field(default=True)  # Seal/stamp recognition
+    enable_region_detection: bool = Field(default=True)  # Region detection for better table structure
    enable_text_recognition: bool = Field(default=True)  # General text recognition

    # PP-StructureV3 Preprocessing (Stage 1)
    use_doc_orientation_classify: bool = Field(default=True)  # Auto-detect and correct document rotation
    use_doc_unwarping: bool = Field(default=True)  # Correct document warping from photos
    use_textline_orientation: bool = Field(default=True)  # Detect textline orientation
-    layout_detection_threshold: float = Field(default=0.2)  # Lower threshold for more sensitive detection
-    layout_nms_threshold: float = Field(default=0.2)  # Lower NMS to preserve more individual elements
-    layout_merge_mode: str = Field(default="small")  # Use 'small' to minimize bbox merging
-    layout_unclip_ratio: float = Field(default=1.2)  # Smaller unclip to preserve element boundaries
-    text_det_thresh: float = Field(default=0.2)  # More sensitive text detection
-    text_det_box_thresh: float = Field(default=0.3)  # Lower box threshold for better detection
-    text_det_unclip_ratio: float = Field(default=1.2)  # Smaller unclip for tighter text boxes
+
+    # Layout Detection Parameters (Stage 3)
+    # NOTE: Testing showed that PaddleX defaults work better for table detection.
+    # Previously we used aggressive low thresholds (0.2) which caused table detection failures.
+    # Now using None to let PaddleX use its optimized defaults.
+    layout_detection_threshold: Optional[float] = Field(default=None)  # None = use PaddleX default
+    layout_nms_threshold: Optional[float] = Field(default=None)  # None = use PaddleX default
+    layout_merge_mode: Optional[str] = Field(default=None)  # None = use PaddleX default
+    layout_unclip_ratio: Optional[float] = Field(default=None)  # None = use PaddleX default
+
+    # Text Detection Parameters
+    text_det_thresh: Optional[float] = Field(default=None)  # None = use PaddleX default
+    text_det_box_thresh: Optional[float] = Field(default=None)  # None = use PaddleX default
+    text_det_unclip_ratio: Optional[float] = Field(default=None)  # None = use PaddleX default

    # Layout Detection Model Configuration (Stage 3)
    # Available models:
@@ -136,6 +144,23 @@ class Settings(BaseSettings):
        description="Table structure model for borderless tables. SLANeXt_wireless recommended."
    )

+    # Table Classification Model - determines if table is wired or wireless
+    table_classification_model_name: Optional[str] = Field(
+        default="PP-LCNet_x1_0_table_cls",
+        description="Model to classify table type (wired vs wireless). Enables automatic model selection."
+    )
+
+    # Table Cell Detection Models - detect individual cells within tables
+    # These are crucial for accurate cell boundary detection in complex tables
+    wired_table_cells_detection_model_name: Optional[str] = Field(
+        default="RT-DETR-L_wired_table_cell_det",
+        description="Cell detection model for bordered tables. RT-DETR-L provides best accuracy."
+    )
+    wireless_table_cells_detection_model_name: Optional[str] = Field(
+        default="RT-DETR-L_wireless_table_cell_det",
+        description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
+    )
+
    # Formula Recognition Model Configuration (Stage 4)
    # Available models:
    # - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
@@ -146,6 +171,37 @@ class Settings(BaseSettings):
        description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
    )

+    # Chart Recognition Model Configuration
+    chart_recognition_model_name: Optional[str] = Field(
+        default="PP-Chart2Table",
+        description="Chart to table recognition model."
+    )
+
+    # Text Detection and Recognition Model Configuration
+    # PP-OCRv5_server provides best accuracy for document OCR
+    text_detection_model_name: Optional[str] = Field(
+        default="PP-OCRv5_server_det",
+        description="Text detection model. PP-OCRv5_server_det recommended for documents."
+    )
+    text_recognition_model_name: Optional[str] = Field(
+        default="PP-OCRv5_server_rec",
+        description="Text recognition model. PP-OCRv5_server_rec recommended for documents."
+    )
+
+    # Document Preprocessing Model Configuration (Stage 1)
+    doc_orientation_classify_model_name: Optional[str] = Field(
+        default="PP-LCNet_x1_0_doc_ori",
+        description="Document orientation classification model for auto-rotation."
+    )
+    doc_unwarping_model_name: Optional[str] = Field(
+        default="UVDoc",
+        description="Document unwarping model for correcting perspective distortion."
+    )
+    textline_orientation_model_name: Optional[str] = Field(
+        default="PP-LCNet_x1_0_textline_ori",
+        description="Textline orientation model for detecting text direction."
+    )
+
    # ===== Layout Preprocessing Configuration =====
    # Image preprocessing to enhance layout detection for documents with faint lines/borders
    # Preprocessing only affects layout detection input; original image is preserved for extraction
@@ -179,6 +235,31 @@ class Settings(BaseSettings):
        description="Contrast below this triggers binarization in auto mode"
    )

+    # Layout image scaling for better table detection
+    # Automatic bidirectional scaling for layout detection
+    # PDF conversion now uses 150 DPI (~1240x1754 for A4), which falls within optimal range
+    # Scaling acts as a safety net for:
+    # - Very large images (>2000px): Downscale to target
+    # - Very small images (<1200px): Upscale to target
+    # - 150 DPI A4 (1240x1754): No scaling needed (already optimal)
+    layout_image_scaling_enabled: bool = Field(
+        default=True,
+        description="Enable automatic bidirectional scaling for layout detection. "
+                    "Images outside optimal range are scaled to target dimension."
+    )
+    layout_image_scaling_max_dimension: int = Field(
+        default=2000,
+        description="Max dimension (pixels) before downscaling. Images larger than this will be scaled down."
+    )
+    layout_image_scaling_min_dimension: int = Field(
+        default=1200,
+        description="Min dimension (pixels) before upscaling. Images smaller than this will be scaled up."
+    )
+    layout_image_scaling_target_dimension: int = Field(
+        default=1600,
+        description="Target dimension (pixels) for scaling. Optimal size for PP-Structure layout detection."
+    )
+
    # ===== Gap Filling Configuration =====
    # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
    gap_filling_enabled: bool = Field(default=True)  # Enable gap filling for OCR track