chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -97,9 +97,115 @@ class Settings(BaseSettings):
    enable_region_detection: bool = Field(default=True)  # Region detection for better table structure
    enable_text_recognition: bool = Field(default=True)  # General text recognition

+    # Table Parsing Mode - Controls how aggressively tables are parsed
+    # This is the KEY setting to prevent "cell explosion" on datasheet-type documents
+    # Options:
+    # - "full": Full table recognition with cell segmentation (aggressive, may over-detect)
+    # - "conservative": Conservative models + disable wireless tables + higher layout threshold
+    # - "classification_only": Only classify table regions, no cell segmentation (recommended for datasheets)
+    # - "disabled": Completely disable table recognition (safest for text-heavy documents)
+    table_parsing_mode: str = Field(
+        default="conservative",
+        description="Table parsing mode: 'full', 'conservative', 'classification_only', 'disabled'"
+    )
+    # Layout threshold for table detection (higher = stricter, less false positives)
+    # WARNING: This affects ALL layout detection, not just tables. Use with caution.
+    # Default None uses PaddleX default. Only set this if you understand the impact.
+    table_layout_threshold: Optional[float] = Field(
+        default=None,
+        description="Layout threshold for ALL element detection. Higher values = fewer elements detected."
+    )
+
+    # Cell Validation (filter over-detected table cells)
+    # DISABLED: This is a patch behavior - focus on getting PP-Structure output right first
+    cell_validation_enabled: bool = Field(
+        default=False,
+        description="Enable cell validation to filter over-detected tables"
+    )
+    cell_validation_max_density: float = Field(
+        default=3.0,
+        description="Max cells per 10,000px². Tables exceeding this are reclassified as TEXT."
+    )
+    cell_validation_min_cell_area: float = Field(
+        default=3000.0,
+        description="Min average cell area in px². Tables below this are reclassified as TEXT."
+    )
+    cell_validation_min_cell_height: float = Field(
+        default=10.0,
+        description="Min average cell height in px. Tables below this are reclassified as TEXT."
+    )
+
+    # Table Content Rebuilder (rebuild table HTML from raw OCR)
+    # DISABLED: This is a patch behavior - focus on getting PP-Structure output right first
+    table_content_rebuilder_enabled: bool = Field(
+        default=False,
+        description="Enable table content rebuilder to fix PP-Structure table HTML"
+    )
+
+    # Table Quality Check (determines rendering strategy based on cell_boxes overlap)
+    # When enabled, tables with overlapping cell_boxes are marked as 'bad' quality
+    # and rendered with border-only mode instead of full cell_boxes rendering.
+    # Disable this to always use cell_boxes rendering regardless of quality.
+    table_quality_check_enabled: bool = Field(
+        default=False,
+        description="Enable cell_boxes quality check. When disabled, all tables use cell_boxes rendering."
+    )
+
+    # Table Rendering: cell_boxes-first approach
+    # When enabled, uses cell_boxes coordinates as the primary source for table structure
+    # instead of relying on HTML colspan/rowspan, which often causes grid mismatch issues
+    # DISABLED: Algorithm needs improvement - clustering produces incorrect grid dimensions
+    table_rendering_prefer_cellboxes: bool = Field(
+        default=False,
+        description="Use cell_boxes coordinates as primary table structure source for PDF rendering"
+    )
+    table_cellboxes_row_threshold: float = Field(
+        default=15.0,
+        description="Y-coordinate threshold for row clustering when inferring grid from cell_boxes"
+    )
+    table_cellboxes_col_threshold: float = Field(
+        default=15.0,
+        description="X-coordinate threshold for column clustering when inferring grid from cell_boxes"
+    )
+
+    # Table Column Alignment Correction (Header-Anchor Algorithm)
+    # Corrects PP-Structure's column assignment errors using header row X-coordinates as reference
+    table_column_correction_enabled: bool = Field(
+        default=True,
+        description="Enable header-anchor column correction for table cells"
+    )
+    table_column_correction_threshold: float = Field(
+        default=0.5,
+        description="Minimum X-overlap ratio (0-1) to trigger column correction"
+    )
+
+    # Vertical Text Fragment Merging
+    # Detects and merges narrow vertical text blocks that were split by OCR
+    vertical_fragment_merge_enabled: bool = Field(
+        default=True,
+        description="Enable vertical text fragment merging for Chinese vertical text"
+    )
+    vertical_fragment_aspect_ratio: float = Field(
+        default=0.3,
+        description="Max width/height ratio to consider as vertical text (lower = narrower)"
+    )
+
+    # Simple Text Positioning Mode (OCR Track)
+    # When enabled, bypasses complex table structure reconstruction and renders
+    # raw OCR text directly at detected positions with rotation correction.
+    # This is more reliable for documents where PP-Structure fails to parse tables correctly.
+    simple_text_positioning_enabled: bool = Field(
+        default=True,
+        description="Use simple text positioning instead of complex table reconstruction for OCR track"
+    )
+    simple_text_positioning_debug: bool = Field(
+        default=False,
+        description="Enable debug logging for simple text positioning"
+    )
+
    # PP-StructureV3 Preprocessing (Stage 1)
    use_doc_orientation_classify: bool = Field(default=True)  # Auto-detect and correct document rotation
-    use_doc_unwarping: bool = Field(default=True)  # Correct document warping from photos
+    use_doc_unwarping: bool = Field(default=False)  # Disabled: can cause document distortion/skewing
    use_textline_orientation: bool = Field(default=True)  # Detect textline orientation

    # Layout Detection Parameters (Stage 3)
@@ -277,11 +383,43 @@ class Settings(BaseSettings):

    # ===== Gap Filling Configuration =====
    # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
-    gap_filling_enabled: bool = Field(default=True)  # Enable gap filling for OCR track
+    # Uses IoA (Intersection over Area) instead of IoU for better coverage detection
+    gap_filling_enabled: bool = Field(default=False)  # Enable gap filling for OCR track
    gap_filling_coverage_threshold: float = Field(default=0.7)  # Activate when coverage < 70%
-    gap_filling_iou_threshold: float = Field(default=0.15)  # IoU threshold for coverage detection
    gap_filling_confidence_threshold: float = Field(default=0.3)  # Min confidence for raw OCR regions
-    gap_filling_dedup_iou_threshold: float = Field(default=0.5)  # IoU threshold for deduplication
+
+    # IoA (Intersection over Area) thresholds - different thresholds per element type
+    # IoA = intersection_area / ocr_box_area (measures how much of OCR box is inside layout region)
+    gap_filling_ioa_threshold_text: float = Field(
+        default=0.6,
+        description="IoA threshold for TEXT/TITLE elements. Tolerates boundary errors."
+    )
+    gap_filling_ioa_threshold_table: float = Field(
+        default=0.1,
+        description="IoA threshold for TABLE elements. Strict to prevent duplicate table content."
+    )
+    gap_filling_ioa_threshold_figure: float = Field(
+        default=0.8,
+        description="IoA threshold for FIGURE/IMAGE elements. Preserves text inside figures."
+    )
+    gap_filling_dedup_ioa_threshold: float = Field(
+        default=0.5,
+        description="IoA threshold for deduplication against existing TEXT elements."
+    )
+    gap_filling_shrink_pixels: int = Field(
+        default=1,
+        description="Shrink OCR bbox inward by this many pixels to reduce edge duplicates."
+    )
+
+    # Use PP-StructureV3's internal OCR (overall_ocr_res) instead of separate Raw OCR
+    gap_filling_use_overall_ocr: bool = Field(
+        default=True,
+        description="Use PP-StructureV3's internal OCR results instead of separate inference."
+    )
+
+    # Legacy IoU threshold (deprecated, kept for backward compatibility)
+    gap_filling_iou_threshold: float = Field(default=0.15)  # Deprecated: use IoA thresholds
+    gap_filling_dedup_iou_threshold: float = Field(default=0.5)  # Deprecated: use gap_filling_dedup_ioa_threshold

    # ===== Debug Configuration =====
    # Enable debug outputs for PP-StructureV3 analysis