chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -97,9 +97,115 @@ class Settings(BaseSettings):
enable_region_detection: bool = Field(default=True) # Region detection for better table structure
enable_text_recognition: bool = Field(default=True) # General text recognition
# Table Parsing Mode - Controls how aggressively tables are parsed
# This is the KEY setting to prevent "cell explosion" on datasheet-type documents
# Options:
# - "full": Full table recognition with cell segmentation (aggressive, may over-detect)
# - "conservative": Conservative models + disable wireless tables + higher layout threshold
# - "classification_only": Only classify table regions, no cell segmentation (recommended for datasheets)
# - "disabled": Completely disable table recognition (safest for text-heavy documents)
table_parsing_mode: str = Field(
default="conservative",
description="Table parsing mode: 'full', 'conservative', 'classification_only', 'disabled'"
)
# Layout threshold for table detection (higher = stricter, less false positives)
# WARNING: This affects ALL layout detection, not just tables. Use with caution.
# Default None uses PaddleX default. Only set this if you understand the impact.
table_layout_threshold: Optional[float] = Field(
default=None,
description="Layout threshold for ALL element detection. Higher values = fewer elements detected."
)
# Cell Validation (filter over-detected table cells)
# DISABLED: This is a patch behavior - focus on getting PP-Structure output right first
cell_validation_enabled: bool = Field(
default=False,
description="Enable cell validation to filter over-detected tables"
)
cell_validation_max_density: float = Field(
default=3.0,
description="Max cells per 10,000px². Tables exceeding this are reclassified as TEXT."
)
cell_validation_min_cell_area: float = Field(
default=3000.0,
description="Min average cell area in px². Tables below this are reclassified as TEXT."
)
cell_validation_min_cell_height: float = Field(
default=10.0,
description="Min average cell height in px. Tables below this are reclassified as TEXT."
)
# Table Content Rebuilder (rebuild table HTML from raw OCR)
# DISABLED: This is a patch behavior - focus on getting PP-Structure output right first
table_content_rebuilder_enabled: bool = Field(
default=False,
description="Enable table content rebuilder to fix PP-Structure table HTML"
)
# Table Quality Check (determines rendering strategy based on cell_boxes overlap)
# When enabled, tables with overlapping cell_boxes are marked as 'bad' quality
# and rendered with border-only mode instead of full cell_boxes rendering.
# Disable this to always use cell_boxes rendering regardless of quality.
table_quality_check_enabled: bool = Field(
default=False,
description="Enable cell_boxes quality check. When disabled, all tables use cell_boxes rendering."
)
# Table Rendering: cell_boxes-first approach
# When enabled, uses cell_boxes coordinates as the primary source for table structure
# instead of relying on HTML colspan/rowspan, which often causes grid mismatch issues
# DISABLED: Algorithm needs improvement - clustering produces incorrect grid dimensions
table_rendering_prefer_cellboxes: bool = Field(
default=False,
description="Use cell_boxes coordinates as primary table structure source for PDF rendering"
)
table_cellboxes_row_threshold: float = Field(
default=15.0,
description="Y-coordinate threshold for row clustering when inferring grid from cell_boxes"
)
table_cellboxes_col_threshold: float = Field(
default=15.0,
description="X-coordinate threshold for column clustering when inferring grid from cell_boxes"
)
# Table Column Alignment Correction (Header-Anchor Algorithm)
# Corrects PP-Structure's column assignment errors using header row X-coordinates as reference
table_column_correction_enabled: bool = Field(
default=True,
description="Enable header-anchor column correction for table cells"
)
table_column_correction_threshold: float = Field(
default=0.5,
description="Minimum X-overlap ratio (0-1) to trigger column correction"
)
# Vertical Text Fragment Merging
# Detects and merges narrow vertical text blocks that were split by OCR
vertical_fragment_merge_enabled: bool = Field(
default=True,
description="Enable vertical text fragment merging for Chinese vertical text"
)
vertical_fragment_aspect_ratio: float = Field(
default=0.3,
description="Max width/height ratio to consider as vertical text (lower = narrower)"
)
# Simple Text Positioning Mode (OCR Track)
# When enabled, bypasses complex table structure reconstruction and renders
# raw OCR text directly at detected positions with rotation correction.
# This is more reliable for documents where PP-Structure fails to parse tables correctly.
simple_text_positioning_enabled: bool = Field(
default=True,
description="Use simple text positioning instead of complex table reconstruction for OCR track"
)
simple_text_positioning_debug: bool = Field(
default=False,
description="Enable debug logging for simple text positioning"
)
# PP-StructureV3 Preprocessing (Stage 1)
use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation
use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos
use_doc_unwarping: bool = Field(default=False) # Disabled: can cause document distortion/skewing
use_textline_orientation: bool = Field(default=True) # Detect textline orientation
# Layout Detection Parameters (Stage 3)
@@ -277,11 +383,43 @@ class Settings(BaseSettings):
# ===== Gap Filling Configuration =====
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
# Uses IoA (Intersection over Area) instead of IoU for better coverage detection
gap_filling_enabled: bool = Field(default=False) # Enable gap filling for OCR track
gap_filling_coverage_threshold: float = Field(default=0.7) # Activate when coverage < 70%
gap_filling_iou_threshold: float = Field(default=0.15) # IoU threshold for coverage detection
gap_filling_confidence_threshold: float = Field(default=0.3) # Min confidence for raw OCR regions
gap_filling_dedup_iou_threshold: float = Field(default=0.5) # IoU threshold for deduplication
# IoA (Intersection over Area) thresholds - different thresholds per element type
# IoA = intersection_area / ocr_box_area (measures how much of OCR box is inside layout region)
gap_filling_ioa_threshold_text: float = Field(
default=0.6,
description="IoA threshold for TEXT/TITLE elements. Tolerates boundary errors."
)
gap_filling_ioa_threshold_table: float = Field(
default=0.1,
description="IoA threshold for TABLE elements. Strict to prevent duplicate table content."
)
gap_filling_ioa_threshold_figure: float = Field(
default=0.8,
description="IoA threshold for FIGURE/IMAGE elements. Preserves text inside figures."
)
gap_filling_dedup_ioa_threshold: float = Field(
default=0.5,
description="IoA threshold for deduplication against existing TEXT elements."
)
gap_filling_shrink_pixels: int = Field(
default=1,
description="Shrink OCR bbox inward by this many pixels to reduce edge duplicates."
)
# Use PP-StructureV3's internal OCR (overall_ocr_res) instead of separate Raw OCR
gap_filling_use_overall_ocr: bool = Field(
default=True,
description="Use PP-StructureV3's internal OCR results instead of separate inference."
)
# Legacy IoU threshold (deprecated, kept for backward compatibility)
gap_filling_iou_threshold: float = Field(default=0.15) # Deprecated: use IoA thresholds
gap_filling_dedup_iou_threshold: float = Field(default=0.5) # Deprecated: use gap_filling_dedup_ioa_threshold
# ===== Debug Configuration =====
# Enable debug outputs for PP-StructureV3 analysis