feat: enhance layout preprocessing and unify image scaling proposal

Backend changes:
- Add image scaling configuration for PP-Structure processing
- Enhance layout preprocessing service with scaling support
- Update OCR service with improved memory management
- Add PP-Structure enhanced processing improvements

Frontend changes:
- Update preprocessing settings UI
- Fix processing page layout and state management
- Update API types for new parameters

Proposals:
- Archive add-layout-preprocessing proposal (completed)
- Add unify-image-scaling proposal for consistent coordinate handling

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions

View File

@@ -90,19 +90,27 @@ class Settings(BaseSettings):
enable_formula_recognition: bool = Field(default=True) # Math formula recognition
enable_table_recognition: bool = Field(default=True) # Table structure recognition
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
enable_region_detection: bool = Field(default=True) # Region detection for better table structure
enable_text_recognition: bool = Field(default=True) # General text recognition
# PP-StructureV3 Preprocessing (Stage 1)
use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation
use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos
use_textline_orientation: bool = Field(default=True) # Detect textline orientation
layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection
layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements
layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging
layout_unclip_ratio: float = Field(default=1.2) # Smaller unclip to preserve element boundaries
text_det_thresh: float = Field(default=0.2) # More sensitive text detection
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
# Layout Detection Parameters (Stage 3)
# NOTE: Testing showed that PaddleX defaults work better for table detection.
# Previously we used aggressive low thresholds (0.2) which caused table detection failures.
# Now using None to let PaddleX use its optimized defaults.
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_merge_mode: Optional[str] = Field(default=None) # None = use PaddleX default
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
# Text Detection Parameters
text_det_thresh: Optional[float] = Field(default=None) # None = use PaddleX default
text_det_box_thresh: Optional[float] = Field(default=None) # None = use PaddleX default
text_det_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
# Layout Detection Model Configuration (Stage 3)
# Available models:
@@ -136,6 +144,23 @@ class Settings(BaseSettings):
description="Table structure model for borderless tables. SLANeXt_wireless recommended."
)
# Table Classification Model - determines if table is wired or wireless
table_classification_model_name: Optional[str] = Field(
default="PP-LCNet_x1_0_table_cls",
description="Model to classify table type (wired vs wireless). Enables automatic model selection."
)
# Table Cell Detection Models - detect individual cells within tables
# These are crucial for accurate cell boundary detection in complex tables
wired_table_cells_detection_model_name: Optional[str] = Field(
default="RT-DETR-L_wired_table_cell_det",
description="Cell detection model for bordered tables. RT-DETR-L provides best accuracy."
)
wireless_table_cells_detection_model_name: Optional[str] = Field(
default="RT-DETR-L_wireless_table_cell_det",
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
)
# Formula Recognition Model Configuration (Stage 4)
# Available models:
# - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
@@ -146,6 +171,37 @@ class Settings(BaseSettings):
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
)
# Chart Recognition Model Configuration
chart_recognition_model_name: Optional[str] = Field(
default="PP-Chart2Table",
description="Chart to table recognition model."
)
# Text Detection and Recognition Model Configuration
# PP-OCRv5_server provides best accuracy for document OCR
text_detection_model_name: Optional[str] = Field(
default="PP-OCRv5_server_det",
description="Text detection model. PP-OCRv5_server_det recommended for documents."
)
text_recognition_model_name: Optional[str] = Field(
default="PP-OCRv5_server_rec",
description="Text recognition model. PP-OCRv5_server_rec recommended for documents."
)
# Document Preprocessing Model Configuration (Stage 1)
doc_orientation_classify_model_name: Optional[str] = Field(
default="PP-LCNet_x1_0_doc_ori",
description="Document orientation classification model for auto-rotation."
)
doc_unwarping_model_name: Optional[str] = Field(
default="UVDoc",
description="Document unwarping model for correcting perspective distortion."
)
textline_orientation_model_name: Optional[str] = Field(
default="PP-LCNet_x1_0_textline_ori",
description="Textline orientation model for detecting text direction."
)
# ===== Layout Preprocessing Configuration =====
# Image preprocessing to enhance layout detection for documents with faint lines/borders
# Preprocessing only affects layout detection input; original image is preserved for extraction
@@ -179,6 +235,31 @@ class Settings(BaseSettings):
description="Contrast below this triggers binarization in auto mode"
)
# Layout image scaling for better table detection
# Automatic bidirectional scaling for layout detection
# PDF conversion now uses 150 DPI (~1240x1754 for A4), which falls within optimal range
# Scaling acts as a safety net for:
# - Very large images (>2000px): Downscale to target
# - Very small images (<1200px): Upscale to target
# - 150 DPI A4 (1240x1754): No scaling needed (already optimal)
layout_image_scaling_enabled: bool = Field(
default=True,
description="Enable automatic bidirectional scaling for layout detection. "
"Images outside optimal range are scaled to target dimension."
)
layout_image_scaling_max_dimension: int = Field(
default=2000,
description="Max dimension (pixels) before downscaling. Images larger than this will be scaled down."
)
layout_image_scaling_min_dimension: int = Field(
default=1200,
description="Min dimension (pixels) before upscaling. Images smaller than this will be scaled up."
)
layout_image_scaling_target_dimension: int = Field(
default=1600,
description="Target dimension (pixels) for scaling. Optimal size for PP-Structure layout detection."
)
# ===== Gap Filling Configuration =====
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track