feat: enhance layout preprocessing and unify image scaling proposal
Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -90,19 +90,27 @@ class Settings(BaseSettings):
|
||||
enable_formula_recognition: bool = Field(default=True) # Math formula recognition
|
||||
enable_table_recognition: bool = Field(default=True) # Table structure recognition
|
||||
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
|
||||
enable_region_detection: bool = Field(default=True) # Region detection for better table structure
|
||||
enable_text_recognition: bool = Field(default=True) # General text recognition
|
||||
|
||||
# PP-StructureV3 Preprocessing (Stage 1)
|
||||
use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation
|
||||
use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos
|
||||
use_textline_orientation: bool = Field(default=True) # Detect textline orientation
|
||||
layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection
|
||||
layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements
|
||||
layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging
|
||||
layout_unclip_ratio: float = Field(default=1.2) # Smaller unclip to preserve element boundaries
|
||||
text_det_thresh: float = Field(default=0.2) # More sensitive text detection
|
||||
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
|
||||
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
|
||||
|
||||
# Layout Detection Parameters (Stage 3)
|
||||
# NOTE: Testing showed that PaddleX defaults work better for table detection.
|
||||
# Previously we used aggressive low thresholds (0.2) which caused table detection failures.
|
||||
# Now using None to let PaddleX use its optimized defaults.
|
||||
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
layout_merge_mode: Optional[str] = Field(default=None) # None = use PaddleX default
|
||||
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
|
||||
# Text Detection Parameters
|
||||
text_det_thresh: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
text_det_box_thresh: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
text_det_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||
|
||||
# Layout Detection Model Configuration (Stage 3)
|
||||
# Available models:
|
||||
@@ -136,6 +144,23 @@ class Settings(BaseSettings):
|
||||
description="Table structure model for borderless tables. SLANeXt_wireless recommended."
|
||||
)
|
||||
|
||||
# Table Classification Model - determines if table is wired or wireless
|
||||
table_classification_model_name: Optional[str] = Field(
|
||||
default="PP-LCNet_x1_0_table_cls",
|
||||
description="Model to classify table type (wired vs wireless). Enables automatic model selection."
|
||||
)
|
||||
|
||||
# Table Cell Detection Models - detect individual cells within tables
|
||||
# These are crucial for accurate cell boundary detection in complex tables
|
||||
wired_table_cells_detection_model_name: Optional[str] = Field(
|
||||
default="RT-DETR-L_wired_table_cell_det",
|
||||
description="Cell detection model for bordered tables. RT-DETR-L provides best accuracy."
|
||||
)
|
||||
wireless_table_cells_detection_model_name: Optional[str] = Field(
|
||||
default="RT-DETR-L_wireless_table_cell_det",
|
||||
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
|
||||
)
|
||||
|
||||
# Formula Recognition Model Configuration (Stage 4)
|
||||
# Available models:
|
||||
# - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
|
||||
@@ -146,6 +171,37 @@ class Settings(BaseSettings):
|
||||
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
|
||||
)
|
||||
|
||||
# Chart Recognition Model Configuration
|
||||
chart_recognition_model_name: Optional[str] = Field(
|
||||
default="PP-Chart2Table",
|
||||
description="Chart to table recognition model."
|
||||
)
|
||||
|
||||
# Text Detection and Recognition Model Configuration
|
||||
# PP-OCRv5_server provides best accuracy for document OCR
|
||||
text_detection_model_name: Optional[str] = Field(
|
||||
default="PP-OCRv5_server_det",
|
||||
description="Text detection model. PP-OCRv5_server_det recommended for documents."
|
||||
)
|
||||
text_recognition_model_name: Optional[str] = Field(
|
||||
default="PP-OCRv5_server_rec",
|
||||
description="Text recognition model. PP-OCRv5_server_rec recommended for documents."
|
||||
)
|
||||
|
||||
# Document Preprocessing Model Configuration (Stage 1)
|
||||
doc_orientation_classify_model_name: Optional[str] = Field(
|
||||
default="PP-LCNet_x1_0_doc_ori",
|
||||
description="Document orientation classification model for auto-rotation."
|
||||
)
|
||||
doc_unwarping_model_name: Optional[str] = Field(
|
||||
default="UVDoc",
|
||||
description="Document unwarping model for correcting perspective distortion."
|
||||
)
|
||||
textline_orientation_model_name: Optional[str] = Field(
|
||||
default="PP-LCNet_x1_0_textline_ori",
|
||||
description="Textline orientation model for detecting text direction."
|
||||
)
|
||||
|
||||
# ===== Layout Preprocessing Configuration =====
|
||||
# Image preprocessing to enhance layout detection for documents with faint lines/borders
|
||||
# Preprocessing only affects layout detection input; original image is preserved for extraction
|
||||
@@ -179,6 +235,31 @@ class Settings(BaseSettings):
|
||||
description="Contrast below this triggers binarization in auto mode"
|
||||
)
|
||||
|
||||
# Layout image scaling for better table detection
|
||||
# Automatic bidirectional scaling for layout detection
|
||||
# PDF conversion now uses 150 DPI (~1240x1754 for A4), which falls within optimal range
|
||||
# Scaling acts as a safety net for:
|
||||
# - Very large images (>2000px): Downscale to target
|
||||
# - Very small images (<1200px): Upscale to target
|
||||
# - 150 DPI A4 (1240x1754): No scaling needed (already optimal)
|
||||
layout_image_scaling_enabled: bool = Field(
|
||||
default=True,
|
||||
description="Enable automatic bidirectional scaling for layout detection. "
|
||||
"Images outside optimal range are scaled to target dimension."
|
||||
)
|
||||
layout_image_scaling_max_dimension: int = Field(
|
||||
default=2000,
|
||||
description="Max dimension (pixels) before downscaling. Images larger than this will be scaled down."
|
||||
)
|
||||
layout_image_scaling_min_dimension: int = Field(
|
||||
default=1200,
|
||||
description="Min dimension (pixels) before upscaling. Images smaller than this will be scaled up."
|
||||
)
|
||||
layout_image_scaling_target_dimension: int = Field(
|
||||
default=1600,
|
||||
description="Target dimension (pixels) for scaling. Optimal size for PP-Structure layout detection."
|
||||
)
|
||||
|
||||
# ===== Gap Filling Configuration =====
|
||||
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
|
||||
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
|
||||
|
||||
Reference in New Issue
Block a user