feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions

View File

@@ -65,139 +65,6 @@ class PreprocessingContrastEnum(str, Enum):
DOCUMENT = "document"
class OCRPresetEnum(str, Enum):
"""OCR processing preset for different document types.
Presets provide optimized PP-Structure configurations for common document types:
- TEXT_HEAVY: Reports, articles, manuals (disable table recognition)
- DATASHEET: Technical datasheets, TDS (conservative table parsing)
- TABLE_HEAVY: Financial reports, spreadsheets (full table recognition)
- FORM: Applications, surveys (conservative table parsing)
- MIXED: General documents (classification only)
- CUSTOM: User-defined settings (use ocr_config)
"""
TEXT_HEAVY = "text_heavy" # Reports, articles, manuals
DATASHEET = "datasheet" # Technical datasheets, TDS
TABLE_HEAVY = "table_heavy" # Financial reports, spreadsheets
FORM = "form" # Applications, surveys
MIXED = "mixed" # General documents
CUSTOM = "custom" # User-defined settings
class TableParsingModeEnum(str, Enum):
"""Table parsing mode controlling how aggressively tables are parsed.
- FULL: Full table recognition with cell segmentation (aggressive)
- CONSERVATIVE: Disable wireless tables to prevent cell explosion
- CLASSIFICATION_ONLY: Only classify table regions, no cell segmentation
- DISABLED: Completely disable table recognition
"""
FULL = "full"
CONSERVATIVE = "conservative"
CLASSIFICATION_ONLY = "classification_only"
DISABLED = "disabled"
class OCRConfig(BaseModel):
"""OCR processing configuration for PP-Structure.
Allows fine-grained control over PP-Structure parameters.
Use with ocr_preset=CUSTOM or to override specific preset values.
"""
# Table Processing
table_parsing_mode: TableParsingModeEnum = Field(
default=TableParsingModeEnum.CONSERVATIVE,
description="Table parsing mode: full, conservative, classification_only, disabled"
)
enable_wired_table: bool = Field(
default=True,
description="Enable wired (bordered) table detection"
)
enable_wireless_table: bool = Field(
default=False,
description="Enable wireless (borderless) table detection. Can cause cell explosion."
)
# Layout Detection
layout_threshold: Optional[float] = Field(
default=None,
ge=0.0,
le=1.0,
description="Layout detection threshold. Higher = stricter. None uses default."
)
layout_nms_threshold: Optional[float] = Field(
default=None,
ge=0.0,
le=1.0,
description="Layout NMS threshold. None uses default."
)
# Preprocessing
use_doc_orientation_classify: bool = Field(
default=True,
description="Auto-detect and correct document rotation"
)
use_doc_unwarping: bool = Field(
default=False,
description="Correct document warping. Can cause distortion."
)
use_textline_orientation: bool = Field(
default=True,
description="Detect textline orientation"
)
# Recognition Modules
enable_chart_recognition: bool = Field(
default=True,
description="Enable chart/diagram recognition"
)
enable_formula_recognition: bool = Field(
default=True,
description="Enable math formula recognition"
)
enable_seal_recognition: bool = Field(
default=False,
description="Enable seal/stamp recognition"
)
enable_region_detection: bool = Field(
default=True,
description="Enable region detection for better structure"
)
# Preset configurations mapping
OCR_PRESET_CONFIGS = {
OCRPresetEnum.TEXT_HEAVY: OCRConfig(
table_parsing_mode=TableParsingModeEnum.DISABLED,
enable_wired_table=False,
enable_wireless_table=False,
enable_chart_recognition=False,
enable_formula_recognition=False,
),
OCRPresetEnum.DATASHEET: OCRConfig(
table_parsing_mode=TableParsingModeEnum.CONSERVATIVE,
enable_wired_table=True,
enable_wireless_table=False,
),
OCRPresetEnum.TABLE_HEAVY: OCRConfig(
table_parsing_mode=TableParsingModeEnum.FULL,
enable_wired_table=True,
enable_wireless_table=True,
),
OCRPresetEnum.FORM: OCRConfig(
table_parsing_mode=TableParsingModeEnum.CONSERVATIVE,
enable_wired_table=True,
enable_wireless_table=False,
),
OCRPresetEnum.MIXED: OCRConfig(
table_parsing_mode=TableParsingModeEnum.CLASSIFICATION_ONLY,
enable_wired_table=True,
enable_wireless_table=False,
),
# CUSTOM uses user-provided config directly
}
class PreprocessingConfig(BaseModel):
"""Preprocessing configuration for layout detection enhancement.
@@ -235,31 +102,6 @@ class PreprocessingConfig(BaseModel):
)
class TableDetectionConfig(BaseModel):
"""Table detection configuration for PP-StructureV3.
Controls which table detection modes to enable. PP-StructureV3 uses specialized
models for different table types:
- Wired (bordered): Tables with visible cell borders/grid lines
- Wireless (borderless): Tables without visible borders, relying on alignment
- Region detection: Detect table-like regions for better cell structure
Multiple options can be enabled simultaneously for comprehensive detection.
"""
enable_wired_table: bool = Field(
default=True,
description="Enable wired (bordered) table detection. Best for tables with visible grid lines."
)
enable_wireless_table: bool = Field(
default=True,
description="Enable wireless (borderless) table detection. Best for tables without visible borders."
)
enable_region_detection: bool = Field(
default=True,
description="Enable region detection for better table structure inference."
)
class ImageQualityMetrics(BaseModel):
"""Image quality metrics from auto-analysis."""
contrast: float = Field(..., description="Contrast level (std dev of grayscale)")
@@ -456,23 +298,6 @@ class ProcessingOptions(BaseModel):
description="Manual preprocessing config (only used when preprocessing_mode='manual')"
)
# Table detection configuration (OCR track only)
table_detection: Optional[TableDetectionConfig] = Field(
None,
description="Table detection config. If None, all table detection modes are enabled."
)
# OCR Processing Preset (OCR track only)
# Use presets for optimized configurations or CUSTOM with ocr_config for fine-tuning
ocr_preset: Optional[OCRPresetEnum] = Field(
default=OCRPresetEnum.DATASHEET,
description="OCR processing preset: text_heavy, datasheet, table_heavy, form, mixed, custom"
)
ocr_config: Optional[OCRConfig] = Field(
None,
description="Custom OCR config. Used when ocr_preset=custom or to override preset values."
)
class AnalyzeRequest(BaseModel):
"""Document analysis request"""