feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions

View File

@@ -90,11 +90,16 @@ class Settings(BaseSettings):
max_concurrent_pages: int = Field(default=2) # Process 2 pages concurrently
# PP-StructureV3 optimization
# Strategy: Use raw OCR positioning (simple-text-positioning) instead of table structure reconstruction
# - Layout Detection: ON (detect regions)
# - General OCR: ON (text recognition)
# - Table Recognition: OFF (no cell/structure parsing - use raw OCR bbox instead)
# - Seal/Formula/Chart: ON (specialized recognition)
enable_chart_recognition: bool = Field(default=True) # Chart/diagram recognition
enable_formula_recognition: bool = Field(default=True) # Math formula recognition
enable_table_recognition: bool = Field(default=True) # Table structure recognition
enable_table_recognition: bool = Field(default=False) # Table structure recognition - DISABLED (use raw OCR)
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
enable_region_detection: bool = Field(default=True) # Region detection for better table structure
enable_region_detection: bool = Field(default=True) # Region detection for layout
enable_text_recognition: bool = Field(default=True) # General text recognition
# Table Parsing Mode - Controls how aggressively tables are parsed
@@ -116,57 +121,6 @@ class Settings(BaseSettings):
description="Layout threshold for ALL element detection. Higher values = fewer elements detected."
)
# Cell Validation (filter over-detected table cells)
# DISABLED: This is a patch behavior - focus on getting PP-Structure output right first
cell_validation_enabled: bool = Field(
default=False,
description="Enable cell validation to filter over-detected tables"
)
cell_validation_max_density: float = Field(
default=3.0,
description="Max cells per 10,000px². Tables exceeding this are reclassified as TEXT."
)
cell_validation_min_cell_area: float = Field(
default=3000.0,
description="Min average cell area in px². Tables below this are reclassified as TEXT."
)
cell_validation_min_cell_height: float = Field(
default=10.0,
description="Min average cell height in px. Tables below this are reclassified as TEXT."
)
# Table Content Rebuilder (rebuild table HTML from raw OCR)
# DISABLED: This is a patch behavior - focus on getting PP-Structure output right first
table_content_rebuilder_enabled: bool = Field(
default=False,
description="Enable table content rebuilder to fix PP-Structure table HTML"
)
# Table Quality Check (determines rendering strategy based on cell_boxes overlap)
# When enabled, tables with overlapping cell_boxes are marked as 'bad' quality
# and rendered with border-only mode instead of full cell_boxes rendering.
# Disable this to always use cell_boxes rendering regardless of quality.
table_quality_check_enabled: bool = Field(
default=False,
description="Enable cell_boxes quality check. When disabled, all tables use cell_boxes rendering."
)
# Table Rendering: cell_boxes-first approach
# When enabled, uses cell_boxes coordinates as the primary source for table structure
# instead of relying on HTML colspan/rowspan, which often causes grid mismatch issues
# DISABLED: Algorithm needs improvement - clustering produces incorrect grid dimensions
table_rendering_prefer_cellboxes: bool = Field(
default=False,
description="Use cell_boxes coordinates as primary table structure source for PDF rendering"
)
table_cellboxes_row_threshold: float = Field(
default=15.0,
description="Y-coordinate threshold for row clustering when inferring grid from cell_boxes"
)
table_cellboxes_col_threshold: float = Field(
default=15.0,
description="X-coordinate threshold for column clustering when inferring grid from cell_boxes"
)
# Table Column Alignment Correction (Header-Anchor Algorithm)
# Corrects PP-Structure's column assignment errors using header row X-coordinates as reference
@@ -204,7 +158,10 @@ class Settings(BaseSettings):
)
# PP-StructureV3 Preprocessing (Stage 1)
use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation
# NOTE: doc_orientation_classify ENABLED - detects and corrects document orientation
# for scanned PDFs where content orientation differs from PDF page metadata.
# When rotation is detected (90°/270°), page dimensions are swapped accordingly.
use_doc_orientation_classify: bool = Field(default=True) # Enabled: auto-detect and correct page orientation
use_doc_unwarping: bool = Field(default=False) # Disabled: can cause document distortion/skewing
use_textline_orientation: bool = Field(default=True) # Detect textline orientation
@@ -417,10 +374,6 @@ class Settings(BaseSettings):
description="Use PP-StructureV3's internal OCR results instead of separate inference."
)
# Legacy IoU threshold (deprecated, kept for backward compatibility)
gap_filling_iou_threshold: float = Field(default=0.15) # Deprecated: use IoA thresholds
gap_filling_dedup_iou_threshold: float = Field(default=0.5) # Deprecated: use gap_filling_dedup_ioa_threshold
# ===== Debug Configuration =====
# Enable debug outputs for PP-StructureV3 analysis
pp_structure_debug_enabled: bool = Field(default=True) # Save debug files for PP-StructureV3