feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions

View File

@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
from datetime import datetime
import uuid
import gc # For garbage collection
import warnings # For suppressing PaddleX deprecation warnings
from paddleocr import PaddleOCR, PPStructureV3
from PIL import Image
@@ -34,7 +35,21 @@ from app.services.layout_preprocessing_service import (
get_layout_preprocessing_service,
LayoutPreprocessingService,
)
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig, TableDetectionConfig
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
from dataclasses import dataclass
@dataclass
class TableDetectionConfig:
"""Internal table detection configuration for OCR service.
Note: This was previously in app.schemas.task but is now internal to OCR service
as frontend no longer configures these options.
"""
enable_wired_table: bool = True
enable_wireless_table: bool = True
enable_region_detection: bool = True
# Import dual-track components
try:
@@ -798,7 +813,12 @@ class OCRService:
if textline_ori_model:
pp_kwargs['textline_orientation_model_name'] = textline_ori_model
self.structure_engine = PPStructureV3(**pp_kwargs)
# Suppress DeprecationWarning during PPStructureV3 initialization
# Workaround for PaddleX bug: it incorrectly treats Python's datetime.utcnow()
# deprecation warning as a model loading error in PP-Chart2Table
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
self.structure_engine = PPStructureV3(**pp_kwargs)
# Track model loading for cache management
self._model_last_used['structure'] = datetime.now()
@@ -881,7 +901,10 @@ class OCRService:
if settings.textline_orientation_model_name:
cpu_kwargs['textline_orientation_model_name'] = settings.textline_orientation_model_name
self.structure_engine = PPStructureV3(**cpu_kwargs)
# Suppress DeprecationWarning during PPStructureV3 initialization (CPU fallback)
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
self.structure_engine = PPStructureV3(**cpu_kwargs)
self._current_layout_model = layout_model # Track current model for recreation check
# Track table detection config for recreation check
if table_detection_config:
@@ -1429,6 +1452,22 @@ class OCRService:
raw_ocr_regions=text_regions # For table content rebuilding
)
# Get detected rotation from layout analysis (default: "0" = no rotation)
detected_rotation = "0"
if layout_data:
detected_rotation = layout_data.get('detected_rotation', '0')
# Adjust page dimensions based on detected rotation
# When rotation is 90° or 270°, the page orientation changes (portrait <-> landscape)
# PP-StructureV3 returns coordinates based on the rotated image, so we need to swap dimensions
if detected_rotation in ['90', '270']:
original_width, original_height = ocr_width, ocr_height
ocr_width, ocr_height = original_height, original_width
logger.info(
f"Page dimensions adjusted for {detected_rotation}° rotation: "
f"{original_width}x{original_height} -> {ocr_width}x{ocr_height}"
)
# Generate Markdown
markdown_content = self.generate_markdown(text_regions, layout_data)
@@ -1450,7 +1489,8 @@ class OCRService:
'ocr_dimensions': {
'width': ocr_width,
'height': ocr_height
}
},
'detected_rotation': detected_rotation # Document orientation: "0", "90", "180", "270"
}
# If layout data is enhanced, add enhanced results for converter
@@ -1705,7 +1745,8 @@ class OCRService:
'total_elements': result['total_elements'],
'reading_order': result['reading_order'],
'element_types': result.get('element_types', {}),
'enhanced': True
'enhanced': True,
'detected_rotation': result.get('detected_rotation', '0') # Document orientation
}
# Extract images metadata