feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions

View File

@@ -15,6 +15,8 @@ from typing import Dict, List, Optional, Set, Tuple
from reportlab.pdfgen import canvas
from reportlab.lib.colors import black
from app.utils.bbox_utils import normalize_bbox
logger = logging.getLogger(__name__)
@@ -162,6 +164,7 @@ class TextRegionRenderer:
def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]:
"""
Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1).
Uses shared bbox utility.
Args:
bbox: List of 4 [x, y] coordinate pairs
@@ -169,12 +172,8 @@ class TextRegionRenderer:
Returns:
Tuple of (x0, y0, x1, y1) - min/max coordinates
"""
if len(bbox) < 4:
return (0.0, 0.0, 0.0, 0.0)
x_coords = [p[0] for p in bbox]
y_coords = [p[1] for p in bbox]
return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
result = normalize_bbox(bbox)
return result if result else (0.0, 0.0, 0.0, 0.0)
def get_bbox_left_baseline(
self,
@@ -646,19 +645,26 @@ def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[D
from pathlib import Path
import json
# Construct filename pattern
filename = f"{task_id}_edit_page_{page_num}_raw_ocr_regions.json"
file_path = Path(result_dir) / filename
result_path = Path(result_dir)
if not file_path.exists():
logger.warning(f"Raw OCR regions file not found: {file_path}")
return []
# Use glob pattern to find raw OCR regions file
# Filename format: {task_id}_{original_filename}_page_{page_num}_raw_ocr_regions.json
# The original_filename varies based on uploaded file (e.g., scan, document, etc.)
glob_pattern = f"{task_id}_*_page_{page_num}_raw_ocr_regions.json"
matching_files = list(result_path.glob(glob_pattern))
try:
with open(file_path, 'r', encoding='utf-8') as f:
regions = json.load(f)
logger.info(f"Loaded {len(regions)} raw OCR regions from {filename}")
return regions
except Exception as e:
logger.error(f"Failed to load raw OCR regions: {e}")
return []
if matching_files:
# Use the first matching file (there should only be one per page)
file_path = matching_files[0]
try:
with open(file_path, 'r', encoding='utf-8') as f:
regions = json.load(f)
logger.info(f"Loaded {len(regions)} raw OCR regions from {file_path.name}")
return regions
except Exception as e:
logger.error(f"Failed to load raw OCR regions from {file_path}: {e}")
return []
logger.warning(f"Raw OCR regions file not found for task {task_id} page {page_num}. "
f"Glob pattern: {glob_pattern}")
return []