feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature - Detect rotation angle from doc_preprocessor_res.angle - Swap page dimensions (width <-> height) for 90°/270° rotations - Output PDF now correctly displays landscape-scanned content Also includes: - Archive completed openspec proposals - Add simplify-frontend-ocr-config proposal (pending) - Code cleanup and frontend simplification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions
--- a/backend/app/services/text_region_renderer.py
+++ b/backend/app/services/text_region_renderer.py
@@ -15,6 +15,8 @@ from typing import Dict, List, Optional, Set, Tuple
 from reportlab.pdfgen import canvas
 from reportlab.lib.colors import black

+from app.utils.bbox_utils import normalize_bbox
+
 logger = logging.getLogger(__name__)


@@ -162,6 +164,7 @@ class TextRegionRenderer:
    def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]:
        """
        Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1).
+        Uses shared bbox utility.

        Args:
            bbox: List of 4 [x, y] coordinate pairs
@@ -169,12 +172,8 @@ class TextRegionRenderer:
        Returns:
            Tuple of (x0, y0, x1, y1) - min/max coordinates
        """
-        if len(bbox) < 4:
-            return (0.0, 0.0, 0.0, 0.0)
-
-        x_coords = [p[0] for p in bbox]
-        y_coords = [p[1] for p in bbox]
-        return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
+        result = normalize_bbox(bbox)
+        return result if result else (0.0, 0.0, 0.0, 0.0)

    def get_bbox_left_baseline(
        self,
@@ -646,19 +645,26 @@ def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[D
    from pathlib import Path
    import json

-    # Construct filename pattern
-    filename = f"{task_id}_edit_page_{page_num}_raw_ocr_regions.json"
-    file_path = Path(result_dir) / filename
+    result_path = Path(result_dir)

-    if not file_path.exists():
-        logger.warning(f"Raw OCR regions file not found: {file_path}")
-        return []
+    # Use glob pattern to find raw OCR regions file
+    # Filename format: {task_id}_{original_filename}_page_{page_num}_raw_ocr_regions.json
+    # The original_filename varies based on uploaded file (e.g., scan, document, etc.)
+    glob_pattern = f"{task_id}_*_page_{page_num}_raw_ocr_regions.json"
+    matching_files = list(result_path.glob(glob_pattern))

-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            regions = json.load(f)
-            logger.info(f"Loaded {len(regions)} raw OCR regions from {filename}")
-            return regions
-    except Exception as e:
-        logger.error(f"Failed to load raw OCR regions: {e}")
-        return []
+    if matching_files:
+        # Use the first matching file (there should only be one per page)
+        file_path = matching_files[0]
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                regions = json.load(f)
+                logger.info(f"Loaded {len(regions)} raw OCR regions from {file_path.name}")
+                return regions
+        except Exception as e:
+            logger.error(f"Failed to load raw OCR regions from {file_path}: {e}")
+            return []
+
+    logger.warning(f"Raw OCR regions file not found for task {task_id} page {page_num}. "
+                   f"Glob pattern: {glob_pattern}")
+    return []