feat: enhance layout preprocessing and unify image scaling proposal

Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -7,10 +7,14 @@ This module provides enhanced PP-StructureV3 processing that extracts all

 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Any
+from typing import Dict, List, Optional, Tuple, Any, TYPE_CHECKING
 import json
 import gc

+# Import ScalingInfo for type checking (avoid circular imports at runtime)
+if TYPE_CHECKING:
+    from app.services.layout_preprocessing_service import ScalingInfo
+
 # Optional torch import for additional GPU memory management
 try:
    import torch
@@ -81,7 +85,8 @@ class PPStructureEnhanced:
        image_path: Path,
        output_dir: Optional[Path] = None,
        current_page: int = 0,
-        preprocessed_image: Optional[Image.Image] = None
+        preprocessed_image: Optional[Image.Image] = None,
+        scaling_info: Optional['ScalingInfo'] = None
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.
@@ -93,10 +98,13 @@ class PPStructureEnhanced:
            preprocessed_image: Optional preprocessed PIL Image for layout detection.
                               If provided, this is used for PP-Structure prediction,
                               but original image_path is still used for cropping images.
+            scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
+                         for layout detection, all bbox coordinates will be scaled back
+                         to original image coordinates for proper cropping.

        Returns:
            Dictionary with complete structure information including:
-            - elements: List of all detected elements with types and bbox
+            - elements: List of all detected elements with types and bbox (in original coords)
            - reading_order: Reading order indices
            - images: Extracted images with metadata
            - tables: Extracted tables with structure
@@ -184,7 +192,7 @@ class PPStructureEnhanced:
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
-                        parsing_res_list, current_page, output_dir, image_path
+                        parsing_res_list, current_page, output_dir, image_path, scaling_info
                    )
                    all_elements.extend(elements)

@@ -247,13 +255,15 @@ class PPStructureEnhanced:
        parsing_res_list: List[Dict],
        current_page: int,
        output_dir: Optional[Path],
-        source_image_path: Optional[Path] = None
+        source_image_path: Optional[Path] = None,
+        scaling_info: Optional['ScalingInfo'] = None
    ) -> List[Dict[str, Any]]:
        """
        Process parsing_res_list to extract all elements.

        Args:
            parsing_res_list: List of parsed elements from PP-StructureV3
+            scaling_info: Scaling information for bbox coordinate restoration
            current_page: Current page number
            output_dir: Optional output directory
            source_image_path: Path to source image for cropping image regions
@@ -285,11 +295,28 @@ class PPStructureEnhanced:

            # Ensure bbox has 4 values
            if len(layout_bbox) >= 4:
-                bbox = layout_bbox[:4]  # [x1, y1, x2, y2]
+                bbox = list(layout_bbox[:4])  # [x1, y1, x2, y2]
            else:
                bbox = [0, 0, 0, 0]  # Default if bbox missing
                logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}")

+            # Scale bbox back to original image coordinates if image was scaled
+            # This is critical for proper cropping from original high-resolution image
+            if scaling_info and scaling_info.was_scaled and bbox != [0, 0, 0, 0]:
+                scale_factor = scaling_info.scale_factor
+                bbox = [
+                    bbox[0] * scale_factor,  # x1
+                    bbox[1] * scale_factor,  # y1
+                    bbox[2] * scale_factor,  # x2
+                    bbox[3] * scale_factor   # y2
+                ]
+                if idx == 0:  # Log only for first element to avoid spam
+                    logger.info(
+                        f"Scaled bbox to original coords: "
+                        f"{[round(x, 1) for x in layout_bbox[:4]]} -> {[round(x, 1) for x in bbox]} "
+                        f"(factor={scale_factor:.3f})"
+                    )
+
            # Extract content (check multiple possible keys)
            content = (
                item.get('content', '') or