feat: enhance layout preprocessing and unify image scaling proposal

Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions
--- a/backend/app/services/layout_preprocessing_service.py
+++ b/backend/app/services/layout_preprocessing_service.py
@@ -32,6 +32,15 @@ from app.schemas.task import (
 logger = logging.getLogger(__name__)


+@dataclass
+class ScalingInfo:
+    """Information about image scaling applied for layout detection."""
+    was_scaled: bool
+    scale_factor: float  # Factor to multiply bbox coords to get original size (1.0 / actual_scale)
+    original_size: Tuple[int, int]  # (width, height) of original image
+    scaled_size: Tuple[int, int]  # (width, height) after scaling
+
+
@dataclass
 class PreprocessingResult:
    """Result of preprocessing operation."""
@@ -39,6 +48,7 @@ class PreprocessingResult:
    config_used: PreprocessingConfig
    quality_metrics: ImageQualityMetrics
    was_processed: bool
+    scaling_info: Optional[ScalingInfo] = None  # Info about any scaling applied


 class LayoutPreprocessingService:
@@ -60,10 +70,23 @@ class LayoutPreprocessingService:
        self.edge_threshold = settings.layout_preprocessing_edge_threshold
        self.binarize_threshold = settings.layout_preprocessing_binarize_threshold

+        # Image scaling settings for layout detection (bidirectional)
+        self.scaling_enabled = settings.layout_image_scaling_enabled
+        self.scaling_max_dimension = settings.layout_image_scaling_max_dimension
+        self.scaling_min_dimension = settings.layout_image_scaling_min_dimension
+        self.scaling_target_dimension = settings.layout_image_scaling_target_dimension
+
        # CLAHE parameters
        self.clahe_clip_limit = 2.0
        self.clahe_tile_grid_size = (8, 8)

+        # Document-specific CLAHE parameters (larger tiles for documents)
+        self.document_clahe_clip_limit = 3.0
+        self.document_clahe_tile_grid_size = (16, 16)
+
+        # Background normalization parameters for scanned documents
+        self.background_kernel_size = 51  # Morphological kernel size
+
        # Sharpening kernel (unsharp mask style)
        self.sharpen_kernel = np.array([
            [0, -1, 0],
@@ -74,7 +97,9 @@ class LayoutPreprocessingService:
        logger.info(
            f"LayoutPreprocessingService initialized with thresholds: "
            f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
-            f"binarize={self.binarize_threshold}"
+            f"binarize={self.binarize_threshold}, "
+            f"scaling={'enabled' if self.scaling_enabled else 'disabled'} "
+            f"(min={self.scaling_min_dimension}, max={self.scaling_max_dimension}, target={self.scaling_target_dimension})"
        )

    def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
@@ -106,6 +131,180 @@ class LayoutPreprocessingService:
            edge_strength=round(edge_strength, 2)
        )

+    def _normalize_background(self, gray: np.ndarray) -> np.ndarray:
+        """
+        Normalize image background to remove uneven illumination.
+
+        This is particularly effective for scanned documents where scanner
+        lighting may be uneven, or where paper has yellowed/stained areas.
+
+        Method:
+        1. Estimate background using morphological closing (fills in text/details)
+        2. Divide original by background estimate
+        3. Rescale to full 0-255 range
+
+        Args:
+            gray: Grayscale image (L channel or grayscale)
+
+        Returns:
+            Normalized grayscale image with uniform background
+        """
+        # Create structuring element for morphological operations
+        kernel_size = self.background_kernel_size
+        # Ensure kernel size is odd
+        if kernel_size % 2 == 0:
+            kernel_size += 1
+
+        kernel = cv2.getStructuringElement(
+            cv2.MORPH_ELLIPSE,
+            (kernel_size, kernel_size)
+        )
+
+        # Morphological closing estimates the background
+        # (dilate then erode - fills in dark features like text)
+        background = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
+
+        # Apply Gaussian blur to smooth the background estimate
+        background = cv2.GaussianBlur(background, (kernel_size, kernel_size), 0)
+
+        # Avoid division by zero
+        background = np.maximum(background, 1).astype(np.float32)
+
+        # Normalize: divide by background and rescale to 0-255
+        # This removes uneven illumination while preserving text/content
+        normalized = (gray.astype(np.float32) / background) * 255.0
+
+        # Clip and convert back to uint8
+        normalized = np.clip(normalized, 0, 255).astype(np.uint8)
+
+        logger.debug(
+            f"Background normalization applied: kernel={kernel_size}, "
+            f"background range=[{background.min():.0f}, {background.max():.0f}]"
+        )
+
+        return normalized
+
+    def scale_for_layout_detection(
+        self,
+        image: np.ndarray,
+        force_scale: bool = False
+    ) -> Tuple[np.ndarray, ScalingInfo]:
+        """
+        Apply bidirectional scaling for optimal layout detection.
+
+        PP-Structure's layout detection model (RT-DETR based) works best with images
+        around 1600px on the longest side. Both too-large and too-small images
+        reduce detection accuracy:
+
+        - Too large (>2000px): Model's receptive field cannot capture entire structures
+        - Too small (<1200px): Insufficient detail for accurate detection
+
+        Scaling behavior:
+        - max_dim > max_dimension (2000): Scale DOWN to target (1600)
+        - max_dim < min_dimension (1200): Scale UP to target (1600)
+        - min_dimension <= max_dim <= max_dimension: No scaling (optimal range)
+
+        Args:
+            image: Input image (BGR)
+            force_scale: Force scaling to target even if in optimal range
+
+        Returns:
+            Tuple of (scaled_image, ScalingInfo)
+            ScalingInfo.scale_factor is the multiplier to convert scaled bbox
+            coordinates back to original image coordinates.
+        """
+        h, w = image.shape[:2]
+        original_size = (w, h)
+        max_dim = max(h, w)
+
+        # Determine if scaling is needed and direction
+        should_downscale = self.scaling_enabled and max_dim > self.scaling_max_dimension
+        should_upscale = self.scaling_enabled and max_dim < self.scaling_min_dimension
+        should_scale = should_downscale or should_upscale or force_scale
+
+        if not should_scale:
+            return image, ScalingInfo(
+                was_scaled=False,
+                scale_factor=1.0,
+                original_size=original_size,
+                scaled_size=original_size
+            )
+
+        # Calculate scale factor to reach target dimension
+        actual_scale = self.scaling_target_dimension / max_dim
+        new_w = int(w * actual_scale)
+        new_h = int(h * actual_scale)
+
+        # Choose interpolation method based on scale direction
+        if actual_scale < 1.0:
+            # Downscaling: INTER_AREA is best for shrinking (anti-aliasing)
+            interpolation = cv2.INTER_AREA
+            direction = "DOWN"
+        else:
+            # Upscaling: INTER_CUBIC provides smooth enlargement
+            interpolation = cv2.INTER_CUBIC
+            direction = "UP"
+
+        scaled_image = cv2.resize(image, (new_w, new_h), interpolation=interpolation)
+
+        # scale_factor is the inverse - used to scale bbox coords back to original
+        scale_factor = 1.0 / actual_scale
+
+        logger.info(
+            f"Scaled {direction} for layout detection: {w}x{h} -> {new_w}x{new_h} "
+            f"(scale_factor={scale_factor:.3f} to restore original coords)"
+        )
+
+        return scaled_image, ScalingInfo(
+            was_scaled=True,
+            scale_factor=scale_factor,
+            original_size=original_size,
+            scaled_size=(new_w, new_h)
+        )
+
+    @staticmethod
+    def scale_bbox_to_original(
+        bbox: Tuple[float, float, float, float],
+        scale_factor: float
+    ) -> Tuple[float, float, float, float]:
+        """
+        Scale a bounding box from scaled coordinates back to original image coordinates.
+
+        Args:
+            bbox: Bounding box as (x1, y1, x2, y2) in scaled image coordinates
+            scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
+
+        Returns:
+            Bounding box in original image coordinates
+        """
+        x1, y1, x2, y2 = bbox
+        return (
+            x1 * scale_factor,
+            y1 * scale_factor,
+            x2 * scale_factor,
+            y2 * scale_factor
+        )
+
+    @staticmethod
+    def scale_bboxes_to_original(
+        bboxes: list,
+        scale_factor: float
+    ) -> list:
+        """
+        Scale multiple bounding boxes from scaled coordinates to original.
+
+        Args:
+            bboxes: List of bounding boxes, each as (x1, y1, x2, y2)
+            scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
+
+        Returns:
+            List of bounding boxes in original image coordinates
+        """
+        return [
+            LayoutPreprocessingService.scale_bbox_to_original(bbox, scale_factor)
+            for bbox in bboxes
+        ]
+
    def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
        """
        Determine optimal preprocessing config based on image quality.
@@ -203,6 +402,18 @@ class LayoutPreprocessingService:
                tileGridSize=self.clahe_tile_grid_size
            )
            l_enhanced = clahe.apply(l_channel)
+        elif method == PreprocessingContrastEnum.DOCUMENT:
+            # Document-specific enhancement for scanned documents
+            # Step 1: Background normalization to remove uneven illumination
+            l_normalized = self._normalize_background(l_channel)
+
+            # Step 2: CLAHE with larger tiles optimized for documents
+            clip_limit = self.document_clahe_clip_limit * strength
+            clahe = cv2.createCLAHE(
+                clipLimit=clip_limit,
+                tileGridSize=self.document_clahe_tile_grid_size
+            )
+            l_enhanced = clahe.apply(l_normalized)
        else:
            return image

@@ -277,15 +488,29 @@ class LayoutPreprocessingService:
        self,
        image: Union[np.ndarray, Image.Image, str, Path],
        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
-        config: Optional[PreprocessingConfig] = None
+        config: Optional[PreprocessingConfig] = None,
+        apply_scaling: bool = True
    ) -> PreprocessingResult:
        """
        Preprocess image for layout detection.

+        The preprocessing pipeline:
+        1. Load image from path/PIL if needed
+        2. Analyze image quality (on original image for accurate metrics)
+        3. Scale down high-resolution images for better layout detection
+        4. Apply contrast enhancement if needed
+        5. Apply sharpening if needed
+        6. Apply binarization if requested (not recommended)
+
+        IMPORTANT: When scaling is applied, all bounding boxes from layout detection
+        must be scaled back to original coordinates using ScalingInfo.scale_factor.
+        The original image should be used for element extraction (cropping).
+
        Args:
            image: Input image (numpy array, PIL Image, or path)
            mode: Preprocessing mode (auto, manual, disabled)
            config: Manual configuration (required if mode='manual')
+            apply_scaling: Whether to apply automatic downscaling (default True)

        Returns:
            PreprocessingResult with preprocessed image and metadata
@@ -299,21 +524,37 @@ class LayoutPreprocessingService:
            # Convert PIL to OpenCV format (BGR)
            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

-        # Analyze quality
+        # Analyze quality on ORIGINAL image (before scaling) for accurate metrics
        metrics = self.analyze_image_quality(image)
        logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")

+        # Apply scaling for layout detection (even if preprocessing is disabled)
+        if apply_scaling:
+            scaled_image, scaling_info = self.scale_for_layout_detection(image)
+        else:
+            h, w = image.shape[:2]
+            scaled_image = image
+            scaling_info = ScalingInfo(
+                was_scaled=False,
+                scale_factor=1.0,
+                original_size=(w, h),
+                scaled_size=(w, h)
+            )
+
        # Determine configuration
        if mode == PreprocessingModeEnum.DISABLED:
+            # Even when preprocessing is disabled, we still return scaled image
+            # for better layout detection. Original image is preserved for cropping.
            return PreprocessingResult(
-                image=image,
+                image=scaled_image,
                config_used=PreprocessingConfig(
                    contrast=PreprocessingContrastEnum.NONE,
                    sharpen=False,
                    binarize=False
                ),
                quality_metrics=metrics,
-                was_processed=False
+                was_processed=scaling_info.was_scaled,  # True if scaling was applied
+                scaling_info=scaling_info
            )

        if mode == PreprocessingModeEnum.AUTO:
@@ -323,9 +564,9 @@ class LayoutPreprocessingService:
            # Manual mode but no config provided, use defaults
            config = PreprocessingConfig()

-        # Apply preprocessing pipeline
-        processed = image.copy()
-        was_processed = False
+        # Apply preprocessing pipeline on SCALED image
+        processed = scaled_image.copy()
+        was_processed = scaling_info.was_scaled  # Start with True if already scaled

        # Step 1: Contrast enhancement
        if config.contrast != PreprocessingContrastEnum.NONE:
@@ -353,29 +594,37 @@ class LayoutPreprocessingService:
            image=processed,
            config_used=config,
            quality_metrics=metrics,
-            was_processed=was_processed
+            was_processed=was_processed,
+            scaling_info=scaling_info
        )

    def preprocess_to_pil(
        self,
        image: Union[np.ndarray, Image.Image, str, Path],
        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
-        config: Optional[PreprocessingConfig] = None
+        config: Optional[PreprocessingConfig] = None,
+        apply_scaling: bool = True
    ) -> Tuple[Image.Image, PreprocessingResult]:
        """
        Preprocess image and return as PIL Image.

        Convenience method for integration with PP-Structure which accepts PIL images.

+        IMPORTANT: When result.scaling_info.was_scaled is True, all bounding boxes
+        from PP-Structure must be scaled back to original coordinates using:
+            scaled_bbox = (x1 * scale_factor, y1 * scale_factor, x2 * scale_factor, y2 * scale_factor)
+        where scale_factor = result.scaling_info.scale_factor
+
        Args:
            image: Input image
            mode: Preprocessing mode
            config: Manual configuration
+            apply_scaling: Whether to apply automatic downscaling (default True)

        Returns:
-            Tuple of (PIL Image, PreprocessingResult)
+            Tuple of (PIL Image for PP-Structure, PreprocessingResult with scaling info)
        """
-        result = self.preprocess(image, mode, config)
+        result = self.preprocess(image, mode, config, apply_scaling=apply_scaling)

        # Convert BGR to RGB for PIL
        rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)