feat: implement layout preprocessing backend

Backend implementation for add-layout-preprocessing proposal: - Add LayoutPreprocessingService with CLAHE, sharpen, binarize - Add auto-detection: analyze_image_quality() for contrast/edge metrics - Integrate preprocessing into OCR pipeline (analyze_layout) - Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing - Add config options: layout_preprocessing_mode, thresholds - Add schemas: PreprocessingConfig, PreprocessingPreviewResponse Preprocessing only affects layout detection input. Original images preserved for element extraction. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 15:17:20 +08:00
parent 06a5973f2e
commit ea0dd7456c
7 changed files with 800 additions and 22 deletions
--- a/backend/app/services/layout_preprocessing_service.py
+++ b/backend/app/services/layout_preprocessing_service.py
@@ -0,0 +1,370 @@
+"""
+Tool_OCR - Layout Preprocessing Service
+Image preprocessing to enhance layout detection for documents with faint lines/borders.
+
+This service provides:
+1. Image quality analysis (contrast, edge strength)
+2. Contrast enhancement (histogram equalization, CLAHE)
+3. Sharpening for faint lines
+4. Optional binarization for very low contrast documents
+
+IMPORTANT: Preprocessing only affects layout detection input.
+Original images are preserved for element extraction.
+"""
+
+import logging
+from pathlib import Path
+from typing import Optional, Tuple, Union
+from dataclasses import dataclass
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from app.core.config import settings
+from app.schemas.task import (
+    PreprocessingConfig,
+    PreprocessingContrastEnum,
+    PreprocessingModeEnum,
+    ImageQualityMetrics,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PreprocessingResult:
+    """Result of preprocessing operation."""
+    image: np.ndarray
+    config_used: PreprocessingConfig
+    quality_metrics: ImageQualityMetrics
+    was_processed: bool
+
+
+class LayoutPreprocessingService:
+    """
+    Service for preprocessing images to improve layout detection.
+
+    The preprocessing pipeline:
+    1. Analyze image quality (contrast, edge strength)
+    2. Apply contrast enhancement if needed (CLAHE or histogram)
+    3. Apply sharpening if edge strength is low
+    4. Apply binarization if contrast is very low (optional)
+
+    All operations preserve the original color image dimensions.
+    """
+
+    def __init__(self):
+        # Load thresholds from config
+        self.contrast_threshold = settings.layout_preprocessing_contrast_threshold
+        self.edge_threshold = settings.layout_preprocessing_edge_threshold
+        self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
+
+        # CLAHE parameters
+        self.clahe_clip_limit = 2.0
+        self.clahe_tile_grid_size = (8, 8)
+
+        # Sharpening kernel (unsharp mask style)
+        self.sharpen_kernel = np.array([
+            [0, -1, 0],
+            [-1, 5, -1],
+            [0, -1, 0]
+        ], dtype=np.float32)
+
+        logger.info(
+            f"LayoutPreprocessingService initialized with thresholds: "
+            f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
+            f"binarize={self.binarize_threshold}"
+        )
+
+    def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
+        """
+        Analyze image quality to determine preprocessing needs.
+
+        Args:
+            image: Input image (BGR or grayscale)
+
+        Returns:
+            ImageQualityMetrics with contrast and edge_strength
+        """
+        # Convert to grayscale if needed
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image
+
+        # Calculate contrast (standard deviation of pixel values)
+        contrast = float(np.std(gray))
+
+        # Calculate edge strength (mean of Sobel gradient magnitude)
+        sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+        sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
+        edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2)))
+
+        return ImageQualityMetrics(
+            contrast=round(contrast, 2),
+            edge_strength=round(edge_strength, 2)
+        )
+
+    def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
+        """
+        Determine optimal preprocessing config based on image quality.
+
+        Args:
+            metrics: Image quality metrics from analyze_image_quality()
+
+        Returns:
+            PreprocessingConfig with recommended settings
+        """
+        # Determine contrast enhancement
+        if metrics.contrast < self.contrast_threshold:
+            contrast = PreprocessingContrastEnum.CLAHE
+        else:
+            contrast = PreprocessingContrastEnum.NONE
+
+        # Determine sharpening
+        sharpen = metrics.edge_strength < self.edge_threshold
+
+        # Determine binarization (only for very low contrast)
+        binarize = metrics.contrast < self.binarize_threshold
+
+        return PreprocessingConfig(
+            contrast=contrast,
+            sharpen=sharpen,
+            binarize=binarize
+        )
+
+    def apply_contrast_enhancement(
+        self,
+        image: np.ndarray,
+        method: PreprocessingContrastEnum
+    ) -> np.ndarray:
+        """
+        Apply contrast enhancement to image.
+
+        Args:
+            image: Input image (BGR)
+            method: Enhancement method (none, histogram, clahe)
+
+        Returns:
+            Enhanced image (BGR)
+        """
+        if method == PreprocessingContrastEnum.NONE:
+            return image
+
+        # Convert to LAB color space for better enhancement
+        lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
+        l_channel, a_channel, b_channel = cv2.split(lab)
+
+        if method == PreprocessingContrastEnum.HISTOGRAM:
+            # Standard histogram equalization
+            l_enhanced = cv2.equalizeHist(l_channel)
+        elif method == PreprocessingContrastEnum.CLAHE:
+            # Contrast Limited Adaptive Histogram Equalization
+            clahe = cv2.createCLAHE(
+                clipLimit=self.clahe_clip_limit,
+                tileGridSize=self.clahe_tile_grid_size
+            )
+            l_enhanced = clahe.apply(l_channel)
+        else:
+            return image
+
+        # Merge channels and convert back to BGR
+        enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel])
+        enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
+
+        return enhanced_bgr
+
+    def apply_sharpening(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply sharpening to enhance edges and faint lines.
+
+        Args:
+            image: Input image (BGR)
+
+        Returns:
+            Sharpened image (BGR)
+        """
+        # Apply unsharp mask style sharpening
+        sharpened = cv2.filter2D(image, -1, self.sharpen_kernel)
+
+        # Clip values to valid range
+        sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
+
+        return sharpened
+
+    def apply_binarization(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply adaptive binarization for very low contrast documents.
+
+        Args:
+            image: Input image (BGR)
+
+        Returns:
+            Binarized image (BGR, but grayscale values)
+        """
+        # Convert to grayscale
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+        # Apply adaptive thresholding
+        binary = cv2.adaptiveThreshold(
+            gray,
+            255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            blockSize=11,
+            C=2
+        )
+
+        # Convert back to BGR for consistency
+        binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
+
+        return binary_bgr
+
+    def preprocess(
+        self,
+        image: Union[np.ndarray, Image.Image, str, Path],
+        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
+        config: Optional[PreprocessingConfig] = None
+    ) -> PreprocessingResult:
+        """
+        Preprocess image for layout detection.
+
+        Args:
+            image: Input image (numpy array, PIL Image, or path)
+            mode: Preprocessing mode (auto, manual, disabled)
+            config: Manual configuration (required if mode='manual')
+
+        Returns:
+            PreprocessingResult with preprocessed image and metadata
+        """
+        # Load image if path provided
+        if isinstance(image, (str, Path)):
+            image = cv2.imread(str(image))
+            if image is None:
+                raise ValueError(f"Failed to load image: {image}")
+        elif isinstance(image, Image.Image):
+            # Convert PIL to OpenCV format (BGR)
+            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+
+        # Analyze quality
+        metrics = self.analyze_image_quality(image)
+        logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
+
+        # Determine configuration
+        if mode == PreprocessingModeEnum.DISABLED:
+            return PreprocessingResult(
+                image=image,
+                config_used=PreprocessingConfig(
+                    contrast=PreprocessingContrastEnum.NONE,
+                    sharpen=False,
+                    binarize=False
+                ),
+                quality_metrics=metrics,
+                was_processed=False
+            )
+
+        if mode == PreprocessingModeEnum.AUTO:
+            config = self.get_auto_config(metrics)
+            logger.debug(f"Auto config: {config}")
+        elif config is None:
+            # Manual mode but no config provided, use defaults
+            config = PreprocessingConfig()
+
+        # Apply preprocessing pipeline
+        processed = image.copy()
+        was_processed = False
+
+        # Step 1: Contrast enhancement
+        if config.contrast != PreprocessingContrastEnum.NONE:
+            processed = self.apply_contrast_enhancement(processed, config.contrast)
+            was_processed = True
+            logger.debug(f"Applied contrast enhancement: {config.contrast}")
+
+        # Step 2: Sharpening
+        if config.sharpen:
+            processed = self.apply_sharpening(processed)
+            was_processed = True
+            logger.debug("Applied sharpening")
+
+        # Step 3: Binarization (last step, overwrites color)
+        if config.binarize:
+            processed = self.apply_binarization(processed)
+            was_processed = True
+            logger.debug("Applied binarization")
+
+        return PreprocessingResult(
+            image=processed,
+            config_used=config,
+            quality_metrics=metrics,
+            was_processed=was_processed
+        )
+
+    def preprocess_to_pil(
+        self,
+        image: Union[np.ndarray, Image.Image, str, Path],
+        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
+        config: Optional[PreprocessingConfig] = None
+    ) -> Tuple[Image.Image, PreprocessingResult]:
+        """
+        Preprocess image and return as PIL Image.
+
+        Convenience method for integration with PP-Structure which accepts PIL images.
+
+        Args:
+            image: Input image
+            mode: Preprocessing mode
+            config: Manual configuration
+
+        Returns:
+            Tuple of (PIL Image, PreprocessingResult)
+        """
+        result = self.preprocess(image, mode, config)
+
+        # Convert BGR to RGB for PIL
+        rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
+        pil_image = Image.fromarray(rgb_image)
+
+        return pil_image, result
+
+    def save_preview(
+        self,
+        original: np.ndarray,
+        preprocessed: np.ndarray,
+        output_dir: Path,
+        prefix: str = "preview"
+    ) -> Tuple[Path, Path]:
+        """
+        Save original and preprocessed images for preview.
+
+        Args:
+            original: Original image (BGR)
+            preprocessed: Preprocessed image (BGR)
+            output_dir: Directory to save images
+            prefix: Filename prefix
+
+        Returns:
+            Tuple of (original_path, preprocessed_path)
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        original_path = output_dir / f"{prefix}_original.png"
+        preprocessed_path = output_dir / f"{prefix}_preprocessed.png"
+
+        cv2.imwrite(str(original_path), original)
+        cv2.imwrite(str(preprocessed_path), preprocessed)
+
+        return original_path, preprocessed_path
+
+
+# Singleton instance
+_layout_preprocessing_service: Optional[LayoutPreprocessingService] = None
+
+
+def get_layout_preprocessing_service() -> LayoutPreprocessingService:
+    """Get or create the layout preprocessing service singleton."""
+    global _layout_preprocessing_service
+    if _layout_preprocessing_service is None:
+        _layout_preprocessing_service = LayoutPreprocessingService()
+    return _layout_preprocessing_service
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -26,6 +26,11 @@ except ImportError:
 from app.core.config import settings
 from app.services.office_converter import OfficeConverter, OfficeConverterError
 from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
+from app.services.layout_preprocessing_service import (
+    get_layout_preprocessing_service,
+    LayoutPreprocessingService,
+)
+from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig

 # Import dual-track components
 try:
@@ -865,7 +870,9 @@ class OCRService:
        confidence_threshold: Optional[float] = None,
        output_dir: Optional[Path] = None,
        current_page: int = 0,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
+        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
+        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Dict:
        """
        Process single image with OCR and layout analysis
@@ -878,6 +885,8 @@ class OCRService:
            output_dir: Optional output directory for saving extracted images
            current_page: Current page number (0-based) for multi-page documents
            layout_model: Layout detection model ('chinese', 'default', 'cdla')
+            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
+            preprocessing_config: Manual preprocessing config (used when mode='manual')

        Returns:
            Dictionary with OCR results and metadata
@@ -946,7 +955,9 @@ class OCRService:
                        confidence_threshold=confidence_threshold,
                        output_dir=output_dir,
                        current_page=page_num - 1,  # Convert to 0-based page number for layout data
-                        layout_model=layout_model
+                        layout_model=layout_model,
+                        preprocessing_mode=preprocessing_mode,
+                        preprocessing_config=preprocessing_config
                    )

                    # Accumulate results
@@ -1092,7 +1103,9 @@ class OCRService:
                    image_path,
                    output_dir=output_dir,
                    current_page=current_page,
-                    layout_model=layout_model
+                    layout_model=layout_model,
+                    preprocessing_mode=preprocessing_mode,
+                    preprocessing_config=preprocessing_config
                )

            # Generate Markdown
@@ -1248,7 +1261,9 @@ class OCRService:
        image_path: Path,
        output_dir: Optional[Path] = None,
        current_page: int = 0,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
+        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
+        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Tuple[Optional[Dict], List[Dict]]:
        """
        Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1258,6 +1273,8 @@ class OCRService:
            output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
            current_page: Current page number (0-based) for multi-page documents
            layout_model: Layout detection model ('chinese', 'default', 'cdla')
+            preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
+            preprocessing_config: Manual preprocessing config (used when mode='manual')

        Returns:
            Tuple of (layout_data, images_metadata)
@@ -1277,13 +1294,45 @@ class OCRService:

            structure_engine = self._ensure_structure_engine(layout_model)

+            # Apply image preprocessing for layout detection
+            # Preprocessing enhances faint lines/borders to improve table detection
+            # Original image is preserved for element extraction
+            preprocessed_image = None
+            preprocessing_result = None
+
+            # Determine preprocessing mode (default from config if not specified)
+            mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
+
+            if mode != PreprocessingModeEnum.DISABLED:
+                try:
+                    preprocessing_service = get_layout_preprocessing_service()
+                    preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
+                        image_path,
+                        mode=mode,
+                        config=preprocessing_config
+                    )
+
+                    if preprocessing_result.was_processed:
+                        preprocessed_image = preprocessed_pil
+                        logger.info(
+                            f"Layout preprocessing applied: mode={mode.value}, "
+                            f"config={preprocessing_result.config_used}, "
+                            f"metrics={preprocessing_result.quality_metrics}"
+                        )
+                    else:
+                        logger.info(f"No preprocessing needed (mode={mode.value})")
+
+                except Exception as preprocess_error:
+                    logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
+                    preprocessed_image = None
+
            # Try enhanced processing first
            try:
                from app.services.pp_structure_enhanced import PPStructureEnhanced

                enhanced_processor = PPStructureEnhanced(structure_engine)
                result = enhanced_processor.analyze_with_full_structure(
-                    image_path, output_dir, current_page
+                    image_path, output_dir, current_page, preprocessed_image=preprocessed_image
                )

                if result.get('has_parsing_res_list'):
@@ -1337,7 +1386,17 @@ class OCRService:
                    logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
                    return None, []

-                results = structure_engine.predict(str(image_path))
+                # Use preprocessed image if available, otherwise original path
+                if preprocessed_image is not None:
+                    import numpy as np
+                    # Convert PIL to numpy array (BGR format for PP-Structure)
+                    predict_input = np.array(preprocessed_image)
+                    if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
+                        # Convert RGB to BGR
+                        predict_input = predict_input[:, :, ::-1]
+                    results = structure_engine.predict(predict_input)
+                else:
+                    results = structure_engine.predict(str(image_path))

            layout_elements = []
            images_metadata = []
@@ -1509,7 +1568,9 @@ class OCRService:
        confidence_threshold: Optional[float] = None,
        output_dir: Optional[Path] = None,
        force_track: Optional[str] = None,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
+        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
+        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Union[UnifiedDocument, Dict]:
        """
        Process document using dual-track approach.
@@ -1522,6 +1583,8 @@ class OCRService:
            output_dir: Optional output directory for extracted images
            force_track: Force specific track ("ocr" or "direct"), None for auto-detection
            layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
+            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
+            preprocessing_config: Manual preprocessing config (used when mode='manual')

        Returns:
            UnifiedDocument if dual-track is enabled, Dict otherwise
@@ -1529,7 +1592,8 @@ class OCRService:
        if not self.dual_track_enabled:
            # Fallback to traditional OCR processing
            return self.process_file_traditional(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
+                preprocessing_mode, preprocessing_config
            )

        start_time = datetime.now()
@@ -1601,7 +1665,9 @@ class OCRService:
                            ocr_result = self.process_file_traditional(
                                actual_file_path, lang, detect_layout=True,
                                confidence_threshold=confidence_threshold,
-                                output_dir=output_dir, layout_model=layout_model
+                                output_dir=output_dir, layout_model=layout_model,
+                                preprocessing_mode=preprocessing_mode,
+                                preprocessing_config=preprocessing_config
                            )

                            # Convert OCR result to extract images
@@ -1634,7 +1700,8 @@ class OCRService:
                # Use OCR for scanned documents, images, etc.
                logger.info("Using OCR track (PaddleOCR)")
                ocr_result = self.process_file_traditional(
-                    file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                    file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
+                    preprocessing_mode, preprocessing_config
                )

                # Convert OCR result to UnifiedDocument using the converter
@@ -1664,7 +1731,8 @@ class OCRService:
            logger.error(f"Error in dual-track processing: {e}")
            # Fallback to traditional OCR
            return self.process_file_traditional(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
+                preprocessing_mode, preprocessing_config
            )

    def _merge_ocr_images_into_direct(
@@ -1743,7 +1811,9 @@ class OCRService:
        detect_layout: bool = True,
        confidence_threshold: Optional[float] = None,
        output_dir: Optional[Path] = None,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
+        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
+        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Dict:
        """
        Traditional OCR processing (legacy method).
@@ -1755,6 +1825,8 @@ class OCRService:
            confidence_threshold: Minimum confidence threshold
            output_dir: Optional output directory
            layout_model: Layout detection model ('chinese', 'default', 'cdla')
+            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
+            preprocessing_config: Manual preprocessing config (used when mode='manual')

        Returns:
            Dictionary with OCR results in legacy format
@@ -1767,7 +1839,8 @@ class OCRService:
            all_results = []
            for i, image_path in enumerate(image_paths):
                result = self.process_image(
-                    image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
+                    image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
+                    preprocessing_mode, preprocessing_config
                )
                all_results.append(result)

@@ -1783,7 +1856,8 @@ class OCRService:
        else:
            # Single image or other file
            return self.process_image(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
+                preprocessing_mode, preprocessing_config
            )

    def _combine_results(self, results: List[Dict]) -> Dict:
@@ -1868,7 +1942,9 @@ class OCRService:
        output_dir: Optional[Path] = None,
        use_dual_track: bool = True,
        force_track: Optional[str] = None,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
+        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
+        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Union[UnifiedDocument, Dict]:
        """
        Main processing method with dual-track support.
@@ -1882,6 +1958,8 @@ class OCRService:
            use_dual_track: Whether to use dual-track processing (default True)
            force_track: Force specific track ("ocr" or "direct")
            layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
+            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
+            preprocessing_config: Manual preprocessing config (used when mode='manual')

        Returns:
            UnifiedDocument if dual-track is enabled and use_dual_track=True,
@@ -1893,12 +1971,14 @@ class OCRService:
        if (use_dual_track or force_track) and self.dual_track_enabled:
            # Use dual-track processing (or forced track)
            return self.process_with_dual_track(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
+                preprocessing_mode, preprocessing_config
            )
        else:
            # Use traditional OCR processing (no force_track support)
            return self.process_file_traditional(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
+                preprocessing_mode, preprocessing_config
            )

    def process_legacy(
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -20,6 +20,8 @@ except ImportError:

 import paddle
 from paddleocr import PPStructureV3
+from PIL import Image
+import numpy as np
 from app.models.unified_document import ElementType
 from app.core.config import settings
 from app.services.memory_manager import prediction_context
@@ -78,15 +80,19 @@ class PPStructureEnhanced:
        self,
        image_path: Path,
        output_dir: Optional[Path] = None,
-        current_page: int = 0
+        current_page: int = 0,
+        preprocessed_image: Optional[Image.Image] = None
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.

        Args:
-            image_path: Path to image file
+            image_path: Path to original image file (used for cropping extracted images)
            output_dir: Optional output directory for saving extracted content
            current_page: Current page number (0-based)
+            preprocessed_image: Optional preprocessed PIL Image for layout detection.
+                               If provided, this is used for PP-Structure prediction,
+                               but original image_path is still used for cropping images.

        Returns:
            Dictionary with complete structure information including:
@@ -97,6 +103,8 @@ class PPStructureEnhanced:
        """
        try:
            logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
+            if preprocessed_image:
+                logger.info("Using preprocessed image for layout detection")

            # Perform structure analysis with semaphore control
            # This prevents OOM errors from multiple simultaneous predictions
@@ -113,7 +121,16 @@ class PPStructureEnhanced:
                        'error': 'Prediction slot timeout'
                    }

-                results = self.structure_engine.predict(str(image_path))
+                # Use preprocessed image if provided, otherwise use original path
+                if preprocessed_image is not None:
+                    # Convert PIL to numpy array (BGR format for PP-Structure)
+                    predict_input = np.array(preprocessed_image)
+                    if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
+                        # Convert RGB to BGR
+                        predict_input = predict_input[:, :, ::-1]
+                    results = self.structure_engine.predict(predict_input)
+                else:
+                    results = self.structure_engine.predict(str(image_path))

            all_elements = []
            all_images = []