feat: implement layout preprocessing backend

Backend implementation for add-layout-preprocessing proposal: - Add LayoutPreprocessingService with CLAHE, sharpen, binarize - Add auto-detection: analyze_image_quality() for contrast/edge metrics - Integrate preprocessing into OCR pipeline (analyze_layout) - Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing - Add config options: layout_preprocessing_mode, thresholds - Add schemas: PreprocessingConfig, PreprocessingPreviewResponse Preprocessing only affects layout detection input. Original images preserved for element extraction. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 15:17:20 +08:00
parent 06a5973f2e
commit ea0dd7456c
7 changed files with 800 additions and 22 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -20,6 +20,8 @@ except ImportError:

 import paddle
 from paddleocr import PPStructureV3
+from PIL import Image
+import numpy as np
 from app.models.unified_document import ElementType
 from app.core.config import settings
 from app.services.memory_manager import prediction_context
@@ -78,15 +80,19 @@ class PPStructureEnhanced:
        self,
        image_path: Path,
        output_dir: Optional[Path] = None,
-        current_page: int = 0
+        current_page: int = 0,
+        preprocessed_image: Optional[Image.Image] = None
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.

        Args:
-            image_path: Path to image file
+            image_path: Path to original image file (used for cropping extracted images)
            output_dir: Optional output directory for saving extracted content
            current_page: Current page number (0-based)
+            preprocessed_image: Optional preprocessed PIL Image for layout detection.
+                               If provided, this is used for PP-Structure prediction,
+                               but original image_path is still used for cropping images.

        Returns:
            Dictionary with complete structure information including:
@@ -97,6 +103,8 @@ class PPStructureEnhanced:
        """
        try:
            logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
+            if preprocessed_image:
+                logger.info("Using preprocessed image for layout detection")

            # Perform structure analysis with semaphore control
            # This prevents OOM errors from multiple simultaneous predictions
@@ -113,7 +121,16 @@ class PPStructureEnhanced:
                        'error': 'Prediction slot timeout'
                    }

-                results = self.structure_engine.predict(str(image_path))
+                # Use preprocessed image if provided, otherwise use original path
+                if preprocessed_image is not None:
+                    # Convert PIL to numpy array (BGR format for PP-Structure)
+                    predict_input = np.array(preprocessed_image)
+                    if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
+                        # Convert RGB to BGR
+                        predict_input = predict_input[:, :, ::-1]
+                    results = self.structure_engine.predict(predict_input)
+                else:
+                    results = self.structure_engine.predict(str(image_path))

            all_elements = []
            all_images = []