feat: implement layout preprocessing backend

Backend implementation for add-layout-preprocessing proposal:
- Add LayoutPreprocessingService with CLAHE, sharpen, binarize
- Add auto-detection: analyze_image_quality() for contrast/edge metrics
- Integrate preprocessing into OCR pipeline (analyze_layout)
- Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing
- Add config options: layout_preprocessing_mode, thresholds
- Add schemas: PreprocessingConfig, PreprocessingPreviewResponse

Preprocessing only affects layout detection input.
Original images preserved for element extraction.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 15:17:20 +08:00
parent 06a5973f2e
commit ea0dd7456c
7 changed files with 800 additions and 22 deletions

View File

@@ -20,6 +20,8 @@ except ImportError:
import paddle
from paddleocr import PPStructureV3
from PIL import Image
import numpy as np
from app.models.unified_document import ElementType
from app.core.config import settings
from app.services.memory_manager import prediction_context
@@ -78,15 +80,19 @@ class PPStructureEnhanced:
self,
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0
current_page: int = 0,
preprocessed_image: Optional[Image.Image] = None
) -> Dict[str, Any]:
"""
Analyze document with full PP-StructureV3 capabilities.
Args:
image_path: Path to image file
image_path: Path to original image file (used for cropping extracted images)
output_dir: Optional output directory for saving extracted content
current_page: Current page number (0-based)
preprocessed_image: Optional preprocessed PIL Image for layout detection.
If provided, this is used for PP-Structure prediction,
but original image_path is still used for cropping images.
Returns:
Dictionary with complete structure information including:
@@ -97,6 +103,8 @@ class PPStructureEnhanced:
"""
try:
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
if preprocessed_image:
logger.info("Using preprocessed image for layout detection")
# Perform structure analysis with semaphore control
# This prevents OOM errors from multiple simultaneous predictions
@@ -113,7 +121,16 @@ class PPStructureEnhanced:
'error': 'Prediction slot timeout'
}
results = self.structure_engine.predict(str(image_path))
# Use preprocessed image if provided, otherwise use original path
if preprocessed_image is not None:
# Convert PIL to numpy array (BGR format for PP-Structure)
predict_input = np.array(preprocessed_image)
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
# Convert RGB to BGR
predict_input = predict_input[:, :, ::-1]
results = self.structure_engine.predict(predict_input)
else:
results = self.structure_engine.predict(str(image_path))
all_elements = []
all_images = []