feat: implement layout preprocessing backend

Backend implementation for add-layout-preprocessing proposal:
- Add LayoutPreprocessingService with CLAHE, sharpen, binarize
- Add auto-detection: analyze_image_quality() for contrast/edge metrics
- Integrate preprocessing into OCR pipeline (analyze_layout)
- Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing
- Add config options: layout_preprocessing_mode, thresholds
- Add schemas: PreprocessingConfig, PreprocessingPreviewResponse

Preprocessing only affects layout detection input.
Original images preserved for element extraction.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 15:17:20 +08:00
parent 06a5973f2e
commit ea0dd7456c
7 changed files with 800 additions and 22 deletions

View File

@@ -0,0 +1,370 @@
"""
Tool_OCR - Layout Preprocessing Service
Image preprocessing to enhance layout detection for documents with faint lines/borders.
This service provides:
1. Image quality analysis (contrast, edge strength)
2. Contrast enhancement (histogram equalization, CLAHE)
3. Sharpening for faint lines
4. Optional binarization for very low contrast documents
IMPORTANT: Preprocessing only affects layout detection input.
Original images are preserved for element extraction.
"""
import logging
from pathlib import Path
from typing import Optional, Tuple, Union
from dataclasses import dataclass
import cv2
import numpy as np
from PIL import Image
from app.core.config import settings
from app.schemas.task import (
PreprocessingConfig,
PreprocessingContrastEnum,
PreprocessingModeEnum,
ImageQualityMetrics,
)
logger = logging.getLogger(__name__)
@dataclass
class PreprocessingResult:
"""Result of preprocessing operation."""
image: np.ndarray
config_used: PreprocessingConfig
quality_metrics: ImageQualityMetrics
was_processed: bool
class LayoutPreprocessingService:
"""
Service for preprocessing images to improve layout detection.
The preprocessing pipeline:
1. Analyze image quality (contrast, edge strength)
2. Apply contrast enhancement if needed (CLAHE or histogram)
3. Apply sharpening if edge strength is low
4. Apply binarization if contrast is very low (optional)
All operations preserve the original color image dimensions.
"""
def __init__(self):
# Load thresholds from config
self.contrast_threshold = settings.layout_preprocessing_contrast_threshold
self.edge_threshold = settings.layout_preprocessing_edge_threshold
self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
# CLAHE parameters
self.clahe_clip_limit = 2.0
self.clahe_tile_grid_size = (8, 8)
# Sharpening kernel (unsharp mask style)
self.sharpen_kernel = np.array([
[0, -1, 0],
[-1, 5, -1],
[0, -1, 0]
], dtype=np.float32)
logger.info(
f"LayoutPreprocessingService initialized with thresholds: "
f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
f"binarize={self.binarize_threshold}"
)
def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
"""
Analyze image quality to determine preprocessing needs.
Args:
image: Input image (BGR or grayscale)
Returns:
ImageQualityMetrics with contrast and edge_strength
"""
# Convert to grayscale if needed
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image
# Calculate contrast (standard deviation of pixel values)
contrast = float(np.std(gray))
# Calculate edge strength (mean of Sobel gradient magnitude)
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2)))
return ImageQualityMetrics(
contrast=round(contrast, 2),
edge_strength=round(edge_strength, 2)
)
def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
"""
Determine optimal preprocessing config based on image quality.
Args:
metrics: Image quality metrics from analyze_image_quality()
Returns:
PreprocessingConfig with recommended settings
"""
# Determine contrast enhancement
if metrics.contrast < self.contrast_threshold:
contrast = PreprocessingContrastEnum.CLAHE
else:
contrast = PreprocessingContrastEnum.NONE
# Determine sharpening
sharpen = metrics.edge_strength < self.edge_threshold
# Determine binarization (only for very low contrast)
binarize = metrics.contrast < self.binarize_threshold
return PreprocessingConfig(
contrast=contrast,
sharpen=sharpen,
binarize=binarize
)
def apply_contrast_enhancement(
self,
image: np.ndarray,
method: PreprocessingContrastEnum
) -> np.ndarray:
"""
Apply contrast enhancement to image.
Args:
image: Input image (BGR)
method: Enhancement method (none, histogram, clahe)
Returns:
Enhanced image (BGR)
"""
if method == PreprocessingContrastEnum.NONE:
return image
# Convert to LAB color space for better enhancement
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
if method == PreprocessingContrastEnum.HISTOGRAM:
# Standard histogram equalization
l_enhanced = cv2.equalizeHist(l_channel)
elif method == PreprocessingContrastEnum.CLAHE:
# Contrast Limited Adaptive Histogram Equalization
clahe = cv2.createCLAHE(
clipLimit=self.clahe_clip_limit,
tileGridSize=self.clahe_tile_grid_size
)
l_enhanced = clahe.apply(l_channel)
else:
return image
# Merge channels and convert back to BGR
enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel])
enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
return enhanced_bgr
def apply_sharpening(self, image: np.ndarray) -> np.ndarray:
"""
Apply sharpening to enhance edges and faint lines.
Args:
image: Input image (BGR)
Returns:
Sharpened image (BGR)
"""
# Apply unsharp mask style sharpening
sharpened = cv2.filter2D(image, -1, self.sharpen_kernel)
# Clip values to valid range
sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
return sharpened
def apply_binarization(self, image: np.ndarray) -> np.ndarray:
"""
Apply adaptive binarization for very low contrast documents.
Args:
image: Input image (BGR)
Returns:
Binarized image (BGR, but grayscale values)
"""
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2
)
# Convert back to BGR for consistency
binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
return binary_bgr
def preprocess(
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
) -> PreprocessingResult:
"""
Preprocess image for layout detection.
Args:
image: Input image (numpy array, PIL Image, or path)
mode: Preprocessing mode (auto, manual, disabled)
config: Manual configuration (required if mode='manual')
Returns:
PreprocessingResult with preprocessed image and metadata
"""
# Load image if path provided
if isinstance(image, (str, Path)):
image = cv2.imread(str(image))
if image is None:
raise ValueError(f"Failed to load image: {image}")
elif isinstance(image, Image.Image):
# Convert PIL to OpenCV format (BGR)
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Analyze quality
metrics = self.analyze_image_quality(image)
logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
# Determine configuration
if mode == PreprocessingModeEnum.DISABLED:
return PreprocessingResult(
image=image,
config_used=PreprocessingConfig(
contrast=PreprocessingContrastEnum.NONE,
sharpen=False,
binarize=False
),
quality_metrics=metrics,
was_processed=False
)
if mode == PreprocessingModeEnum.AUTO:
config = self.get_auto_config(metrics)
logger.debug(f"Auto config: {config}")
elif config is None:
# Manual mode but no config provided, use defaults
config = PreprocessingConfig()
# Apply preprocessing pipeline
processed = image.copy()
was_processed = False
# Step 1: Contrast enhancement
if config.contrast != PreprocessingContrastEnum.NONE:
processed = self.apply_contrast_enhancement(processed, config.contrast)
was_processed = True
logger.debug(f"Applied contrast enhancement: {config.contrast}")
# Step 2: Sharpening
if config.sharpen:
processed = self.apply_sharpening(processed)
was_processed = True
logger.debug("Applied sharpening")
# Step 3: Binarization (last step, overwrites color)
if config.binarize:
processed = self.apply_binarization(processed)
was_processed = True
logger.debug("Applied binarization")
return PreprocessingResult(
image=processed,
config_used=config,
quality_metrics=metrics,
was_processed=was_processed
)
def preprocess_to_pil(
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
) -> Tuple[Image.Image, PreprocessingResult]:
"""
Preprocess image and return as PIL Image.
Convenience method for integration with PP-Structure which accepts PIL images.
Args:
image: Input image
mode: Preprocessing mode
config: Manual configuration
Returns:
Tuple of (PIL Image, PreprocessingResult)
"""
result = self.preprocess(image, mode, config)
# Convert BGR to RGB for PIL
rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_image)
return pil_image, result
def save_preview(
self,
original: np.ndarray,
preprocessed: np.ndarray,
output_dir: Path,
prefix: str = "preview"
) -> Tuple[Path, Path]:
"""
Save original and preprocessed images for preview.
Args:
original: Original image (BGR)
preprocessed: Preprocessed image (BGR)
output_dir: Directory to save images
prefix: Filename prefix
Returns:
Tuple of (original_path, preprocessed_path)
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
original_path = output_dir / f"{prefix}_original.png"
preprocessed_path = output_dir / f"{prefix}_preprocessed.png"
cv2.imwrite(str(original_path), original)
cv2.imwrite(str(preprocessed_path), preprocessed)
return original_path, preprocessed_path
# Singleton instance
_layout_preprocessing_service: Optional[LayoutPreprocessingService] = None
def get_layout_preprocessing_service() -> LayoutPreprocessingService:
"""Get or create the layout preprocessing service singleton."""
global _layout_preprocessing_service
if _layout_preprocessing_service is None:
_layout_preprocessing_service = LayoutPreprocessingService()
return _layout_preprocessing_service