Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
676 lines
25 KiB
Python
676 lines
25 KiB
Python
"""
|
|
Tool_OCR - Layout Preprocessing Service
|
|
Image preprocessing to enhance layout detection for documents with faint lines/borders.
|
|
|
|
This service provides:
|
|
1. Image quality analysis (contrast, edge strength)
|
|
2. Contrast enhancement (histogram equalization, CLAHE)
|
|
3. Sharpening for faint lines
|
|
4. Optional binarization for very low contrast documents
|
|
|
|
IMPORTANT: Preprocessing only affects layout detection input.
|
|
Original images are preserved for element extraction.
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple, Union
|
|
from dataclasses import dataclass
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
from app.core.config import settings
|
|
from app.schemas.task import (
|
|
PreprocessingConfig,
|
|
PreprocessingContrastEnum,
|
|
PreprocessingModeEnum,
|
|
ImageQualityMetrics,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ScalingInfo:
|
|
"""Information about image scaling applied for layout detection."""
|
|
was_scaled: bool
|
|
scale_factor: float # Factor to multiply bbox coords to get original size (1.0 / actual_scale)
|
|
original_size: Tuple[int, int] # (width, height) of original image
|
|
scaled_size: Tuple[int, int] # (width, height) after scaling
|
|
|
|
|
|
@dataclass
|
|
class PreprocessingResult:
|
|
"""Result of preprocessing operation."""
|
|
image: np.ndarray
|
|
config_used: PreprocessingConfig
|
|
quality_metrics: ImageQualityMetrics
|
|
was_processed: bool
|
|
scaling_info: Optional[ScalingInfo] = None # Info about any scaling applied
|
|
|
|
|
|
class LayoutPreprocessingService:
|
|
"""
|
|
Service for preprocessing images to improve layout detection.
|
|
|
|
The preprocessing pipeline:
|
|
1. Analyze image quality (contrast, edge strength)
|
|
2. Apply contrast enhancement if needed (CLAHE or histogram)
|
|
3. Apply sharpening if edge strength is low
|
|
4. Apply binarization if contrast is very low (optional)
|
|
|
|
All operations preserve the original color image dimensions.
|
|
"""
|
|
|
|
def __init__(self):
|
|
# Load thresholds from config
|
|
self.contrast_threshold = settings.layout_preprocessing_contrast_threshold
|
|
self.edge_threshold = settings.layout_preprocessing_edge_threshold
|
|
self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
|
|
|
|
# Image scaling settings for layout detection (bidirectional)
|
|
self.scaling_enabled = settings.layout_image_scaling_enabled
|
|
self.scaling_max_dimension = settings.layout_image_scaling_max_dimension
|
|
self.scaling_min_dimension = settings.layout_image_scaling_min_dimension
|
|
self.scaling_target_dimension = settings.layout_image_scaling_target_dimension
|
|
|
|
# CLAHE parameters
|
|
self.clahe_clip_limit = 2.0
|
|
self.clahe_tile_grid_size = (8, 8)
|
|
|
|
# Document-specific CLAHE parameters (larger tiles for documents)
|
|
self.document_clahe_clip_limit = 3.0
|
|
self.document_clahe_tile_grid_size = (16, 16)
|
|
|
|
# Background normalization parameters for scanned documents
|
|
self.background_kernel_size = 51 # Morphological kernel size
|
|
|
|
# Sharpening kernel (unsharp mask style)
|
|
self.sharpen_kernel = np.array([
|
|
[0, -1, 0],
|
|
[-1, 5, -1],
|
|
[0, -1, 0]
|
|
], dtype=np.float32)
|
|
|
|
logger.info(
|
|
f"LayoutPreprocessingService initialized with thresholds: "
|
|
f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
|
|
f"binarize={self.binarize_threshold}, "
|
|
f"scaling={'enabled' if self.scaling_enabled else 'disabled'} "
|
|
f"(min={self.scaling_min_dimension}, max={self.scaling_max_dimension}, target={self.scaling_target_dimension})"
|
|
)
|
|
|
|
def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
|
|
"""
|
|
Analyze image quality to determine preprocessing needs.
|
|
|
|
Args:
|
|
image: Input image (BGR or grayscale)
|
|
|
|
Returns:
|
|
ImageQualityMetrics with contrast and edge_strength
|
|
"""
|
|
# Convert to grayscale if needed
|
|
if len(image.shape) == 3:
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
else:
|
|
gray = image
|
|
|
|
# Calculate contrast (standard deviation of pixel values)
|
|
contrast = float(np.std(gray))
|
|
|
|
# Calculate edge strength (mean of Sobel gradient magnitude)
|
|
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
|
|
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
|
|
edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2)))
|
|
|
|
return ImageQualityMetrics(
|
|
contrast=round(contrast, 2),
|
|
edge_strength=round(edge_strength, 2)
|
|
)
|
|
|
|
def _normalize_background(self, gray: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Normalize image background to remove uneven illumination.
|
|
|
|
This is particularly effective for scanned documents where scanner
|
|
lighting may be uneven, or where paper has yellowed/stained areas.
|
|
|
|
Method:
|
|
1. Estimate background using morphological closing (fills in text/details)
|
|
2. Divide original by background estimate
|
|
3. Rescale to full 0-255 range
|
|
|
|
Args:
|
|
gray: Grayscale image (L channel or grayscale)
|
|
|
|
Returns:
|
|
Normalized grayscale image with uniform background
|
|
"""
|
|
# Create structuring element for morphological operations
|
|
kernel_size = self.background_kernel_size
|
|
# Ensure kernel size is odd
|
|
if kernel_size % 2 == 0:
|
|
kernel_size += 1
|
|
|
|
kernel = cv2.getStructuringElement(
|
|
cv2.MORPH_ELLIPSE,
|
|
(kernel_size, kernel_size)
|
|
)
|
|
|
|
# Morphological closing estimates the background
|
|
# (dilate then erode - fills in dark features like text)
|
|
background = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
|
|
|
|
# Apply Gaussian blur to smooth the background estimate
|
|
background = cv2.GaussianBlur(background, (kernel_size, kernel_size), 0)
|
|
|
|
# Avoid division by zero
|
|
background = np.maximum(background, 1).astype(np.float32)
|
|
|
|
# Normalize: divide by background and rescale to 0-255
|
|
# This removes uneven illumination while preserving text/content
|
|
normalized = (gray.astype(np.float32) / background) * 255.0
|
|
|
|
# Clip and convert back to uint8
|
|
normalized = np.clip(normalized, 0, 255).astype(np.uint8)
|
|
|
|
logger.debug(
|
|
f"Background normalization applied: kernel={kernel_size}, "
|
|
f"background range=[{background.min():.0f}, {background.max():.0f}]"
|
|
)
|
|
|
|
return normalized
|
|
|
|
def scale_for_layout_detection(
|
|
self,
|
|
image: np.ndarray,
|
|
force_scale: bool = False
|
|
) -> Tuple[np.ndarray, ScalingInfo]:
|
|
"""
|
|
Apply bidirectional scaling for optimal layout detection.
|
|
|
|
PP-Structure's layout detection model (RT-DETR based) works best with images
|
|
around 1600px on the longest side. Both too-large and too-small images
|
|
reduce detection accuracy:
|
|
|
|
- Too large (>2000px): Model's receptive field cannot capture entire structures
|
|
- Too small (<1200px): Insufficient detail for accurate detection
|
|
|
|
Scaling behavior:
|
|
- max_dim > max_dimension (2000): Scale DOWN to target (1600)
|
|
- max_dim < min_dimension (1200): Scale UP to target (1600)
|
|
- min_dimension <= max_dim <= max_dimension: No scaling (optimal range)
|
|
|
|
Args:
|
|
image: Input image (BGR)
|
|
force_scale: Force scaling to target even if in optimal range
|
|
|
|
Returns:
|
|
Tuple of (scaled_image, ScalingInfo)
|
|
ScalingInfo.scale_factor is the multiplier to convert scaled bbox
|
|
coordinates back to original image coordinates.
|
|
"""
|
|
h, w = image.shape[:2]
|
|
original_size = (w, h)
|
|
max_dim = max(h, w)
|
|
|
|
# Determine if scaling is needed and direction
|
|
should_downscale = self.scaling_enabled and max_dim > self.scaling_max_dimension
|
|
should_upscale = self.scaling_enabled and max_dim < self.scaling_min_dimension
|
|
should_scale = should_downscale or should_upscale or force_scale
|
|
|
|
if not should_scale:
|
|
return image, ScalingInfo(
|
|
was_scaled=False,
|
|
scale_factor=1.0,
|
|
original_size=original_size,
|
|
scaled_size=original_size
|
|
)
|
|
|
|
# Calculate scale factor to reach target dimension
|
|
actual_scale = self.scaling_target_dimension / max_dim
|
|
new_w = int(w * actual_scale)
|
|
new_h = int(h * actual_scale)
|
|
|
|
# Choose interpolation method based on scale direction
|
|
if actual_scale < 1.0:
|
|
# Downscaling: INTER_AREA is best for shrinking (anti-aliasing)
|
|
interpolation = cv2.INTER_AREA
|
|
direction = "DOWN"
|
|
else:
|
|
# Upscaling: INTER_CUBIC provides smooth enlargement
|
|
interpolation = cv2.INTER_CUBIC
|
|
direction = "UP"
|
|
|
|
scaled_image = cv2.resize(image, (new_w, new_h), interpolation=interpolation)
|
|
|
|
# scale_factor is the inverse - used to scale bbox coords back to original
|
|
scale_factor = 1.0 / actual_scale
|
|
|
|
logger.info(
|
|
f"Scaled {direction} for layout detection: {w}x{h} -> {new_w}x{new_h} "
|
|
f"(scale_factor={scale_factor:.3f} to restore original coords)"
|
|
)
|
|
|
|
return scaled_image, ScalingInfo(
|
|
was_scaled=True,
|
|
scale_factor=scale_factor,
|
|
original_size=original_size,
|
|
scaled_size=(new_w, new_h)
|
|
)
|
|
|
|
@staticmethod
|
|
def scale_bbox_to_original(
|
|
bbox: Tuple[float, float, float, float],
|
|
scale_factor: float
|
|
) -> Tuple[float, float, float, float]:
|
|
"""
|
|
Scale a bounding box from scaled coordinates back to original image coordinates.
|
|
|
|
Args:
|
|
bbox: Bounding box as (x1, y1, x2, y2) in scaled image coordinates
|
|
scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
|
|
|
|
Returns:
|
|
Bounding box in original image coordinates
|
|
"""
|
|
x1, y1, x2, y2 = bbox
|
|
return (
|
|
x1 * scale_factor,
|
|
y1 * scale_factor,
|
|
x2 * scale_factor,
|
|
y2 * scale_factor
|
|
)
|
|
|
|
@staticmethod
|
|
def scale_bboxes_to_original(
|
|
bboxes: list,
|
|
scale_factor: float
|
|
) -> list:
|
|
"""
|
|
Scale multiple bounding boxes from scaled coordinates to original.
|
|
|
|
Args:
|
|
bboxes: List of bounding boxes, each as (x1, y1, x2, y2)
|
|
scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
|
|
|
|
Returns:
|
|
List of bounding boxes in original image coordinates
|
|
"""
|
|
return [
|
|
LayoutPreprocessingService.scale_bbox_to_original(bbox, scale_factor)
|
|
for bbox in bboxes
|
|
]
|
|
|
|
def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
|
|
"""
|
|
Determine optimal preprocessing config based on image quality.
|
|
|
|
Auto-detection calculates appropriate strength values:
|
|
- Lower image contrast → Higher contrast_strength
|
|
- Lower edge strength → Higher sharpen_strength
|
|
- Binarization is disabled by default (rarely beneficial)
|
|
|
|
Args:
|
|
metrics: Image quality metrics from analyze_image_quality()
|
|
|
|
Returns:
|
|
PreprocessingConfig with recommended settings
|
|
"""
|
|
# Determine contrast enhancement and strength
|
|
if metrics.contrast < self.contrast_threshold:
|
|
contrast = PreprocessingContrastEnum.CLAHE
|
|
# Calculate strength based on how far below threshold
|
|
# contrast=40 threshold, contrast=20 → strength=2.0, contrast=30 → strength=1.5
|
|
contrast_ratio = (self.contrast_threshold - metrics.contrast) / self.contrast_threshold
|
|
contrast_strength = min(1.0 + contrast_ratio * 2.0, 3.0) # Range: 1.0 to 3.0
|
|
else:
|
|
contrast = PreprocessingContrastEnum.NONE
|
|
contrast_strength = 1.0
|
|
|
|
# Determine sharpening and strength
|
|
if metrics.edge_strength < self.edge_threshold:
|
|
sharpen = True
|
|
# Calculate strength based on how far below threshold
|
|
# edge=15 threshold, edge=5 → strength=1.67, edge=10 → strength=1.33
|
|
edge_ratio = (self.edge_threshold - metrics.edge_strength) / self.edge_threshold
|
|
sharpen_strength = min(1.0 + edge_ratio * 1.0, 2.0) # Range: 1.0 to 2.0
|
|
else:
|
|
sharpen = False
|
|
sharpen_strength = 1.0
|
|
|
|
# Binarization is disabled by default - it rarely helps and often hurts
|
|
# Only enable for extremely low contrast (< 15) which indicates a scan quality issue
|
|
binarize = False # Disabled by default
|
|
|
|
logger.debug(
|
|
f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, "
|
|
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}"
|
|
)
|
|
|
|
return PreprocessingConfig(
|
|
contrast=contrast,
|
|
contrast_strength=round(contrast_strength, 2),
|
|
sharpen=sharpen,
|
|
sharpen_strength=round(sharpen_strength, 2),
|
|
binarize=binarize
|
|
)
|
|
|
|
def apply_contrast_enhancement(
|
|
self,
|
|
image: np.ndarray,
|
|
method: PreprocessingContrastEnum,
|
|
strength: float = 1.0
|
|
) -> np.ndarray:
|
|
"""
|
|
Apply contrast enhancement to image.
|
|
|
|
Args:
|
|
image: Input image (BGR)
|
|
method: Enhancement method (none, histogram, clahe)
|
|
strength: Enhancement strength (0.5-3.0, default 1.0)
|
|
- 0.5: Subtle enhancement
|
|
- 1.0: Normal enhancement
|
|
- 2.0: Strong enhancement
|
|
- 3.0: Maximum enhancement
|
|
|
|
Returns:
|
|
Enhanced image (BGR)
|
|
"""
|
|
if method == PreprocessingContrastEnum.NONE:
|
|
return image
|
|
|
|
# Convert to LAB color space for better enhancement
|
|
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
|
|
l_channel, a_channel, b_channel = cv2.split(lab)
|
|
|
|
if method == PreprocessingContrastEnum.HISTOGRAM:
|
|
# Standard histogram equalization (strength affects blending)
|
|
l_equalized = cv2.equalizeHist(l_channel)
|
|
# Blend original with equalized based on strength
|
|
alpha = min(strength, 1.0) # Cap at 1.0 for histogram
|
|
l_enhanced = cv2.addWeighted(l_equalized, alpha, l_channel, 1 - alpha, 0)
|
|
elif method == PreprocessingContrastEnum.CLAHE:
|
|
# Contrast Limited Adaptive Histogram Equalization
|
|
# clipLimit controls contrast amplification: 2.0 is default, up to 6.0 for strong
|
|
clip_limit = self.clahe_clip_limit * strength # 2.0 * 1.0 = 2.0, 2.0 * 2.0 = 4.0
|
|
clahe = cv2.createCLAHE(
|
|
clipLimit=clip_limit,
|
|
tileGridSize=self.clahe_tile_grid_size
|
|
)
|
|
l_enhanced = clahe.apply(l_channel)
|
|
elif method == PreprocessingContrastEnum.DOCUMENT:
|
|
# Document-specific enhancement for scanned documents
|
|
# Step 1: Background normalization to remove uneven illumination
|
|
l_normalized = self._normalize_background(l_channel)
|
|
|
|
# Step 2: CLAHE with larger tiles optimized for documents
|
|
clip_limit = self.document_clahe_clip_limit * strength
|
|
clahe = cv2.createCLAHE(
|
|
clipLimit=clip_limit,
|
|
tileGridSize=self.document_clahe_tile_grid_size
|
|
)
|
|
l_enhanced = clahe.apply(l_normalized)
|
|
else:
|
|
return image
|
|
|
|
# Merge channels and convert back to BGR
|
|
enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel])
|
|
enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
|
|
|
|
return enhanced_bgr
|
|
|
|
def apply_sharpening(self, image: np.ndarray, strength: float = 1.0) -> np.ndarray:
|
|
"""
|
|
Apply sharpening to enhance edges and faint lines using unsharp mask.
|
|
|
|
Args:
|
|
image: Input image (BGR)
|
|
strength: Sharpening strength (0.5-2.0, default 1.0)
|
|
- 0.5: Subtle sharpening
|
|
- 1.0: Normal sharpening
|
|
- 1.5: Strong sharpening
|
|
- 2.0: Maximum sharpening
|
|
|
|
Returns:
|
|
Sharpened image (BGR)
|
|
"""
|
|
# Use unsharp mask technique for better control
|
|
# 1. Create blurred version
|
|
# 2. Subtract from original (scaled by strength)
|
|
# 3. Add back to original
|
|
|
|
# Gaussian blur with sigma based on strength
|
|
sigma = 1.0
|
|
blurred = cv2.GaussianBlur(image, (0, 0), sigma)
|
|
|
|
# Unsharp mask: original + (original - blurred) * strength
|
|
# This is equivalent to: original * (1 + strength) - blurred * strength
|
|
sharpened = cv2.addWeighted(image, 1.0 + strength, blurred, -strength, 0)
|
|
|
|
# Clip values to valid range
|
|
sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
|
|
|
|
return sharpened
|
|
|
|
def apply_binarization(self, image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Apply adaptive binarization for very low contrast documents.
|
|
|
|
Args:
|
|
image: Input image (BGR)
|
|
|
|
Returns:
|
|
Binarized image (BGR, but grayscale values)
|
|
"""
|
|
# Convert to grayscale
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Apply adaptive thresholding
|
|
binary = cv2.adaptiveThreshold(
|
|
gray,
|
|
255,
|
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
cv2.THRESH_BINARY,
|
|
blockSize=11,
|
|
C=2
|
|
)
|
|
|
|
# Convert back to BGR for consistency
|
|
binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
|
|
|
|
return binary_bgr
|
|
|
|
def preprocess(
|
|
self,
|
|
image: Union[np.ndarray, Image.Image, str, Path],
|
|
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
|
|
config: Optional[PreprocessingConfig] = None,
|
|
apply_scaling: bool = True
|
|
) -> PreprocessingResult:
|
|
"""
|
|
Preprocess image for layout detection.
|
|
|
|
The preprocessing pipeline:
|
|
1. Load image from path/PIL if needed
|
|
2. Analyze image quality (on original image for accurate metrics)
|
|
3. Scale down high-resolution images for better layout detection
|
|
4. Apply contrast enhancement if needed
|
|
5. Apply sharpening if needed
|
|
6. Apply binarization if requested (not recommended)
|
|
|
|
IMPORTANT: When scaling is applied, all bounding boxes from layout detection
|
|
must be scaled back to original coordinates using ScalingInfo.scale_factor.
|
|
The original image should be used for element extraction (cropping).
|
|
|
|
Args:
|
|
image: Input image (numpy array, PIL Image, or path)
|
|
mode: Preprocessing mode (auto, manual, disabled)
|
|
config: Manual configuration (required if mode='manual')
|
|
apply_scaling: Whether to apply automatic downscaling (default True)
|
|
|
|
Returns:
|
|
PreprocessingResult with preprocessed image and metadata
|
|
"""
|
|
# Load image if path provided
|
|
if isinstance(image, (str, Path)):
|
|
image = cv2.imread(str(image))
|
|
if image is None:
|
|
raise ValueError(f"Failed to load image: {image}")
|
|
elif isinstance(image, Image.Image):
|
|
# Convert PIL to OpenCV format (BGR)
|
|
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
|
|
|
# Analyze quality on ORIGINAL image (before scaling) for accurate metrics
|
|
metrics = self.analyze_image_quality(image)
|
|
logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
|
|
|
|
# Apply scaling for layout detection (even if preprocessing is disabled)
|
|
if apply_scaling:
|
|
scaled_image, scaling_info = self.scale_for_layout_detection(image)
|
|
else:
|
|
h, w = image.shape[:2]
|
|
scaled_image = image
|
|
scaling_info = ScalingInfo(
|
|
was_scaled=False,
|
|
scale_factor=1.0,
|
|
original_size=(w, h),
|
|
scaled_size=(w, h)
|
|
)
|
|
|
|
# Determine configuration
|
|
if mode == PreprocessingModeEnum.DISABLED:
|
|
# Even when preprocessing is disabled, we still return scaled image
|
|
# for better layout detection. Original image is preserved for cropping.
|
|
return PreprocessingResult(
|
|
image=scaled_image,
|
|
config_used=PreprocessingConfig(
|
|
contrast=PreprocessingContrastEnum.NONE,
|
|
sharpen=False,
|
|
binarize=False
|
|
),
|
|
quality_metrics=metrics,
|
|
was_processed=scaling_info.was_scaled, # True if scaling was applied
|
|
scaling_info=scaling_info
|
|
)
|
|
|
|
if mode == PreprocessingModeEnum.AUTO:
|
|
config = self.get_auto_config(metrics)
|
|
logger.debug(f"Auto config: {config}")
|
|
elif config is None:
|
|
# Manual mode but no config provided, use defaults
|
|
config = PreprocessingConfig()
|
|
|
|
# Apply preprocessing pipeline on SCALED image
|
|
processed = scaled_image.copy()
|
|
was_processed = scaling_info.was_scaled # Start with True if already scaled
|
|
|
|
# Step 1: Contrast enhancement
|
|
if config.contrast != PreprocessingContrastEnum.NONE:
|
|
processed = self.apply_contrast_enhancement(
|
|
processed,
|
|
config.contrast,
|
|
strength=config.contrast_strength
|
|
)
|
|
was_processed = True
|
|
logger.debug(f"Applied contrast enhancement: {config.contrast} (strength={config.contrast_strength})")
|
|
|
|
# Step 2: Sharpening
|
|
if config.sharpen:
|
|
processed = self.apply_sharpening(processed, strength=config.sharpen_strength)
|
|
was_processed = True
|
|
logger.debug(f"Applied sharpening (strength={config.sharpen_strength})")
|
|
|
|
# Step 3: Binarization (last step, overwrites color)
|
|
if config.binarize:
|
|
processed = self.apply_binarization(processed)
|
|
was_processed = True
|
|
logger.debug("Applied binarization")
|
|
|
|
return PreprocessingResult(
|
|
image=processed,
|
|
config_used=config,
|
|
quality_metrics=metrics,
|
|
was_processed=was_processed,
|
|
scaling_info=scaling_info
|
|
)
|
|
|
|
def preprocess_to_pil(
|
|
self,
|
|
image: Union[np.ndarray, Image.Image, str, Path],
|
|
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
|
|
config: Optional[PreprocessingConfig] = None,
|
|
apply_scaling: bool = True
|
|
) -> Tuple[Image.Image, PreprocessingResult]:
|
|
"""
|
|
Preprocess image and return as PIL Image.
|
|
|
|
Convenience method for integration with PP-Structure which accepts PIL images.
|
|
|
|
IMPORTANT: When result.scaling_info.was_scaled is True, all bounding boxes
|
|
from PP-Structure must be scaled back to original coordinates using:
|
|
scaled_bbox = (x1 * scale_factor, y1 * scale_factor, x2 * scale_factor, y2 * scale_factor)
|
|
where scale_factor = result.scaling_info.scale_factor
|
|
|
|
Args:
|
|
image: Input image
|
|
mode: Preprocessing mode
|
|
config: Manual configuration
|
|
apply_scaling: Whether to apply automatic downscaling (default True)
|
|
|
|
Returns:
|
|
Tuple of (PIL Image for PP-Structure, PreprocessingResult with scaling info)
|
|
"""
|
|
result = self.preprocess(image, mode, config, apply_scaling=apply_scaling)
|
|
|
|
# Convert BGR to RGB for PIL
|
|
rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
|
|
pil_image = Image.fromarray(rgb_image)
|
|
|
|
return pil_image, result
|
|
|
|
def save_preview(
|
|
self,
|
|
original: np.ndarray,
|
|
preprocessed: np.ndarray,
|
|
output_dir: Path,
|
|
prefix: str = "preview"
|
|
) -> Tuple[Path, Path]:
|
|
"""
|
|
Save original and preprocessed images for preview.
|
|
|
|
Args:
|
|
original: Original image (BGR)
|
|
preprocessed: Preprocessed image (BGR)
|
|
output_dir: Directory to save images
|
|
prefix: Filename prefix
|
|
|
|
Returns:
|
|
Tuple of (original_path, preprocessed_path)
|
|
"""
|
|
output_dir = Path(output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
original_path = output_dir / f"{prefix}_original.png"
|
|
preprocessed_path = output_dir / f"{prefix}_preprocessed.png"
|
|
|
|
cv2.imwrite(str(original_path), original)
|
|
cv2.imwrite(str(preprocessed_path), preprocessed)
|
|
|
|
return original_path, preprocessed_path
|
|
|
|
|
|
# Singleton instance
|
|
_layout_preprocessing_service: Optional[LayoutPreprocessingService] = None
|
|
|
|
|
|
def get_layout_preprocessing_service() -> LayoutPreprocessingService:
|
|
"""Get or create the layout preprocessing service singleton."""
|
|
global _layout_preprocessing_service
|
|
if _layout_preprocessing_service is None:
|
|
_layout_preprocessing_service = LayoutPreprocessingService()
|
|
return _layout_preprocessing_service
|