feat: enhance layout preprocessing and unify image scaling proposal
Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,15 @@ from app.schemas.task import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScalingInfo:
|
||||
"""Information about image scaling applied for layout detection."""
|
||||
was_scaled: bool
|
||||
scale_factor: float # Factor to multiply bbox coords to get original size (1.0 / actual_scale)
|
||||
original_size: Tuple[int, int] # (width, height) of original image
|
||||
scaled_size: Tuple[int, int] # (width, height) after scaling
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessingResult:
|
||||
"""Result of preprocessing operation."""
|
||||
@@ -39,6 +48,7 @@ class PreprocessingResult:
|
||||
config_used: PreprocessingConfig
|
||||
quality_metrics: ImageQualityMetrics
|
||||
was_processed: bool
|
||||
scaling_info: Optional[ScalingInfo] = None # Info about any scaling applied
|
||||
|
||||
|
||||
class LayoutPreprocessingService:
|
||||
@@ -60,10 +70,23 @@ class LayoutPreprocessingService:
|
||||
self.edge_threshold = settings.layout_preprocessing_edge_threshold
|
||||
self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
|
||||
|
||||
# Image scaling settings for layout detection (bidirectional)
|
||||
self.scaling_enabled = settings.layout_image_scaling_enabled
|
||||
self.scaling_max_dimension = settings.layout_image_scaling_max_dimension
|
||||
self.scaling_min_dimension = settings.layout_image_scaling_min_dimension
|
||||
self.scaling_target_dimension = settings.layout_image_scaling_target_dimension
|
||||
|
||||
# CLAHE parameters
|
||||
self.clahe_clip_limit = 2.0
|
||||
self.clahe_tile_grid_size = (8, 8)
|
||||
|
||||
# Document-specific CLAHE parameters (larger tiles for documents)
|
||||
self.document_clahe_clip_limit = 3.0
|
||||
self.document_clahe_tile_grid_size = (16, 16)
|
||||
|
||||
# Background normalization parameters for scanned documents
|
||||
self.background_kernel_size = 51 # Morphological kernel size
|
||||
|
||||
# Sharpening kernel (unsharp mask style)
|
||||
self.sharpen_kernel = np.array([
|
||||
[0, -1, 0],
|
||||
@@ -74,7 +97,9 @@ class LayoutPreprocessingService:
|
||||
logger.info(
|
||||
f"LayoutPreprocessingService initialized with thresholds: "
|
||||
f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
|
||||
f"binarize={self.binarize_threshold}"
|
||||
f"binarize={self.binarize_threshold}, "
|
||||
f"scaling={'enabled' if self.scaling_enabled else 'disabled'} "
|
||||
f"(min={self.scaling_min_dimension}, max={self.scaling_max_dimension}, target={self.scaling_target_dimension})"
|
||||
)
|
||||
|
||||
def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
|
||||
@@ -106,6 +131,180 @@ class LayoutPreprocessingService:
|
||||
edge_strength=round(edge_strength, 2)
|
||||
)
|
||||
|
||||
def _normalize_background(self, gray: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Normalize image background to remove uneven illumination.
|
||||
|
||||
This is particularly effective for scanned documents where scanner
|
||||
lighting may be uneven, or where paper has yellowed/stained areas.
|
||||
|
||||
Method:
|
||||
1. Estimate background using morphological closing (fills in text/details)
|
||||
2. Divide original by background estimate
|
||||
3. Rescale to full 0-255 range
|
||||
|
||||
Args:
|
||||
gray: Grayscale image (L channel or grayscale)
|
||||
|
||||
Returns:
|
||||
Normalized grayscale image with uniform background
|
||||
"""
|
||||
# Create structuring element for morphological operations
|
||||
kernel_size = self.background_kernel_size
|
||||
# Ensure kernel size is odd
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
|
||||
kernel = cv2.getStructuringElement(
|
||||
cv2.MORPH_ELLIPSE,
|
||||
(kernel_size, kernel_size)
|
||||
)
|
||||
|
||||
# Morphological closing estimates the background
|
||||
# (dilate then erode - fills in dark features like text)
|
||||
background = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
|
||||
|
||||
# Apply Gaussian blur to smooth the background estimate
|
||||
background = cv2.GaussianBlur(background, (kernel_size, kernel_size), 0)
|
||||
|
||||
# Avoid division by zero
|
||||
background = np.maximum(background, 1).astype(np.float32)
|
||||
|
||||
# Normalize: divide by background and rescale to 0-255
|
||||
# This removes uneven illumination while preserving text/content
|
||||
normalized = (gray.astype(np.float32) / background) * 255.0
|
||||
|
||||
# Clip and convert back to uint8
|
||||
normalized = np.clip(normalized, 0, 255).astype(np.uint8)
|
||||
|
||||
logger.debug(
|
||||
f"Background normalization applied: kernel={kernel_size}, "
|
||||
f"background range=[{background.min():.0f}, {background.max():.0f}]"
|
||||
)
|
||||
|
||||
return normalized
|
||||
|
||||
def scale_for_layout_detection(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
force_scale: bool = False
|
||||
) -> Tuple[np.ndarray, ScalingInfo]:
|
||||
"""
|
||||
Apply bidirectional scaling for optimal layout detection.
|
||||
|
||||
PP-Structure's layout detection model (RT-DETR based) works best with images
|
||||
around 1600px on the longest side. Both too-large and too-small images
|
||||
reduce detection accuracy:
|
||||
|
||||
- Too large (>2000px): Model's receptive field cannot capture entire structures
|
||||
- Too small (<1200px): Insufficient detail for accurate detection
|
||||
|
||||
Scaling behavior:
|
||||
- max_dim > max_dimension (2000): Scale DOWN to target (1600)
|
||||
- max_dim < min_dimension (1200): Scale UP to target (1600)
|
||||
- min_dimension <= max_dim <= max_dimension: No scaling (optimal range)
|
||||
|
||||
Args:
|
||||
image: Input image (BGR)
|
||||
force_scale: Force scaling to target even if in optimal range
|
||||
|
||||
Returns:
|
||||
Tuple of (scaled_image, ScalingInfo)
|
||||
ScalingInfo.scale_factor is the multiplier to convert scaled bbox
|
||||
coordinates back to original image coordinates.
|
||||
"""
|
||||
h, w = image.shape[:2]
|
||||
original_size = (w, h)
|
||||
max_dim = max(h, w)
|
||||
|
||||
# Determine if scaling is needed and direction
|
||||
should_downscale = self.scaling_enabled and max_dim > self.scaling_max_dimension
|
||||
should_upscale = self.scaling_enabled and max_dim < self.scaling_min_dimension
|
||||
should_scale = should_downscale or should_upscale or force_scale
|
||||
|
||||
if not should_scale:
|
||||
return image, ScalingInfo(
|
||||
was_scaled=False,
|
||||
scale_factor=1.0,
|
||||
original_size=original_size,
|
||||
scaled_size=original_size
|
||||
)
|
||||
|
||||
# Calculate scale factor to reach target dimension
|
||||
actual_scale = self.scaling_target_dimension / max_dim
|
||||
new_w = int(w * actual_scale)
|
||||
new_h = int(h * actual_scale)
|
||||
|
||||
# Choose interpolation method based on scale direction
|
||||
if actual_scale < 1.0:
|
||||
# Downscaling: INTER_AREA is best for shrinking (anti-aliasing)
|
||||
interpolation = cv2.INTER_AREA
|
||||
direction = "DOWN"
|
||||
else:
|
||||
# Upscaling: INTER_CUBIC provides smooth enlargement
|
||||
interpolation = cv2.INTER_CUBIC
|
||||
direction = "UP"
|
||||
|
||||
scaled_image = cv2.resize(image, (new_w, new_h), interpolation=interpolation)
|
||||
|
||||
# scale_factor is the inverse - used to scale bbox coords back to original
|
||||
scale_factor = 1.0 / actual_scale
|
||||
|
||||
logger.info(
|
||||
f"Scaled {direction} for layout detection: {w}x{h} -> {new_w}x{new_h} "
|
||||
f"(scale_factor={scale_factor:.3f} to restore original coords)"
|
||||
)
|
||||
|
||||
return scaled_image, ScalingInfo(
|
||||
was_scaled=True,
|
||||
scale_factor=scale_factor,
|
||||
original_size=original_size,
|
||||
scaled_size=(new_w, new_h)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def scale_bbox_to_original(
|
||||
bbox: Tuple[float, float, float, float],
|
||||
scale_factor: float
|
||||
) -> Tuple[float, float, float, float]:
|
||||
"""
|
||||
Scale a bounding box from scaled coordinates back to original image coordinates.
|
||||
|
||||
Args:
|
||||
bbox: Bounding box as (x1, y1, x2, y2) in scaled image coordinates
|
||||
scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
|
||||
|
||||
Returns:
|
||||
Bounding box in original image coordinates
|
||||
"""
|
||||
x1, y1, x2, y2 = bbox
|
||||
return (
|
||||
x1 * scale_factor,
|
||||
y1 * scale_factor,
|
||||
x2 * scale_factor,
|
||||
y2 * scale_factor
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def scale_bboxes_to_original(
|
||||
bboxes: list,
|
||||
scale_factor: float
|
||||
) -> list:
|
||||
"""
|
||||
Scale multiple bounding boxes from scaled coordinates to original.
|
||||
|
||||
Args:
|
||||
bboxes: List of bounding boxes, each as (x1, y1, x2, y2)
|
||||
scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
|
||||
|
||||
Returns:
|
||||
List of bounding boxes in original image coordinates
|
||||
"""
|
||||
return [
|
||||
LayoutPreprocessingService.scale_bbox_to_original(bbox, scale_factor)
|
||||
for bbox in bboxes
|
||||
]
|
||||
|
||||
def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
|
||||
"""
|
||||
Determine optimal preprocessing config based on image quality.
|
||||
@@ -203,6 +402,18 @@ class LayoutPreprocessingService:
|
||||
tileGridSize=self.clahe_tile_grid_size
|
||||
)
|
||||
l_enhanced = clahe.apply(l_channel)
|
||||
elif method == PreprocessingContrastEnum.DOCUMENT:
|
||||
# Document-specific enhancement for scanned documents
|
||||
# Step 1: Background normalization to remove uneven illumination
|
||||
l_normalized = self._normalize_background(l_channel)
|
||||
|
||||
# Step 2: CLAHE with larger tiles optimized for documents
|
||||
clip_limit = self.document_clahe_clip_limit * strength
|
||||
clahe = cv2.createCLAHE(
|
||||
clipLimit=clip_limit,
|
||||
tileGridSize=self.document_clahe_tile_grid_size
|
||||
)
|
||||
l_enhanced = clahe.apply(l_normalized)
|
||||
else:
|
||||
return image
|
||||
|
||||
@@ -277,15 +488,29 @@ class LayoutPreprocessingService:
|
||||
self,
|
||||
image: Union[np.ndarray, Image.Image, str, Path],
|
||||
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
|
||||
config: Optional[PreprocessingConfig] = None
|
||||
config: Optional[PreprocessingConfig] = None,
|
||||
apply_scaling: bool = True
|
||||
) -> PreprocessingResult:
|
||||
"""
|
||||
Preprocess image for layout detection.
|
||||
|
||||
The preprocessing pipeline:
|
||||
1. Load image from path/PIL if needed
|
||||
2. Analyze image quality (on original image for accurate metrics)
|
||||
3. Scale down high-resolution images for better layout detection
|
||||
4. Apply contrast enhancement if needed
|
||||
5. Apply sharpening if needed
|
||||
6. Apply binarization if requested (not recommended)
|
||||
|
||||
IMPORTANT: When scaling is applied, all bounding boxes from layout detection
|
||||
must be scaled back to original coordinates using ScalingInfo.scale_factor.
|
||||
The original image should be used for element extraction (cropping).
|
||||
|
||||
Args:
|
||||
image: Input image (numpy array, PIL Image, or path)
|
||||
mode: Preprocessing mode (auto, manual, disabled)
|
||||
config: Manual configuration (required if mode='manual')
|
||||
apply_scaling: Whether to apply automatic downscaling (default True)
|
||||
|
||||
Returns:
|
||||
PreprocessingResult with preprocessed image and metadata
|
||||
@@ -299,21 +524,37 @@ class LayoutPreprocessingService:
|
||||
# Convert PIL to OpenCV format (BGR)
|
||||
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
||||
|
||||
# Analyze quality
|
||||
# Analyze quality on ORIGINAL image (before scaling) for accurate metrics
|
||||
metrics = self.analyze_image_quality(image)
|
||||
logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
|
||||
|
||||
# Apply scaling for layout detection (even if preprocessing is disabled)
|
||||
if apply_scaling:
|
||||
scaled_image, scaling_info = self.scale_for_layout_detection(image)
|
||||
else:
|
||||
h, w = image.shape[:2]
|
||||
scaled_image = image
|
||||
scaling_info = ScalingInfo(
|
||||
was_scaled=False,
|
||||
scale_factor=1.0,
|
||||
original_size=(w, h),
|
||||
scaled_size=(w, h)
|
||||
)
|
||||
|
||||
# Determine configuration
|
||||
if mode == PreprocessingModeEnum.DISABLED:
|
||||
# Even when preprocessing is disabled, we still return scaled image
|
||||
# for better layout detection. Original image is preserved for cropping.
|
||||
return PreprocessingResult(
|
||||
image=image,
|
||||
image=scaled_image,
|
||||
config_used=PreprocessingConfig(
|
||||
contrast=PreprocessingContrastEnum.NONE,
|
||||
sharpen=False,
|
||||
binarize=False
|
||||
),
|
||||
quality_metrics=metrics,
|
||||
was_processed=False
|
||||
was_processed=scaling_info.was_scaled, # True if scaling was applied
|
||||
scaling_info=scaling_info
|
||||
)
|
||||
|
||||
if mode == PreprocessingModeEnum.AUTO:
|
||||
@@ -323,9 +564,9 @@ class LayoutPreprocessingService:
|
||||
# Manual mode but no config provided, use defaults
|
||||
config = PreprocessingConfig()
|
||||
|
||||
# Apply preprocessing pipeline
|
||||
processed = image.copy()
|
||||
was_processed = False
|
||||
# Apply preprocessing pipeline on SCALED image
|
||||
processed = scaled_image.copy()
|
||||
was_processed = scaling_info.was_scaled # Start with True if already scaled
|
||||
|
||||
# Step 1: Contrast enhancement
|
||||
if config.contrast != PreprocessingContrastEnum.NONE:
|
||||
@@ -353,29 +594,37 @@ class LayoutPreprocessingService:
|
||||
image=processed,
|
||||
config_used=config,
|
||||
quality_metrics=metrics,
|
||||
was_processed=was_processed
|
||||
was_processed=was_processed,
|
||||
scaling_info=scaling_info
|
||||
)
|
||||
|
||||
def preprocess_to_pil(
|
||||
self,
|
||||
image: Union[np.ndarray, Image.Image, str, Path],
|
||||
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
|
||||
config: Optional[PreprocessingConfig] = None
|
||||
config: Optional[PreprocessingConfig] = None,
|
||||
apply_scaling: bool = True
|
||||
) -> Tuple[Image.Image, PreprocessingResult]:
|
||||
"""
|
||||
Preprocess image and return as PIL Image.
|
||||
|
||||
Convenience method for integration with PP-Structure which accepts PIL images.
|
||||
|
||||
IMPORTANT: When result.scaling_info.was_scaled is True, all bounding boxes
|
||||
from PP-Structure must be scaled back to original coordinates using:
|
||||
scaled_bbox = (x1 * scale_factor, y1 * scale_factor, x2 * scale_factor, y2 * scale_factor)
|
||||
where scale_factor = result.scaling_info.scale_factor
|
||||
|
||||
Args:
|
||||
image: Input image
|
||||
mode: Preprocessing mode
|
||||
config: Manual configuration
|
||||
apply_scaling: Whether to apply automatic downscaling (default True)
|
||||
|
||||
Returns:
|
||||
Tuple of (PIL Image, PreprocessingResult)
|
||||
Tuple of (PIL Image for PP-Structure, PreprocessingResult with scaling info)
|
||||
"""
|
||||
result = self.preprocess(image, mode, config)
|
||||
result = self.preprocess(image, mode, config, apply_scaling=apply_scaling)
|
||||
|
||||
# Convert BGR to RGB for PIL
|
||||
rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
Reference in New Issue
Block a user