feat: implement layout preprocessing backend

Backend implementation for add-layout-preprocessing proposal:
- Add LayoutPreprocessingService with CLAHE, sharpen, binarize
- Add auto-detection: analyze_image_quality() for contrast/edge metrics
- Integrate preprocessing into OCR pipeline (analyze_layout)
- Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing
- Add config options: layout_preprocessing_mode, thresholds
- Add schemas: PreprocessingConfig, PreprocessingPreviewResponse

Preprocessing only affects layout detection input.
Original images preserved for element extraction.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 15:17:20 +08:00
parent 06a5973f2e
commit ea0dd7456c
7 changed files with 800 additions and 22 deletions

View File

@@ -0,0 +1,370 @@
"""
Tool_OCR - Layout Preprocessing Service
Image preprocessing to enhance layout detection for documents with faint lines/borders.
This service provides:
1. Image quality analysis (contrast, edge strength)
2. Contrast enhancement (histogram equalization, CLAHE)
3. Sharpening for faint lines
4. Optional binarization for very low contrast documents
IMPORTANT: Preprocessing only affects layout detection input.
Original images are preserved for element extraction.
"""
import logging
from pathlib import Path
from typing import Optional, Tuple, Union
from dataclasses import dataclass
import cv2
import numpy as np
from PIL import Image
from app.core.config import settings
from app.schemas.task import (
PreprocessingConfig,
PreprocessingContrastEnum,
PreprocessingModeEnum,
ImageQualityMetrics,
)
logger = logging.getLogger(__name__)
@dataclass
class PreprocessingResult:
"""Result of preprocessing operation."""
image: np.ndarray
config_used: PreprocessingConfig
quality_metrics: ImageQualityMetrics
was_processed: bool
class LayoutPreprocessingService:
"""
Service for preprocessing images to improve layout detection.
The preprocessing pipeline:
1. Analyze image quality (contrast, edge strength)
2. Apply contrast enhancement if needed (CLAHE or histogram)
3. Apply sharpening if edge strength is low
4. Apply binarization if contrast is very low (optional)
All operations preserve the original color image dimensions.
"""
def __init__(self):
# Load thresholds from config
self.contrast_threshold = settings.layout_preprocessing_contrast_threshold
self.edge_threshold = settings.layout_preprocessing_edge_threshold
self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
# CLAHE parameters
self.clahe_clip_limit = 2.0
self.clahe_tile_grid_size = (8, 8)
# Sharpening kernel (unsharp mask style)
self.sharpen_kernel = np.array([
[0, -1, 0],
[-1, 5, -1],
[0, -1, 0]
], dtype=np.float32)
logger.info(
f"LayoutPreprocessingService initialized with thresholds: "
f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
f"binarize={self.binarize_threshold}"
)
def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
"""
Analyze image quality to determine preprocessing needs.
Args:
image: Input image (BGR or grayscale)
Returns:
ImageQualityMetrics with contrast and edge_strength
"""
# Convert to grayscale if needed
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image
# Calculate contrast (standard deviation of pixel values)
contrast = float(np.std(gray))
# Calculate edge strength (mean of Sobel gradient magnitude)
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2)))
return ImageQualityMetrics(
contrast=round(contrast, 2),
edge_strength=round(edge_strength, 2)
)
def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
"""
Determine optimal preprocessing config based on image quality.
Args:
metrics: Image quality metrics from analyze_image_quality()
Returns:
PreprocessingConfig with recommended settings
"""
# Determine contrast enhancement
if metrics.contrast < self.contrast_threshold:
contrast = PreprocessingContrastEnum.CLAHE
else:
contrast = PreprocessingContrastEnum.NONE
# Determine sharpening
sharpen = metrics.edge_strength < self.edge_threshold
# Determine binarization (only for very low contrast)
binarize = metrics.contrast < self.binarize_threshold
return PreprocessingConfig(
contrast=contrast,
sharpen=sharpen,
binarize=binarize
)
def apply_contrast_enhancement(
self,
image: np.ndarray,
method: PreprocessingContrastEnum
) -> np.ndarray:
"""
Apply contrast enhancement to image.
Args:
image: Input image (BGR)
method: Enhancement method (none, histogram, clahe)
Returns:
Enhanced image (BGR)
"""
if method == PreprocessingContrastEnum.NONE:
return image
# Convert to LAB color space for better enhancement
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
if method == PreprocessingContrastEnum.HISTOGRAM:
# Standard histogram equalization
l_enhanced = cv2.equalizeHist(l_channel)
elif method == PreprocessingContrastEnum.CLAHE:
# Contrast Limited Adaptive Histogram Equalization
clahe = cv2.createCLAHE(
clipLimit=self.clahe_clip_limit,
tileGridSize=self.clahe_tile_grid_size
)
l_enhanced = clahe.apply(l_channel)
else:
return image
# Merge channels and convert back to BGR
enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel])
enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
return enhanced_bgr
def apply_sharpening(self, image: np.ndarray) -> np.ndarray:
"""
Apply sharpening to enhance edges and faint lines.
Args:
image: Input image (BGR)
Returns:
Sharpened image (BGR)
"""
# Apply unsharp mask style sharpening
sharpened = cv2.filter2D(image, -1, self.sharpen_kernel)
# Clip values to valid range
sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
return sharpened
def apply_binarization(self, image: np.ndarray) -> np.ndarray:
"""
Apply adaptive binarization for very low contrast documents.
Args:
image: Input image (BGR)
Returns:
Binarized image (BGR, but grayscale values)
"""
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2
)
# Convert back to BGR for consistency
binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
return binary_bgr
def preprocess(
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
) -> PreprocessingResult:
"""
Preprocess image for layout detection.
Args:
image: Input image (numpy array, PIL Image, or path)
mode: Preprocessing mode (auto, manual, disabled)
config: Manual configuration (required if mode='manual')
Returns:
PreprocessingResult with preprocessed image and metadata
"""
# Load image if path provided
if isinstance(image, (str, Path)):
image = cv2.imread(str(image))
if image is None:
raise ValueError(f"Failed to load image: {image}")
elif isinstance(image, Image.Image):
# Convert PIL to OpenCV format (BGR)
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Analyze quality
metrics = self.analyze_image_quality(image)
logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
# Determine configuration
if mode == PreprocessingModeEnum.DISABLED:
return PreprocessingResult(
image=image,
config_used=PreprocessingConfig(
contrast=PreprocessingContrastEnum.NONE,
sharpen=False,
binarize=False
),
quality_metrics=metrics,
was_processed=False
)
if mode == PreprocessingModeEnum.AUTO:
config = self.get_auto_config(metrics)
logger.debug(f"Auto config: {config}")
elif config is None:
# Manual mode but no config provided, use defaults
config = PreprocessingConfig()
# Apply preprocessing pipeline
processed = image.copy()
was_processed = False
# Step 1: Contrast enhancement
if config.contrast != PreprocessingContrastEnum.NONE:
processed = self.apply_contrast_enhancement(processed, config.contrast)
was_processed = True
logger.debug(f"Applied contrast enhancement: {config.contrast}")
# Step 2: Sharpening
if config.sharpen:
processed = self.apply_sharpening(processed)
was_processed = True
logger.debug("Applied sharpening")
# Step 3: Binarization (last step, overwrites color)
if config.binarize:
processed = self.apply_binarization(processed)
was_processed = True
logger.debug("Applied binarization")
return PreprocessingResult(
image=processed,
config_used=config,
quality_metrics=metrics,
was_processed=was_processed
)
def preprocess_to_pil(
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
) -> Tuple[Image.Image, PreprocessingResult]:
"""
Preprocess image and return as PIL Image.
Convenience method for integration with PP-Structure which accepts PIL images.
Args:
image: Input image
mode: Preprocessing mode
config: Manual configuration
Returns:
Tuple of (PIL Image, PreprocessingResult)
"""
result = self.preprocess(image, mode, config)
# Convert BGR to RGB for PIL
rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_image)
return pil_image, result
def save_preview(
self,
original: np.ndarray,
preprocessed: np.ndarray,
output_dir: Path,
prefix: str = "preview"
) -> Tuple[Path, Path]:
"""
Save original and preprocessed images for preview.
Args:
original: Original image (BGR)
preprocessed: Preprocessed image (BGR)
output_dir: Directory to save images
prefix: Filename prefix
Returns:
Tuple of (original_path, preprocessed_path)
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
original_path = output_dir / f"{prefix}_original.png"
preprocessed_path = output_dir / f"{prefix}_preprocessed.png"
cv2.imwrite(str(original_path), original)
cv2.imwrite(str(preprocessed_path), preprocessed)
return original_path, preprocessed_path
# Singleton instance
_layout_preprocessing_service: Optional[LayoutPreprocessingService] = None
def get_layout_preprocessing_service() -> LayoutPreprocessingService:
"""Get or create the layout preprocessing service singleton."""
global _layout_preprocessing_service
if _layout_preprocessing_service is None:
_layout_preprocessing_service = LayoutPreprocessingService()
return _layout_preprocessing_service

View File

@@ -26,6 +26,11 @@ except ImportError:
from app.core.config import settings
from app.services.office_converter import OfficeConverter, OfficeConverterError
from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
from app.services.layout_preprocessing_service import (
get_layout_preprocessing_service,
LayoutPreprocessingService,
)
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
# Import dual-track components
try:
@@ -865,7 +870,9 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
current_page: int = 0,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
@@ -878,6 +885,8 @@ class OCRService:
output_dir: Optional output directory for saving extracted images
current_page: Current page number (0-based) for multi-page documents
layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
Dictionary with OCR results and metadata
@@ -946,7 +955,9 @@ class OCRService:
confidence_threshold=confidence_threshold,
output_dir=output_dir,
current_page=page_num - 1, # Convert to 0-based page number for layout data
layout_model=layout_model
layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config
)
# Accumulate results
@@ -1092,7 +1103,9 @@ class OCRService:
image_path,
output_dir=output_dir,
current_page=current_page,
layout_model=layout_model
layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config
)
# Generate Markdown
@@ -1248,7 +1261,9 @@ class OCRService:
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1258,6 +1273,8 @@ class OCRService:
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
current_page: Current page number (0-based) for multi-page documents
layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
Tuple of (layout_data, images_metadata)
@@ -1277,13 +1294,45 @@ class OCRService:
structure_engine = self._ensure_structure_engine(layout_model)
# Apply image preprocessing for layout detection
# Preprocessing enhances faint lines/borders to improve table detection
# Original image is preserved for element extraction
preprocessed_image = None
preprocessing_result = None
# Determine preprocessing mode (default from config if not specified)
mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
if mode != PreprocessingModeEnum.DISABLED:
try:
preprocessing_service = get_layout_preprocessing_service()
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
image_path,
mode=mode,
config=preprocessing_config
)
if preprocessing_result.was_processed:
preprocessed_image = preprocessed_pil
logger.info(
f"Layout preprocessing applied: mode={mode.value}, "
f"config={preprocessing_result.config_used}, "
f"metrics={preprocessing_result.quality_metrics}"
)
else:
logger.info(f"No preprocessing needed (mode={mode.value})")
except Exception as preprocess_error:
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
preprocessed_image = None
# Try enhanced processing first
try:
from app.services.pp_structure_enhanced import PPStructureEnhanced
enhanced_processor = PPStructureEnhanced(structure_engine)
result = enhanced_processor.analyze_with_full_structure(
image_path, output_dir, current_page
image_path, output_dir, current_page, preprocessed_image=preprocessed_image
)
if result.get('has_parsing_res_list'):
@@ -1337,7 +1386,17 @@ class OCRService:
logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
return None, []
results = structure_engine.predict(str(image_path))
# Use preprocessed image if available, otherwise original path
if preprocessed_image is not None:
import numpy as np
# Convert PIL to numpy array (BGR format for PP-Structure)
predict_input = np.array(preprocessed_image)
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
# Convert RGB to BGR
predict_input = predict_input[:, :, ::-1]
results = structure_engine.predict(predict_input)
else:
results = structure_engine.predict(str(image_path))
layout_elements = []
images_metadata = []
@@ -1509,7 +1568,9 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
force_track: Optional[str] = None,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Union[UnifiedDocument, Dict]:
"""
Process document using dual-track approach.
@@ -1522,6 +1583,8 @@ class OCRService:
output_dir: Optional output directory for extracted images
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
UnifiedDocument if dual-track is enabled, Dict otherwise
@@ -1529,7 +1592,8 @@ class OCRService:
if not self.dual_track_enabled:
# Fallback to traditional OCR processing
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
start_time = datetime.now()
@@ -1601,7 +1665,9 @@ class OCRService:
ocr_result = self.process_file_traditional(
actual_file_path, lang, detect_layout=True,
confidence_threshold=confidence_threshold,
output_dir=output_dir, layout_model=layout_model
output_dir=output_dir, layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config
)
# Convert OCR result to extract images
@@ -1634,7 +1700,8 @@ class OCRService:
# Use OCR for scanned documents, images, etc.
logger.info("Using OCR track (PaddleOCR)")
ocr_result = self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
# Convert OCR result to UnifiedDocument using the converter
@@ -1664,7 +1731,8 @@ class OCRService:
logger.error(f"Error in dual-track processing: {e}")
# Fallback to traditional OCR
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
def _merge_ocr_images_into_direct(
@@ -1743,7 +1811,9 @@ class OCRService:
detect_layout: bool = True,
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Dict:
"""
Traditional OCR processing (legacy method).
@@ -1755,6 +1825,8 @@ class OCRService:
confidence_threshold: Minimum confidence threshold
output_dir: Optional output directory
layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
Dictionary with OCR results in legacy format
@@ -1767,7 +1839,8 @@ class OCRService:
all_results = []
for i, image_path in enumerate(image_paths):
result = self.process_image(
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
preprocessing_mode, preprocessing_config
)
all_results.append(result)
@@ -1783,7 +1856,8 @@ class OCRService:
else:
# Single image or other file
return self.process_image(
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
preprocessing_mode, preprocessing_config
)
def _combine_results(self, results: List[Dict]) -> Dict:
@@ -1868,7 +1942,9 @@ class OCRService:
output_dir: Optional[Path] = None,
use_dual_track: bool = True,
force_track: Optional[str] = None,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Union[UnifiedDocument, Dict]:
"""
Main processing method with dual-track support.
@@ -1882,6 +1958,8 @@ class OCRService:
use_dual_track: Whether to use dual-track processing (default True)
force_track: Force specific track ("ocr" or "direct")
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
UnifiedDocument if dual-track is enabled and use_dual_track=True,
@@ -1893,12 +1971,14 @@ class OCRService:
if (use_dual_track or force_track) and self.dual_track_enabled:
# Use dual-track processing (or forced track)
return self.process_with_dual_track(
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
preprocessing_mode, preprocessing_config
)
else:
# Use traditional OCR processing (no force_track support)
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
def process_legacy(

View File

@@ -20,6 +20,8 @@ except ImportError:
import paddle
from paddleocr import PPStructureV3
from PIL import Image
import numpy as np
from app.models.unified_document import ElementType
from app.core.config import settings
from app.services.memory_manager import prediction_context
@@ -78,15 +80,19 @@ class PPStructureEnhanced:
self,
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0
current_page: int = 0,
preprocessed_image: Optional[Image.Image] = None
) -> Dict[str, Any]:
"""
Analyze document with full PP-StructureV3 capabilities.
Args:
image_path: Path to image file
image_path: Path to original image file (used for cropping extracted images)
output_dir: Optional output directory for saving extracted content
current_page: Current page number (0-based)
preprocessed_image: Optional preprocessed PIL Image for layout detection.
If provided, this is used for PP-Structure prediction,
but original image_path is still used for cropping images.
Returns:
Dictionary with complete structure information including:
@@ -97,6 +103,8 @@ class PPStructureEnhanced:
"""
try:
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
if preprocessed_image:
logger.info("Using preprocessed image for layout detection")
# Perform structure analysis with semaphore control
# This prevents OOM errors from multiple simultaneous predictions
@@ -113,7 +121,16 @@ class PPStructureEnhanced:
'error': 'Prediction slot timeout'
}
results = self.structure_engine.predict(str(image_path))
# Use preprocessed image if provided, otherwise use original path
if preprocessed_image is not None:
# Convert PIL to numpy array (BGR format for PP-Structure)
predict_input = np.array(preprocessed_image)
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
# Convert RGB to BGR
predict_input = predict_input[:, :, ::-1]
results = self.structure_engine.predict(predict_input)
else:
results = self.structure_engine.predict(str(image_path))
all_elements = []
all_images = []