feat: implement layout preprocessing backend
Backend implementation for add-layout-preprocessing proposal:
- Add LayoutPreprocessingService with CLAHE, sharpen, binarize
- Add auto-detection: analyze_image_quality() for contrast/edge metrics
- Integrate preprocessing into OCR pipeline (analyze_layout)
- Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing
- Add config options: layout_preprocessing_mode, thresholds
- Add schemas: PreprocessingConfig, PreprocessingPreviewResponse
Preprocessing only affects layout detection input.
Original images preserved for element extraction.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,11 @@ except ImportError:
|
||||
from app.core.config import settings
|
||||
from app.services.office_converter import OfficeConverter, OfficeConverterError
|
||||
from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
|
||||
from app.services.layout_preprocessing_service import (
|
||||
get_layout_preprocessing_service,
|
||||
LayoutPreprocessingService,
|
||||
)
|
||||
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
|
||||
|
||||
# Import dual-track components
|
||||
try:
|
||||
@@ -865,7 +870,9 @@ class OCRService:
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None,
|
||||
current_page: int = 0,
|
||||
layout_model: Optional[str] = None
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
@@ -878,6 +885,8 @@ class OCRService:
|
||||
output_dir: Optional output directory for saving extracted images
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results and metadata
|
||||
@@ -946,7 +955,9 @@ class OCRService:
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir,
|
||||
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
||||
layout_model=layout_model
|
||||
layout_model=layout_model,
|
||||
preprocessing_mode=preprocessing_mode,
|
||||
preprocessing_config=preprocessing_config
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
@@ -1092,7 +1103,9 @@ class OCRService:
|
||||
image_path,
|
||||
output_dir=output_dir,
|
||||
current_page=current_page,
|
||||
layout_model=layout_model
|
||||
layout_model=layout_model,
|
||||
preprocessing_mode=preprocessing_mode,
|
||||
preprocessing_config=preprocessing_config
|
||||
)
|
||||
|
||||
# Generate Markdown
|
||||
@@ -1248,7 +1261,9 @@ class OCRService:
|
||||
image_path: Path,
|
||||
output_dir: Optional[Path] = None,
|
||||
current_page: int = 0,
|
||||
layout_model: Optional[str] = None
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
||||
@@ -1258,6 +1273,8 @@ class OCRService:
|
||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
@@ -1277,13 +1294,45 @@ class OCRService:
|
||||
|
||||
structure_engine = self._ensure_structure_engine(layout_model)
|
||||
|
||||
# Apply image preprocessing for layout detection
|
||||
# Preprocessing enhances faint lines/borders to improve table detection
|
||||
# Original image is preserved for element extraction
|
||||
preprocessed_image = None
|
||||
preprocessing_result = None
|
||||
|
||||
# Determine preprocessing mode (default from config if not specified)
|
||||
mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
|
||||
|
||||
if mode != PreprocessingModeEnum.DISABLED:
|
||||
try:
|
||||
preprocessing_service = get_layout_preprocessing_service()
|
||||
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
|
||||
image_path,
|
||||
mode=mode,
|
||||
config=preprocessing_config
|
||||
)
|
||||
|
||||
if preprocessing_result.was_processed:
|
||||
preprocessed_image = preprocessed_pil
|
||||
logger.info(
|
||||
f"Layout preprocessing applied: mode={mode.value}, "
|
||||
f"config={preprocessing_result.config_used}, "
|
||||
f"metrics={preprocessing_result.quality_metrics}"
|
||||
)
|
||||
else:
|
||||
logger.info(f"No preprocessing needed (mode={mode.value})")
|
||||
|
||||
except Exception as preprocess_error:
|
||||
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
|
||||
preprocessed_image = None
|
||||
|
||||
# Try enhanced processing first
|
||||
try:
|
||||
from app.services.pp_structure_enhanced import PPStructureEnhanced
|
||||
|
||||
enhanced_processor = PPStructureEnhanced(structure_engine)
|
||||
result = enhanced_processor.analyze_with_full_structure(
|
||||
image_path, output_dir, current_page
|
||||
image_path, output_dir, current_page, preprocessed_image=preprocessed_image
|
||||
)
|
||||
|
||||
if result.get('has_parsing_res_list'):
|
||||
@@ -1337,7 +1386,17 @@ class OCRService:
|
||||
logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
|
||||
return None, []
|
||||
|
||||
results = structure_engine.predict(str(image_path))
|
||||
# Use preprocessed image if available, otherwise original path
|
||||
if preprocessed_image is not None:
|
||||
import numpy as np
|
||||
# Convert PIL to numpy array (BGR format for PP-Structure)
|
||||
predict_input = np.array(preprocessed_image)
|
||||
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
|
||||
# Convert RGB to BGR
|
||||
predict_input = predict_input[:, :, ::-1]
|
||||
results = structure_engine.predict(predict_input)
|
||||
else:
|
||||
results = structure_engine.predict(str(image_path))
|
||||
|
||||
layout_elements = []
|
||||
images_metadata = []
|
||||
@@ -1509,7 +1568,9 @@ class OCRService:
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None,
|
||||
force_track: Optional[str] = None,
|
||||
layout_model: Optional[str] = None
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
) -> Union[UnifiedDocument, Dict]:
|
||||
"""
|
||||
Process document using dual-track approach.
|
||||
@@ -1522,6 +1583,8 @@ class OCRService:
|
||||
output_dir: Optional output directory for extracted images
|
||||
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
|
||||
Returns:
|
||||
UnifiedDocument if dual-track is enabled, Dict otherwise
|
||||
@@ -1529,7 +1592,8 @@ class OCRService:
|
||||
if not self.dual_track_enabled:
|
||||
# Fallback to traditional OCR processing
|
||||
return self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
)
|
||||
|
||||
start_time = datetime.now()
|
||||
@@ -1601,7 +1665,9 @@ class OCRService:
|
||||
ocr_result = self.process_file_traditional(
|
||||
actual_file_path, lang, detect_layout=True,
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir, layout_model=layout_model
|
||||
output_dir=output_dir, layout_model=layout_model,
|
||||
preprocessing_mode=preprocessing_mode,
|
||||
preprocessing_config=preprocessing_config
|
||||
)
|
||||
|
||||
# Convert OCR result to extract images
|
||||
@@ -1634,7 +1700,8 @@ class OCRService:
|
||||
# Use OCR for scanned documents, images, etc.
|
||||
logger.info("Using OCR track (PaddleOCR)")
|
||||
ocr_result = self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
)
|
||||
|
||||
# Convert OCR result to UnifiedDocument using the converter
|
||||
@@ -1664,7 +1731,8 @@ class OCRService:
|
||||
logger.error(f"Error in dual-track processing: {e}")
|
||||
# Fallback to traditional OCR
|
||||
return self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
)
|
||||
|
||||
def _merge_ocr_images_into_direct(
|
||||
@@ -1743,7 +1811,9 @@ class OCRService:
|
||||
detect_layout: bool = True,
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None,
|
||||
layout_model: Optional[str] = None
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Traditional OCR processing (legacy method).
|
||||
@@ -1755,6 +1825,8 @@ class OCRService:
|
||||
confidence_threshold: Minimum confidence threshold
|
||||
output_dir: Optional output directory
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results in legacy format
|
||||
@@ -1767,7 +1839,8 @@ class OCRService:
|
||||
all_results = []
|
||||
for i, image_path in enumerate(image_paths):
|
||||
result = self.process_image(
|
||||
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
|
||||
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
)
|
||||
all_results.append(result)
|
||||
|
||||
@@ -1783,7 +1856,8 @@ class OCRService:
|
||||
else:
|
||||
# Single image or other file
|
||||
return self.process_image(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
)
|
||||
|
||||
def _combine_results(self, results: List[Dict]) -> Dict:
|
||||
@@ -1868,7 +1942,9 @@ class OCRService:
|
||||
output_dir: Optional[Path] = None,
|
||||
use_dual_track: bool = True,
|
||||
force_track: Optional[str] = None,
|
||||
layout_model: Optional[str] = None
|
||||
layout_model: Optional[str] = None,
|
||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||
) -> Union[UnifiedDocument, Dict]:
|
||||
"""
|
||||
Main processing method with dual-track support.
|
||||
@@ -1882,6 +1958,8 @@ class OCRService:
|
||||
use_dual_track: Whether to use dual-track processing (default True)
|
||||
force_track: Force specific track ("ocr" or "direct")
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||
|
||||
Returns:
|
||||
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
||||
@@ -1893,12 +1971,14 @@ class OCRService:
|
||||
if (use_dual_track or force_track) and self.dual_track_enabled:
|
||||
# Use dual-track processing (or forced track)
|
||||
return self.process_with_dual_track(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
)
|
||||
else:
|
||||
# Use traditional OCR processing (no force_track support)
|
||||
return self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||
preprocessing_mode, preprocessing_config
|
||||
)
|
||||
|
||||
def process_legacy(
|
||||
|
||||
Reference in New Issue
Block a user