feat: implement layout preprocessing backend

Backend implementation for add-layout-preprocessing proposal:
- Add LayoutPreprocessingService with CLAHE, sharpen, binarize
- Add auto-detection: analyze_image_quality() for contrast/edge metrics
- Integrate preprocessing into OCR pipeline (analyze_layout)
- Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing
- Add config options: layout_preprocessing_mode, thresholds
- Add schemas: PreprocessingConfig, PreprocessingPreviewResponse

Preprocessing only affects layout detection input.
Original images preserved for element extraction.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 15:17:20 +08:00
parent 06a5973f2e
commit ea0dd7456c
7 changed files with 800 additions and 22 deletions

View File

@@ -146,6 +146,39 @@ class Settings(BaseSettings):
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
)
# ===== Layout Preprocessing Configuration =====
# Image preprocessing to enhance layout detection for documents with faint lines/borders
# Preprocessing only affects layout detection input; original image is preserved for extraction
layout_preprocessing_mode: str = Field(
default="auto",
description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
)
layout_preprocessing_contrast: str = Field(
default="clahe",
description="Contrast enhancement method: 'none', 'histogram', 'clahe' (recommended)"
)
layout_preprocessing_sharpen: bool = Field(
default=True,
description="Enable sharpening to enhance faint lines and borders"
)
layout_preprocessing_binarize: bool = Field(
default=False,
description="Enable binarization (aggressive, use for very low contrast documents only)"
)
# Auto-detection thresholds
layout_preprocessing_contrast_threshold: float = Field(
default=40.0,
description="Contrast (std dev) below this triggers CLAHE in auto mode"
)
layout_preprocessing_edge_threshold: float = Field(
default=15.0,
description="Edge strength below this triggers sharpening in auto mode"
)
layout_preprocessing_binarize_threshold: float = Field(
default=20.0,
description="Contrast below this triggers binarization in auto mode"
)
# ===== Gap Filling Configuration =====
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track

View File

@@ -35,6 +35,11 @@ from app.schemas.task import (
ProcessingMetadata,
TaskResponseWithMetadata,
ExportOptions,
PreprocessingModeEnum,
PreprocessingConfig,
PreprocessingPreviewRequest,
PreprocessingPreviewResponse,
ImageQualityMetrics,
)
from app.services.task_service import task_service
from app.services.file_access_service import file_access_service
@@ -1131,3 +1136,193 @@ async def download_unified(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to download: {str(e)}"
)
# ===== Preprocessing Preview Endpoints =====
@router.post("/{task_id}/preview/preprocessing", response_model=PreprocessingPreviewResponse, summary="Preview preprocessing effect")
async def preview_preprocessing(
task_id: str,
request: PreprocessingPreviewRequest,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Preview the effect of image preprocessing before OCR processing.
Shows side-by-side comparison of original and preprocessed images,
along with image quality metrics and auto-detected configuration.
- **task_id**: Task UUID
- **page**: Page number to preview (1-based)
- **mode**: Preprocessing mode ('auto', 'manual', 'disabled')
- **config**: Manual preprocessing config (only used when mode='manual')
"""
from pdf2image import convert_from_path
import base64
import io
from PIL import Image
from app.services.layout_preprocessing_service import get_layout_preprocessing_service
try:
# Get task details
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
# Get task file
task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
if not task_file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task file not found"
)
file_path = Path(task_file.stored_path)
if not file_path.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Source file not found"
)
# Get the page image
page_num = request.page
if file_path.suffix.lower() == '.pdf':
# Convert specific page from PDF
images = convert_from_path(
str(file_path),
first_page=page_num,
last_page=page_num,
dpi=150
)
if not images:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Page {page_num} not found in PDF"
)
original_image = images[0]
else:
# Direct image file
if page_num != 1:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Single image file only has page 1"
)
original_image = Image.open(file_path)
# Get preprocessing service
preprocessing_service = get_layout_preprocessing_service()
# Apply preprocessing
preprocessed_image, preprocess_result = preprocessing_service.preprocess_to_pil(
original_image,
mode=request.mode,
config=request.config
)
# Create result directory for preview images
preview_dir = Path(settings.result_dir) / task_id / "preview"
preview_dir.mkdir(parents=True, exist_ok=True)
# Save preview images
original_filename = f"page_{page_num}_original.png"
preprocessed_filename = f"page_{page_num}_preprocessed.png"
original_path = preview_dir / original_filename
preprocessed_path = preview_dir / preprocessed_filename
original_image.save(str(original_path), "PNG")
preprocessed_image.save(str(preprocessed_path), "PNG")
# Build URLs (relative paths that can be served)
base_url = f"/api/v2/tasks/{task_id}/preview/image"
original_url = f"{base_url}?type=original&page={page_num}"
preprocessed_url = f"{base_url}?type=preprocessed&page={page_num}"
return PreprocessingPreviewResponse(
original_url=original_url,
preprocessed_url=preprocessed_url,
quality_metrics=preprocess_result.quality_metrics,
auto_config=preprocess_result.config_used,
mode_used=request.mode
)
except HTTPException:
raise
except Exception as e:
logger.exception(f"Failed to preview preprocessing for task {task_id}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to preview preprocessing: {str(e)}"
)
@router.get("/{task_id}/preview/image", summary="Get preview image")
async def get_preview_image(
task_id: str,
type: str = Query(..., description="Image type: 'original' or 'preprocessed'"),
page: int = Query(1, ge=1, description="Page number"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Get a preview image (original or preprocessed).
- **task_id**: Task UUID
- **type**: Image type ('original' or 'preprocessed')
- **page**: Page number
"""
try:
# Verify task ownership
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
# Validate type parameter
if type not in ['original', 'preprocessed']:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Invalid type. Must be 'original' or 'preprocessed'"
)
# Build image path
preview_dir = Path(settings.result_dir) / task_id / "preview"
image_filename = f"page_{page}_{type}.png"
image_path = preview_dir / image_filename
if not image_path.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Preview image not found. Please call preview/preprocessing first."
)
return FileResponse(
path=str(image_path),
media_type="image/png",
filename=image_filename
)
except HTTPException:
raise
except Exception as e:
logger.exception(f"Failed to get preview image for task {task_id}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to get preview image: {str(e)}"
)

View File

@@ -37,6 +37,79 @@ class LayoutModelEnum(str, Enum):
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
class PreprocessingModeEnum(str, Enum):
"""Preprocessing mode for layout detection enhancement.
- AUTO: Analyze image quality and automatically apply optimal preprocessing
- MANUAL: Use user-specified preprocessing configuration
- DISABLED: Skip preprocessing entirely
"""
AUTO = "auto" # Analyze and apply automatically (default)
MANUAL = "manual" # Use specified configuration
DISABLED = "disabled" # Skip preprocessing
class PreprocessingContrastEnum(str, Enum):
"""Contrast enhancement method for preprocessing.
- NONE: No contrast enhancement
- HISTOGRAM: Standard histogram equalization
- CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended)
"""
NONE = "none"
HISTOGRAM = "histogram"
CLAHE = "clahe"
class PreprocessingConfig(BaseModel):
"""Preprocessing configuration for layout detection enhancement.
Used to configure image preprocessing before PP-Structure layout detection.
Preprocessing helps detect tables with faint lines or low contrast borders.
Original image is preserved for element extraction.
"""
contrast: PreprocessingContrastEnum = Field(
default=PreprocessingContrastEnum.CLAHE,
description="Contrast enhancement method"
)
sharpen: bool = Field(
default=True,
description="Enable sharpening for faint lines"
)
binarize: bool = Field(
default=False,
description="Enable binarization (aggressive, for very low contrast)"
)
class ImageQualityMetrics(BaseModel):
"""Image quality metrics from auto-analysis."""
contrast: float = Field(..., description="Contrast level (std dev of grayscale)")
edge_strength: float = Field(..., description="Edge strength (Sobel gradient mean)")
class PreprocessingPreviewRequest(BaseModel):
"""Request for preprocessing preview."""
page: int = Field(default=1, ge=1, description="Page number to preview")
mode: PreprocessingModeEnum = Field(
default=PreprocessingModeEnum.AUTO,
description="Preprocessing mode"
)
config: Optional[PreprocessingConfig] = Field(
None,
description="Manual configuration (only used when mode='manual')"
)
class PreprocessingPreviewResponse(BaseModel):
"""Response for preprocessing preview."""
original_url: str = Field(..., description="URL to original image")
preprocessed_url: str = Field(..., description="URL to preprocessed image")
quality_metrics: ImageQualityMetrics = Field(..., description="Image quality analysis")
auto_config: PreprocessingConfig = Field(..., description="Auto-detected configuration")
mode_used: PreprocessingModeEnum = Field(..., description="Mode that was applied")
class TaskCreate(BaseModel):
"""Task creation request"""
filename: Optional[str] = Field(None, description="Original filename")
@@ -195,6 +268,16 @@ class ProcessingOptions(BaseModel):
description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)"
)
# Layout preprocessing (OCR track only)
preprocessing_mode: PreprocessingModeEnum = Field(
default=PreprocessingModeEnum.AUTO,
description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
)
preprocessing_config: Optional[PreprocessingConfig] = Field(
None,
description="Manual preprocessing config (only used when preprocessing_mode='manual')"
)
class AnalyzeRequest(BaseModel):
"""Document analysis request"""

View File

@@ -0,0 +1,370 @@
"""
Tool_OCR - Layout Preprocessing Service
Image preprocessing to enhance layout detection for documents with faint lines/borders.
This service provides:
1. Image quality analysis (contrast, edge strength)
2. Contrast enhancement (histogram equalization, CLAHE)
3. Sharpening for faint lines
4. Optional binarization for very low contrast documents
IMPORTANT: Preprocessing only affects layout detection input.
Original images are preserved for element extraction.
"""
import logging
from pathlib import Path
from typing import Optional, Tuple, Union
from dataclasses import dataclass
import cv2
import numpy as np
from PIL import Image
from app.core.config import settings
from app.schemas.task import (
PreprocessingConfig,
PreprocessingContrastEnum,
PreprocessingModeEnum,
ImageQualityMetrics,
)
logger = logging.getLogger(__name__)
@dataclass
class PreprocessingResult:
"""Result of preprocessing operation."""
image: np.ndarray
config_used: PreprocessingConfig
quality_metrics: ImageQualityMetrics
was_processed: bool
class LayoutPreprocessingService:
"""
Service for preprocessing images to improve layout detection.
The preprocessing pipeline:
1. Analyze image quality (contrast, edge strength)
2. Apply contrast enhancement if needed (CLAHE or histogram)
3. Apply sharpening if edge strength is low
4. Apply binarization if contrast is very low (optional)
All operations preserve the original color image dimensions.
"""
def __init__(self):
# Load thresholds from config
self.contrast_threshold = settings.layout_preprocessing_contrast_threshold
self.edge_threshold = settings.layout_preprocessing_edge_threshold
self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
# CLAHE parameters
self.clahe_clip_limit = 2.0
self.clahe_tile_grid_size = (8, 8)
# Sharpening kernel (unsharp mask style)
self.sharpen_kernel = np.array([
[0, -1, 0],
[-1, 5, -1],
[0, -1, 0]
], dtype=np.float32)
logger.info(
f"LayoutPreprocessingService initialized with thresholds: "
f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
f"binarize={self.binarize_threshold}"
)
def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
"""
Analyze image quality to determine preprocessing needs.
Args:
image: Input image (BGR or grayscale)
Returns:
ImageQualityMetrics with contrast and edge_strength
"""
# Convert to grayscale if needed
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image
# Calculate contrast (standard deviation of pixel values)
contrast = float(np.std(gray))
# Calculate edge strength (mean of Sobel gradient magnitude)
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2)))
return ImageQualityMetrics(
contrast=round(contrast, 2),
edge_strength=round(edge_strength, 2)
)
def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
"""
Determine optimal preprocessing config based on image quality.
Args:
metrics: Image quality metrics from analyze_image_quality()
Returns:
PreprocessingConfig with recommended settings
"""
# Determine contrast enhancement
if metrics.contrast < self.contrast_threshold:
contrast = PreprocessingContrastEnum.CLAHE
else:
contrast = PreprocessingContrastEnum.NONE
# Determine sharpening
sharpen = metrics.edge_strength < self.edge_threshold
# Determine binarization (only for very low contrast)
binarize = metrics.contrast < self.binarize_threshold
return PreprocessingConfig(
contrast=contrast,
sharpen=sharpen,
binarize=binarize
)
def apply_contrast_enhancement(
self,
image: np.ndarray,
method: PreprocessingContrastEnum
) -> np.ndarray:
"""
Apply contrast enhancement to image.
Args:
image: Input image (BGR)
method: Enhancement method (none, histogram, clahe)
Returns:
Enhanced image (BGR)
"""
if method == PreprocessingContrastEnum.NONE:
return image
# Convert to LAB color space for better enhancement
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
if method == PreprocessingContrastEnum.HISTOGRAM:
# Standard histogram equalization
l_enhanced = cv2.equalizeHist(l_channel)
elif method == PreprocessingContrastEnum.CLAHE:
# Contrast Limited Adaptive Histogram Equalization
clahe = cv2.createCLAHE(
clipLimit=self.clahe_clip_limit,
tileGridSize=self.clahe_tile_grid_size
)
l_enhanced = clahe.apply(l_channel)
else:
return image
# Merge channels and convert back to BGR
enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel])
enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
return enhanced_bgr
def apply_sharpening(self, image: np.ndarray) -> np.ndarray:
"""
Apply sharpening to enhance edges and faint lines.
Args:
image: Input image (BGR)
Returns:
Sharpened image (BGR)
"""
# Apply unsharp mask style sharpening
sharpened = cv2.filter2D(image, -1, self.sharpen_kernel)
# Clip values to valid range
sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
return sharpened
def apply_binarization(self, image: np.ndarray) -> np.ndarray:
"""
Apply adaptive binarization for very low contrast documents.
Args:
image: Input image (BGR)
Returns:
Binarized image (BGR, but grayscale values)
"""
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2
)
# Convert back to BGR for consistency
binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
return binary_bgr
def preprocess(
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
) -> PreprocessingResult:
"""
Preprocess image for layout detection.
Args:
image: Input image (numpy array, PIL Image, or path)
mode: Preprocessing mode (auto, manual, disabled)
config: Manual configuration (required if mode='manual')
Returns:
PreprocessingResult with preprocessed image and metadata
"""
# Load image if path provided
if isinstance(image, (str, Path)):
image = cv2.imread(str(image))
if image is None:
raise ValueError(f"Failed to load image: {image}")
elif isinstance(image, Image.Image):
# Convert PIL to OpenCV format (BGR)
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Analyze quality
metrics = self.analyze_image_quality(image)
logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
# Determine configuration
if mode == PreprocessingModeEnum.DISABLED:
return PreprocessingResult(
image=image,
config_used=PreprocessingConfig(
contrast=PreprocessingContrastEnum.NONE,
sharpen=False,
binarize=False
),
quality_metrics=metrics,
was_processed=False
)
if mode == PreprocessingModeEnum.AUTO:
config = self.get_auto_config(metrics)
logger.debug(f"Auto config: {config}")
elif config is None:
# Manual mode but no config provided, use defaults
config = PreprocessingConfig()
# Apply preprocessing pipeline
processed = image.copy()
was_processed = False
# Step 1: Contrast enhancement
if config.contrast != PreprocessingContrastEnum.NONE:
processed = self.apply_contrast_enhancement(processed, config.contrast)
was_processed = True
logger.debug(f"Applied contrast enhancement: {config.contrast}")
# Step 2: Sharpening
if config.sharpen:
processed = self.apply_sharpening(processed)
was_processed = True
logger.debug("Applied sharpening")
# Step 3: Binarization (last step, overwrites color)
if config.binarize:
processed = self.apply_binarization(processed)
was_processed = True
logger.debug("Applied binarization")
return PreprocessingResult(
image=processed,
config_used=config,
quality_metrics=metrics,
was_processed=was_processed
)
def preprocess_to_pil(
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
) -> Tuple[Image.Image, PreprocessingResult]:
"""
Preprocess image and return as PIL Image.
Convenience method for integration with PP-Structure which accepts PIL images.
Args:
image: Input image
mode: Preprocessing mode
config: Manual configuration
Returns:
Tuple of (PIL Image, PreprocessingResult)
"""
result = self.preprocess(image, mode, config)
# Convert BGR to RGB for PIL
rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_image)
return pil_image, result
def save_preview(
self,
original: np.ndarray,
preprocessed: np.ndarray,
output_dir: Path,
prefix: str = "preview"
) -> Tuple[Path, Path]:
"""
Save original and preprocessed images for preview.
Args:
original: Original image (BGR)
preprocessed: Preprocessed image (BGR)
output_dir: Directory to save images
prefix: Filename prefix
Returns:
Tuple of (original_path, preprocessed_path)
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
original_path = output_dir / f"{prefix}_original.png"
preprocessed_path = output_dir / f"{prefix}_preprocessed.png"
cv2.imwrite(str(original_path), original)
cv2.imwrite(str(preprocessed_path), preprocessed)
return original_path, preprocessed_path
# Singleton instance
_layout_preprocessing_service: Optional[LayoutPreprocessingService] = None
def get_layout_preprocessing_service() -> LayoutPreprocessingService:
"""Get or create the layout preprocessing service singleton."""
global _layout_preprocessing_service
if _layout_preprocessing_service is None:
_layout_preprocessing_service = LayoutPreprocessingService()
return _layout_preprocessing_service

View File

@@ -26,6 +26,11 @@ except ImportError:
from app.core.config import settings
from app.services.office_converter import OfficeConverter, OfficeConverterError
from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
from app.services.layout_preprocessing_service import (
get_layout_preprocessing_service,
LayoutPreprocessingService,
)
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
# Import dual-track components
try:
@@ -865,7 +870,9 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
current_page: int = 0,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
@@ -878,6 +885,8 @@ class OCRService:
output_dir: Optional output directory for saving extracted images
current_page: Current page number (0-based) for multi-page documents
layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
Dictionary with OCR results and metadata
@@ -946,7 +955,9 @@ class OCRService:
confidence_threshold=confidence_threshold,
output_dir=output_dir,
current_page=page_num - 1, # Convert to 0-based page number for layout data
layout_model=layout_model
layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config
)
# Accumulate results
@@ -1092,7 +1103,9 @@ class OCRService:
image_path,
output_dir=output_dir,
current_page=current_page,
layout_model=layout_model
layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config
)
# Generate Markdown
@@ -1248,7 +1261,9 @@ class OCRService:
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1258,6 +1273,8 @@ class OCRService:
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
current_page: Current page number (0-based) for multi-page documents
layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
Tuple of (layout_data, images_metadata)
@@ -1277,13 +1294,45 @@ class OCRService:
structure_engine = self._ensure_structure_engine(layout_model)
# Apply image preprocessing for layout detection
# Preprocessing enhances faint lines/borders to improve table detection
# Original image is preserved for element extraction
preprocessed_image = None
preprocessing_result = None
# Determine preprocessing mode (default from config if not specified)
mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
if mode != PreprocessingModeEnum.DISABLED:
try:
preprocessing_service = get_layout_preprocessing_service()
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
image_path,
mode=mode,
config=preprocessing_config
)
if preprocessing_result.was_processed:
preprocessed_image = preprocessed_pil
logger.info(
f"Layout preprocessing applied: mode={mode.value}, "
f"config={preprocessing_result.config_used}, "
f"metrics={preprocessing_result.quality_metrics}"
)
else:
logger.info(f"No preprocessing needed (mode={mode.value})")
except Exception as preprocess_error:
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
preprocessed_image = None
# Try enhanced processing first
try:
from app.services.pp_structure_enhanced import PPStructureEnhanced
enhanced_processor = PPStructureEnhanced(structure_engine)
result = enhanced_processor.analyze_with_full_structure(
image_path, output_dir, current_page
image_path, output_dir, current_page, preprocessed_image=preprocessed_image
)
if result.get('has_parsing_res_list'):
@@ -1337,7 +1386,17 @@ class OCRService:
logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
return None, []
results = structure_engine.predict(str(image_path))
# Use preprocessed image if available, otherwise original path
if preprocessed_image is not None:
import numpy as np
# Convert PIL to numpy array (BGR format for PP-Structure)
predict_input = np.array(preprocessed_image)
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
# Convert RGB to BGR
predict_input = predict_input[:, :, ::-1]
results = structure_engine.predict(predict_input)
else:
results = structure_engine.predict(str(image_path))
layout_elements = []
images_metadata = []
@@ -1509,7 +1568,9 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
force_track: Optional[str] = None,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Union[UnifiedDocument, Dict]:
"""
Process document using dual-track approach.
@@ -1522,6 +1583,8 @@ class OCRService:
output_dir: Optional output directory for extracted images
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
UnifiedDocument if dual-track is enabled, Dict otherwise
@@ -1529,7 +1592,8 @@ class OCRService:
if not self.dual_track_enabled:
# Fallback to traditional OCR processing
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
start_time = datetime.now()
@@ -1601,7 +1665,9 @@ class OCRService:
ocr_result = self.process_file_traditional(
actual_file_path, lang, detect_layout=True,
confidence_threshold=confidence_threshold,
output_dir=output_dir, layout_model=layout_model
output_dir=output_dir, layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config
)
# Convert OCR result to extract images
@@ -1634,7 +1700,8 @@ class OCRService:
# Use OCR for scanned documents, images, etc.
logger.info("Using OCR track (PaddleOCR)")
ocr_result = self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
# Convert OCR result to UnifiedDocument using the converter
@@ -1664,7 +1731,8 @@ class OCRService:
logger.error(f"Error in dual-track processing: {e}")
# Fallback to traditional OCR
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
def _merge_ocr_images_into_direct(
@@ -1743,7 +1811,9 @@ class OCRService:
detect_layout: bool = True,
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Dict:
"""
Traditional OCR processing (legacy method).
@@ -1755,6 +1825,8 @@ class OCRService:
confidence_threshold: Minimum confidence threshold
output_dir: Optional output directory
layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
Dictionary with OCR results in legacy format
@@ -1767,7 +1839,8 @@ class OCRService:
all_results = []
for i, image_path in enumerate(image_paths):
result = self.process_image(
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
preprocessing_mode, preprocessing_config
)
all_results.append(result)
@@ -1783,7 +1856,8 @@ class OCRService:
else:
# Single image or other file
return self.process_image(
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
preprocessing_mode, preprocessing_config
)
def _combine_results(self, results: List[Dict]) -> Dict:
@@ -1868,7 +1942,9 @@ class OCRService:
output_dir: Optional[Path] = None,
use_dual_track: bool = True,
force_track: Optional[str] = None,
layout_model: Optional[str] = None
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None
) -> Union[UnifiedDocument, Dict]:
"""
Main processing method with dual-track support.
@@ -1882,6 +1958,8 @@ class OCRService:
use_dual_track: Whether to use dual-track processing (default True)
force_track: Force specific track ("ocr" or "direct")
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
Returns:
UnifiedDocument if dual-track is enabled and use_dual_track=True,
@@ -1893,12 +1971,14 @@ class OCRService:
if (use_dual_track or force_track) and self.dual_track_enabled:
# Use dual-track processing (or forced track)
return self.process_with_dual_track(
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
preprocessing_mode, preprocessing_config
)
else:
# Use traditional OCR processing (no force_track support)
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config
)
def process_legacy(

View File

@@ -20,6 +20,8 @@ except ImportError:
import paddle
from paddleocr import PPStructureV3
from PIL import Image
import numpy as np
from app.models.unified_document import ElementType
from app.core.config import settings
from app.services.memory_manager import prediction_context
@@ -78,15 +80,19 @@ class PPStructureEnhanced:
self,
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0
current_page: int = 0,
preprocessed_image: Optional[Image.Image] = None
) -> Dict[str, Any]:
"""
Analyze document with full PP-StructureV3 capabilities.
Args:
image_path: Path to image file
image_path: Path to original image file (used for cropping extracted images)
output_dir: Optional output directory for saving extracted content
current_page: Current page number (0-based)
preprocessed_image: Optional preprocessed PIL Image for layout detection.
If provided, this is used for PP-Structure prediction,
but original image_path is still used for cropping images.
Returns:
Dictionary with complete structure information including:
@@ -97,6 +103,8 @@ class PPStructureEnhanced:
"""
try:
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
if preprocessed_image:
logger.info("Using preprocessed image for layout detection")
# Perform structure analysis with semaphore control
# This prevents OOM errors from multiple simultaneous predictions
@@ -113,7 +121,16 @@ class PPStructureEnhanced:
'error': 'Prediction slot timeout'
}
results = self.structure_engine.predict(str(image_path))
# Use preprocessed image if provided, otherwise use original path
if preprocessed_image is not None:
# Convert PIL to numpy array (BGR format for PP-Structure)
predict_input = np.array(preprocessed_image)
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
# Convert RGB to BGR
predict_input = predict_input[:, :, ::-1]
results = self.structure_engine.predict(predict_input)
else:
results = self.structure_engine.predict(str(image_path))
all_elements = []
all_images = []

View File

@@ -93,7 +93,7 @@
- `frontend/src/i18n/locales/zh-TW.json` - Traditional Chinese
- `frontend/src/i18n/locales/en.json` - English (if exists)
## 6. Testing
## 6. Testing (with env)
- [ ] 6.1 Unit tests for preprocessing_service
- Test contrast enhancement methods
@@ -106,7 +106,7 @@
- Test preview endpoint returns correct images
- Test auto-detection returns sensible config
- [ ] 6.3 Integration tests
- [ ] 6.3 Integration tests (accountL ymirliu@panjit.com.tw ; password: 4RFV5tgb6yhn)
- Test OCR track with preprocessing modes (auto/manual/disabled)
- Verify image element quality is preserved
- Test with known problematic documents (faint table borders)