feat: enhance layout preprocessing and unify image scaling proposal

Backend changes:
- Add image scaling configuration for PP-Structure processing
- Enhance layout preprocessing service with scaling support
- Update OCR service with improved memory management
- Add PP-Structure enhanced processing improvements

Frontend changes:
- Update preprocessing settings UI
- Fix processing page layout and state management
- Update API types for new parameters

Proposals:
- Archive add-layout-preprocessing proposal (completed)
- Add unify-image-scaling proposal for consistent coordinate handling

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions

View File

@@ -90,19 +90,27 @@ class Settings(BaseSettings):
enable_formula_recognition: bool = Field(default=True) # Math formula recognition
enable_table_recognition: bool = Field(default=True) # Table structure recognition
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
enable_region_detection: bool = Field(default=True) # Region detection for better table structure
enable_text_recognition: bool = Field(default=True) # General text recognition
# PP-StructureV3 Preprocessing (Stage 1)
use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation
use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos
use_textline_orientation: bool = Field(default=True) # Detect textline orientation
layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection
layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements
layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging
layout_unclip_ratio: float = Field(default=1.2) # Smaller unclip to preserve element boundaries
text_det_thresh: float = Field(default=0.2) # More sensitive text detection
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
# Layout Detection Parameters (Stage 3)
# NOTE: Testing showed that PaddleX defaults work better for table detection.
# Previously we used aggressive low thresholds (0.2) which caused table detection failures.
# Now using None to let PaddleX use its optimized defaults.
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_merge_mode: Optional[str] = Field(default=None) # None = use PaddleX default
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
# Text Detection Parameters
text_det_thresh: Optional[float] = Field(default=None) # None = use PaddleX default
text_det_box_thresh: Optional[float] = Field(default=None) # None = use PaddleX default
text_det_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
# Layout Detection Model Configuration (Stage 3)
# Available models:
@@ -136,6 +144,23 @@ class Settings(BaseSettings):
description="Table structure model for borderless tables. SLANeXt_wireless recommended."
)
# Table Classification Model - determines if table is wired or wireless
table_classification_model_name: Optional[str] = Field(
default="PP-LCNet_x1_0_table_cls",
description="Model to classify table type (wired vs wireless). Enables automatic model selection."
)
# Table Cell Detection Models - detect individual cells within tables
# These are crucial for accurate cell boundary detection in complex tables
wired_table_cells_detection_model_name: Optional[str] = Field(
default="RT-DETR-L_wired_table_cell_det",
description="Cell detection model for bordered tables. RT-DETR-L provides best accuracy."
)
wireless_table_cells_detection_model_name: Optional[str] = Field(
default="RT-DETR-L_wireless_table_cell_det",
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
)
# Formula Recognition Model Configuration (Stage 4)
# Available models:
# - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
@@ -146,6 +171,37 @@ class Settings(BaseSettings):
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
)
# Chart Recognition Model Configuration
chart_recognition_model_name: Optional[str] = Field(
default="PP-Chart2Table",
description="Chart to table recognition model."
)
# Text Detection and Recognition Model Configuration
# PP-OCRv5_server provides best accuracy for document OCR
text_detection_model_name: Optional[str] = Field(
default="PP-OCRv5_server_det",
description="Text detection model. PP-OCRv5_server_det recommended for documents."
)
text_recognition_model_name: Optional[str] = Field(
default="PP-OCRv5_server_rec",
description="Text recognition model. PP-OCRv5_server_rec recommended for documents."
)
# Document Preprocessing Model Configuration (Stage 1)
doc_orientation_classify_model_name: Optional[str] = Field(
default="PP-LCNet_x1_0_doc_ori",
description="Document orientation classification model for auto-rotation."
)
doc_unwarping_model_name: Optional[str] = Field(
default="UVDoc",
description="Document unwarping model for correcting perspective distortion."
)
textline_orientation_model_name: Optional[str] = Field(
default="PP-LCNet_x1_0_textline_ori",
description="Textline orientation model for detecting text direction."
)
# ===== Layout Preprocessing Configuration =====
# Image preprocessing to enhance layout detection for documents with faint lines/borders
# Preprocessing only affects layout detection input; original image is preserved for extraction
@@ -179,6 +235,31 @@ class Settings(BaseSettings):
description="Contrast below this triggers binarization in auto mode"
)
# Layout image scaling for better table detection
# Automatic bidirectional scaling for layout detection
# PDF conversion now uses 150 DPI (~1240x1754 for A4), which falls within optimal range
# Scaling acts as a safety net for:
# - Very large images (>2000px): Downscale to target
# - Very small images (<1200px): Upscale to target
# - 150 DPI A4 (1240x1754): No scaling needed (already optimal)
layout_image_scaling_enabled: bool = Field(
default=True,
description="Enable automatic bidirectional scaling for layout detection. "
"Images outside optimal range are scaled to target dimension."
)
layout_image_scaling_max_dimension: int = Field(
default=2000,
description="Max dimension (pixels) before downscaling. Images larger than this will be scaled down."
)
layout_image_scaling_min_dimension: int = Field(
default=1200,
description="Min dimension (pixels) before upscaling. Images smaller than this will be scaled up."
)
layout_image_scaling_target_dimension: int = Field(
default=1600,
description="Target dimension (pixels) for scaling. Optimal size for PP-Structure layout detection."
)
# ===== Gap Filling Configuration =====
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track

View File

@@ -54,11 +54,15 @@ class PreprocessingContrastEnum(str, Enum):
- NONE: No contrast enhancement
- HISTOGRAM: Standard histogram equalization
- CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended)
- CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended for most cases)
- DOCUMENT: Background normalization + CLAHE (recommended for scanned documents)
Removes uneven illumination before enhancement. Best for scans with
yellowed paper, shadow, or scanner lighting issues.
"""
NONE = "none"
HISTOGRAM = "histogram"
CLAHE = "clahe"
DOCUMENT = "document"
class PreprocessingConfig(BaseModel):

View File

@@ -32,6 +32,15 @@ from app.schemas.task import (
logger = logging.getLogger(__name__)
@dataclass
class ScalingInfo:
"""Information about image scaling applied for layout detection."""
was_scaled: bool
scale_factor: float # Factor to multiply bbox coords to get original size (1.0 / actual_scale)
original_size: Tuple[int, int] # (width, height) of original image
scaled_size: Tuple[int, int] # (width, height) after scaling
@dataclass
class PreprocessingResult:
"""Result of preprocessing operation."""
@@ -39,6 +48,7 @@ class PreprocessingResult:
config_used: PreprocessingConfig
quality_metrics: ImageQualityMetrics
was_processed: bool
scaling_info: Optional[ScalingInfo] = None # Info about any scaling applied
class LayoutPreprocessingService:
@@ -60,10 +70,23 @@ class LayoutPreprocessingService:
self.edge_threshold = settings.layout_preprocessing_edge_threshold
self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
# Image scaling settings for layout detection (bidirectional)
self.scaling_enabled = settings.layout_image_scaling_enabled
self.scaling_max_dimension = settings.layout_image_scaling_max_dimension
self.scaling_min_dimension = settings.layout_image_scaling_min_dimension
self.scaling_target_dimension = settings.layout_image_scaling_target_dimension
# CLAHE parameters
self.clahe_clip_limit = 2.0
self.clahe_tile_grid_size = (8, 8)
# Document-specific CLAHE parameters (larger tiles for documents)
self.document_clahe_clip_limit = 3.0
self.document_clahe_tile_grid_size = (16, 16)
# Background normalization parameters for scanned documents
self.background_kernel_size = 51 # Morphological kernel size
# Sharpening kernel (unsharp mask style)
self.sharpen_kernel = np.array([
[0, -1, 0],
@@ -74,7 +97,9 @@ class LayoutPreprocessingService:
logger.info(
f"LayoutPreprocessingService initialized with thresholds: "
f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
f"binarize={self.binarize_threshold}"
f"binarize={self.binarize_threshold}, "
f"scaling={'enabled' if self.scaling_enabled else 'disabled'} "
f"(min={self.scaling_min_dimension}, max={self.scaling_max_dimension}, target={self.scaling_target_dimension})"
)
def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
@@ -106,6 +131,180 @@ class LayoutPreprocessingService:
edge_strength=round(edge_strength, 2)
)
def _normalize_background(self, gray: np.ndarray) -> np.ndarray:
"""
Normalize image background to remove uneven illumination.
This is particularly effective for scanned documents where scanner
lighting may be uneven, or where paper has yellowed/stained areas.
Method:
1. Estimate background using morphological closing (fills in text/details)
2. Divide original by background estimate
3. Rescale to full 0-255 range
Args:
gray: Grayscale image (L channel or grayscale)
Returns:
Normalized grayscale image with uniform background
"""
# Create structuring element for morphological operations
kernel_size = self.background_kernel_size
# Ensure kernel size is odd
if kernel_size % 2 == 0:
kernel_size += 1
kernel = cv2.getStructuringElement(
cv2.MORPH_ELLIPSE,
(kernel_size, kernel_size)
)
# Morphological closing estimates the background
# (dilate then erode - fills in dark features like text)
background = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
# Apply Gaussian blur to smooth the background estimate
background = cv2.GaussianBlur(background, (kernel_size, kernel_size), 0)
# Avoid division by zero
background = np.maximum(background, 1).astype(np.float32)
# Normalize: divide by background and rescale to 0-255
# This removes uneven illumination while preserving text/content
normalized = (gray.astype(np.float32) / background) * 255.0
# Clip and convert back to uint8
normalized = np.clip(normalized, 0, 255).astype(np.uint8)
logger.debug(
f"Background normalization applied: kernel={kernel_size}, "
f"background range=[{background.min():.0f}, {background.max():.0f}]"
)
return normalized
def scale_for_layout_detection(
self,
image: np.ndarray,
force_scale: bool = False
) -> Tuple[np.ndarray, ScalingInfo]:
"""
Apply bidirectional scaling for optimal layout detection.
PP-Structure's layout detection model (RT-DETR based) works best with images
around 1600px on the longest side. Both too-large and too-small images
reduce detection accuracy:
- Too large (>2000px): Model's receptive field cannot capture entire structures
- Too small (<1200px): Insufficient detail for accurate detection
Scaling behavior:
- max_dim > max_dimension (2000): Scale DOWN to target (1600)
- max_dim < min_dimension (1200): Scale UP to target (1600)
- min_dimension <= max_dim <= max_dimension: No scaling (optimal range)
Args:
image: Input image (BGR)
force_scale: Force scaling to target even if in optimal range
Returns:
Tuple of (scaled_image, ScalingInfo)
ScalingInfo.scale_factor is the multiplier to convert scaled bbox
coordinates back to original image coordinates.
"""
h, w = image.shape[:2]
original_size = (w, h)
max_dim = max(h, w)
# Determine if scaling is needed and direction
should_downscale = self.scaling_enabled and max_dim > self.scaling_max_dimension
should_upscale = self.scaling_enabled and max_dim < self.scaling_min_dimension
should_scale = should_downscale or should_upscale or force_scale
if not should_scale:
return image, ScalingInfo(
was_scaled=False,
scale_factor=1.0,
original_size=original_size,
scaled_size=original_size
)
# Calculate scale factor to reach target dimension
actual_scale = self.scaling_target_dimension / max_dim
new_w = int(w * actual_scale)
new_h = int(h * actual_scale)
# Choose interpolation method based on scale direction
if actual_scale < 1.0:
# Downscaling: INTER_AREA is best for shrinking (anti-aliasing)
interpolation = cv2.INTER_AREA
direction = "DOWN"
else:
# Upscaling: INTER_CUBIC provides smooth enlargement
interpolation = cv2.INTER_CUBIC
direction = "UP"
scaled_image = cv2.resize(image, (new_w, new_h), interpolation=interpolation)
# scale_factor is the inverse - used to scale bbox coords back to original
scale_factor = 1.0 / actual_scale
logger.info(
f"Scaled {direction} for layout detection: {w}x{h} -> {new_w}x{new_h} "
f"(scale_factor={scale_factor:.3f} to restore original coords)"
)
return scaled_image, ScalingInfo(
was_scaled=True,
scale_factor=scale_factor,
original_size=original_size,
scaled_size=(new_w, new_h)
)
@staticmethod
def scale_bbox_to_original(
bbox: Tuple[float, float, float, float],
scale_factor: float
) -> Tuple[float, float, float, float]:
"""
Scale a bounding box from scaled coordinates back to original image coordinates.
Args:
bbox: Bounding box as (x1, y1, x2, y2) in scaled image coordinates
scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
Returns:
Bounding box in original image coordinates
"""
x1, y1, x2, y2 = bbox
return (
x1 * scale_factor,
y1 * scale_factor,
x2 * scale_factor,
y2 * scale_factor
)
@staticmethod
def scale_bboxes_to_original(
bboxes: list,
scale_factor: float
) -> list:
"""
Scale multiple bounding boxes from scaled coordinates to original.
Args:
bboxes: List of bounding boxes, each as (x1, y1, x2, y2)
scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
Returns:
List of bounding boxes in original image coordinates
"""
return [
LayoutPreprocessingService.scale_bbox_to_original(bbox, scale_factor)
for bbox in bboxes
]
def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
"""
Determine optimal preprocessing config based on image quality.
@@ -203,6 +402,18 @@ class LayoutPreprocessingService:
tileGridSize=self.clahe_tile_grid_size
)
l_enhanced = clahe.apply(l_channel)
elif method == PreprocessingContrastEnum.DOCUMENT:
# Document-specific enhancement for scanned documents
# Step 1: Background normalization to remove uneven illumination
l_normalized = self._normalize_background(l_channel)
# Step 2: CLAHE with larger tiles optimized for documents
clip_limit = self.document_clahe_clip_limit * strength
clahe = cv2.createCLAHE(
clipLimit=clip_limit,
tileGridSize=self.document_clahe_tile_grid_size
)
l_enhanced = clahe.apply(l_normalized)
else:
return image
@@ -277,15 +488,29 @@ class LayoutPreprocessingService:
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
config: Optional[PreprocessingConfig] = None,
apply_scaling: bool = True
) -> PreprocessingResult:
"""
Preprocess image for layout detection.
The preprocessing pipeline:
1. Load image from path/PIL if needed
2. Analyze image quality (on original image for accurate metrics)
3. Scale down high-resolution images for better layout detection
4. Apply contrast enhancement if needed
5. Apply sharpening if needed
6. Apply binarization if requested (not recommended)
IMPORTANT: When scaling is applied, all bounding boxes from layout detection
must be scaled back to original coordinates using ScalingInfo.scale_factor.
The original image should be used for element extraction (cropping).
Args:
image: Input image (numpy array, PIL Image, or path)
mode: Preprocessing mode (auto, manual, disabled)
config: Manual configuration (required if mode='manual')
apply_scaling: Whether to apply automatic downscaling (default True)
Returns:
PreprocessingResult with preprocessed image and metadata
@@ -299,21 +524,37 @@ class LayoutPreprocessingService:
# Convert PIL to OpenCV format (BGR)
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Analyze quality
# Analyze quality on ORIGINAL image (before scaling) for accurate metrics
metrics = self.analyze_image_quality(image)
logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
# Apply scaling for layout detection (even if preprocessing is disabled)
if apply_scaling:
scaled_image, scaling_info = self.scale_for_layout_detection(image)
else:
h, w = image.shape[:2]
scaled_image = image
scaling_info = ScalingInfo(
was_scaled=False,
scale_factor=1.0,
original_size=(w, h),
scaled_size=(w, h)
)
# Determine configuration
if mode == PreprocessingModeEnum.DISABLED:
# Even when preprocessing is disabled, we still return scaled image
# for better layout detection. Original image is preserved for cropping.
return PreprocessingResult(
image=image,
image=scaled_image,
config_used=PreprocessingConfig(
contrast=PreprocessingContrastEnum.NONE,
sharpen=False,
binarize=False
),
quality_metrics=metrics,
was_processed=False
was_processed=scaling_info.was_scaled, # True if scaling was applied
scaling_info=scaling_info
)
if mode == PreprocessingModeEnum.AUTO:
@@ -323,9 +564,9 @@ class LayoutPreprocessingService:
# Manual mode but no config provided, use defaults
config = PreprocessingConfig()
# Apply preprocessing pipeline
processed = image.copy()
was_processed = False
# Apply preprocessing pipeline on SCALED image
processed = scaled_image.copy()
was_processed = scaling_info.was_scaled # Start with True if already scaled
# Step 1: Contrast enhancement
if config.contrast != PreprocessingContrastEnum.NONE:
@@ -353,29 +594,37 @@ class LayoutPreprocessingService:
image=processed,
config_used=config,
quality_metrics=metrics,
was_processed=was_processed
was_processed=was_processed,
scaling_info=scaling_info
)
def preprocess_to_pil(
self,
image: Union[np.ndarray, Image.Image, str, Path],
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
config: Optional[PreprocessingConfig] = None
config: Optional[PreprocessingConfig] = None,
apply_scaling: bool = True
) -> Tuple[Image.Image, PreprocessingResult]:
"""
Preprocess image and return as PIL Image.
Convenience method for integration with PP-Structure which accepts PIL images.
IMPORTANT: When result.scaling_info.was_scaled is True, all bounding boxes
from PP-Structure must be scaled back to original coordinates using:
scaled_bbox = (x1 * scale_factor, y1 * scale_factor, x2 * scale_factor, y2 * scale_factor)
where scale_factor = result.scaling_info.scale_factor
Args:
image: Input image
mode: Preprocessing mode
config: Manual configuration
apply_scaling: Whether to apply automatic downscaling (default True)
Returns:
Tuple of (PIL Image, PreprocessingResult)
Tuple of (PIL Image for PP-Structure, PreprocessingResult with scaling info)
"""
result = self.preprocess(image, mode, config)
result = self.preprocess(image, mode, config, apply_scaling=apply_scaling)
# Convert BGR to RGB for PIL
rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)

View File

@@ -502,6 +502,8 @@ class OCRService:
use_chart = settings.enable_chart_recognition
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
use_seal = settings.enable_seal_recognition
use_region = settings.enable_region_detection
layout_threshold = settings.layout_detection_threshold
layout_nms = settings.layout_nms_threshold
layout_merge = settings.layout_merge_mode
@@ -530,17 +532,32 @@ class OCRService:
# Table and formula model configuration (Stage 4)
wired_table_model = settings.wired_table_model_name
wireless_table_model = settings.wireless_table_model_name
table_cls_model = settings.table_classification_model_name
wired_cell_det_model = settings.wired_table_cells_detection_model_name
wireless_cell_det_model = settings.wireless_table_cells_detection_model_name
formula_model = settings.formula_recognition_model_name
chart_model = settings.chart_recognition_model_name
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
# Text detection/recognition model configuration
text_det_model = settings.text_detection_model_name
text_rec_model = settings.text_recognition_model_name
# Document preprocessing model configuration (Stage 1)
doc_ori_model = settings.doc_orientation_classify_model_name
doc_unwarp_model = settings.doc_unwarping_model_name
textline_ori_model = settings.textline_orientation_model_name
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}, seal={use_seal}, region={use_region}")
logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}")
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}")
logger.info(f"Table structure models: wired={wired_table_model}, wireless={wireless_table_model}")
logger.info(f"Table cell detection: cls={table_cls_model}, wired_det={wired_cell_det_model}, wireless_det={wireless_cell_det_model}")
logger.info(f"Formula model: {formula_model}")
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
# Build PPStructureV3 kwargs
# Only include parameters that are not None (let PaddleX use defaults for None values)
pp_kwargs = {
# Preprocessing (Stage 1)
'use_doc_orientation_classify': use_orientation,
@@ -550,17 +567,29 @@ class OCRService:
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
# Layout detection parameters
'layout_threshold': layout_threshold,
'layout_nms': layout_nms,
'layout_unclip_ratio': layout_unclip,
'layout_merge_bboxes_mode': layout_merge,
# Text detection parameters
'text_det_thresh': text_thresh,
'text_det_box_thresh': text_box_thresh,
'text_det_unclip_ratio': text_unclip,
'use_seal_recognition': use_seal,
'use_region_detection': use_region,
}
# Add layout detection parameters only if explicitly configured
# (None = use PaddleX optimized defaults, which work better for table detection)
if layout_threshold is not None:
pp_kwargs['layout_threshold'] = layout_threshold
if layout_nms is not None:
pp_kwargs['layout_nms'] = layout_nms
if layout_unclip is not None:
pp_kwargs['layout_unclip_ratio'] = layout_unclip
if layout_merge is not None:
pp_kwargs['layout_merge_bboxes_mode'] = layout_merge
# Add text detection parameters only if explicitly configured
if text_thresh is not None:
pp_kwargs['text_det_thresh'] = text_thresh
if text_box_thresh is not None:
pp_kwargs['text_det_box_thresh'] = text_box_thresh
if text_unclip is not None:
pp_kwargs['text_det_unclip_ratio'] = text_unclip
# Add layout model configuration if specified (Stage 3)
if layout_model_name:
pp_kwargs['layout_detection_model_name'] = layout_model_name
@@ -575,10 +604,38 @@ class OCRService:
if wireless_table_model:
pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
# Add table classification model (determines wired vs wireless automatically)
if table_cls_model:
pp_kwargs['table_classification_model_name'] = table_cls_model
# Add table cell detection models (crucial for accurate cell boundary detection)
if wired_cell_det_model:
pp_kwargs['wired_table_cells_detection_model_name'] = wired_cell_det_model
if wireless_cell_det_model:
pp_kwargs['wireless_table_cells_detection_model_name'] = wireless_cell_det_model
# Add formula recognition model configuration (Stage 4)
if formula_model:
pp_kwargs['formula_recognition_model_name'] = formula_model
# Add chart recognition model configuration
if chart_model:
pp_kwargs['chart_recognition_model_name'] = chart_model
# Add text detection/recognition model configuration
if text_det_model:
pp_kwargs['text_detection_model_name'] = text_det_model
if text_rec_model:
pp_kwargs['text_recognition_model_name'] = text_rec_model
# Add document preprocessing model configuration (Stage 1)
if doc_ori_model:
pp_kwargs['doc_orientation_classify_model_name'] = doc_ori_model
if doc_unwarp_model:
pp_kwargs['doc_unwarping_model_name'] = doc_unwarp_model
if textline_ori_model:
pp_kwargs['textline_orientation_model_name'] = textline_ori_model
self.structure_engine = PPStructureV3(**pp_kwargs)
# Track model loading for cache management
@@ -599,40 +656,63 @@ class OCRService:
# Switch to CPU device globally
paddle.set_device('cpu')
use_chart = settings.enable_chart_recognition
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
layout_threshold = settings.layout_detection_threshold
layout_model_name = settings.layout_detection_model_name
layout_model_dir = settings.layout_detection_model_dir
wired_table_model = settings.wired_table_model_name
wireless_table_model = settings.wireless_table_model_name
formula_model = settings.formula_recognition_model_name
# Build CPU fallback kwargs
# Build CPU fallback kwargs (same logic as GPU mode)
cpu_kwargs = {
'use_doc_orientation_classify': settings.use_doc_orientation_classify,
'use_doc_unwarping': settings.use_doc_unwarping,
'use_textline_orientation': settings.use_textline_orientation,
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
'layout_threshold': layout_threshold,
'use_table_recognition': settings.enable_table_recognition,
'use_formula_recognition': settings.enable_formula_recognition,
'use_chart_recognition': settings.enable_chart_recognition,
'use_seal_recognition': settings.enable_seal_recognition,
'use_region_detection': settings.enable_region_detection,
}
if layout_model_name:
cpu_kwargs['layout_detection_model_name'] = layout_model_name
if layout_model_dir:
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
if wired_table_model:
cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
if wireless_table_model:
cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
if formula_model:
cpu_kwargs['formula_recognition_model_name'] = formula_model
# Add layout detection parameters only if explicitly configured
if settings.layout_detection_threshold is not None:
cpu_kwargs['layout_threshold'] = settings.layout_detection_threshold
# Add layout model configuration
if settings.layout_detection_model_name:
cpu_kwargs['layout_detection_model_name'] = settings.layout_detection_model_name
if settings.layout_detection_model_dir:
cpu_kwargs['layout_detection_model_dir'] = settings.layout_detection_model_dir
# Add table structure model configuration
if settings.wired_table_model_name:
cpu_kwargs['wired_table_structure_recognition_model_name'] = settings.wired_table_model_name
if settings.wireless_table_model_name:
cpu_kwargs['wireless_table_structure_recognition_model_name'] = settings.wireless_table_model_name
if settings.table_classification_model_name:
cpu_kwargs['table_classification_model_name'] = settings.table_classification_model_name
if settings.wired_table_cells_detection_model_name:
cpu_kwargs['wired_table_cells_detection_model_name'] = settings.wired_table_cells_detection_model_name
if settings.wireless_table_cells_detection_model_name:
cpu_kwargs['wireless_table_cells_detection_model_name'] = settings.wireless_table_cells_detection_model_name
# Add formula and chart recognition model configuration
if settings.formula_recognition_model_name:
cpu_kwargs['formula_recognition_model_name'] = settings.formula_recognition_model_name
if settings.chart_recognition_model_name:
cpu_kwargs['chart_recognition_model_name'] = settings.chart_recognition_model_name
# Add text detection/recognition model configuration
if settings.text_detection_model_name:
cpu_kwargs['text_detection_model_name'] = settings.text_detection_model_name
if settings.text_recognition_model_name:
cpu_kwargs['text_recognition_model_name'] = settings.text_recognition_model_name
# Add document preprocessing model configuration
if settings.doc_orientation_classify_model_name:
cpu_kwargs['doc_orientation_classify_model_name'] = settings.doc_orientation_classify_model_name
if settings.doc_unwarping_model_name:
cpu_kwargs['doc_unwarping_model_name'] = settings.doc_unwarping_model_name
if settings.textline_orientation_model_name:
cpu_kwargs['textline_orientation_model_name'] = settings.textline_orientation_model_name
self.structure_engine = PPStructureV3(**cpu_kwargs)
self._current_layout_model = layout_model # Track current model for recreation check
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
else:
raise
@@ -840,10 +920,14 @@ class OCRService:
logger.info(f"Converting PDF {pdf_path.name} to images")
# Convert PDF to images (300 DPI for good quality)
# Convert PDF to images
# Use 150 DPI - testing showed this produces optimal results for PP-Structure:
# - 150 DPI produces ~1240x1754 for A4, which is ideal for layout detection
# - 300 DPI produces ~2480x3508, which requires scaling down and degrades quality
# - Table line detection works better at 150 DPI without scaling artifacts
images = convert_from_path(
str(pdf_path),
dpi=300,
dpi=150,
fmt='png'
)
@@ -1295,44 +1379,63 @@ class OCRService:
structure_engine = self._ensure_structure_engine(layout_model)
# Apply image preprocessing for layout detection
# Preprocessing enhances faint lines/borders to improve table detection
# Original image is preserved for element extraction
# Preprocessing includes:
# 1. Automatic downscaling of high-resolution images for better table detection
# 2. Optional contrast/sharpen enhancement for faint lines/borders
# Original image is preserved for element extraction (cropping uses original coords)
preprocessed_image = None
preprocessing_result = None
# Determine preprocessing mode (default from config if not specified)
mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
if mode != PreprocessingModeEnum.DISABLED:
try:
preprocessing_service = get_layout_preprocessing_service()
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
image_path,
mode=mode,
config=preprocessing_config
# Always call preprocessing service (even when DISABLED) because:
# - Scaling is applied regardless of mode for better layout detection
# - When DISABLED, only scaling is applied, no contrast/sharpen/binarize
try:
preprocessing_service = get_layout_preprocessing_service()
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
image_path,
mode=mode,
config=preprocessing_config
)
if preprocessing_result.was_processed:
preprocessed_image = preprocessed_pil
scaling_info = preprocessing_result.scaling_info
logger.info(
f"Layout preprocessing applied: mode={mode.value}, "
f"config={preprocessing_result.config_used}, "
f"metrics={preprocessing_result.quality_metrics}, "
f"scaled={scaling_info.was_scaled if scaling_info else False}"
)
if preprocessing_result.was_processed:
preprocessed_image = preprocessed_pil
if scaling_info and scaling_info.was_scaled:
logger.info(
f"Layout preprocessing applied: mode={mode.value}, "
f"config={preprocessing_result.config_used}, "
f"metrics={preprocessing_result.quality_metrics}"
f"Image scaled for layout detection: "
f"{scaling_info.original_size} -> {scaling_info.scaled_size} "
f"(scale_factor={scaling_info.scale_factor:.3f} for bbox restoration)"
)
else:
logger.info(f"No preprocessing needed (mode={mode.value})")
else:
logger.info(f"No preprocessing needed (mode={mode.value})")
except Exception as preprocess_error:
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
preprocessed_image = None
except Exception as preprocess_error:
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
preprocessed_image = None
preprocessing_result = None
# Try enhanced processing first
try:
from app.services.pp_structure_enhanced import PPStructureEnhanced
enhanced_processor = PPStructureEnhanced(structure_engine)
# Get scaling info for bbox coordinate restoration
scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
result = enhanced_processor.analyze_with_full_structure(
image_path, output_dir, current_page, preprocessed_image=preprocessed_image
image_path, output_dir, current_page,
preprocessed_image=preprocessed_image,
scaling_info=scaling_info
)
if result.get('has_parsing_res_list'):

View File

@@ -7,10 +7,14 @@ This module provides enhanced PP-StructureV3 processing that extracts all
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from typing import Dict, List, Optional, Tuple, Any, TYPE_CHECKING
import json
import gc
# Import ScalingInfo for type checking (avoid circular imports at runtime)
if TYPE_CHECKING:
from app.services.layout_preprocessing_service import ScalingInfo
# Optional torch import for additional GPU memory management
try:
import torch
@@ -81,7 +85,8 @@ class PPStructureEnhanced:
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0,
preprocessed_image: Optional[Image.Image] = None
preprocessed_image: Optional[Image.Image] = None,
scaling_info: Optional['ScalingInfo'] = None
) -> Dict[str, Any]:
"""
Analyze document with full PP-StructureV3 capabilities.
@@ -93,10 +98,13 @@ class PPStructureEnhanced:
preprocessed_image: Optional preprocessed PIL Image for layout detection.
If provided, this is used for PP-Structure prediction,
but original image_path is still used for cropping images.
scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
for layout detection, all bbox coordinates will be scaled back
to original image coordinates for proper cropping.
Returns:
Dictionary with complete structure information including:
- elements: List of all detected elements with types and bbox
- elements: List of all detected elements with types and bbox (in original coords)
- reading_order: Reading order indices
- images: Extracted images with metadata
- tables: Extracted tables with structure
@@ -184,7 +192,7 @@ class PPStructureEnhanced:
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir, image_path
parsing_res_list, current_page, output_dir, image_path, scaling_info
)
all_elements.extend(elements)
@@ -247,13 +255,15 @@ class PPStructureEnhanced:
parsing_res_list: List[Dict],
current_page: int,
output_dir: Optional[Path],
source_image_path: Optional[Path] = None
source_image_path: Optional[Path] = None,
scaling_info: Optional['ScalingInfo'] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
Args:
parsing_res_list: List of parsed elements from PP-StructureV3
scaling_info: Scaling information for bbox coordinate restoration
current_page: Current page number
output_dir: Optional output directory
source_image_path: Path to source image for cropping image regions
@@ -285,11 +295,28 @@ class PPStructureEnhanced:
# Ensure bbox has 4 values
if len(layout_bbox) >= 4:
bbox = layout_bbox[:4] # [x1, y1, x2, y2]
bbox = list(layout_bbox[:4]) # [x1, y1, x2, y2]
else:
bbox = [0, 0, 0, 0] # Default if bbox missing
logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}")
# Scale bbox back to original image coordinates if image was scaled
# This is critical for proper cropping from original high-resolution image
if scaling_info and scaling_info.was_scaled and bbox != [0, 0, 0, 0]:
scale_factor = scaling_info.scale_factor
bbox = [
bbox[0] * scale_factor, # x1
bbox[1] * scale_factor, # y1
bbox[2] * scale_factor, # x2
bbox[3] * scale_factor # y2
]
if idx == 0: # Log only for first element to avoid spam
logger.info(
f"Scaled bbox to original coords: "
f"{[round(x, 1) for x in layout_bbox[:4]]} -> {[round(x, 1) for x in bbox]} "
f"(factor={scale_factor:.3f})"
)
# Extract content (check multiple possible keys)
content = (
item.get('content', '') or