feat: enhance layout preprocessing and unify image scaling proposal
Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -502,6 +502,8 @@ class OCRService:
|
||||
use_chart = settings.enable_chart_recognition
|
||||
use_formula = settings.enable_formula_recognition
|
||||
use_table = settings.enable_table_recognition
|
||||
use_seal = settings.enable_seal_recognition
|
||||
use_region = settings.enable_region_detection
|
||||
layout_threshold = settings.layout_detection_threshold
|
||||
layout_nms = settings.layout_nms_threshold
|
||||
layout_merge = settings.layout_merge_mode
|
||||
@@ -530,17 +532,32 @@ class OCRService:
|
||||
# Table and formula model configuration (Stage 4)
|
||||
wired_table_model = settings.wired_table_model_name
|
||||
wireless_table_model = settings.wireless_table_model_name
|
||||
table_cls_model = settings.table_classification_model_name
|
||||
wired_cell_det_model = settings.wired_table_cells_detection_model_name
|
||||
wireless_cell_det_model = settings.wireless_table_cells_detection_model_name
|
||||
formula_model = settings.formula_recognition_model_name
|
||||
chart_model = settings.chart_recognition_model_name
|
||||
|
||||
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
||||
# Text detection/recognition model configuration
|
||||
text_det_model = settings.text_detection_model_name
|
||||
text_rec_model = settings.text_recognition_model_name
|
||||
|
||||
# Document preprocessing model configuration (Stage 1)
|
||||
doc_ori_model = settings.doc_orientation_classify_model_name
|
||||
doc_unwarp_model = settings.doc_unwarping_model_name
|
||||
textline_ori_model = settings.textline_orientation_model_name
|
||||
|
||||
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}, seal={use_seal}, region={use_region}")
|
||||
logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}")
|
||||
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
|
||||
logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}")
|
||||
logger.info(f"Table structure models: wired={wired_table_model}, wireless={wireless_table_model}")
|
||||
logger.info(f"Table cell detection: cls={table_cls_model}, wired_det={wired_cell_det_model}, wireless_det={wireless_cell_det_model}")
|
||||
logger.info(f"Formula model: {formula_model}")
|
||||
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
|
||||
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
|
||||
|
||||
# Build PPStructureV3 kwargs
|
||||
# Only include parameters that are not None (let PaddleX use defaults for None values)
|
||||
pp_kwargs = {
|
||||
# Preprocessing (Stage 1)
|
||||
'use_doc_orientation_classify': use_orientation,
|
||||
@@ -550,17 +567,29 @@ class OCRService:
|
||||
'use_table_recognition': use_table,
|
||||
'use_formula_recognition': use_formula,
|
||||
'use_chart_recognition': use_chart,
|
||||
# Layout detection parameters
|
||||
'layout_threshold': layout_threshold,
|
||||
'layout_nms': layout_nms,
|
||||
'layout_unclip_ratio': layout_unclip,
|
||||
'layout_merge_bboxes_mode': layout_merge,
|
||||
# Text detection parameters
|
||||
'text_det_thresh': text_thresh,
|
||||
'text_det_box_thresh': text_box_thresh,
|
||||
'text_det_unclip_ratio': text_unclip,
|
||||
'use_seal_recognition': use_seal,
|
||||
'use_region_detection': use_region,
|
||||
}
|
||||
|
||||
# Add layout detection parameters only if explicitly configured
|
||||
# (None = use PaddleX optimized defaults, which work better for table detection)
|
||||
if layout_threshold is not None:
|
||||
pp_kwargs['layout_threshold'] = layout_threshold
|
||||
if layout_nms is not None:
|
||||
pp_kwargs['layout_nms'] = layout_nms
|
||||
if layout_unclip is not None:
|
||||
pp_kwargs['layout_unclip_ratio'] = layout_unclip
|
||||
if layout_merge is not None:
|
||||
pp_kwargs['layout_merge_bboxes_mode'] = layout_merge
|
||||
|
||||
# Add text detection parameters only if explicitly configured
|
||||
if text_thresh is not None:
|
||||
pp_kwargs['text_det_thresh'] = text_thresh
|
||||
if text_box_thresh is not None:
|
||||
pp_kwargs['text_det_box_thresh'] = text_box_thresh
|
||||
if text_unclip is not None:
|
||||
pp_kwargs['text_det_unclip_ratio'] = text_unclip
|
||||
|
||||
# Add layout model configuration if specified (Stage 3)
|
||||
if layout_model_name:
|
||||
pp_kwargs['layout_detection_model_name'] = layout_model_name
|
||||
@@ -575,10 +604,38 @@ class OCRService:
|
||||
if wireless_table_model:
|
||||
pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
|
||||
|
||||
# Add table classification model (determines wired vs wireless automatically)
|
||||
if table_cls_model:
|
||||
pp_kwargs['table_classification_model_name'] = table_cls_model
|
||||
|
||||
# Add table cell detection models (crucial for accurate cell boundary detection)
|
||||
if wired_cell_det_model:
|
||||
pp_kwargs['wired_table_cells_detection_model_name'] = wired_cell_det_model
|
||||
if wireless_cell_det_model:
|
||||
pp_kwargs['wireless_table_cells_detection_model_name'] = wireless_cell_det_model
|
||||
|
||||
# Add formula recognition model configuration (Stage 4)
|
||||
if formula_model:
|
||||
pp_kwargs['formula_recognition_model_name'] = formula_model
|
||||
|
||||
# Add chart recognition model configuration
|
||||
if chart_model:
|
||||
pp_kwargs['chart_recognition_model_name'] = chart_model
|
||||
|
||||
# Add text detection/recognition model configuration
|
||||
if text_det_model:
|
||||
pp_kwargs['text_detection_model_name'] = text_det_model
|
||||
if text_rec_model:
|
||||
pp_kwargs['text_recognition_model_name'] = text_rec_model
|
||||
|
||||
# Add document preprocessing model configuration (Stage 1)
|
||||
if doc_ori_model:
|
||||
pp_kwargs['doc_orientation_classify_model_name'] = doc_ori_model
|
||||
if doc_unwarp_model:
|
||||
pp_kwargs['doc_unwarping_model_name'] = doc_unwarp_model
|
||||
if textline_ori_model:
|
||||
pp_kwargs['textline_orientation_model_name'] = textline_ori_model
|
||||
|
||||
self.structure_engine = PPStructureV3(**pp_kwargs)
|
||||
|
||||
# Track model loading for cache management
|
||||
@@ -599,40 +656,63 @@ class OCRService:
|
||||
# Switch to CPU device globally
|
||||
paddle.set_device('cpu')
|
||||
|
||||
use_chart = settings.enable_chart_recognition
|
||||
use_formula = settings.enable_formula_recognition
|
||||
use_table = settings.enable_table_recognition
|
||||
layout_threshold = settings.layout_detection_threshold
|
||||
layout_model_name = settings.layout_detection_model_name
|
||||
layout_model_dir = settings.layout_detection_model_dir
|
||||
wired_table_model = settings.wired_table_model_name
|
||||
wireless_table_model = settings.wireless_table_model_name
|
||||
formula_model = settings.formula_recognition_model_name
|
||||
|
||||
# Build CPU fallback kwargs
|
||||
# Build CPU fallback kwargs (same logic as GPU mode)
|
||||
cpu_kwargs = {
|
||||
'use_doc_orientation_classify': settings.use_doc_orientation_classify,
|
||||
'use_doc_unwarping': settings.use_doc_unwarping,
|
||||
'use_textline_orientation': settings.use_textline_orientation,
|
||||
'use_table_recognition': use_table,
|
||||
'use_formula_recognition': use_formula,
|
||||
'use_chart_recognition': use_chart,
|
||||
'layout_threshold': layout_threshold,
|
||||
'use_table_recognition': settings.enable_table_recognition,
|
||||
'use_formula_recognition': settings.enable_formula_recognition,
|
||||
'use_chart_recognition': settings.enable_chart_recognition,
|
||||
'use_seal_recognition': settings.enable_seal_recognition,
|
||||
'use_region_detection': settings.enable_region_detection,
|
||||
}
|
||||
if layout_model_name:
|
||||
cpu_kwargs['layout_detection_model_name'] = layout_model_name
|
||||
if layout_model_dir:
|
||||
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
|
||||
if wired_table_model:
|
||||
cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
|
||||
if wireless_table_model:
|
||||
cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
|
||||
if formula_model:
|
||||
cpu_kwargs['formula_recognition_model_name'] = formula_model
|
||||
|
||||
# Add layout detection parameters only if explicitly configured
|
||||
if settings.layout_detection_threshold is not None:
|
||||
cpu_kwargs['layout_threshold'] = settings.layout_detection_threshold
|
||||
|
||||
# Add layout model configuration
|
||||
if settings.layout_detection_model_name:
|
||||
cpu_kwargs['layout_detection_model_name'] = settings.layout_detection_model_name
|
||||
if settings.layout_detection_model_dir:
|
||||
cpu_kwargs['layout_detection_model_dir'] = settings.layout_detection_model_dir
|
||||
|
||||
# Add table structure model configuration
|
||||
if settings.wired_table_model_name:
|
||||
cpu_kwargs['wired_table_structure_recognition_model_name'] = settings.wired_table_model_name
|
||||
if settings.wireless_table_model_name:
|
||||
cpu_kwargs['wireless_table_structure_recognition_model_name'] = settings.wireless_table_model_name
|
||||
if settings.table_classification_model_name:
|
||||
cpu_kwargs['table_classification_model_name'] = settings.table_classification_model_name
|
||||
if settings.wired_table_cells_detection_model_name:
|
||||
cpu_kwargs['wired_table_cells_detection_model_name'] = settings.wired_table_cells_detection_model_name
|
||||
if settings.wireless_table_cells_detection_model_name:
|
||||
cpu_kwargs['wireless_table_cells_detection_model_name'] = settings.wireless_table_cells_detection_model_name
|
||||
|
||||
# Add formula and chart recognition model configuration
|
||||
if settings.formula_recognition_model_name:
|
||||
cpu_kwargs['formula_recognition_model_name'] = settings.formula_recognition_model_name
|
||||
if settings.chart_recognition_model_name:
|
||||
cpu_kwargs['chart_recognition_model_name'] = settings.chart_recognition_model_name
|
||||
|
||||
# Add text detection/recognition model configuration
|
||||
if settings.text_detection_model_name:
|
||||
cpu_kwargs['text_detection_model_name'] = settings.text_detection_model_name
|
||||
if settings.text_recognition_model_name:
|
||||
cpu_kwargs['text_recognition_model_name'] = settings.text_recognition_model_name
|
||||
|
||||
# Add document preprocessing model configuration
|
||||
if settings.doc_orientation_classify_model_name:
|
||||
cpu_kwargs['doc_orientation_classify_model_name'] = settings.doc_orientation_classify_model_name
|
||||
if settings.doc_unwarping_model_name:
|
||||
cpu_kwargs['doc_unwarping_model_name'] = settings.doc_unwarping_model_name
|
||||
if settings.textline_orientation_model_name:
|
||||
cpu_kwargs['textline_orientation_model_name'] = settings.textline_orientation_model_name
|
||||
|
||||
self.structure_engine = PPStructureV3(**cpu_kwargs)
|
||||
self._current_layout_model = layout_model # Track current model for recreation check
|
||||
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
|
||||
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
|
||||
else:
|
||||
raise
|
||||
|
||||
@@ -840,10 +920,14 @@ class OCRService:
|
||||
|
||||
logger.info(f"Converting PDF {pdf_path.name} to images")
|
||||
|
||||
# Convert PDF to images (300 DPI for good quality)
|
||||
# Convert PDF to images
|
||||
# Use 150 DPI - testing showed this produces optimal results for PP-Structure:
|
||||
# - 150 DPI produces ~1240x1754 for A4, which is ideal for layout detection
|
||||
# - 300 DPI produces ~2480x3508, which requires scaling down and degrades quality
|
||||
# - Table line detection works better at 150 DPI without scaling artifacts
|
||||
images = convert_from_path(
|
||||
str(pdf_path),
|
||||
dpi=300,
|
||||
dpi=150,
|
||||
fmt='png'
|
||||
)
|
||||
|
||||
@@ -1295,44 +1379,63 @@ class OCRService:
|
||||
structure_engine = self._ensure_structure_engine(layout_model)
|
||||
|
||||
# Apply image preprocessing for layout detection
|
||||
# Preprocessing enhances faint lines/borders to improve table detection
|
||||
# Original image is preserved for element extraction
|
||||
# Preprocessing includes:
|
||||
# 1. Automatic downscaling of high-resolution images for better table detection
|
||||
# 2. Optional contrast/sharpen enhancement for faint lines/borders
|
||||
# Original image is preserved for element extraction (cropping uses original coords)
|
||||
preprocessed_image = None
|
||||
preprocessing_result = None
|
||||
|
||||
# Determine preprocessing mode (default from config if not specified)
|
||||
mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
|
||||
|
||||
if mode != PreprocessingModeEnum.DISABLED:
|
||||
try:
|
||||
preprocessing_service = get_layout_preprocessing_service()
|
||||
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
|
||||
image_path,
|
||||
mode=mode,
|
||||
config=preprocessing_config
|
||||
# Always call preprocessing service (even when DISABLED) because:
|
||||
# - Scaling is applied regardless of mode for better layout detection
|
||||
# - When DISABLED, only scaling is applied, no contrast/sharpen/binarize
|
||||
try:
|
||||
preprocessing_service = get_layout_preprocessing_service()
|
||||
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
|
||||
image_path,
|
||||
mode=mode,
|
||||
config=preprocessing_config
|
||||
)
|
||||
|
||||
if preprocessing_result.was_processed:
|
||||
preprocessed_image = preprocessed_pil
|
||||
scaling_info = preprocessing_result.scaling_info
|
||||
logger.info(
|
||||
f"Layout preprocessing applied: mode={mode.value}, "
|
||||
f"config={preprocessing_result.config_used}, "
|
||||
f"metrics={preprocessing_result.quality_metrics}, "
|
||||
f"scaled={scaling_info.was_scaled if scaling_info else False}"
|
||||
)
|
||||
|
||||
if preprocessing_result.was_processed:
|
||||
preprocessed_image = preprocessed_pil
|
||||
if scaling_info and scaling_info.was_scaled:
|
||||
logger.info(
|
||||
f"Layout preprocessing applied: mode={mode.value}, "
|
||||
f"config={preprocessing_result.config_used}, "
|
||||
f"metrics={preprocessing_result.quality_metrics}"
|
||||
f"Image scaled for layout detection: "
|
||||
f"{scaling_info.original_size} -> {scaling_info.scaled_size} "
|
||||
f"(scale_factor={scaling_info.scale_factor:.3f} for bbox restoration)"
|
||||
)
|
||||
else:
|
||||
logger.info(f"No preprocessing needed (mode={mode.value})")
|
||||
else:
|
||||
logger.info(f"No preprocessing needed (mode={mode.value})")
|
||||
|
||||
except Exception as preprocess_error:
|
||||
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
|
||||
preprocessed_image = None
|
||||
except Exception as preprocess_error:
|
||||
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
|
||||
preprocessed_image = None
|
||||
preprocessing_result = None
|
||||
|
||||
# Try enhanced processing first
|
||||
try:
|
||||
from app.services.pp_structure_enhanced import PPStructureEnhanced
|
||||
|
||||
enhanced_processor = PPStructureEnhanced(structure_engine)
|
||||
|
||||
# Get scaling info for bbox coordinate restoration
|
||||
scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
|
||||
|
||||
result = enhanced_processor.analyze_with_full_structure(
|
||||
image_path, output_dir, current_page, preprocessed_image=preprocessed_image
|
||||
image_path, output_dir, current_page,
|
||||
preprocessed_image=preprocessed_image,
|
||||
scaling_info=scaling_info
|
||||
)
|
||||
|
||||
if result.get('has_parsing_res_list'):
|
||||
|
||||
Reference in New Issue
Block a user