feat: enhance layout preprocessing and unify image scaling proposal

Backend changes:
- Add image scaling configuration for PP-Structure processing
- Enhance layout preprocessing service with scaling support
- Update OCR service with improved memory management
- Add PP-Structure enhanced processing improvements

Frontend changes:
- Update preprocessing settings UI
- Fix processing page layout and state management
- Update API types for new parameters

Proposals:
- Archive add-layout-preprocessing proposal (completed)
- Add unify-image-scaling proposal for consistent coordinate handling

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions

View File

@@ -502,6 +502,8 @@ class OCRService:
use_chart = settings.enable_chart_recognition
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
use_seal = settings.enable_seal_recognition
use_region = settings.enable_region_detection
layout_threshold = settings.layout_detection_threshold
layout_nms = settings.layout_nms_threshold
layout_merge = settings.layout_merge_mode
@@ -530,17 +532,32 @@ class OCRService:
# Table and formula model configuration (Stage 4)
wired_table_model = settings.wired_table_model_name
wireless_table_model = settings.wireless_table_model_name
table_cls_model = settings.table_classification_model_name
wired_cell_det_model = settings.wired_table_cells_detection_model_name
wireless_cell_det_model = settings.wireless_table_cells_detection_model_name
formula_model = settings.formula_recognition_model_name
chart_model = settings.chart_recognition_model_name
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
# Text detection/recognition model configuration
text_det_model = settings.text_detection_model_name
text_rec_model = settings.text_recognition_model_name
# Document preprocessing model configuration (Stage 1)
doc_ori_model = settings.doc_orientation_classify_model_name
doc_unwarp_model = settings.doc_unwarping_model_name
textline_ori_model = settings.textline_orientation_model_name
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}, seal={use_seal}, region={use_region}")
logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}")
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}")
logger.info(f"Table structure models: wired={wired_table_model}, wireless={wireless_table_model}")
logger.info(f"Table cell detection: cls={table_cls_model}, wired_det={wired_cell_det_model}, wireless_det={wireless_cell_det_model}")
logger.info(f"Formula model: {formula_model}")
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
# Build PPStructureV3 kwargs
# Only include parameters that are not None (let PaddleX use defaults for None values)
pp_kwargs = {
# Preprocessing (Stage 1)
'use_doc_orientation_classify': use_orientation,
@@ -550,17 +567,29 @@ class OCRService:
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
# Layout detection parameters
'layout_threshold': layout_threshold,
'layout_nms': layout_nms,
'layout_unclip_ratio': layout_unclip,
'layout_merge_bboxes_mode': layout_merge,
# Text detection parameters
'text_det_thresh': text_thresh,
'text_det_box_thresh': text_box_thresh,
'text_det_unclip_ratio': text_unclip,
'use_seal_recognition': use_seal,
'use_region_detection': use_region,
}
# Add layout detection parameters only if explicitly configured
# (None = use PaddleX optimized defaults, which work better for table detection)
if layout_threshold is not None:
pp_kwargs['layout_threshold'] = layout_threshold
if layout_nms is not None:
pp_kwargs['layout_nms'] = layout_nms
if layout_unclip is not None:
pp_kwargs['layout_unclip_ratio'] = layout_unclip
if layout_merge is not None:
pp_kwargs['layout_merge_bboxes_mode'] = layout_merge
# Add text detection parameters only if explicitly configured
if text_thresh is not None:
pp_kwargs['text_det_thresh'] = text_thresh
if text_box_thresh is not None:
pp_kwargs['text_det_box_thresh'] = text_box_thresh
if text_unclip is not None:
pp_kwargs['text_det_unclip_ratio'] = text_unclip
# Add layout model configuration if specified (Stage 3)
if layout_model_name:
pp_kwargs['layout_detection_model_name'] = layout_model_name
@@ -575,10 +604,38 @@ class OCRService:
if wireless_table_model:
pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
# Add table classification model (determines wired vs wireless automatically)
if table_cls_model:
pp_kwargs['table_classification_model_name'] = table_cls_model
# Add table cell detection models (crucial for accurate cell boundary detection)
if wired_cell_det_model:
pp_kwargs['wired_table_cells_detection_model_name'] = wired_cell_det_model
if wireless_cell_det_model:
pp_kwargs['wireless_table_cells_detection_model_name'] = wireless_cell_det_model
# Add formula recognition model configuration (Stage 4)
if formula_model:
pp_kwargs['formula_recognition_model_name'] = formula_model
# Add chart recognition model configuration
if chart_model:
pp_kwargs['chart_recognition_model_name'] = chart_model
# Add text detection/recognition model configuration
if text_det_model:
pp_kwargs['text_detection_model_name'] = text_det_model
if text_rec_model:
pp_kwargs['text_recognition_model_name'] = text_rec_model
# Add document preprocessing model configuration (Stage 1)
if doc_ori_model:
pp_kwargs['doc_orientation_classify_model_name'] = doc_ori_model
if doc_unwarp_model:
pp_kwargs['doc_unwarping_model_name'] = doc_unwarp_model
if textline_ori_model:
pp_kwargs['textline_orientation_model_name'] = textline_ori_model
self.structure_engine = PPStructureV3(**pp_kwargs)
# Track model loading for cache management
@@ -599,40 +656,63 @@ class OCRService:
# Switch to CPU device globally
paddle.set_device('cpu')
use_chart = settings.enable_chart_recognition
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
layout_threshold = settings.layout_detection_threshold
layout_model_name = settings.layout_detection_model_name
layout_model_dir = settings.layout_detection_model_dir
wired_table_model = settings.wired_table_model_name
wireless_table_model = settings.wireless_table_model_name
formula_model = settings.formula_recognition_model_name
# Build CPU fallback kwargs
# Build CPU fallback kwargs (same logic as GPU mode)
cpu_kwargs = {
'use_doc_orientation_classify': settings.use_doc_orientation_classify,
'use_doc_unwarping': settings.use_doc_unwarping,
'use_textline_orientation': settings.use_textline_orientation,
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
'layout_threshold': layout_threshold,
'use_table_recognition': settings.enable_table_recognition,
'use_formula_recognition': settings.enable_formula_recognition,
'use_chart_recognition': settings.enable_chart_recognition,
'use_seal_recognition': settings.enable_seal_recognition,
'use_region_detection': settings.enable_region_detection,
}
if layout_model_name:
cpu_kwargs['layout_detection_model_name'] = layout_model_name
if layout_model_dir:
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
if wired_table_model:
cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
if wireless_table_model:
cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
if formula_model:
cpu_kwargs['formula_recognition_model_name'] = formula_model
# Add layout detection parameters only if explicitly configured
if settings.layout_detection_threshold is not None:
cpu_kwargs['layout_threshold'] = settings.layout_detection_threshold
# Add layout model configuration
if settings.layout_detection_model_name:
cpu_kwargs['layout_detection_model_name'] = settings.layout_detection_model_name
if settings.layout_detection_model_dir:
cpu_kwargs['layout_detection_model_dir'] = settings.layout_detection_model_dir
# Add table structure model configuration
if settings.wired_table_model_name:
cpu_kwargs['wired_table_structure_recognition_model_name'] = settings.wired_table_model_name
if settings.wireless_table_model_name:
cpu_kwargs['wireless_table_structure_recognition_model_name'] = settings.wireless_table_model_name
if settings.table_classification_model_name:
cpu_kwargs['table_classification_model_name'] = settings.table_classification_model_name
if settings.wired_table_cells_detection_model_name:
cpu_kwargs['wired_table_cells_detection_model_name'] = settings.wired_table_cells_detection_model_name
if settings.wireless_table_cells_detection_model_name:
cpu_kwargs['wireless_table_cells_detection_model_name'] = settings.wireless_table_cells_detection_model_name
# Add formula and chart recognition model configuration
if settings.formula_recognition_model_name:
cpu_kwargs['formula_recognition_model_name'] = settings.formula_recognition_model_name
if settings.chart_recognition_model_name:
cpu_kwargs['chart_recognition_model_name'] = settings.chart_recognition_model_name
# Add text detection/recognition model configuration
if settings.text_detection_model_name:
cpu_kwargs['text_detection_model_name'] = settings.text_detection_model_name
if settings.text_recognition_model_name:
cpu_kwargs['text_recognition_model_name'] = settings.text_recognition_model_name
# Add document preprocessing model configuration
if settings.doc_orientation_classify_model_name:
cpu_kwargs['doc_orientation_classify_model_name'] = settings.doc_orientation_classify_model_name
if settings.doc_unwarping_model_name:
cpu_kwargs['doc_unwarping_model_name'] = settings.doc_unwarping_model_name
if settings.textline_orientation_model_name:
cpu_kwargs['textline_orientation_model_name'] = settings.textline_orientation_model_name
self.structure_engine = PPStructureV3(**cpu_kwargs)
self._current_layout_model = layout_model # Track current model for recreation check
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
else:
raise
@@ -840,10 +920,14 @@ class OCRService:
logger.info(f"Converting PDF {pdf_path.name} to images")
# Convert PDF to images (300 DPI for good quality)
# Convert PDF to images
# Use 150 DPI - testing showed this produces optimal results for PP-Structure:
# - 150 DPI produces ~1240x1754 for A4, which is ideal for layout detection
# - 300 DPI produces ~2480x3508, which requires scaling down and degrades quality
# - Table line detection works better at 150 DPI without scaling artifacts
images = convert_from_path(
str(pdf_path),
dpi=300,
dpi=150,
fmt='png'
)
@@ -1295,44 +1379,63 @@ class OCRService:
structure_engine = self._ensure_structure_engine(layout_model)
# Apply image preprocessing for layout detection
# Preprocessing enhances faint lines/borders to improve table detection
# Original image is preserved for element extraction
# Preprocessing includes:
# 1. Automatic downscaling of high-resolution images for better table detection
# 2. Optional contrast/sharpen enhancement for faint lines/borders
# Original image is preserved for element extraction (cropping uses original coords)
preprocessed_image = None
preprocessing_result = None
# Determine preprocessing mode (default from config if not specified)
mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
if mode != PreprocessingModeEnum.DISABLED:
try:
preprocessing_service = get_layout_preprocessing_service()
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
image_path,
mode=mode,
config=preprocessing_config
# Always call preprocessing service (even when DISABLED) because:
# - Scaling is applied regardless of mode for better layout detection
# - When DISABLED, only scaling is applied, no contrast/sharpen/binarize
try:
preprocessing_service = get_layout_preprocessing_service()
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
image_path,
mode=mode,
config=preprocessing_config
)
if preprocessing_result.was_processed:
preprocessed_image = preprocessed_pil
scaling_info = preprocessing_result.scaling_info
logger.info(
f"Layout preprocessing applied: mode={mode.value}, "
f"config={preprocessing_result.config_used}, "
f"metrics={preprocessing_result.quality_metrics}, "
f"scaled={scaling_info.was_scaled if scaling_info else False}"
)
if preprocessing_result.was_processed:
preprocessed_image = preprocessed_pil
if scaling_info and scaling_info.was_scaled:
logger.info(
f"Layout preprocessing applied: mode={mode.value}, "
f"config={preprocessing_result.config_used}, "
f"metrics={preprocessing_result.quality_metrics}"
f"Image scaled for layout detection: "
f"{scaling_info.original_size} -> {scaling_info.scaled_size} "
f"(scale_factor={scaling_info.scale_factor:.3f} for bbox restoration)"
)
else:
logger.info(f"No preprocessing needed (mode={mode.value})")
else:
logger.info(f"No preprocessing needed (mode={mode.value})")
except Exception as preprocess_error:
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
preprocessed_image = None
except Exception as preprocess_error:
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
preprocessed_image = None
preprocessing_result = None
# Try enhanced processing first
try:
from app.services.pp_structure_enhanced import PPStructureEnhanced
enhanced_processor = PPStructureEnhanced(structure_engine)
# Get scaling info for bbox coordinate restoration
scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
result = enhanced_processor.analyze_with_full_structure(
image_path, output_dir, current_page, preprocessed_image=preprocessed_image
image_path, output_dir, current_page,
preprocessed_image=preprocessed_image,
scaling_info=scaling_info
)
if result.get('has_parsing_res_list'):