feat: enhance layout preprocessing and unify image scaling proposal

Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -502,6 +502,8 @@ class OCRService:
                use_chart = settings.enable_chart_recognition
                use_formula = settings.enable_formula_recognition
                use_table = settings.enable_table_recognition
+                use_seal = settings.enable_seal_recognition
+                use_region = settings.enable_region_detection
                layout_threshold = settings.layout_detection_threshold
                layout_nms = settings.layout_nms_threshold
                layout_merge = settings.layout_merge_mode
@@ -530,17 +532,32 @@ class OCRService:
                # Table and formula model configuration (Stage 4)
                wired_table_model = settings.wired_table_model_name
                wireless_table_model = settings.wireless_table_model_name
+                table_cls_model = settings.table_classification_model_name
+                wired_cell_det_model = settings.wired_table_cells_detection_model_name
+                wireless_cell_det_model = settings.wireless_table_cells_detection_model_name
                formula_model = settings.formula_recognition_model_name
+                chart_model = settings.chart_recognition_model_name

-                logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
+                # Text detection/recognition model configuration
+                text_det_model = settings.text_detection_model_name
+                text_rec_model = settings.text_recognition_model_name
+
+                # Document preprocessing model configuration (Stage 1)
+                doc_ori_model = settings.doc_orientation_classify_model_name
+                doc_unwarp_model = settings.doc_unwarping_model_name
+                textline_ori_model = settings.textline_orientation_model_name
+
+                logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}, seal={use_seal}, region={use_region}")
                logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}")
                logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
-                logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}")
+                logger.info(f"Table structure models: wired={wired_table_model}, wireless={wireless_table_model}")
+                logger.info(f"Table cell detection: cls={table_cls_model}, wired_det={wired_cell_det_model}, wireless_det={wireless_cell_det_model}")
                logger.info(f"Formula model: {formula_model}")
                logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
                logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")

                # Build PPStructureV3 kwargs
+                # Only include parameters that are not None (let PaddleX use defaults for None values)
                pp_kwargs = {
                    # Preprocessing (Stage 1)
                    'use_doc_orientation_classify': use_orientation,
@@ -550,17 +567,29 @@ class OCRService:
                    'use_table_recognition': use_table,
                    'use_formula_recognition': use_formula,
                    'use_chart_recognition': use_chart,
-                    # Layout detection parameters
-                    'layout_threshold': layout_threshold,
-                    'layout_nms': layout_nms,
-                    'layout_unclip_ratio': layout_unclip,
-                    'layout_merge_bboxes_mode': layout_merge,
-                    # Text detection parameters
-                    'text_det_thresh': text_thresh,
-                    'text_det_box_thresh': text_box_thresh,
-                    'text_det_unclip_ratio': text_unclip,
+                    'use_seal_recognition': use_seal,
+                    'use_region_detection': use_region,
                }

+                # Add layout detection parameters only if explicitly configured
+                # (None = use PaddleX optimized defaults, which work better for table detection)
+                if layout_threshold is not None:
+                    pp_kwargs['layout_threshold'] = layout_threshold
+                if layout_nms is not None:
+                    pp_kwargs['layout_nms'] = layout_nms
+                if layout_unclip is not None:
+                    pp_kwargs['layout_unclip_ratio'] = layout_unclip
+                if layout_merge is not None:
+                    pp_kwargs['layout_merge_bboxes_mode'] = layout_merge
+
+                # Add text detection parameters only if explicitly configured
+                if text_thresh is not None:
+                    pp_kwargs['text_det_thresh'] = text_thresh
+                if text_box_thresh is not None:
+                    pp_kwargs['text_det_box_thresh'] = text_box_thresh
+                if text_unclip is not None:
+                    pp_kwargs['text_det_unclip_ratio'] = text_unclip
+
                # Add layout model configuration if specified (Stage 3)
                if layout_model_name:
                    pp_kwargs['layout_detection_model_name'] = layout_model_name
@@ -575,10 +604,38 @@ class OCRService:
                if wireless_table_model:
                    pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model

+                # Add table classification model (determines wired vs wireless automatically)
+                if table_cls_model:
+                    pp_kwargs['table_classification_model_name'] = table_cls_model
+
+                # Add table cell detection models (crucial for accurate cell boundary detection)
+                if wired_cell_det_model:
+                    pp_kwargs['wired_table_cells_detection_model_name'] = wired_cell_det_model
+                if wireless_cell_det_model:
+                    pp_kwargs['wireless_table_cells_detection_model_name'] = wireless_cell_det_model
+
                # Add formula recognition model configuration (Stage 4)
                if formula_model:
                    pp_kwargs['formula_recognition_model_name'] = formula_model

+                # Add chart recognition model configuration
+                if chart_model:
+                    pp_kwargs['chart_recognition_model_name'] = chart_model
+
+                # Add text detection/recognition model configuration
+                if text_det_model:
+                    pp_kwargs['text_detection_model_name'] = text_det_model
+                if text_rec_model:
+                    pp_kwargs['text_recognition_model_name'] = text_rec_model
+
+                # Add document preprocessing model configuration (Stage 1)
+                if doc_ori_model:
+                    pp_kwargs['doc_orientation_classify_model_name'] = doc_ori_model
+                if doc_unwarp_model:
+                    pp_kwargs['doc_unwarping_model_name'] = doc_unwarp_model
+                if textline_ori_model:
+                    pp_kwargs['textline_orientation_model_name'] = textline_ori_model
+
                self.structure_engine = PPStructureV3(**pp_kwargs)

                # Track model loading for cache management
@@ -599,40 +656,63 @@ class OCRService:
                    # Switch to CPU device globally
                    paddle.set_device('cpu')

-                    use_chart = settings.enable_chart_recognition
-                    use_formula = settings.enable_formula_recognition
-                    use_table = settings.enable_table_recognition
-                    layout_threshold = settings.layout_detection_threshold
-                    layout_model_name = settings.layout_detection_model_name
-                    layout_model_dir = settings.layout_detection_model_dir
-                    wired_table_model = settings.wired_table_model_name
-                    wireless_table_model = settings.wireless_table_model_name
-                    formula_model = settings.formula_recognition_model_name
-
-                    # Build CPU fallback kwargs
+                    # Build CPU fallback kwargs (same logic as GPU mode)
                    cpu_kwargs = {
                        'use_doc_orientation_classify': settings.use_doc_orientation_classify,
                        'use_doc_unwarping': settings.use_doc_unwarping,
                        'use_textline_orientation': settings.use_textline_orientation,
-                        'use_table_recognition': use_table,
-                        'use_formula_recognition': use_formula,
-                        'use_chart_recognition': use_chart,
-                        'layout_threshold': layout_threshold,
+                        'use_table_recognition': settings.enable_table_recognition,
+                        'use_formula_recognition': settings.enable_formula_recognition,
+                        'use_chart_recognition': settings.enable_chart_recognition,
+                        'use_seal_recognition': settings.enable_seal_recognition,
+                        'use_region_detection': settings.enable_region_detection,
                    }
-                    if layout_model_name:
-                        cpu_kwargs['layout_detection_model_name'] = layout_model_name
-                    if layout_model_dir:
-                        cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
-                    if wired_table_model:
-                        cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
-                    if wireless_table_model:
-                        cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
-                    if formula_model:
-                        cpu_kwargs['formula_recognition_model_name'] = formula_model
+
+                    # Add layout detection parameters only if explicitly configured
+                    if settings.layout_detection_threshold is not None:
+                        cpu_kwargs['layout_threshold'] = settings.layout_detection_threshold
+
+                    # Add layout model configuration
+                    if settings.layout_detection_model_name:
+                        cpu_kwargs['layout_detection_model_name'] = settings.layout_detection_model_name
+                    if settings.layout_detection_model_dir:
+                        cpu_kwargs['layout_detection_model_dir'] = settings.layout_detection_model_dir
+
+                    # Add table structure model configuration
+                    if settings.wired_table_model_name:
+                        cpu_kwargs['wired_table_structure_recognition_model_name'] = settings.wired_table_model_name
+                    if settings.wireless_table_model_name:
+                        cpu_kwargs['wireless_table_structure_recognition_model_name'] = settings.wireless_table_model_name
+                    if settings.table_classification_model_name:
+                        cpu_kwargs['table_classification_model_name'] = settings.table_classification_model_name
+                    if settings.wired_table_cells_detection_model_name:
+                        cpu_kwargs['wired_table_cells_detection_model_name'] = settings.wired_table_cells_detection_model_name
+                    if settings.wireless_table_cells_detection_model_name:
+                        cpu_kwargs['wireless_table_cells_detection_model_name'] = settings.wireless_table_cells_detection_model_name
+
+                    # Add formula and chart recognition model configuration
+                    if settings.formula_recognition_model_name:
+                        cpu_kwargs['formula_recognition_model_name'] = settings.formula_recognition_model_name
+                    if settings.chart_recognition_model_name:
+                        cpu_kwargs['chart_recognition_model_name'] = settings.chart_recognition_model_name
+
+                    # Add text detection/recognition model configuration
+                    if settings.text_detection_model_name:
+                        cpu_kwargs['text_detection_model_name'] = settings.text_detection_model_name
+                    if settings.text_recognition_model_name:
+                        cpu_kwargs['text_recognition_model_name'] = settings.text_recognition_model_name
+
+                    # Add document preprocessing model configuration
+                    if settings.doc_orientation_classify_model_name:
+                        cpu_kwargs['doc_orientation_classify_model_name'] = settings.doc_orientation_classify_model_name
+                    if settings.doc_unwarping_model_name:
+                        cpu_kwargs['doc_unwarping_model_name'] = settings.doc_unwarping_model_name
+                    if settings.textline_orientation_model_name:
+                        cpu_kwargs['textline_orientation_model_name'] = settings.textline_orientation_model_name

                    self.structure_engine = PPStructureV3(**cpu_kwargs)
                    self._current_layout_model = layout_model  # Track current model for recreation check
-                    logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
+                    logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
                else:
                    raise

@@ -840,10 +920,14 @@ class OCRService:

            logger.info(f"Converting PDF {pdf_path.name} to images")

-            # Convert PDF to images (300 DPI for good quality)
+            # Convert PDF to images
+            # Use 150 DPI - testing showed this produces optimal results for PP-Structure:
+            # - 150 DPI produces ~1240x1754 for A4, which is ideal for layout detection
+            # - 300 DPI produces ~2480x3508, which requires scaling down and degrades quality
+            # - Table line detection works better at 150 DPI without scaling artifacts
            images = convert_from_path(
                str(pdf_path),
-                dpi=300,
+                dpi=150,
                fmt='png'
            )

@@ -1295,44 +1379,63 @@ class OCRService:
            structure_engine = self._ensure_structure_engine(layout_model)

            # Apply image preprocessing for layout detection
-            # Preprocessing enhances faint lines/borders to improve table detection
-            # Original image is preserved for element extraction
+            # Preprocessing includes:
+            # 1. Automatic downscaling of high-resolution images for better table detection
+            # 2. Optional contrast/sharpen enhancement for faint lines/borders
+            # Original image is preserved for element extraction (cropping uses original coords)
            preprocessed_image = None
            preprocessing_result = None

            # Determine preprocessing mode (default from config if not specified)
            mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)

-            if mode != PreprocessingModeEnum.DISABLED:
-                try:
-                    preprocessing_service = get_layout_preprocessing_service()
-                    preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
-                        image_path,
-                        mode=mode,
-                        config=preprocessing_config
+            # Always call preprocessing service (even when DISABLED) because:
+            # - Scaling is applied regardless of mode for better layout detection
+            # - When DISABLED, only scaling is applied, no contrast/sharpen/binarize
+            try:
+                preprocessing_service = get_layout_preprocessing_service()
+                preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
+                    image_path,
+                    mode=mode,
+                    config=preprocessing_config
+                )
+
+                if preprocessing_result.was_processed:
+                    preprocessed_image = preprocessed_pil
+                    scaling_info = preprocessing_result.scaling_info
+                    logger.info(
+                        f"Layout preprocessing applied: mode={mode.value}, "
+                        f"config={preprocessing_result.config_used}, "
+                        f"metrics={preprocessing_result.quality_metrics}, "
+                        f"scaled={scaling_info.was_scaled if scaling_info else False}"
                    )
-
-                    if preprocessing_result.was_processed:
-                        preprocessed_image = preprocessed_pil
+                    if scaling_info and scaling_info.was_scaled:
                        logger.info(
-                            f"Layout preprocessing applied: mode={mode.value}, "
-                            f"config={preprocessing_result.config_used}, "
-                            f"metrics={preprocessing_result.quality_metrics}"
+                            f"Image scaled for layout detection: "
+                            f"{scaling_info.original_size} -> {scaling_info.scaled_size} "
+                            f"(scale_factor={scaling_info.scale_factor:.3f} for bbox restoration)"
                        )
-                    else:
-                        logger.info(f"No preprocessing needed (mode={mode.value})")
+                else:
+                    logger.info(f"No preprocessing needed (mode={mode.value})")

-                except Exception as preprocess_error:
-                    logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
-                    preprocessed_image = None
+            except Exception as preprocess_error:
+                logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
+                preprocessed_image = None
+                preprocessing_result = None

            # Try enhanced processing first
            try:
                from app.services.pp_structure_enhanced import PPStructureEnhanced

                enhanced_processor = PPStructureEnhanced(structure_engine)
+
+                # Get scaling info for bbox coordinate restoration
+                scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
+
                result = enhanced_processor.analyze_with_full_structure(
-                    image_path, output_dir, current_page, preprocessed_image=preprocessed_image
+                    image_path, output_dir, current_page,
+                    preprocessed_image=preprocessed_image,
+                    scaling_info=scaling_info
                )

                if result.get('has_parsing_res_list'):