feat: enhance layout preprocessing and unify image scaling proposal

Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -90,19 +90,27 @@ class Settings(BaseSettings):
    enable_formula_recognition: bool = Field(default=True)  # Math formula recognition
    enable_table_recognition: bool = Field(default=True)  # Table structure recognition
    enable_seal_recognition: bool = Field(default=True)  # Seal/stamp recognition
+    enable_region_detection: bool = Field(default=True)  # Region detection for better table structure
    enable_text_recognition: bool = Field(default=True)  # General text recognition

    # PP-StructureV3 Preprocessing (Stage 1)
    use_doc_orientation_classify: bool = Field(default=True)  # Auto-detect and correct document rotation
    use_doc_unwarping: bool = Field(default=True)  # Correct document warping from photos
    use_textline_orientation: bool = Field(default=True)  # Detect textline orientation
-    layout_detection_threshold: float = Field(default=0.2)  # Lower threshold for more sensitive detection
-    layout_nms_threshold: float = Field(default=0.2)  # Lower NMS to preserve more individual elements
-    layout_merge_mode: str = Field(default="small")  # Use 'small' to minimize bbox merging
-    layout_unclip_ratio: float = Field(default=1.2)  # Smaller unclip to preserve element boundaries
-    text_det_thresh: float = Field(default=0.2)  # More sensitive text detection
-    text_det_box_thresh: float = Field(default=0.3)  # Lower box threshold for better detection
-    text_det_unclip_ratio: float = Field(default=1.2)  # Smaller unclip for tighter text boxes
+
+    # Layout Detection Parameters (Stage 3)
+    # NOTE: Testing showed that PaddleX defaults work better for table detection.
+    # Previously we used aggressive low thresholds (0.2) which caused table detection failures.
+    # Now using None to let PaddleX use its optimized defaults.
+    layout_detection_threshold: Optional[float] = Field(default=None)  # None = use PaddleX default
+    layout_nms_threshold: Optional[float] = Field(default=None)  # None = use PaddleX default
+    layout_merge_mode: Optional[str] = Field(default=None)  # None = use PaddleX default
+    layout_unclip_ratio: Optional[float] = Field(default=None)  # None = use PaddleX default
+
+    # Text Detection Parameters
+    text_det_thresh: Optional[float] = Field(default=None)  # None = use PaddleX default
+    text_det_box_thresh: Optional[float] = Field(default=None)  # None = use PaddleX default
+    text_det_unclip_ratio: Optional[float] = Field(default=None)  # None = use PaddleX default

    # Layout Detection Model Configuration (Stage 3)
    # Available models:
@@ -136,6 +144,23 @@ class Settings(BaseSettings):
        description="Table structure model for borderless tables. SLANeXt_wireless recommended."
    )

+    # Table Classification Model - determines if table is wired or wireless
+    table_classification_model_name: Optional[str] = Field(
+        default="PP-LCNet_x1_0_table_cls",
+        description="Model to classify table type (wired vs wireless). Enables automatic model selection."
+    )
+
+    # Table Cell Detection Models - detect individual cells within tables
+    # These are crucial for accurate cell boundary detection in complex tables
+    wired_table_cells_detection_model_name: Optional[str] = Field(
+        default="RT-DETR-L_wired_table_cell_det",
+        description="Cell detection model for bordered tables. RT-DETR-L provides best accuracy."
+    )
+    wireless_table_cells_detection_model_name: Optional[str] = Field(
+        default="RT-DETR-L_wireless_table_cell_det",
+        description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
+    )
+
    # Formula Recognition Model Configuration (Stage 4)
    # Available models:
    # - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
@@ -146,6 +171,37 @@ class Settings(BaseSettings):
        description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
    )

+    # Chart Recognition Model Configuration
+    chart_recognition_model_name: Optional[str] = Field(
+        default="PP-Chart2Table",
+        description="Chart to table recognition model."
+    )
+
+    # Text Detection and Recognition Model Configuration
+    # PP-OCRv5_server provides best accuracy for document OCR
+    text_detection_model_name: Optional[str] = Field(
+        default="PP-OCRv5_server_det",
+        description="Text detection model. PP-OCRv5_server_det recommended for documents."
+    )
+    text_recognition_model_name: Optional[str] = Field(
+        default="PP-OCRv5_server_rec",
+        description="Text recognition model. PP-OCRv5_server_rec recommended for documents."
+    )
+
+    # Document Preprocessing Model Configuration (Stage 1)
+    doc_orientation_classify_model_name: Optional[str] = Field(
+        default="PP-LCNet_x1_0_doc_ori",
+        description="Document orientation classification model for auto-rotation."
+    )
+    doc_unwarping_model_name: Optional[str] = Field(
+        default="UVDoc",
+        description="Document unwarping model for correcting perspective distortion."
+    )
+    textline_orientation_model_name: Optional[str] = Field(
+        default="PP-LCNet_x1_0_textline_ori",
+        description="Textline orientation model for detecting text direction."
+    )
+
    # ===== Layout Preprocessing Configuration =====
    # Image preprocessing to enhance layout detection for documents with faint lines/borders
    # Preprocessing only affects layout detection input; original image is preserved for extraction
@@ -179,6 +235,31 @@ class Settings(BaseSettings):
        description="Contrast below this triggers binarization in auto mode"
    )

+    # Layout image scaling for better table detection
+    # Automatic bidirectional scaling for layout detection
+    # PDF conversion now uses 150 DPI (~1240x1754 for A4), which falls within optimal range
+    # Scaling acts as a safety net for:
+    # - Very large images (>2000px): Downscale to target
+    # - Very small images (<1200px): Upscale to target
+    # - 150 DPI A4 (1240x1754): No scaling needed (already optimal)
+    layout_image_scaling_enabled: bool = Field(
+        default=True,
+        description="Enable automatic bidirectional scaling for layout detection. "
+                    "Images outside optimal range are scaled to target dimension."
+    )
+    layout_image_scaling_max_dimension: int = Field(
+        default=2000,
+        description="Max dimension (pixels) before downscaling. Images larger than this will be scaled down."
+    )
+    layout_image_scaling_min_dimension: int = Field(
+        default=1200,
+        description="Min dimension (pixels) before upscaling. Images smaller than this will be scaled up."
+    )
+    layout_image_scaling_target_dimension: int = Field(
+        default=1600,
+        description="Target dimension (pixels) for scaling. Optimal size for PP-Structure layout detection."
+    )
+
    # ===== Gap Filling Configuration =====
    # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
    gap_filling_enabled: bool = Field(default=True)  # Enable gap filling for OCR track
--- a/backend/app/schemas/task.py
+++ b/backend/app/schemas/task.py
@@ -54,11 +54,15 @@ class PreprocessingContrastEnum(str, Enum):

    - NONE: No contrast enhancement
    - HISTOGRAM: Standard histogram equalization
-    - CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended)
+    - CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended for most cases)
+    - DOCUMENT: Background normalization + CLAHE (recommended for scanned documents)
+      Removes uneven illumination before enhancement. Best for scans with
+      yellowed paper, shadow, or scanner lighting issues.
    """
    NONE = "none"
    HISTOGRAM = "histogram"
    CLAHE = "clahe"
+    DOCUMENT = "document"


 class PreprocessingConfig(BaseModel):
--- a/backend/app/services/layout_preprocessing_service.py
+++ b/backend/app/services/layout_preprocessing_service.py
@@ -32,6 +32,15 @@ from app.schemas.task import (
 logger = logging.getLogger(__name__)


+@dataclass
+class ScalingInfo:
+    """Information about image scaling applied for layout detection."""
+    was_scaled: bool
+    scale_factor: float  # Factor to multiply bbox coords to get original size (1.0 / actual_scale)
+    original_size: Tuple[int, int]  # (width, height) of original image
+    scaled_size: Tuple[int, int]  # (width, height) after scaling
+
+
@dataclass
 class PreprocessingResult:
    """Result of preprocessing operation."""
@@ -39,6 +48,7 @@ class PreprocessingResult:
    config_used: PreprocessingConfig
    quality_metrics: ImageQualityMetrics
    was_processed: bool
+    scaling_info: Optional[ScalingInfo] = None  # Info about any scaling applied


 class LayoutPreprocessingService:
@@ -60,10 +70,23 @@ class LayoutPreprocessingService:
        self.edge_threshold = settings.layout_preprocessing_edge_threshold
        self.binarize_threshold = settings.layout_preprocessing_binarize_threshold

+        # Image scaling settings for layout detection (bidirectional)
+        self.scaling_enabled = settings.layout_image_scaling_enabled
+        self.scaling_max_dimension = settings.layout_image_scaling_max_dimension
+        self.scaling_min_dimension = settings.layout_image_scaling_min_dimension
+        self.scaling_target_dimension = settings.layout_image_scaling_target_dimension
+
        # CLAHE parameters
        self.clahe_clip_limit = 2.0
        self.clahe_tile_grid_size = (8, 8)

+        # Document-specific CLAHE parameters (larger tiles for documents)
+        self.document_clahe_clip_limit = 3.0
+        self.document_clahe_tile_grid_size = (16, 16)
+
+        # Background normalization parameters for scanned documents
+        self.background_kernel_size = 51  # Morphological kernel size
+
        # Sharpening kernel (unsharp mask style)
        self.sharpen_kernel = np.array([
            [0, -1, 0],
@@ -74,7 +97,9 @@ class LayoutPreprocessingService:
        logger.info(
            f"LayoutPreprocessingService initialized with thresholds: "
            f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
-            f"binarize={self.binarize_threshold}"
+            f"binarize={self.binarize_threshold}, "
+            f"scaling={'enabled' if self.scaling_enabled else 'disabled'} "
+            f"(min={self.scaling_min_dimension}, max={self.scaling_max_dimension}, target={self.scaling_target_dimension})"
        )

    def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
@@ -106,6 +131,180 @@ class LayoutPreprocessingService:
            edge_strength=round(edge_strength, 2)
        )

+    def _normalize_background(self, gray: np.ndarray) -> np.ndarray:
+        """
+        Normalize image background to remove uneven illumination.
+
+        This is particularly effective for scanned documents where scanner
+        lighting may be uneven, or where paper has yellowed/stained areas.
+
+        Method:
+        1. Estimate background using morphological closing (fills in text/details)
+        2. Divide original by background estimate
+        3. Rescale to full 0-255 range
+
+        Args:
+            gray: Grayscale image (L channel or grayscale)
+
+        Returns:
+            Normalized grayscale image with uniform background
+        """
+        # Create structuring element for morphological operations
+        kernel_size = self.background_kernel_size
+        # Ensure kernel size is odd
+        if kernel_size % 2 == 0:
+            kernel_size += 1
+
+        kernel = cv2.getStructuringElement(
+            cv2.MORPH_ELLIPSE,
+            (kernel_size, kernel_size)
+        )
+
+        # Morphological closing estimates the background
+        # (dilate then erode - fills in dark features like text)
+        background = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
+
+        # Apply Gaussian blur to smooth the background estimate
+        background = cv2.GaussianBlur(background, (kernel_size, kernel_size), 0)
+
+        # Avoid division by zero
+        background = np.maximum(background, 1).astype(np.float32)
+
+        # Normalize: divide by background and rescale to 0-255
+        # This removes uneven illumination while preserving text/content
+        normalized = (gray.astype(np.float32) / background) * 255.0
+
+        # Clip and convert back to uint8
+        normalized = np.clip(normalized, 0, 255).astype(np.uint8)
+
+        logger.debug(
+            f"Background normalization applied: kernel={kernel_size}, "
+            f"background range=[{background.min():.0f}, {background.max():.0f}]"
+        )
+
+        return normalized
+
+    def scale_for_layout_detection(
+        self,
+        image: np.ndarray,
+        force_scale: bool = False
+    ) -> Tuple[np.ndarray, ScalingInfo]:
+        """
+        Apply bidirectional scaling for optimal layout detection.
+
+        PP-Structure's layout detection model (RT-DETR based) works best with images
+        around 1600px on the longest side. Both too-large and too-small images
+        reduce detection accuracy:
+
+        - Too large (>2000px): Model's receptive field cannot capture entire structures
+        - Too small (<1200px): Insufficient detail for accurate detection
+
+        Scaling behavior:
+        - max_dim > max_dimension (2000): Scale DOWN to target (1600)
+        - max_dim < min_dimension (1200): Scale UP to target (1600)
+        - min_dimension <= max_dim <= max_dimension: No scaling (optimal range)
+
+        Args:
+            image: Input image (BGR)
+            force_scale: Force scaling to target even if in optimal range
+
+        Returns:
+            Tuple of (scaled_image, ScalingInfo)
+            ScalingInfo.scale_factor is the multiplier to convert scaled bbox
+            coordinates back to original image coordinates.
+        """
+        h, w = image.shape[:2]
+        original_size = (w, h)
+        max_dim = max(h, w)
+
+        # Determine if scaling is needed and direction
+        should_downscale = self.scaling_enabled and max_dim > self.scaling_max_dimension
+        should_upscale = self.scaling_enabled and max_dim < self.scaling_min_dimension
+        should_scale = should_downscale or should_upscale or force_scale
+
+        if not should_scale:
+            return image, ScalingInfo(
+                was_scaled=False,
+                scale_factor=1.0,
+                original_size=original_size,
+                scaled_size=original_size
+            )
+
+        # Calculate scale factor to reach target dimension
+        actual_scale = self.scaling_target_dimension / max_dim
+        new_w = int(w * actual_scale)
+        new_h = int(h * actual_scale)
+
+        # Choose interpolation method based on scale direction
+        if actual_scale < 1.0:
+            # Downscaling: INTER_AREA is best for shrinking (anti-aliasing)
+            interpolation = cv2.INTER_AREA
+            direction = "DOWN"
+        else:
+            # Upscaling: INTER_CUBIC provides smooth enlargement
+            interpolation = cv2.INTER_CUBIC
+            direction = "UP"
+
+        scaled_image = cv2.resize(image, (new_w, new_h), interpolation=interpolation)
+
+        # scale_factor is the inverse - used to scale bbox coords back to original
+        scale_factor = 1.0 / actual_scale
+
+        logger.info(
+            f"Scaled {direction} for layout detection: {w}x{h} -> {new_w}x{new_h} "
+            f"(scale_factor={scale_factor:.3f} to restore original coords)"
+        )
+
+        return scaled_image, ScalingInfo(
+            was_scaled=True,
+            scale_factor=scale_factor,
+            original_size=original_size,
+            scaled_size=(new_w, new_h)
+        )
+
+    @staticmethod
+    def scale_bbox_to_original(
+        bbox: Tuple[float, float, float, float],
+        scale_factor: float
+    ) -> Tuple[float, float, float, float]:
+        """
+        Scale a bounding box from scaled coordinates back to original image coordinates.
+
+        Args:
+            bbox: Bounding box as (x1, y1, x2, y2) in scaled image coordinates
+            scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
+
+        Returns:
+            Bounding box in original image coordinates
+        """
+        x1, y1, x2, y2 = bbox
+        return (
+            x1 * scale_factor,
+            y1 * scale_factor,
+            x2 * scale_factor,
+            y2 * scale_factor
+        )
+
+    @staticmethod
+    def scale_bboxes_to_original(
+        bboxes: list,
+        scale_factor: float
+    ) -> list:
+        """
+        Scale multiple bounding boxes from scaled coordinates to original.
+
+        Args:
+            bboxes: List of bounding boxes, each as (x1, y1, x2, y2)
+            scale_factor: Factor to multiply (from ScalingInfo.scale_factor)
+
+        Returns:
+            List of bounding boxes in original image coordinates
+        """
+        return [
+            LayoutPreprocessingService.scale_bbox_to_original(bbox, scale_factor)
+            for bbox in bboxes
+        ]
+
    def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
        """
        Determine optimal preprocessing config based on image quality.
@@ -203,6 +402,18 @@ class LayoutPreprocessingService:
                tileGridSize=self.clahe_tile_grid_size
            )
            l_enhanced = clahe.apply(l_channel)
+        elif method == PreprocessingContrastEnum.DOCUMENT:
+            # Document-specific enhancement for scanned documents
+            # Step 1: Background normalization to remove uneven illumination
+            l_normalized = self._normalize_background(l_channel)
+
+            # Step 2: CLAHE with larger tiles optimized for documents
+            clip_limit = self.document_clahe_clip_limit * strength
+            clahe = cv2.createCLAHE(
+                clipLimit=clip_limit,
+                tileGridSize=self.document_clahe_tile_grid_size
+            )
+            l_enhanced = clahe.apply(l_normalized)
        else:
            return image

@@ -277,15 +488,29 @@ class LayoutPreprocessingService:
        self,
        image: Union[np.ndarray, Image.Image, str, Path],
        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
-        config: Optional[PreprocessingConfig] = None
+        config: Optional[PreprocessingConfig] = None,
+        apply_scaling: bool = True
    ) -> PreprocessingResult:
        """
        Preprocess image for layout detection.

+        The preprocessing pipeline:
+        1. Load image from path/PIL if needed
+        2. Analyze image quality (on original image for accurate metrics)
+        3. Scale down high-resolution images for better layout detection
+        4. Apply contrast enhancement if needed
+        5. Apply sharpening if needed
+        6. Apply binarization if requested (not recommended)
+
+        IMPORTANT: When scaling is applied, all bounding boxes from layout detection
+        must be scaled back to original coordinates using ScalingInfo.scale_factor.
+        The original image should be used for element extraction (cropping).
+
        Args:
            image: Input image (numpy array, PIL Image, or path)
            mode: Preprocessing mode (auto, manual, disabled)
            config: Manual configuration (required if mode='manual')
+            apply_scaling: Whether to apply automatic downscaling (default True)

        Returns:
            PreprocessingResult with preprocessed image and metadata
@@ -299,21 +524,37 @@ class LayoutPreprocessingService:
            # Convert PIL to OpenCV format (BGR)
            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

-        # Analyze quality
+        # Analyze quality on ORIGINAL image (before scaling) for accurate metrics
        metrics = self.analyze_image_quality(image)
        logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")

+        # Apply scaling for layout detection (even if preprocessing is disabled)
+        if apply_scaling:
+            scaled_image, scaling_info = self.scale_for_layout_detection(image)
+        else:
+            h, w = image.shape[:2]
+            scaled_image = image
+            scaling_info = ScalingInfo(
+                was_scaled=False,
+                scale_factor=1.0,
+                original_size=(w, h),
+                scaled_size=(w, h)
+            )
+
        # Determine configuration
        if mode == PreprocessingModeEnum.DISABLED:
+            # Even when preprocessing is disabled, we still return scaled image
+            # for better layout detection. Original image is preserved for cropping.
            return PreprocessingResult(
-                image=image,
+                image=scaled_image,
                config_used=PreprocessingConfig(
                    contrast=PreprocessingContrastEnum.NONE,
                    sharpen=False,
                    binarize=False
                ),
                quality_metrics=metrics,
-                was_processed=False
+                was_processed=scaling_info.was_scaled,  # True if scaling was applied
+                scaling_info=scaling_info
            )

        if mode == PreprocessingModeEnum.AUTO:
@@ -323,9 +564,9 @@ class LayoutPreprocessingService:
            # Manual mode but no config provided, use defaults
            config = PreprocessingConfig()

-        # Apply preprocessing pipeline
-        processed = image.copy()
-        was_processed = False
+        # Apply preprocessing pipeline on SCALED image
+        processed = scaled_image.copy()
+        was_processed = scaling_info.was_scaled  # Start with True if already scaled

        # Step 1: Contrast enhancement
        if config.contrast != PreprocessingContrastEnum.NONE:
@@ -353,29 +594,37 @@ class LayoutPreprocessingService:
            image=processed,
            config_used=config,
            quality_metrics=metrics,
-            was_processed=was_processed
+            was_processed=was_processed,
+            scaling_info=scaling_info
        )

    def preprocess_to_pil(
        self,
        image: Union[np.ndarray, Image.Image, str, Path],
        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
-        config: Optional[PreprocessingConfig] = None
+        config: Optional[PreprocessingConfig] = None,
+        apply_scaling: bool = True
    ) -> Tuple[Image.Image, PreprocessingResult]:
        """
        Preprocess image and return as PIL Image.

        Convenience method for integration with PP-Structure which accepts PIL images.

+        IMPORTANT: When result.scaling_info.was_scaled is True, all bounding boxes
+        from PP-Structure must be scaled back to original coordinates using:
+            scaled_bbox = (x1 * scale_factor, y1 * scale_factor, x2 * scale_factor, y2 * scale_factor)
+        where scale_factor = result.scaling_info.scale_factor
+
        Args:
            image: Input image
            mode: Preprocessing mode
            config: Manual configuration
+            apply_scaling: Whether to apply automatic downscaling (default True)

        Returns:
-            Tuple of (PIL Image, PreprocessingResult)
+            Tuple of (PIL Image for PP-Structure, PreprocessingResult with scaling info)
        """
-        result = self.preprocess(image, mode, config)
+        result = self.preprocess(image, mode, config, apply_scaling=apply_scaling)

        # Convert BGR to RGB for PIL
        rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -502,6 +502,8 @@ class OCRService:
                use_chart = settings.enable_chart_recognition
                use_formula = settings.enable_formula_recognition
                use_table = settings.enable_table_recognition
+                use_seal = settings.enable_seal_recognition
+                use_region = settings.enable_region_detection
                layout_threshold = settings.layout_detection_threshold
                layout_nms = settings.layout_nms_threshold
                layout_merge = settings.layout_merge_mode
@@ -530,17 +532,32 @@ class OCRService:
                # Table and formula model configuration (Stage 4)
                wired_table_model = settings.wired_table_model_name
                wireless_table_model = settings.wireless_table_model_name
+                table_cls_model = settings.table_classification_model_name
+                wired_cell_det_model = settings.wired_table_cells_detection_model_name
+                wireless_cell_det_model = settings.wireless_table_cells_detection_model_name
                formula_model = settings.formula_recognition_model_name
+                chart_model = settings.chart_recognition_model_name

-                logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
+                # Text detection/recognition model configuration
+                text_det_model = settings.text_detection_model_name
+                text_rec_model = settings.text_recognition_model_name
+
+                # Document preprocessing model configuration (Stage 1)
+                doc_ori_model = settings.doc_orientation_classify_model_name
+                doc_unwarp_model = settings.doc_unwarping_model_name
+                textline_ori_model = settings.textline_orientation_model_name
+
+                logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}, seal={use_seal}, region={use_region}")
                logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}")
                logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
-                logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}")
+                logger.info(f"Table structure models: wired={wired_table_model}, wireless={wireless_table_model}")
+                logger.info(f"Table cell detection: cls={table_cls_model}, wired_det={wired_cell_det_model}, wireless_det={wireless_cell_det_model}")
                logger.info(f"Formula model: {formula_model}")
                logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
                logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")

                # Build PPStructureV3 kwargs
+                # Only include parameters that are not None (let PaddleX use defaults for None values)
                pp_kwargs = {
                    # Preprocessing (Stage 1)
                    'use_doc_orientation_classify': use_orientation,
@@ -550,17 +567,29 @@ class OCRService:
                    'use_table_recognition': use_table,
                    'use_formula_recognition': use_formula,
                    'use_chart_recognition': use_chart,
-                    # Layout detection parameters
-                    'layout_threshold': layout_threshold,
-                    'layout_nms': layout_nms,
-                    'layout_unclip_ratio': layout_unclip,
-                    'layout_merge_bboxes_mode': layout_merge,
-                    # Text detection parameters
-                    'text_det_thresh': text_thresh,
-                    'text_det_box_thresh': text_box_thresh,
-                    'text_det_unclip_ratio': text_unclip,
+                    'use_seal_recognition': use_seal,
+                    'use_region_detection': use_region,
                }

+                # Add layout detection parameters only if explicitly configured
+                # (None = use PaddleX optimized defaults, which work better for table detection)
+                if layout_threshold is not None:
+                    pp_kwargs['layout_threshold'] = layout_threshold
+                if layout_nms is not None:
+                    pp_kwargs['layout_nms'] = layout_nms
+                if layout_unclip is not None:
+                    pp_kwargs['layout_unclip_ratio'] = layout_unclip
+                if layout_merge is not None:
+                    pp_kwargs['layout_merge_bboxes_mode'] = layout_merge
+
+                # Add text detection parameters only if explicitly configured
+                if text_thresh is not None:
+                    pp_kwargs['text_det_thresh'] = text_thresh
+                if text_box_thresh is not None:
+                    pp_kwargs['text_det_box_thresh'] = text_box_thresh
+                if text_unclip is not None:
+                    pp_kwargs['text_det_unclip_ratio'] = text_unclip
+
                # Add layout model configuration if specified (Stage 3)
                if layout_model_name:
                    pp_kwargs['layout_detection_model_name'] = layout_model_name
@@ -575,10 +604,38 @@ class OCRService:
                if wireless_table_model:
                    pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model

+                # Add table classification model (determines wired vs wireless automatically)
+                if table_cls_model:
+                    pp_kwargs['table_classification_model_name'] = table_cls_model
+
+                # Add table cell detection models (crucial for accurate cell boundary detection)
+                if wired_cell_det_model:
+                    pp_kwargs['wired_table_cells_detection_model_name'] = wired_cell_det_model
+                if wireless_cell_det_model:
+                    pp_kwargs['wireless_table_cells_detection_model_name'] = wireless_cell_det_model
+
                # Add formula recognition model configuration (Stage 4)
                if formula_model:
                    pp_kwargs['formula_recognition_model_name'] = formula_model

+                # Add chart recognition model configuration
+                if chart_model:
+                    pp_kwargs['chart_recognition_model_name'] = chart_model
+
+                # Add text detection/recognition model configuration
+                if text_det_model:
+                    pp_kwargs['text_detection_model_name'] = text_det_model
+                if text_rec_model:
+                    pp_kwargs['text_recognition_model_name'] = text_rec_model
+
+                # Add document preprocessing model configuration (Stage 1)
+                if doc_ori_model:
+                    pp_kwargs['doc_orientation_classify_model_name'] = doc_ori_model
+                if doc_unwarp_model:
+                    pp_kwargs['doc_unwarping_model_name'] = doc_unwarp_model
+                if textline_ori_model:
+                    pp_kwargs['textline_orientation_model_name'] = textline_ori_model
+
                self.structure_engine = PPStructureV3(**pp_kwargs)

                # Track model loading for cache management
@@ -599,40 +656,63 @@ class OCRService:
                    # Switch to CPU device globally
                    paddle.set_device('cpu')

-                    use_chart = settings.enable_chart_recognition
-                    use_formula = settings.enable_formula_recognition
-                    use_table = settings.enable_table_recognition
-                    layout_threshold = settings.layout_detection_threshold
-                    layout_model_name = settings.layout_detection_model_name
-                    layout_model_dir = settings.layout_detection_model_dir
-                    wired_table_model = settings.wired_table_model_name
-                    wireless_table_model = settings.wireless_table_model_name
-                    formula_model = settings.formula_recognition_model_name
-
-                    # Build CPU fallback kwargs
+                    # Build CPU fallback kwargs (same logic as GPU mode)
                    cpu_kwargs = {
                        'use_doc_orientation_classify': settings.use_doc_orientation_classify,
                        'use_doc_unwarping': settings.use_doc_unwarping,
                        'use_textline_orientation': settings.use_textline_orientation,
-                        'use_table_recognition': use_table,
-                        'use_formula_recognition': use_formula,
-                        'use_chart_recognition': use_chart,
-                        'layout_threshold': layout_threshold,
+                        'use_table_recognition': settings.enable_table_recognition,
+                        'use_formula_recognition': settings.enable_formula_recognition,
+                        'use_chart_recognition': settings.enable_chart_recognition,
+                        'use_seal_recognition': settings.enable_seal_recognition,
+                        'use_region_detection': settings.enable_region_detection,
                    }
-                    if layout_model_name:
-                        cpu_kwargs['layout_detection_model_name'] = layout_model_name
-                    if layout_model_dir:
-                        cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
-                    if wired_table_model:
-                        cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
-                    if wireless_table_model:
-                        cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
-                    if formula_model:
-                        cpu_kwargs['formula_recognition_model_name'] = formula_model
+
+                    # Add layout detection parameters only if explicitly configured
+                    if settings.layout_detection_threshold is not None:
+                        cpu_kwargs['layout_threshold'] = settings.layout_detection_threshold
+
+                    # Add layout model configuration
+                    if settings.layout_detection_model_name:
+                        cpu_kwargs['layout_detection_model_name'] = settings.layout_detection_model_name
+                    if settings.layout_detection_model_dir:
+                        cpu_kwargs['layout_detection_model_dir'] = settings.layout_detection_model_dir
+
+                    # Add table structure model configuration
+                    if settings.wired_table_model_name:
+                        cpu_kwargs['wired_table_structure_recognition_model_name'] = settings.wired_table_model_name
+                    if settings.wireless_table_model_name:
+                        cpu_kwargs['wireless_table_structure_recognition_model_name'] = settings.wireless_table_model_name
+                    if settings.table_classification_model_name:
+                        cpu_kwargs['table_classification_model_name'] = settings.table_classification_model_name
+                    if settings.wired_table_cells_detection_model_name:
+                        cpu_kwargs['wired_table_cells_detection_model_name'] = settings.wired_table_cells_detection_model_name
+                    if settings.wireless_table_cells_detection_model_name:
+                        cpu_kwargs['wireless_table_cells_detection_model_name'] = settings.wireless_table_cells_detection_model_name
+
+                    # Add formula and chart recognition model configuration
+                    if settings.formula_recognition_model_name:
+                        cpu_kwargs['formula_recognition_model_name'] = settings.formula_recognition_model_name
+                    if settings.chart_recognition_model_name:
+                        cpu_kwargs['chart_recognition_model_name'] = settings.chart_recognition_model_name
+
+                    # Add text detection/recognition model configuration
+                    if settings.text_detection_model_name:
+                        cpu_kwargs['text_detection_model_name'] = settings.text_detection_model_name
+                    if settings.text_recognition_model_name:
+                        cpu_kwargs['text_recognition_model_name'] = settings.text_recognition_model_name
+
+                    # Add document preprocessing model configuration
+                    if settings.doc_orientation_classify_model_name:
+                        cpu_kwargs['doc_orientation_classify_model_name'] = settings.doc_orientation_classify_model_name
+                    if settings.doc_unwarping_model_name:
+                        cpu_kwargs['doc_unwarping_model_name'] = settings.doc_unwarping_model_name
+                    if settings.textline_orientation_model_name:
+                        cpu_kwargs['textline_orientation_model_name'] = settings.textline_orientation_model_name

                    self.structure_engine = PPStructureV3(**cpu_kwargs)
                    self._current_layout_model = layout_model  # Track current model for recreation check
-                    logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
+                    logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
                else:
                    raise

@@ -840,10 +920,14 @@ class OCRService:

            logger.info(f"Converting PDF {pdf_path.name} to images")

-            # Convert PDF to images (300 DPI for good quality)
+            # Convert PDF to images
+            # Use 150 DPI - testing showed this produces optimal results for PP-Structure:
+            # - 150 DPI produces ~1240x1754 for A4, which is ideal for layout detection
+            # - 300 DPI produces ~2480x3508, which requires scaling down and degrades quality
+            # - Table line detection works better at 150 DPI without scaling artifacts
            images = convert_from_path(
                str(pdf_path),
-                dpi=300,
+                dpi=150,
                fmt='png'
            )

@@ -1295,44 +1379,63 @@ class OCRService:
            structure_engine = self._ensure_structure_engine(layout_model)

            # Apply image preprocessing for layout detection
-            # Preprocessing enhances faint lines/borders to improve table detection
-            # Original image is preserved for element extraction
+            # Preprocessing includes:
+            # 1. Automatic downscaling of high-resolution images for better table detection
+            # 2. Optional contrast/sharpen enhancement for faint lines/borders
+            # Original image is preserved for element extraction (cropping uses original coords)
            preprocessed_image = None
            preprocessing_result = None

            # Determine preprocessing mode (default from config if not specified)
            mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)

-            if mode != PreprocessingModeEnum.DISABLED:
-                try:
-                    preprocessing_service = get_layout_preprocessing_service()
-                    preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
-                        image_path,
-                        mode=mode,
-                        config=preprocessing_config
+            # Always call preprocessing service (even when DISABLED) because:
+            # - Scaling is applied regardless of mode for better layout detection
+            # - When DISABLED, only scaling is applied, no contrast/sharpen/binarize
+            try:
+                preprocessing_service = get_layout_preprocessing_service()
+                preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
+                    image_path,
+                    mode=mode,
+                    config=preprocessing_config
+                )
+
+                if preprocessing_result.was_processed:
+                    preprocessed_image = preprocessed_pil
+                    scaling_info = preprocessing_result.scaling_info
+                    logger.info(
+                        f"Layout preprocessing applied: mode={mode.value}, "
+                        f"config={preprocessing_result.config_used}, "
+                        f"metrics={preprocessing_result.quality_metrics}, "
+                        f"scaled={scaling_info.was_scaled if scaling_info else False}"
                    )
-
-                    if preprocessing_result.was_processed:
-                        preprocessed_image = preprocessed_pil
+                    if scaling_info and scaling_info.was_scaled:
                        logger.info(
-                            f"Layout preprocessing applied: mode={mode.value}, "
-                            f"config={preprocessing_result.config_used}, "
-                            f"metrics={preprocessing_result.quality_metrics}"
+                            f"Image scaled for layout detection: "
+                            f"{scaling_info.original_size} -> {scaling_info.scaled_size} "
+                            f"(scale_factor={scaling_info.scale_factor:.3f} for bbox restoration)"
                        )
-                    else:
-                        logger.info(f"No preprocessing needed (mode={mode.value})")
+                else:
+                    logger.info(f"No preprocessing needed (mode={mode.value})")

-                except Exception as preprocess_error:
-                    logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
-                    preprocessed_image = None
+            except Exception as preprocess_error:
+                logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
+                preprocessed_image = None
+                preprocessing_result = None

            # Try enhanced processing first
            try:
                from app.services.pp_structure_enhanced import PPStructureEnhanced

                enhanced_processor = PPStructureEnhanced(structure_engine)
+
+                # Get scaling info for bbox coordinate restoration
+                scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
+
                result = enhanced_processor.analyze_with_full_structure(
-                    image_path, output_dir, current_page, preprocessed_image=preprocessed_image
+                    image_path, output_dir, current_page,
+                    preprocessed_image=preprocessed_image,
+                    scaling_info=scaling_info
                )

                if result.get('has_parsing_res_list'):
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -7,10 +7,14 @@ This module provides enhanced PP-StructureV3 processing that extracts all

 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Any
+from typing import Dict, List, Optional, Tuple, Any, TYPE_CHECKING
 import json
 import gc

+# Import ScalingInfo for type checking (avoid circular imports at runtime)
+if TYPE_CHECKING:
+    from app.services.layout_preprocessing_service import ScalingInfo
+
 # Optional torch import for additional GPU memory management
 try:
    import torch
@@ -81,7 +85,8 @@ class PPStructureEnhanced:
        image_path: Path,
        output_dir: Optional[Path] = None,
        current_page: int = 0,
-        preprocessed_image: Optional[Image.Image] = None
+        preprocessed_image: Optional[Image.Image] = None,
+        scaling_info: Optional['ScalingInfo'] = None
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.
@@ -93,10 +98,13 @@ class PPStructureEnhanced:
            preprocessed_image: Optional preprocessed PIL Image for layout detection.
                               If provided, this is used for PP-Structure prediction,
                               but original image_path is still used for cropping images.
+            scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
+                         for layout detection, all bbox coordinates will be scaled back
+                         to original image coordinates for proper cropping.

        Returns:
            Dictionary with complete structure information including:
-            - elements: List of all detected elements with types and bbox
+            - elements: List of all detected elements with types and bbox (in original coords)
            - reading_order: Reading order indices
            - images: Extracted images with metadata
            - tables: Extracted tables with structure
@@ -184,7 +192,7 @@ class PPStructureEnhanced:
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
-                        parsing_res_list, current_page, output_dir, image_path
+                        parsing_res_list, current_page, output_dir, image_path, scaling_info
                    )
                    all_elements.extend(elements)

@@ -247,13 +255,15 @@ class PPStructureEnhanced:
        parsing_res_list: List[Dict],
        current_page: int,
        output_dir: Optional[Path],
-        source_image_path: Optional[Path] = None
+        source_image_path: Optional[Path] = None,
+        scaling_info: Optional['ScalingInfo'] = None
    ) -> List[Dict[str, Any]]:
        """
        Process parsing_res_list to extract all elements.

        Args:
            parsing_res_list: List of parsed elements from PP-StructureV3
+            scaling_info: Scaling information for bbox coordinate restoration
            current_page: Current page number
            output_dir: Optional output directory
            source_image_path: Path to source image for cropping image regions
@@ -285,11 +295,28 @@ class PPStructureEnhanced:

            # Ensure bbox has 4 values
            if len(layout_bbox) >= 4:
-                bbox = layout_bbox[:4]  # [x1, y1, x2, y2]
+                bbox = list(layout_bbox[:4])  # [x1, y1, x2, y2]
            else:
                bbox = [0, 0, 0, 0]  # Default if bbox missing
                logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}")

+            # Scale bbox back to original image coordinates if image was scaled
+            # This is critical for proper cropping from original high-resolution image
+            if scaling_info and scaling_info.was_scaled and bbox != [0, 0, 0, 0]:
+                scale_factor = scaling_info.scale_factor
+                bbox = [
+                    bbox[0] * scale_factor,  # x1
+                    bbox[1] * scale_factor,  # y1
+                    bbox[2] * scale_factor,  # x2
+                    bbox[3] * scale_factor   # y2
+                ]
+                if idx == 0:  # Log only for first element to avoid spam
+                    logger.info(
+                        f"Scaled bbox to original coords: "
+                        f"{[round(x, 1) for x in layout_bbox[:4]]} -> {[round(x, 1) for x in bbox]} "
+                        f"(factor={scale_factor:.3f})"
+                    )
+
            # Extract content (check multiple possible keys)
            content = (
                item.get('content', '') or