feat: implement layout preprocessing backend

Backend implementation for add-layout-preprocessing proposal: - Add LayoutPreprocessingService with CLAHE, sharpen, binarize - Add auto-detection: analyze_image_quality() for contrast/edge metrics - Integrate preprocessing into OCR pipeline (analyze_layout) - Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing - Add config options: layout_preprocessing_mode, thresholds - Add schemas: PreprocessingConfig, PreprocessingPreviewResponse Preprocessing only affects layout detection input. Original images preserved for element extraction. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 15:17:20 +08:00
parent 06a5973f2e
commit ea0dd7456c
7 changed files with 800 additions and 22 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -146,6 +146,39 @@ class Settings(BaseSettings):
        description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
    )
    # ===== Layout Preprocessing Configuration =====
    # Image preprocessing to enhance layout detection for documents with faint lines/borders
    # Preprocessing only affects layout detection input; original image is preserved for extraction
    layout_preprocessing_mode: str = Field(
        default="auto",
        description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
    )
    layout_preprocessing_contrast: str = Field(
        default="clahe",
        description="Contrast enhancement method: 'none', 'histogram', 'clahe' (recommended)"
    )
    layout_preprocessing_sharpen: bool = Field(
        default=True,
        description="Enable sharpening to enhance faint lines and borders"
    )
    layout_preprocessing_binarize: bool = Field(
        default=False,
        description="Enable binarization (aggressive, use for very low contrast documents only)"
    )
    # Auto-detection thresholds
    layout_preprocessing_contrast_threshold: float = Field(
        default=40.0,
        description="Contrast (std dev) below this triggers CLAHE in auto mode"
    )
    layout_preprocessing_edge_threshold: float = Field(
        default=15.0,
        description="Edge strength below this triggers sharpening in auto mode"
    )
    layout_preprocessing_binarize_threshold: float = Field(
        default=20.0,
        description="Contrast below this triggers binarization in auto mode"
    )
    # ===== Gap Filling Configuration =====
    # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
    gap_filling_enabled: bool = Field(default=True)  # Enable gap filling for OCR track
--- a/backend/app/routers/tasks.py
+++ b/backend/app/routers/tasks.py
@@ -35,6 +35,11 @@ from app.schemas.task import (
    ProcessingMetadata,
    TaskResponseWithMetadata,
    ExportOptions,
    PreprocessingModeEnum,
    PreprocessingConfig,
    PreprocessingPreviewRequest,
    PreprocessingPreviewResponse,
    ImageQualityMetrics,
 )
 from app.services.task_service import task_service
 from app.services.file_access_service import file_access_service
@@ -1131,3 +1136,193 @@ async def download_unified(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to download: {str(e)}"
        )
 # ===== Preprocessing Preview Endpoints =====
@router.post("/{task_id}/preview/preprocessing", response_model=PreprocessingPreviewResponse, summary="Preview preprocessing effect")
 async def preview_preprocessing(
    task_id: str,
    request: PreprocessingPreviewRequest,
    db: Session = Depends(get_db),
    current_user: User = Depends(get_current_user)
 ):
    """
    Preview the effect of image preprocessing before OCR processing.
    Shows side-by-side comparison of original and preprocessed images,
    along with image quality metrics and auto-detected configuration.
    - **task_id**: Task UUID
    - **page**: Page number to preview (1-based)
    - **mode**: Preprocessing mode ('auto', 'manual', 'disabled')
    - **config**: Manual preprocessing config (only used when mode='manual')
    """
    from pdf2image import convert_from_path
    import base64
    import io
    from PIL import Image
    from app.services.layout_preprocessing_service import get_layout_preprocessing_service
    try:
        # Get task details
        task = task_service.get_task_by_id(
            db=db,
            task_id=task_id,
            user_id=current_user.id
        )
        if not task:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail="Task not found"
            )
        # Get task file
        task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
        if not task_file:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail="Task file not found"
            )
        file_path = Path(task_file.stored_path)
        if not file_path.exists():
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail="Source file not found"
            )
        # Get the page image
        page_num = request.page
        if file_path.suffix.lower() == '.pdf':
            # Convert specific page from PDF
            images = convert_from_path(
                str(file_path),
                first_page=page_num,
                last_page=page_num,
                dpi=150
            )
            if not images:
                raise HTTPException(
                    status_code=status.HTTP_400_BAD_REQUEST,
                    detail=f"Page {page_num} not found in PDF"
                )
            original_image = images[0]
        else:
            # Direct image file
            if page_num != 1:
                raise HTTPException(
                    status_code=status.HTTP_400_BAD_REQUEST,
                    detail="Single image file only has page 1"
                )
            original_image = Image.open(file_path)
        # Get preprocessing service
        preprocessing_service = get_layout_preprocessing_service()
        # Apply preprocessing
        preprocessed_image, preprocess_result = preprocessing_service.preprocess_to_pil(
            original_image,
            mode=request.mode,
            config=request.config
        )
        # Create result directory for preview images
        preview_dir = Path(settings.result_dir) / task_id / "preview"
        preview_dir.mkdir(parents=True, exist_ok=True)
        # Save preview images
        original_filename = f"page_{page_num}_original.png"
        preprocessed_filename = f"page_{page_num}_preprocessed.png"
        original_path = preview_dir / original_filename
        preprocessed_path = preview_dir / preprocessed_filename
        original_image.save(str(original_path), "PNG")
        preprocessed_image.save(str(preprocessed_path), "PNG")
        # Build URLs (relative paths that can be served)
        base_url = f"/api/v2/tasks/{task_id}/preview/image"
        original_url = f"{base_url}?type=original&page={page_num}"
        preprocessed_url = f"{base_url}?type=preprocessed&page={page_num}"
        return PreprocessingPreviewResponse(
            original_url=original_url,
            preprocessed_url=preprocessed_url,
            quality_metrics=preprocess_result.quality_metrics,
            auto_config=preprocess_result.config_used,
            mode_used=request.mode
        )
    except HTTPException:
        raise
    except Exception as e:
        logger.exception(f"Failed to preview preprocessing for task {task_id}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to preview preprocessing: {str(e)}"
        )
@router.get("/{task_id}/preview/image", summary="Get preview image")
 async def get_preview_image(
    task_id: str,
    type: str = Query(..., description="Image type: 'original' or 'preprocessed'"),
    page: int = Query(1, ge=1, description="Page number"),
    db: Session = Depends(get_db),
    current_user: User = Depends(get_current_user)
 ):
    """
    Get a preview image (original or preprocessed).
    - **task_id**: Task UUID
    - **type**: Image type ('original' or 'preprocessed')
    - **page**: Page number
    """
    try:
        # Verify task ownership
        task = task_service.get_task_by_id(
            db=db,
            task_id=task_id,
            user_id=current_user.id
        )
        if not task:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail="Task not found"
            )
        # Validate type parameter
        if type not in ['original', 'preprocessed']:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Invalid type. Must be 'original' or 'preprocessed'"
            )
        # Build image path
        preview_dir = Path(settings.result_dir) / task_id / "preview"
        image_filename = f"page_{page}_{type}.png"
        image_path = preview_dir / image_filename
        if not image_path.exists():
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail=f"Preview image not found. Please call preview/preprocessing first."
            )
        return FileResponse(
            path=str(image_path),
            media_type="image/png",
            filename=image_filename
        )
    except HTTPException:
        raise
    except Exception as e:
        logger.exception(f"Failed to get preview image for task {task_id}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to get preview image: {str(e)}"
        )
--- a/backend/app/schemas/task.py
+++ b/backend/app/schemas/task.py
@@ -37,6 +37,79 @@ class LayoutModelEnum(str, Enum):
    CDLA = "cdla"         # CDLA model - Alternative for Chinese layout
 class PreprocessingModeEnum(str, Enum):
    """Preprocessing mode for layout detection enhancement.
    - AUTO: Analyze image quality and automatically apply optimal preprocessing
    - MANUAL: Use user-specified preprocessing configuration
    - DISABLED: Skip preprocessing entirely
    """
    AUTO = "auto"         # Analyze and apply automatically (default)
    MANUAL = "manual"     # Use specified configuration
    DISABLED = "disabled" # Skip preprocessing
 class PreprocessingContrastEnum(str, Enum):
    """Contrast enhancement method for preprocessing.
    - NONE: No contrast enhancement
    - HISTOGRAM: Standard histogram equalization
    - CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended)
    """
    NONE = "none"
    HISTOGRAM = "histogram"
    CLAHE = "clahe"
 class PreprocessingConfig(BaseModel):
    """Preprocessing configuration for layout detection enhancement.
    Used to configure image preprocessing before PP-Structure layout detection.
    Preprocessing helps detect tables with faint lines or low contrast borders.
    Original image is preserved for element extraction.
    """
    contrast: PreprocessingContrastEnum = Field(
        default=PreprocessingContrastEnum.CLAHE,
        description="Contrast enhancement method"
    )
    sharpen: bool = Field(
        default=True,
        description="Enable sharpening for faint lines"
    )
    binarize: bool = Field(
        default=False,
        description="Enable binarization (aggressive, for very low contrast)"
    )
 class ImageQualityMetrics(BaseModel):
    """Image quality metrics from auto-analysis."""
    contrast: float = Field(..., description="Contrast level (std dev of grayscale)")
    edge_strength: float = Field(..., description="Edge strength (Sobel gradient mean)")
 class PreprocessingPreviewRequest(BaseModel):
    """Request for preprocessing preview."""
    page: int = Field(default=1, ge=1, description="Page number to preview")
    mode: PreprocessingModeEnum = Field(
        default=PreprocessingModeEnum.AUTO,
        description="Preprocessing mode"
    )
    config: Optional[PreprocessingConfig] = Field(
        None,
        description="Manual configuration (only used when mode='manual')"
    )
 class PreprocessingPreviewResponse(BaseModel):
    """Response for preprocessing preview."""
    original_url: str = Field(..., description="URL to original image")
    preprocessed_url: str = Field(..., description="URL to preprocessed image")
    quality_metrics: ImageQualityMetrics = Field(..., description="Image quality analysis")
    auto_config: PreprocessingConfig = Field(..., description="Auto-detected configuration")
    mode_used: PreprocessingModeEnum = Field(..., description="Mode that was applied")
 class TaskCreate(BaseModel):
    """Task creation request"""
    filename: Optional[str] = Field(None, description="Original filename")
@@ -195,6 +268,16 @@ class ProcessingOptions(BaseModel):
        description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)"
    )
    # Layout preprocessing (OCR track only)
    preprocessing_mode: PreprocessingModeEnum = Field(
        default=PreprocessingModeEnum.AUTO,
        description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
    )
    preprocessing_config: Optional[PreprocessingConfig] = Field(
        None,
        description="Manual preprocessing config (only used when preprocessing_mode='manual')"
    )
 class AnalyzeRequest(BaseModel):
    """Document analysis request"""
--- a/backend/app/services/layout_preprocessing_service.py
+++ b/backend/app/services/layout_preprocessing_service.py
@@ -0,0 +1,370 @@
 """
 Tool_OCR - Layout Preprocessing Service
 Image preprocessing to enhance layout detection for documents with faint lines/borders.
 This service provides:
 1. Image quality analysis (contrast, edge strength)
 2. Contrast enhancement (histogram equalization, CLAHE)
 3. Sharpening for faint lines
 4. Optional binarization for very low contrast documents
 IMPORTANT: Preprocessing only affects layout detection input.
 Original images are preserved for element extraction.
 """
 import logging
 from pathlib import Path
 from typing import Optional, Tuple, Union
 from dataclasses import dataclass
 import cv2
 import numpy as np
 from PIL import Image
 from app.core.config import settings
 from app.schemas.task import (
    PreprocessingConfig,
    PreprocessingContrastEnum,
    PreprocessingModeEnum,
    ImageQualityMetrics,
 )
 logger = logging.getLogger(__name__)
@dataclass
 class PreprocessingResult:
    """Result of preprocessing operation."""
    image: np.ndarray
    config_used: PreprocessingConfig
    quality_metrics: ImageQualityMetrics
    was_processed: bool
 class LayoutPreprocessingService:
    """
    Service for preprocessing images to improve layout detection.
    The preprocessing pipeline:
    1. Analyze image quality (contrast, edge strength)
    2. Apply contrast enhancement if needed (CLAHE or histogram)
    3. Apply sharpening if edge strength is low
    4. Apply binarization if contrast is very low (optional)
    All operations preserve the original color image dimensions.
    """
    def __init__(self):
        # Load thresholds from config
        self.contrast_threshold = settings.layout_preprocessing_contrast_threshold
        self.edge_threshold = settings.layout_preprocessing_edge_threshold
        self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
        # CLAHE parameters
        self.clahe_clip_limit = 2.0
        self.clahe_tile_grid_size = (8, 8)
        # Sharpening kernel (unsharp mask style)
        self.sharpen_kernel = np.array([
            [0, -1, 0],
            [-1, 5, -1],
            [0, -1, 0]
        ], dtype=np.float32)
        logger.info(
            f"LayoutPreprocessingService initialized with thresholds: "
            f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
            f"binarize={self.binarize_threshold}"
        )
    def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
        """
        Analyze image quality to determine preprocessing needs.
        Args:
            image: Input image (BGR or grayscale)
        Returns:
            ImageQualityMetrics with contrast and edge_strength
        """
        # Convert to grayscale if needed
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image
        # Calculate contrast (standard deviation of pixel values)
        contrast = float(np.std(gray))
        # Calculate edge strength (mean of Sobel gradient magnitude)
        sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
        sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
        edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2)))
        return ImageQualityMetrics(
            contrast=round(contrast, 2),
            edge_strength=round(edge_strength, 2)
        )
    def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
        """
        Determine optimal preprocessing config based on image quality.
        Args:
            metrics: Image quality metrics from analyze_image_quality()
        Returns:
            PreprocessingConfig with recommended settings
        """
        # Determine contrast enhancement
        if metrics.contrast < self.contrast_threshold:
            contrast = PreprocessingContrastEnum.CLAHE
        else:
            contrast = PreprocessingContrastEnum.NONE
        # Determine sharpening
        sharpen = metrics.edge_strength < self.edge_threshold
        # Determine binarization (only for very low contrast)
        binarize = metrics.contrast < self.binarize_threshold
        return PreprocessingConfig(
            contrast=contrast,
            sharpen=sharpen,
            binarize=binarize
        )
    def apply_contrast_enhancement(
        self,
        image: np.ndarray,
        method: PreprocessingContrastEnum
    ) -> np.ndarray:
        """
        Apply contrast enhancement to image.
        Args:
            image: Input image (BGR)
            method: Enhancement method (none, histogram, clahe)
        Returns:
            Enhanced image (BGR)
        """
        if method == PreprocessingContrastEnum.NONE:
            return image
        # Convert to LAB color space for better enhancement
        lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
        l_channel, a_channel, b_channel = cv2.split(lab)
        if method == PreprocessingContrastEnum.HISTOGRAM:
            # Standard histogram equalization
            l_enhanced = cv2.equalizeHist(l_channel)
        elif method == PreprocessingContrastEnum.CLAHE:
            # Contrast Limited Adaptive Histogram Equalization
            clahe = cv2.createCLAHE(
                clipLimit=self.clahe_clip_limit,
                tileGridSize=self.clahe_tile_grid_size
            )
            l_enhanced = clahe.apply(l_channel)
        else:
            return image
        # Merge channels and convert back to BGR
        enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel])
        enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
        return enhanced_bgr
    def apply_sharpening(self, image: np.ndarray) -> np.ndarray:
        """
        Apply sharpening to enhance edges and faint lines.
        Args:
            image: Input image (BGR)
        Returns:
            Sharpened image (BGR)
        """
        # Apply unsharp mask style sharpening
        sharpened = cv2.filter2D(image, -1, self.sharpen_kernel)
        # Clip values to valid range
        sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
        return sharpened
    def apply_binarization(self, image: np.ndarray) -> np.ndarray:
        """
        Apply adaptive binarization for very low contrast documents.
        Args:
            image: Input image (BGR)
        Returns:
            Binarized image (BGR, but grayscale values)
        """
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # Apply adaptive thresholding
        binary = cv2.adaptiveThreshold(
            gray,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            blockSize=11,
            C=2
        )
        # Convert back to BGR for consistency
        binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
        return binary_bgr
    def preprocess(
        self,
        image: Union[np.ndarray, Image.Image, str, Path],
        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
        config: Optional[PreprocessingConfig] = None
    ) -> PreprocessingResult:
        """
        Preprocess image for layout detection.
        Args:
            image: Input image (numpy array, PIL Image, or path)
            mode: Preprocessing mode (auto, manual, disabled)
            config: Manual configuration (required if mode='manual')
        Returns:
            PreprocessingResult with preprocessed image and metadata
        """
        # Load image if path provided
        if isinstance(image, (str, Path)):
            image = cv2.imread(str(image))
            if image is None:
                raise ValueError(f"Failed to load image: {image}")
        elif isinstance(image, Image.Image):
            # Convert PIL to OpenCV format (BGR)
            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        # Analyze quality
        metrics = self.analyze_image_quality(image)
        logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
        # Determine configuration
        if mode == PreprocessingModeEnum.DISABLED:
            return PreprocessingResult(
                image=image,
                config_used=PreprocessingConfig(
                    contrast=PreprocessingContrastEnum.NONE,
                    sharpen=False,
                    binarize=False
                ),
                quality_metrics=metrics,
                was_processed=False
            )
        if mode == PreprocessingModeEnum.AUTO:
            config = self.get_auto_config(metrics)
            logger.debug(f"Auto config: {config}")
        elif config is None:
            # Manual mode but no config provided, use defaults
            config = PreprocessingConfig()
        # Apply preprocessing pipeline
        processed = image.copy()
        was_processed = False
        # Step 1: Contrast enhancement
        if config.contrast != PreprocessingContrastEnum.NONE:
            processed = self.apply_contrast_enhancement(processed, config.contrast)
            was_processed = True
            logger.debug(f"Applied contrast enhancement: {config.contrast}")
        # Step 2: Sharpening
        if config.sharpen:
            processed = self.apply_sharpening(processed)
            was_processed = True
            logger.debug("Applied sharpening")
        # Step 3: Binarization (last step, overwrites color)
        if config.binarize:
            processed = self.apply_binarization(processed)
            was_processed = True
            logger.debug("Applied binarization")
        return PreprocessingResult(
            image=processed,
            config_used=config,
            quality_metrics=metrics,
            was_processed=was_processed
        )
    def preprocess_to_pil(
        self,
        image: Union[np.ndarray, Image.Image, str, Path],
        mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
        config: Optional[PreprocessingConfig] = None
    ) -> Tuple[Image.Image, PreprocessingResult]:
        """
        Preprocess image and return as PIL Image.
        Convenience method for integration with PP-Structure which accepts PIL images.
        Args:
            image: Input image
            mode: Preprocessing mode
            config: Manual configuration
        Returns:
            Tuple of (PIL Image, PreprocessingResult)
        """
        result = self.preprocess(image, mode, config)
        # Convert BGR to RGB for PIL
        rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb_image)
        return pil_image, result
    def save_preview(
        self,
        original: np.ndarray,
        preprocessed: np.ndarray,
        output_dir: Path,
        prefix: str = "preview"
    ) -> Tuple[Path, Path]:
        """
        Save original and preprocessed images for preview.
        Args:
            original: Original image (BGR)
            preprocessed: Preprocessed image (BGR)
            output_dir: Directory to save images
            prefix: Filename prefix
        Returns:
            Tuple of (original_path, preprocessed_path)
        """
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        original_path = output_dir / f"{prefix}_original.png"
        preprocessed_path = output_dir / f"{prefix}_preprocessed.png"
        cv2.imwrite(str(original_path), original)
        cv2.imwrite(str(preprocessed_path), preprocessed)
        return original_path, preprocessed_path
 # Singleton instance
 _layout_preprocessing_service: Optional[LayoutPreprocessingService] = None
 def get_layout_preprocessing_service() -> LayoutPreprocessingService:
    """Get or create the layout preprocessing service singleton."""
    global _layout_preprocessing_service
    if _layout_preprocessing_service is None:
        _layout_preprocessing_service = LayoutPreprocessingService()
    return _layout_preprocessing_service
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -26,6 +26,11 @@ except ImportError:
 from app.core.config import settings
 from app.services.office_converter import OfficeConverter, OfficeConverterError
 from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
 from app.services.layout_preprocessing_service import (
    get_layout_preprocessing_service,
    LayoutPreprocessingService,
 )
 from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
 # Import dual-track components
 try:
@@ -865,7 +870,9 @@ class OCRService:
        confidence_threshold: Optional[float] = None,
        output_dir: Optional[Path] = None,
        current_page: int = 0,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Dict:
        """
        Process single image with OCR and layout analysis
@@ -878,6 +885,8 @@ class OCRService:
            output_dir: Optional output directory for saving extracted images
            current_page: Current page number (0-based) for multi-page documents
            layout_model: Layout detection model ('chinese', 'default', 'cdla')
            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
        Returns:
            Dictionary with OCR results and metadata
@@ -946,7 +955,9 @@ class OCRService:
                        confidence_threshold=confidence_threshold,
                        output_dir=output_dir,
                        current_page=page_num - 1,  # Convert to 0-based page number for layout data
-                        layout_model=layout_model
+                        layout_model=layout_model,
                        preprocessing_mode=preprocessing_mode,
                        preprocessing_config=preprocessing_config
                    )
                    # Accumulate results
@@ -1092,7 +1103,9 @@ class OCRService:
                    image_path,
                    output_dir=output_dir,
                    current_page=current_page,
-                    layout_model=layout_model
+                    layout_model=layout_model,
                    preprocessing_mode=preprocessing_mode,
                    preprocessing_config=preprocessing_config
                )
            # Generate Markdown
@@ -1248,7 +1261,9 @@ class OCRService:
        image_path: Path,
        output_dir: Optional[Path] = None,
        current_page: int = 0,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Tuple[Optional[Dict], List[Dict]]:
        """
        Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1258,6 +1273,8 @@ class OCRService:
            output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
            current_page: Current page number (0-based) for multi-page documents
            layout_model: Layout detection model ('chinese', 'default', 'cdla')
            preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
        Returns:
            Tuple of (layout_data, images_metadata)
@@ -1277,13 +1294,45 @@ class OCRService:
            structure_engine = self._ensure_structure_engine(layout_model)
            # Apply image preprocessing for layout detection
            # Preprocessing enhances faint lines/borders to improve table detection
            # Original image is preserved for element extraction
            preprocessed_image = None
            preprocessing_result = None
            # Determine preprocessing mode (default from config if not specified)
            mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
            if mode != PreprocessingModeEnum.DISABLED:
                try:
                    preprocessing_service = get_layout_preprocessing_service()
                    preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
                        image_path,
                        mode=mode,
                        config=preprocessing_config
                    )
                    if preprocessing_result.was_processed:
                        preprocessed_image = preprocessed_pil
                        logger.info(
                            f"Layout preprocessing applied: mode={mode.value}, "
                            f"config={preprocessing_result.config_used}, "
                            f"metrics={preprocessing_result.quality_metrics}"
                        )
                    else:
                        logger.info(f"No preprocessing needed (mode={mode.value})")
                except Exception as preprocess_error:
                    logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
                    preprocessed_image = None
            # Try enhanced processing first
            try:
                from app.services.pp_structure_enhanced import PPStructureEnhanced
                enhanced_processor = PPStructureEnhanced(structure_engine)
                result = enhanced_processor.analyze_with_full_structure(
-                    image_path, output_dir, current_page
+                    image_path, output_dir, current_page, preprocessed_image=preprocessed_image
                )
                if result.get('has_parsing_res_list'):
@@ -1337,7 +1386,17 @@ class OCRService:
                    logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
                    return None, []
-                results = structure_engine.predict(str(image_path))
+                # Use preprocessed image if available, otherwise original path
                if preprocessed_image is not None:
                    import numpy as np
                    # Convert PIL to numpy array (BGR format for PP-Structure)
                    predict_input = np.array(preprocessed_image)
                    if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
                        # Convert RGB to BGR
                        predict_input = predict_input[:, :, ::-1]
                    results = structure_engine.predict(predict_input)
                else:
                    results = structure_engine.predict(str(image_path))
            layout_elements = []
            images_metadata = []
@@ -1509,7 +1568,9 @@ class OCRService:
        confidence_threshold: Optional[float] = None,
        output_dir: Optional[Path] = None,
        force_track: Optional[str] = None,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Union[UnifiedDocument, Dict]:
        """
        Process document using dual-track approach.
@@ -1522,6 +1583,8 @@ class OCRService:
            output_dir: Optional output directory for extracted images
            force_track: Force specific track ("ocr" or "direct"), None for auto-detection
            layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
        Returns:
            UnifiedDocument if dual-track is enabled, Dict otherwise
@@ -1529,7 +1592,8 @@ class OCRService:
        if not self.dual_track_enabled:
            # Fallback to traditional OCR processing
            return self.process_file_traditional(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
                preprocessing_mode, preprocessing_config
            )
        start_time = datetime.now()
@@ -1601,7 +1665,9 @@ class OCRService:
                            ocr_result = self.process_file_traditional(
                                actual_file_path, lang, detect_layout=True,
                                confidence_threshold=confidence_threshold,
-                                output_dir=output_dir, layout_model=layout_model
+                                output_dir=output_dir, layout_model=layout_model,
                                preprocessing_mode=preprocessing_mode,
                                preprocessing_config=preprocessing_config
                            )
                            # Convert OCR result to extract images
@@ -1634,7 +1700,8 @@ class OCRService:
                # Use OCR for scanned documents, images, etc.
                logger.info("Using OCR track (PaddleOCR)")
                ocr_result = self.process_file_traditional(
-                    file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                    file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
                    preprocessing_mode, preprocessing_config
                )
                # Convert OCR result to UnifiedDocument using the converter
@@ -1664,7 +1731,8 @@ class OCRService:
            logger.error(f"Error in dual-track processing: {e}")
            # Fallback to traditional OCR
            return self.process_file_traditional(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
                preprocessing_mode, preprocessing_config
            )
    def _merge_ocr_images_into_direct(
@@ -1743,7 +1811,9 @@ class OCRService:
        detect_layout: bool = True,
        confidence_threshold: Optional[float] = None,
        output_dir: Optional[Path] = None,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Dict:
        """
        Traditional OCR processing (legacy method).
@@ -1755,6 +1825,8 @@ class OCRService:
            confidence_threshold: Minimum confidence threshold
            output_dir: Optional output directory
            layout_model: Layout detection model ('chinese', 'default', 'cdla')
            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
        Returns:
            Dictionary with OCR results in legacy format
@@ -1767,7 +1839,8 @@ class OCRService:
            all_results = []
            for i, image_path in enumerate(image_paths):
                result = self.process_image(
-                    image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
+                    image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
                    preprocessing_mode, preprocessing_config
                )
                all_results.append(result)
@@ -1783,7 +1856,8 @@ class OCRService:
        else:
            # Single image or other file
            return self.process_image(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
                preprocessing_mode, preprocessing_config
            )
    def _combine_results(self, results: List[Dict]) -> Dict:
@@ -1868,7 +1942,9 @@ class OCRService:
        output_dir: Optional[Path] = None,
        use_dual_track: bool = True,
        force_track: Optional[str] = None,
-        layout_model: Optional[str] = None
+        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None
    ) -> Union[UnifiedDocument, Dict]:
        """
        Main processing method with dual-track support.
@@ -1882,6 +1958,8 @@ class OCRService:
            use_dual_track: Whether to use dual-track processing (default True)
            force_track: Force specific track ("ocr" or "direct")
            layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
        Returns:
            UnifiedDocument if dual-track is enabled and use_dual_track=True,
@@ -1893,12 +1971,14 @@ class OCRService:
        if (use_dual_track or force_track) and self.dual_track_enabled:
            # Use dual-track processing (or forced track)
            return self.process_with_dual_track(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
                preprocessing_mode, preprocessing_config
            )
        else:
            # Use traditional OCR processing (no force_track support)
            return self.process_file_traditional(
-                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
+                file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
                preprocessing_mode, preprocessing_config
            )
    def process_legacy(
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -20,6 +20,8 @@ except ImportError:
 import paddle
 from paddleocr import PPStructureV3
 from PIL import Image
 import numpy as np
 from app.models.unified_document import ElementType
 from app.core.config import settings
 from app.services.memory_manager import prediction_context
@@ -78,15 +80,19 @@ class PPStructureEnhanced:
        self,
        image_path: Path,
        output_dir: Optional[Path] = None,
-        current_page: int = 0
+        current_page: int = 0,
        preprocessed_image: Optional[Image.Image] = None
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.
        Args:
-            image_path: Path to image file
+            image_path: Path to original image file (used for cropping extracted images)
            output_dir: Optional output directory for saving extracted content
            current_page: Current page number (0-based)
            preprocessed_image: Optional preprocessed PIL Image for layout detection.
                               If provided, this is used for PP-Structure prediction,
                               but original image_path is still used for cropping images.
        Returns:
            Dictionary with complete structure information including:
@@ -97,6 +103,8 @@ class PPStructureEnhanced:
        """
        try:
            logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
            if preprocessed_image:
                logger.info("Using preprocessed image for layout detection")
            # Perform structure analysis with semaphore control
            # This prevents OOM errors from multiple simultaneous predictions
@@ -113,7 +121,16 @@ class PPStructureEnhanced:
                        'error': 'Prediction slot timeout'
                    }
-                results = self.structure_engine.predict(str(image_path))
+                # Use preprocessed image if provided, otherwise use original path
                if preprocessed_image is not None:
                    # Convert PIL to numpy array (BGR format for PP-Structure)
                    predict_input = np.array(preprocessed_image)
                    if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
                        # Convert RGB to BGR
                        predict_input = predict_input[:, :, ::-1]
                    results = self.structure_engine.predict(predict_input)
                else:
                    results = self.structure_engine.predict(str(image_path))
            all_elements = []
            all_images = []
--- a/openspec/changes/add-layout-preprocessing/tasks.md
+++ b/openspec/changes/add-layout-preprocessing/tasks.md
@@ -93,7 +93,7 @@
  - `frontend/src/i18n/locales/zh-TW.json` - Traditional Chinese
  - `frontend/src/i18n/locales/en.json` - English (if exists)
-## 6. Testing
+## 6. Testing (with env)
 - [ ] 6.1 Unit tests for preprocessing_service
  - Test contrast enhancement methods
@@ -106,7 +106,7 @@
  - Test preview endpoint returns correct images
  - Test auto-detection returns sensible config
- [ ] 6.3 Integration tests
+- [ ] 6.3 Integration tests (accountL ymirliu@panjit.com.tw ; password: 4RFV5tgb6yhn)
  - Test OCR track with preprocessing modes (auto/manual/disabled)
  - Verify image element quality is preserved
  - Test with known problematic documents (faint table borders)