diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 699788e..fac10a7 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -146,6 +146,39 @@ class Settings(BaseSettings): description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support." ) + # ===== Layout Preprocessing Configuration ===== + # Image preprocessing to enhance layout detection for documents with faint lines/borders + # Preprocessing only affects layout detection input; original image is preserved for extraction + layout_preprocessing_mode: str = Field( + default="auto", + description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'" + ) + layout_preprocessing_contrast: str = Field( + default="clahe", + description="Contrast enhancement method: 'none', 'histogram', 'clahe' (recommended)" + ) + layout_preprocessing_sharpen: bool = Field( + default=True, + description="Enable sharpening to enhance faint lines and borders" + ) + layout_preprocessing_binarize: bool = Field( + default=False, + description="Enable binarization (aggressive, use for very low contrast documents only)" + ) + # Auto-detection thresholds + layout_preprocessing_contrast_threshold: float = Field( + default=40.0, + description="Contrast (std dev) below this triggers CLAHE in auto mode" + ) + layout_preprocessing_edge_threshold: float = Field( + default=15.0, + description="Edge strength below this triggers sharpening in auto mode" + ) + layout_preprocessing_binarize_threshold: float = Field( + default=20.0, + description="Contrast below this triggers binarization in auto mode" + ) + # ===== Gap Filling Configuration ===== # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track diff --git a/backend/app/routers/tasks.py b/backend/app/routers/tasks.py index 6ed2672..3504f16 100644 --- a/backend/app/routers/tasks.py +++ b/backend/app/routers/tasks.py @@ -35,6 +35,11 @@ from app.schemas.task import ( ProcessingMetadata, TaskResponseWithMetadata, ExportOptions, + PreprocessingModeEnum, + PreprocessingConfig, + PreprocessingPreviewRequest, + PreprocessingPreviewResponse, + ImageQualityMetrics, ) from app.services.task_service import task_service from app.services.file_access_service import file_access_service @@ -1131,3 +1136,193 @@ async def download_unified( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to download: {str(e)}" ) + + +# ===== Preprocessing Preview Endpoints ===== + +@router.post("/{task_id}/preview/preprocessing", response_model=PreprocessingPreviewResponse, summary="Preview preprocessing effect") +async def preview_preprocessing( + task_id: str, + request: PreprocessingPreviewRequest, + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user) +): + """ + Preview the effect of image preprocessing before OCR processing. + + Shows side-by-side comparison of original and preprocessed images, + along with image quality metrics and auto-detected configuration. + + - **task_id**: Task UUID + - **page**: Page number to preview (1-based) + - **mode**: Preprocessing mode ('auto', 'manual', 'disabled') + - **config**: Manual preprocessing config (only used when mode='manual') + """ + from pdf2image import convert_from_path + import base64 + import io + from PIL import Image + from app.services.layout_preprocessing_service import get_layout_preprocessing_service + + try: + # Get task details + task = task_service.get_task_by_id( + db=db, + task_id=task_id, + user_id=current_user.id + ) + + if not task: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Task not found" + ) + + # Get task file + task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first() + if not task_file: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Task file not found" + ) + + file_path = Path(task_file.stored_path) + if not file_path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Source file not found" + ) + + # Get the page image + page_num = request.page + if file_path.suffix.lower() == '.pdf': + # Convert specific page from PDF + images = convert_from_path( + str(file_path), + first_page=page_num, + last_page=page_num, + dpi=150 + ) + if not images: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Page {page_num} not found in PDF" + ) + original_image = images[0] + else: + # Direct image file + if page_num != 1: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Single image file only has page 1" + ) + original_image = Image.open(file_path) + + # Get preprocessing service + preprocessing_service = get_layout_preprocessing_service() + + # Apply preprocessing + preprocessed_image, preprocess_result = preprocessing_service.preprocess_to_pil( + original_image, + mode=request.mode, + config=request.config + ) + + # Create result directory for preview images + preview_dir = Path(settings.result_dir) / task_id / "preview" + preview_dir.mkdir(parents=True, exist_ok=True) + + # Save preview images + original_filename = f"page_{page_num}_original.png" + preprocessed_filename = f"page_{page_num}_preprocessed.png" + + original_path = preview_dir / original_filename + preprocessed_path = preview_dir / preprocessed_filename + + original_image.save(str(original_path), "PNG") + preprocessed_image.save(str(preprocessed_path), "PNG") + + # Build URLs (relative paths that can be served) + base_url = f"/api/v2/tasks/{task_id}/preview/image" + original_url = f"{base_url}?type=original&page={page_num}" + preprocessed_url = f"{base_url}?type=preprocessed&page={page_num}" + + return PreprocessingPreviewResponse( + original_url=original_url, + preprocessed_url=preprocessed_url, + quality_metrics=preprocess_result.quality_metrics, + auto_config=preprocess_result.config_used, + mode_used=request.mode + ) + + except HTTPException: + raise + except Exception as e: + logger.exception(f"Failed to preview preprocessing for task {task_id}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to preview preprocessing: {str(e)}" + ) + + +@router.get("/{task_id}/preview/image", summary="Get preview image") +async def get_preview_image( + task_id: str, + type: str = Query(..., description="Image type: 'original' or 'preprocessed'"), + page: int = Query(1, ge=1, description="Page number"), + db: Session = Depends(get_db), + current_user: User = Depends(get_current_user) +): + """ + Get a preview image (original or preprocessed). + + - **task_id**: Task UUID + - **type**: Image type ('original' or 'preprocessed') + - **page**: Page number + """ + try: + # Verify task ownership + task = task_service.get_task_by_id( + db=db, + task_id=task_id, + user_id=current_user.id + ) + + if not task: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Task not found" + ) + + # Validate type parameter + if type not in ['original', 'preprocessed']: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Invalid type. Must be 'original' or 'preprocessed'" + ) + + # Build image path + preview_dir = Path(settings.result_dir) / task_id / "preview" + image_filename = f"page_{page}_{type}.png" + image_path = preview_dir / image_filename + + if not image_path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Preview image not found. Please call preview/preprocessing first." + ) + + return FileResponse( + path=str(image_path), + media_type="image/png", + filename=image_filename + ) + + except HTTPException: + raise + except Exception as e: + logger.exception(f"Failed to get preview image for task {task_id}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to get preview image: {str(e)}" + ) diff --git a/backend/app/schemas/task.py b/backend/app/schemas/task.py index 75be647..f50500a 100644 --- a/backend/app/schemas/task.py +++ b/backend/app/schemas/task.py @@ -37,6 +37,79 @@ class LayoutModelEnum(str, Enum): CDLA = "cdla" # CDLA model - Alternative for Chinese layout +class PreprocessingModeEnum(str, Enum): + """Preprocessing mode for layout detection enhancement. + + - AUTO: Analyze image quality and automatically apply optimal preprocessing + - MANUAL: Use user-specified preprocessing configuration + - DISABLED: Skip preprocessing entirely + """ + AUTO = "auto" # Analyze and apply automatically (default) + MANUAL = "manual" # Use specified configuration + DISABLED = "disabled" # Skip preprocessing + + +class PreprocessingContrastEnum(str, Enum): + """Contrast enhancement method for preprocessing. + + - NONE: No contrast enhancement + - HISTOGRAM: Standard histogram equalization + - CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended) + """ + NONE = "none" + HISTOGRAM = "histogram" + CLAHE = "clahe" + + +class PreprocessingConfig(BaseModel): + """Preprocessing configuration for layout detection enhancement. + + Used to configure image preprocessing before PP-Structure layout detection. + Preprocessing helps detect tables with faint lines or low contrast borders. + Original image is preserved for element extraction. + """ + contrast: PreprocessingContrastEnum = Field( + default=PreprocessingContrastEnum.CLAHE, + description="Contrast enhancement method" + ) + sharpen: bool = Field( + default=True, + description="Enable sharpening for faint lines" + ) + binarize: bool = Field( + default=False, + description="Enable binarization (aggressive, for very low contrast)" + ) + + +class ImageQualityMetrics(BaseModel): + """Image quality metrics from auto-analysis.""" + contrast: float = Field(..., description="Contrast level (std dev of grayscale)") + edge_strength: float = Field(..., description="Edge strength (Sobel gradient mean)") + + +class PreprocessingPreviewRequest(BaseModel): + """Request for preprocessing preview.""" + page: int = Field(default=1, ge=1, description="Page number to preview") + mode: PreprocessingModeEnum = Field( + default=PreprocessingModeEnum.AUTO, + description="Preprocessing mode" + ) + config: Optional[PreprocessingConfig] = Field( + None, + description="Manual configuration (only used when mode='manual')" + ) + + +class PreprocessingPreviewResponse(BaseModel): + """Response for preprocessing preview.""" + original_url: str = Field(..., description="URL to original image") + preprocessed_url: str = Field(..., description="URL to preprocessed image") + quality_metrics: ImageQualityMetrics = Field(..., description="Image quality analysis") + auto_config: PreprocessingConfig = Field(..., description="Auto-detected configuration") + mode_used: PreprocessingModeEnum = Field(..., description="Mode that was applied") + + class TaskCreate(BaseModel): """Task creation request""" filename: Optional[str] = Field(None, description="Original filename") @@ -195,6 +268,16 @@ class ProcessingOptions(BaseModel): description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)" ) + # Layout preprocessing (OCR track only) + preprocessing_mode: PreprocessingModeEnum = Field( + default=PreprocessingModeEnum.AUTO, + description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'" + ) + preprocessing_config: Optional[PreprocessingConfig] = Field( + None, + description="Manual preprocessing config (only used when preprocessing_mode='manual')" + ) + class AnalyzeRequest(BaseModel): """Document analysis request""" diff --git a/backend/app/services/layout_preprocessing_service.py b/backend/app/services/layout_preprocessing_service.py new file mode 100644 index 0000000..05a918a --- /dev/null +++ b/backend/app/services/layout_preprocessing_service.py @@ -0,0 +1,370 @@ +""" +Tool_OCR - Layout Preprocessing Service +Image preprocessing to enhance layout detection for documents with faint lines/borders. + +This service provides: +1. Image quality analysis (contrast, edge strength) +2. Contrast enhancement (histogram equalization, CLAHE) +3. Sharpening for faint lines +4. Optional binarization for very low contrast documents + +IMPORTANT: Preprocessing only affects layout detection input. +Original images are preserved for element extraction. +""" + +import logging +from pathlib import Path +from typing import Optional, Tuple, Union +from dataclasses import dataclass + +import cv2 +import numpy as np +from PIL import Image + +from app.core.config import settings +from app.schemas.task import ( + PreprocessingConfig, + PreprocessingContrastEnum, + PreprocessingModeEnum, + ImageQualityMetrics, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class PreprocessingResult: + """Result of preprocessing operation.""" + image: np.ndarray + config_used: PreprocessingConfig + quality_metrics: ImageQualityMetrics + was_processed: bool + + +class LayoutPreprocessingService: + """ + Service for preprocessing images to improve layout detection. + + The preprocessing pipeline: + 1. Analyze image quality (contrast, edge strength) + 2. Apply contrast enhancement if needed (CLAHE or histogram) + 3. Apply sharpening if edge strength is low + 4. Apply binarization if contrast is very low (optional) + + All operations preserve the original color image dimensions. + """ + + def __init__(self): + # Load thresholds from config + self.contrast_threshold = settings.layout_preprocessing_contrast_threshold + self.edge_threshold = settings.layout_preprocessing_edge_threshold + self.binarize_threshold = settings.layout_preprocessing_binarize_threshold + + # CLAHE parameters + self.clahe_clip_limit = 2.0 + self.clahe_tile_grid_size = (8, 8) + + # Sharpening kernel (unsharp mask style) + self.sharpen_kernel = np.array([ + [0, -1, 0], + [-1, 5, -1], + [0, -1, 0] + ], dtype=np.float32) + + logger.info( + f"LayoutPreprocessingService initialized with thresholds: " + f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, " + f"binarize={self.binarize_threshold}" + ) + + def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics: + """ + Analyze image quality to determine preprocessing needs. + + Args: + image: Input image (BGR or grayscale) + + Returns: + ImageQualityMetrics with contrast and edge_strength + """ + # Convert to grayscale if needed + if len(image.shape) == 3: + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + gray = image + + # Calculate contrast (standard deviation of pixel values) + contrast = float(np.std(gray)) + + # Calculate edge strength (mean of Sobel gradient magnitude) + sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3) + sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3) + edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2))) + + return ImageQualityMetrics( + contrast=round(contrast, 2), + edge_strength=round(edge_strength, 2) + ) + + def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig: + """ + Determine optimal preprocessing config based on image quality. + + Args: + metrics: Image quality metrics from analyze_image_quality() + + Returns: + PreprocessingConfig with recommended settings + """ + # Determine contrast enhancement + if metrics.contrast < self.contrast_threshold: + contrast = PreprocessingContrastEnum.CLAHE + else: + contrast = PreprocessingContrastEnum.NONE + + # Determine sharpening + sharpen = metrics.edge_strength < self.edge_threshold + + # Determine binarization (only for very low contrast) + binarize = metrics.contrast < self.binarize_threshold + + return PreprocessingConfig( + contrast=contrast, + sharpen=sharpen, + binarize=binarize + ) + + def apply_contrast_enhancement( + self, + image: np.ndarray, + method: PreprocessingContrastEnum + ) -> np.ndarray: + """ + Apply contrast enhancement to image. + + Args: + image: Input image (BGR) + method: Enhancement method (none, histogram, clahe) + + Returns: + Enhanced image (BGR) + """ + if method == PreprocessingContrastEnum.NONE: + return image + + # Convert to LAB color space for better enhancement + lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) + l_channel, a_channel, b_channel = cv2.split(lab) + + if method == PreprocessingContrastEnum.HISTOGRAM: + # Standard histogram equalization + l_enhanced = cv2.equalizeHist(l_channel) + elif method == PreprocessingContrastEnum.CLAHE: + # Contrast Limited Adaptive Histogram Equalization + clahe = cv2.createCLAHE( + clipLimit=self.clahe_clip_limit, + tileGridSize=self.clahe_tile_grid_size + ) + l_enhanced = clahe.apply(l_channel) + else: + return image + + # Merge channels and convert back to BGR + enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel]) + enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR) + + return enhanced_bgr + + def apply_sharpening(self, image: np.ndarray) -> np.ndarray: + """ + Apply sharpening to enhance edges and faint lines. + + Args: + image: Input image (BGR) + + Returns: + Sharpened image (BGR) + """ + # Apply unsharp mask style sharpening + sharpened = cv2.filter2D(image, -1, self.sharpen_kernel) + + # Clip values to valid range + sharpened = np.clip(sharpened, 0, 255).astype(np.uint8) + + return sharpened + + def apply_binarization(self, image: np.ndarray) -> np.ndarray: + """ + Apply adaptive binarization for very low contrast documents. + + Args: + image: Input image (BGR) + + Returns: + Binarized image (BGR, but grayscale values) + """ + # Convert to grayscale + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + # Apply adaptive thresholding + binary = cv2.adaptiveThreshold( + gray, + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + blockSize=11, + C=2 + ) + + # Convert back to BGR for consistency + binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR) + + return binary_bgr + + def preprocess( + self, + image: Union[np.ndarray, Image.Image, str, Path], + mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO, + config: Optional[PreprocessingConfig] = None + ) -> PreprocessingResult: + """ + Preprocess image for layout detection. + + Args: + image: Input image (numpy array, PIL Image, or path) + mode: Preprocessing mode (auto, manual, disabled) + config: Manual configuration (required if mode='manual') + + Returns: + PreprocessingResult with preprocessed image and metadata + """ + # Load image if path provided + if isinstance(image, (str, Path)): + image = cv2.imread(str(image)) + if image is None: + raise ValueError(f"Failed to load image: {image}") + elif isinstance(image, Image.Image): + # Convert PIL to OpenCV format (BGR) + image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) + + # Analyze quality + metrics = self.analyze_image_quality(image) + logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}") + + # Determine configuration + if mode == PreprocessingModeEnum.DISABLED: + return PreprocessingResult( + image=image, + config_used=PreprocessingConfig( + contrast=PreprocessingContrastEnum.NONE, + sharpen=False, + binarize=False + ), + quality_metrics=metrics, + was_processed=False + ) + + if mode == PreprocessingModeEnum.AUTO: + config = self.get_auto_config(metrics) + logger.debug(f"Auto config: {config}") + elif config is None: + # Manual mode but no config provided, use defaults + config = PreprocessingConfig() + + # Apply preprocessing pipeline + processed = image.copy() + was_processed = False + + # Step 1: Contrast enhancement + if config.contrast != PreprocessingContrastEnum.NONE: + processed = self.apply_contrast_enhancement(processed, config.contrast) + was_processed = True + logger.debug(f"Applied contrast enhancement: {config.contrast}") + + # Step 2: Sharpening + if config.sharpen: + processed = self.apply_sharpening(processed) + was_processed = True + logger.debug("Applied sharpening") + + # Step 3: Binarization (last step, overwrites color) + if config.binarize: + processed = self.apply_binarization(processed) + was_processed = True + logger.debug("Applied binarization") + + return PreprocessingResult( + image=processed, + config_used=config, + quality_metrics=metrics, + was_processed=was_processed + ) + + def preprocess_to_pil( + self, + image: Union[np.ndarray, Image.Image, str, Path], + mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO, + config: Optional[PreprocessingConfig] = None + ) -> Tuple[Image.Image, PreprocessingResult]: + """ + Preprocess image and return as PIL Image. + + Convenience method for integration with PP-Structure which accepts PIL images. + + Args: + image: Input image + mode: Preprocessing mode + config: Manual configuration + + Returns: + Tuple of (PIL Image, PreprocessingResult) + """ + result = self.preprocess(image, mode, config) + + # Convert BGR to RGB for PIL + rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(rgb_image) + + return pil_image, result + + def save_preview( + self, + original: np.ndarray, + preprocessed: np.ndarray, + output_dir: Path, + prefix: str = "preview" + ) -> Tuple[Path, Path]: + """ + Save original and preprocessed images for preview. + + Args: + original: Original image (BGR) + preprocessed: Preprocessed image (BGR) + output_dir: Directory to save images + prefix: Filename prefix + + Returns: + Tuple of (original_path, preprocessed_path) + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + original_path = output_dir / f"{prefix}_original.png" + preprocessed_path = output_dir / f"{prefix}_preprocessed.png" + + cv2.imwrite(str(original_path), original) + cv2.imwrite(str(preprocessed_path), preprocessed) + + return original_path, preprocessed_path + + +# Singleton instance +_layout_preprocessing_service: Optional[LayoutPreprocessingService] = None + + +def get_layout_preprocessing_service() -> LayoutPreprocessingService: + """Get or create the layout preprocessing service singleton.""" + global _layout_preprocessing_service + if _layout_preprocessing_service is None: + _layout_preprocessing_service = LayoutPreprocessingService() + return _layout_preprocessing_service diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index 2f0dabc..20ec0ad 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -26,6 +26,11 @@ except ImportError: from app.core.config import settings from app.services.office_converter import OfficeConverter, OfficeConverterError from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context +from app.services.layout_preprocessing_service import ( + get_layout_preprocessing_service, + LayoutPreprocessingService, +) +from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig # Import dual-track components try: @@ -865,7 +870,9 @@ class OCRService: confidence_threshold: Optional[float] = None, output_dir: Optional[Path] = None, current_page: int = 0, - layout_model: Optional[str] = None + layout_model: Optional[str] = None, + preprocessing_mode: Optional[PreprocessingModeEnum] = None, + preprocessing_config: Optional[PreprocessingConfig] = None ) -> Dict: """ Process single image with OCR and layout analysis @@ -878,6 +885,8 @@ class OCRService: output_dir: Optional output directory for saving extracted images current_page: Current page number (0-based) for multi-page documents layout_model: Layout detection model ('chinese', 'default', 'cdla') + preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') + preprocessing_config: Manual preprocessing config (used when mode='manual') Returns: Dictionary with OCR results and metadata @@ -946,7 +955,9 @@ class OCRService: confidence_threshold=confidence_threshold, output_dir=output_dir, current_page=page_num - 1, # Convert to 0-based page number for layout data - layout_model=layout_model + layout_model=layout_model, + preprocessing_mode=preprocessing_mode, + preprocessing_config=preprocessing_config ) # Accumulate results @@ -1092,7 +1103,9 @@ class OCRService: image_path, output_dir=output_dir, current_page=current_page, - layout_model=layout_model + layout_model=layout_model, + preprocessing_mode=preprocessing_mode, + preprocessing_config=preprocessing_config ) # Generate Markdown @@ -1248,7 +1261,9 @@ class OCRService: image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0, - layout_model: Optional[str] = None + layout_model: Optional[str] = None, + preprocessing_mode: Optional[PreprocessingModeEnum] = None, + preprocessing_config: Optional[PreprocessingConfig] = None ) -> Tuple[Optional[Dict], List[Dict]]: """ Analyze document layout using PP-StructureV3 with enhanced element extraction @@ -1258,6 +1273,8 @@ class OCRService: output_dir: Optional output directory for saving extracted images (defaults to image_path.parent) current_page: Current page number (0-based) for multi-page documents layout_model: Layout detection model ('chinese', 'default', 'cdla') + preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled') + preprocessing_config: Manual preprocessing config (used when mode='manual') Returns: Tuple of (layout_data, images_metadata) @@ -1277,13 +1294,45 @@ class OCRService: structure_engine = self._ensure_structure_engine(layout_model) + # Apply image preprocessing for layout detection + # Preprocessing enhances faint lines/borders to improve table detection + # Original image is preserved for element extraction + preprocessed_image = None + preprocessing_result = None + + # Determine preprocessing mode (default from config if not specified) + mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode) + + if mode != PreprocessingModeEnum.DISABLED: + try: + preprocessing_service = get_layout_preprocessing_service() + preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil( + image_path, + mode=mode, + config=preprocessing_config + ) + + if preprocessing_result.was_processed: + preprocessed_image = preprocessed_pil + logger.info( + f"Layout preprocessing applied: mode={mode.value}, " + f"config={preprocessing_result.config_used}, " + f"metrics={preprocessing_result.quality_metrics}" + ) + else: + logger.info(f"No preprocessing needed (mode={mode.value})") + + except Exception as preprocess_error: + logger.warning(f"Preprocessing failed, using original image: {preprocess_error}") + preprocessed_image = None + # Try enhanced processing first try: from app.services.pp_structure_enhanced import PPStructureEnhanced enhanced_processor = PPStructureEnhanced(structure_engine) result = enhanced_processor.analyze_with_full_structure( - image_path, output_dir, current_page + image_path, output_dir, current_page, preprocessed_image=preprocessed_image ) if result.get('has_parsing_res_list'): @@ -1337,7 +1386,17 @@ class OCRService: logger.error("Failed to acquire prediction slot (timeout), returning empty layout") return None, [] - results = structure_engine.predict(str(image_path)) + # Use preprocessed image if available, otherwise original path + if preprocessed_image is not None: + import numpy as np + # Convert PIL to numpy array (BGR format for PP-Structure) + predict_input = np.array(preprocessed_image) + if len(predict_input.shape) == 3 and predict_input.shape[2] == 3: + # Convert RGB to BGR + predict_input = predict_input[:, :, ::-1] + results = structure_engine.predict(predict_input) + else: + results = structure_engine.predict(str(image_path)) layout_elements = [] images_metadata = [] @@ -1509,7 +1568,9 @@ class OCRService: confidence_threshold: Optional[float] = None, output_dir: Optional[Path] = None, force_track: Optional[str] = None, - layout_model: Optional[str] = None + layout_model: Optional[str] = None, + preprocessing_mode: Optional[PreprocessingModeEnum] = None, + preprocessing_config: Optional[PreprocessingConfig] = None ) -> Union[UnifiedDocument, Dict]: """ Process document using dual-track approach. @@ -1522,6 +1583,8 @@ class OCRService: output_dir: Optional output directory for extracted images force_track: Force specific track ("ocr" or "direct"), None for auto-detection layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only) + preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') + preprocessing_config: Manual preprocessing config (used when mode='manual') Returns: UnifiedDocument if dual-track is enabled, Dict otherwise @@ -1529,7 +1592,8 @@ class OCRService: if not self.dual_track_enabled: # Fallback to traditional OCR processing return self.process_file_traditional( - file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model + file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, + preprocessing_mode, preprocessing_config ) start_time = datetime.now() @@ -1601,7 +1665,9 @@ class OCRService: ocr_result = self.process_file_traditional( actual_file_path, lang, detect_layout=True, confidence_threshold=confidence_threshold, - output_dir=output_dir, layout_model=layout_model + output_dir=output_dir, layout_model=layout_model, + preprocessing_mode=preprocessing_mode, + preprocessing_config=preprocessing_config ) # Convert OCR result to extract images @@ -1634,7 +1700,8 @@ class OCRService: # Use OCR for scanned documents, images, etc. logger.info("Using OCR track (PaddleOCR)") ocr_result = self.process_file_traditional( - file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model + file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, + preprocessing_mode, preprocessing_config ) # Convert OCR result to UnifiedDocument using the converter @@ -1664,7 +1731,8 @@ class OCRService: logger.error(f"Error in dual-track processing: {e}") # Fallback to traditional OCR return self.process_file_traditional( - file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model + file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, + preprocessing_mode, preprocessing_config ) def _merge_ocr_images_into_direct( @@ -1743,7 +1811,9 @@ class OCRService: detect_layout: bool = True, confidence_threshold: Optional[float] = None, output_dir: Optional[Path] = None, - layout_model: Optional[str] = None + layout_model: Optional[str] = None, + preprocessing_mode: Optional[PreprocessingModeEnum] = None, + preprocessing_config: Optional[PreprocessingConfig] = None ) -> Dict: """ Traditional OCR processing (legacy method). @@ -1755,6 +1825,8 @@ class OCRService: confidence_threshold: Minimum confidence threshold output_dir: Optional output directory layout_model: Layout detection model ('chinese', 'default', 'cdla') + preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') + preprocessing_config: Manual preprocessing config (used when mode='manual') Returns: Dictionary with OCR results in legacy format @@ -1767,7 +1839,8 @@ class OCRService: all_results = [] for i, image_path in enumerate(image_paths): result = self.process_image( - image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model + image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model, + preprocessing_mode, preprocessing_config ) all_results.append(result) @@ -1783,7 +1856,8 @@ class OCRService: else: # Single image or other file return self.process_image( - file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model + file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model, + preprocessing_mode, preprocessing_config ) def _combine_results(self, results: List[Dict]) -> Dict: @@ -1868,7 +1942,9 @@ class OCRService: output_dir: Optional[Path] = None, use_dual_track: bool = True, force_track: Optional[str] = None, - layout_model: Optional[str] = None + layout_model: Optional[str] = None, + preprocessing_mode: Optional[PreprocessingModeEnum] = None, + preprocessing_config: Optional[PreprocessingConfig] = None ) -> Union[UnifiedDocument, Dict]: """ Main processing method with dual-track support. @@ -1882,6 +1958,8 @@ class OCRService: use_dual_track: Whether to use dual-track processing (default True) force_track: Force specific track ("ocr" or "direct") layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only) + preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') + preprocessing_config: Manual preprocessing config (used when mode='manual') Returns: UnifiedDocument if dual-track is enabled and use_dual_track=True, @@ -1893,12 +1971,14 @@ class OCRService: if (use_dual_track or force_track) and self.dual_track_enabled: # Use dual-track processing (or forced track) return self.process_with_dual_track( - file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model + file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model, + preprocessing_mode, preprocessing_config ) else: # Use traditional OCR processing (no force_track support) return self.process_file_traditional( - file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model + file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, + preprocessing_mode, preprocessing_config ) def process_legacy( diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py index 7833e7a..703e884 100644 --- a/backend/app/services/pp_structure_enhanced.py +++ b/backend/app/services/pp_structure_enhanced.py @@ -20,6 +20,8 @@ except ImportError: import paddle from paddleocr import PPStructureV3 +from PIL import Image +import numpy as np from app.models.unified_document import ElementType from app.core.config import settings from app.services.memory_manager import prediction_context @@ -78,15 +80,19 @@ class PPStructureEnhanced: self, image_path: Path, output_dir: Optional[Path] = None, - current_page: int = 0 + current_page: int = 0, + preprocessed_image: Optional[Image.Image] = None ) -> Dict[str, Any]: """ Analyze document with full PP-StructureV3 capabilities. Args: - image_path: Path to image file + image_path: Path to original image file (used for cropping extracted images) output_dir: Optional output directory for saving extracted content current_page: Current page number (0-based) + preprocessed_image: Optional preprocessed PIL Image for layout detection. + If provided, this is used for PP-Structure prediction, + but original image_path is still used for cropping images. Returns: Dictionary with complete structure information including: @@ -97,6 +103,8 @@ class PPStructureEnhanced: """ try: logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}") + if preprocessed_image: + logger.info("Using preprocessed image for layout detection") # Perform structure analysis with semaphore control # This prevents OOM errors from multiple simultaneous predictions @@ -113,7 +121,16 @@ class PPStructureEnhanced: 'error': 'Prediction slot timeout' } - results = self.structure_engine.predict(str(image_path)) + # Use preprocessed image if provided, otherwise use original path + if preprocessed_image is not None: + # Convert PIL to numpy array (BGR format for PP-Structure) + predict_input = np.array(preprocessed_image) + if len(predict_input.shape) == 3 and predict_input.shape[2] == 3: + # Convert RGB to BGR + predict_input = predict_input[:, :, ::-1] + results = self.structure_engine.predict(predict_input) + else: + results = self.structure_engine.predict(str(image_path)) all_elements = [] all_images = [] diff --git a/openspec/changes/add-layout-preprocessing/tasks.md b/openspec/changes/add-layout-preprocessing/tasks.md index 5ba7a63..274d165 100644 --- a/openspec/changes/add-layout-preprocessing/tasks.md +++ b/openspec/changes/add-layout-preprocessing/tasks.md @@ -93,7 +93,7 @@ - `frontend/src/i18n/locales/zh-TW.json` - Traditional Chinese - `frontend/src/i18n/locales/en.json` - English (if exists) -## 6. Testing +## 6. Testing (with env) - [ ] 6.1 Unit tests for preprocessing_service - Test contrast enhancement methods @@ -106,7 +106,7 @@ - Test preview endpoint returns correct images - Test auto-detection returns sensible config -- [ ] 6.3 Integration tests +- [ ] 6.3 Integration tests (accountL ymirliu@panjit.com.tw ; password: 4RFV5tgb6yhn) - Test OCR track with preprocessing modes (auto/manual/disabled) - Verify image element quality is preserved - Test with known problematic documents (faint table borders)