feat: implement layout preprocessing backend
Backend implementation for add-layout-preprocessing proposal:
- Add LayoutPreprocessingService with CLAHE, sharpen, binarize
- Add auto-detection: analyze_image_quality() for contrast/edge metrics
- Integrate preprocessing into OCR pipeline (analyze_layout)
- Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing
- Add config options: layout_preprocessing_mode, thresholds
- Add schemas: PreprocessingConfig, PreprocessingPreviewResponse
Preprocessing only affects layout detection input.
Original images preserved for element extraction.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -146,6 +146,39 @@ class Settings(BaseSettings):
|
|||||||
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
|
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ===== Layout Preprocessing Configuration =====
|
||||||
|
# Image preprocessing to enhance layout detection for documents with faint lines/borders
|
||||||
|
# Preprocessing only affects layout detection input; original image is preserved for extraction
|
||||||
|
layout_preprocessing_mode: str = Field(
|
||||||
|
default="auto",
|
||||||
|
description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
|
||||||
|
)
|
||||||
|
layout_preprocessing_contrast: str = Field(
|
||||||
|
default="clahe",
|
||||||
|
description="Contrast enhancement method: 'none', 'histogram', 'clahe' (recommended)"
|
||||||
|
)
|
||||||
|
layout_preprocessing_sharpen: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Enable sharpening to enhance faint lines and borders"
|
||||||
|
)
|
||||||
|
layout_preprocessing_binarize: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Enable binarization (aggressive, use for very low contrast documents only)"
|
||||||
|
)
|
||||||
|
# Auto-detection thresholds
|
||||||
|
layout_preprocessing_contrast_threshold: float = Field(
|
||||||
|
default=40.0,
|
||||||
|
description="Contrast (std dev) below this triggers CLAHE in auto mode"
|
||||||
|
)
|
||||||
|
layout_preprocessing_edge_threshold: float = Field(
|
||||||
|
default=15.0,
|
||||||
|
description="Edge strength below this triggers sharpening in auto mode"
|
||||||
|
)
|
||||||
|
layout_preprocessing_binarize_threshold: float = Field(
|
||||||
|
default=20.0,
|
||||||
|
description="Contrast below this triggers binarization in auto mode"
|
||||||
|
)
|
||||||
|
|
||||||
# ===== Gap Filling Configuration =====
|
# ===== Gap Filling Configuration =====
|
||||||
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
|
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
|
||||||
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
|
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
|
||||||
|
|||||||
@@ -35,6 +35,11 @@ from app.schemas.task import (
|
|||||||
ProcessingMetadata,
|
ProcessingMetadata,
|
||||||
TaskResponseWithMetadata,
|
TaskResponseWithMetadata,
|
||||||
ExportOptions,
|
ExportOptions,
|
||||||
|
PreprocessingModeEnum,
|
||||||
|
PreprocessingConfig,
|
||||||
|
PreprocessingPreviewRequest,
|
||||||
|
PreprocessingPreviewResponse,
|
||||||
|
ImageQualityMetrics,
|
||||||
)
|
)
|
||||||
from app.services.task_service import task_service
|
from app.services.task_service import task_service
|
||||||
from app.services.file_access_service import file_access_service
|
from app.services.file_access_service import file_access_service
|
||||||
@@ -1131,3 +1136,193 @@ async def download_unified(
|
|||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
detail=f"Failed to download: {str(e)}"
|
detail=f"Failed to download: {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ===== Preprocessing Preview Endpoints =====
|
||||||
|
|
||||||
|
@router.post("/{task_id}/preview/preprocessing", response_model=PreprocessingPreviewResponse, summary="Preview preprocessing effect")
|
||||||
|
async def preview_preprocessing(
|
||||||
|
task_id: str,
|
||||||
|
request: PreprocessingPreviewRequest,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
current_user: User = Depends(get_current_user)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Preview the effect of image preprocessing before OCR processing.
|
||||||
|
|
||||||
|
Shows side-by-side comparison of original and preprocessed images,
|
||||||
|
along with image quality metrics and auto-detected configuration.
|
||||||
|
|
||||||
|
- **task_id**: Task UUID
|
||||||
|
- **page**: Page number to preview (1-based)
|
||||||
|
- **mode**: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
|
- **config**: Manual preprocessing config (only used when mode='manual')
|
||||||
|
"""
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
from PIL import Image
|
||||||
|
from app.services.layout_preprocessing_service import get_layout_preprocessing_service
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get task details
|
||||||
|
task = task_service.get_task_by_id(
|
||||||
|
db=db,
|
||||||
|
task_id=task_id,
|
||||||
|
user_id=current_user.id
|
||||||
|
)
|
||||||
|
|
||||||
|
if not task:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="Task not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get task file
|
||||||
|
task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
|
||||||
|
if not task_file:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="Task file not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
file_path = Path(task_file.stored_path)
|
||||||
|
if not file_path.exists():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="Source file not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the page image
|
||||||
|
page_num = request.page
|
||||||
|
if file_path.suffix.lower() == '.pdf':
|
||||||
|
# Convert specific page from PDF
|
||||||
|
images = convert_from_path(
|
||||||
|
str(file_path),
|
||||||
|
first_page=page_num,
|
||||||
|
last_page=page_num,
|
||||||
|
dpi=150
|
||||||
|
)
|
||||||
|
if not images:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=f"Page {page_num} not found in PDF"
|
||||||
|
)
|
||||||
|
original_image = images[0]
|
||||||
|
else:
|
||||||
|
# Direct image file
|
||||||
|
if page_num != 1:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Single image file only has page 1"
|
||||||
|
)
|
||||||
|
original_image = Image.open(file_path)
|
||||||
|
|
||||||
|
# Get preprocessing service
|
||||||
|
preprocessing_service = get_layout_preprocessing_service()
|
||||||
|
|
||||||
|
# Apply preprocessing
|
||||||
|
preprocessed_image, preprocess_result = preprocessing_service.preprocess_to_pil(
|
||||||
|
original_image,
|
||||||
|
mode=request.mode,
|
||||||
|
config=request.config
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create result directory for preview images
|
||||||
|
preview_dir = Path(settings.result_dir) / task_id / "preview"
|
||||||
|
preview_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save preview images
|
||||||
|
original_filename = f"page_{page_num}_original.png"
|
||||||
|
preprocessed_filename = f"page_{page_num}_preprocessed.png"
|
||||||
|
|
||||||
|
original_path = preview_dir / original_filename
|
||||||
|
preprocessed_path = preview_dir / preprocessed_filename
|
||||||
|
|
||||||
|
original_image.save(str(original_path), "PNG")
|
||||||
|
preprocessed_image.save(str(preprocessed_path), "PNG")
|
||||||
|
|
||||||
|
# Build URLs (relative paths that can be served)
|
||||||
|
base_url = f"/api/v2/tasks/{task_id}/preview/image"
|
||||||
|
original_url = f"{base_url}?type=original&page={page_num}"
|
||||||
|
preprocessed_url = f"{base_url}?type=preprocessed&page={page_num}"
|
||||||
|
|
||||||
|
return PreprocessingPreviewResponse(
|
||||||
|
original_url=original_url,
|
||||||
|
preprocessed_url=preprocessed_url,
|
||||||
|
quality_metrics=preprocess_result.quality_metrics,
|
||||||
|
auto_config=preprocess_result.config_used,
|
||||||
|
mode_used=request.mode
|
||||||
|
)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Failed to preview preprocessing for task {task_id}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Failed to preview preprocessing: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{task_id}/preview/image", summary="Get preview image")
|
||||||
|
async def get_preview_image(
|
||||||
|
task_id: str,
|
||||||
|
type: str = Query(..., description="Image type: 'original' or 'preprocessed'"),
|
||||||
|
page: int = Query(1, ge=1, description="Page number"),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
current_user: User = Depends(get_current_user)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get a preview image (original or preprocessed).
|
||||||
|
|
||||||
|
- **task_id**: Task UUID
|
||||||
|
- **type**: Image type ('original' or 'preprocessed')
|
||||||
|
- **page**: Page number
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Verify task ownership
|
||||||
|
task = task_service.get_task_by_id(
|
||||||
|
db=db,
|
||||||
|
task_id=task_id,
|
||||||
|
user_id=current_user.id
|
||||||
|
)
|
||||||
|
|
||||||
|
if not task:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="Task not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate type parameter
|
||||||
|
if type not in ['original', 'preprocessed']:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Invalid type. Must be 'original' or 'preprocessed'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build image path
|
||||||
|
preview_dir = Path(settings.result_dir) / task_id / "preview"
|
||||||
|
image_filename = f"page_{page}_{type}.png"
|
||||||
|
image_path = preview_dir / image_filename
|
||||||
|
|
||||||
|
if not image_path.exists():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=f"Preview image not found. Please call preview/preprocessing first."
|
||||||
|
)
|
||||||
|
|
||||||
|
return FileResponse(
|
||||||
|
path=str(image_path),
|
||||||
|
media_type="image/png",
|
||||||
|
filename=image_filename
|
||||||
|
)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Failed to get preview image for task {task_id}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Failed to get preview image: {str(e)}"
|
||||||
|
)
|
||||||
|
|||||||
@@ -37,6 +37,79 @@ class LayoutModelEnum(str, Enum):
|
|||||||
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
|
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
|
||||||
|
|
||||||
|
|
||||||
|
class PreprocessingModeEnum(str, Enum):
|
||||||
|
"""Preprocessing mode for layout detection enhancement.
|
||||||
|
|
||||||
|
- AUTO: Analyze image quality and automatically apply optimal preprocessing
|
||||||
|
- MANUAL: Use user-specified preprocessing configuration
|
||||||
|
- DISABLED: Skip preprocessing entirely
|
||||||
|
"""
|
||||||
|
AUTO = "auto" # Analyze and apply automatically (default)
|
||||||
|
MANUAL = "manual" # Use specified configuration
|
||||||
|
DISABLED = "disabled" # Skip preprocessing
|
||||||
|
|
||||||
|
|
||||||
|
class PreprocessingContrastEnum(str, Enum):
|
||||||
|
"""Contrast enhancement method for preprocessing.
|
||||||
|
|
||||||
|
- NONE: No contrast enhancement
|
||||||
|
- HISTOGRAM: Standard histogram equalization
|
||||||
|
- CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended)
|
||||||
|
"""
|
||||||
|
NONE = "none"
|
||||||
|
HISTOGRAM = "histogram"
|
||||||
|
CLAHE = "clahe"
|
||||||
|
|
||||||
|
|
||||||
|
class PreprocessingConfig(BaseModel):
|
||||||
|
"""Preprocessing configuration for layout detection enhancement.
|
||||||
|
|
||||||
|
Used to configure image preprocessing before PP-Structure layout detection.
|
||||||
|
Preprocessing helps detect tables with faint lines or low contrast borders.
|
||||||
|
Original image is preserved for element extraction.
|
||||||
|
"""
|
||||||
|
contrast: PreprocessingContrastEnum = Field(
|
||||||
|
default=PreprocessingContrastEnum.CLAHE,
|
||||||
|
description="Contrast enhancement method"
|
||||||
|
)
|
||||||
|
sharpen: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Enable sharpening for faint lines"
|
||||||
|
)
|
||||||
|
binarize: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Enable binarization (aggressive, for very low contrast)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ImageQualityMetrics(BaseModel):
|
||||||
|
"""Image quality metrics from auto-analysis."""
|
||||||
|
contrast: float = Field(..., description="Contrast level (std dev of grayscale)")
|
||||||
|
edge_strength: float = Field(..., description="Edge strength (Sobel gradient mean)")
|
||||||
|
|
||||||
|
|
||||||
|
class PreprocessingPreviewRequest(BaseModel):
|
||||||
|
"""Request for preprocessing preview."""
|
||||||
|
page: int = Field(default=1, ge=1, description="Page number to preview")
|
||||||
|
mode: PreprocessingModeEnum = Field(
|
||||||
|
default=PreprocessingModeEnum.AUTO,
|
||||||
|
description="Preprocessing mode"
|
||||||
|
)
|
||||||
|
config: Optional[PreprocessingConfig] = Field(
|
||||||
|
None,
|
||||||
|
description="Manual configuration (only used when mode='manual')"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PreprocessingPreviewResponse(BaseModel):
|
||||||
|
"""Response for preprocessing preview."""
|
||||||
|
original_url: str = Field(..., description="URL to original image")
|
||||||
|
preprocessed_url: str = Field(..., description="URL to preprocessed image")
|
||||||
|
quality_metrics: ImageQualityMetrics = Field(..., description="Image quality analysis")
|
||||||
|
auto_config: PreprocessingConfig = Field(..., description="Auto-detected configuration")
|
||||||
|
mode_used: PreprocessingModeEnum = Field(..., description="Mode that was applied")
|
||||||
|
|
||||||
|
|
||||||
class TaskCreate(BaseModel):
|
class TaskCreate(BaseModel):
|
||||||
"""Task creation request"""
|
"""Task creation request"""
|
||||||
filename: Optional[str] = Field(None, description="Original filename")
|
filename: Optional[str] = Field(None, description="Original filename")
|
||||||
@@ -195,6 +268,16 @@ class ProcessingOptions(BaseModel):
|
|||||||
description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)"
|
description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Layout preprocessing (OCR track only)
|
||||||
|
preprocessing_mode: PreprocessingModeEnum = Field(
|
||||||
|
default=PreprocessingModeEnum.AUTO,
|
||||||
|
description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
|
||||||
|
)
|
||||||
|
preprocessing_config: Optional[PreprocessingConfig] = Field(
|
||||||
|
None,
|
||||||
|
description="Manual preprocessing config (only used when preprocessing_mode='manual')"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AnalyzeRequest(BaseModel):
|
class AnalyzeRequest(BaseModel):
|
||||||
"""Document analysis request"""
|
"""Document analysis request"""
|
||||||
|
|||||||
370
backend/app/services/layout_preprocessing_service.py
Normal file
370
backend/app/services/layout_preprocessing_service.py
Normal file
@@ -0,0 +1,370 @@
|
|||||||
|
"""
|
||||||
|
Tool_OCR - Layout Preprocessing Service
|
||||||
|
Image preprocessing to enhance layout detection for documents with faint lines/borders.
|
||||||
|
|
||||||
|
This service provides:
|
||||||
|
1. Image quality analysis (contrast, edge strength)
|
||||||
|
2. Contrast enhancement (histogram equalization, CLAHE)
|
||||||
|
3. Sharpening for faint lines
|
||||||
|
4. Optional binarization for very low contrast documents
|
||||||
|
|
||||||
|
IMPORTANT: Preprocessing only affects layout detection input.
|
||||||
|
Original images are preserved for element extraction.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple, Union
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from app.core.config import settings
|
||||||
|
from app.schemas.task import (
|
||||||
|
PreprocessingConfig,
|
||||||
|
PreprocessingContrastEnum,
|
||||||
|
PreprocessingModeEnum,
|
||||||
|
ImageQualityMetrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PreprocessingResult:
|
||||||
|
"""Result of preprocessing operation."""
|
||||||
|
image: np.ndarray
|
||||||
|
config_used: PreprocessingConfig
|
||||||
|
quality_metrics: ImageQualityMetrics
|
||||||
|
was_processed: bool
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutPreprocessingService:
|
||||||
|
"""
|
||||||
|
Service for preprocessing images to improve layout detection.
|
||||||
|
|
||||||
|
The preprocessing pipeline:
|
||||||
|
1. Analyze image quality (contrast, edge strength)
|
||||||
|
2. Apply contrast enhancement if needed (CLAHE or histogram)
|
||||||
|
3. Apply sharpening if edge strength is low
|
||||||
|
4. Apply binarization if contrast is very low (optional)
|
||||||
|
|
||||||
|
All operations preserve the original color image dimensions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# Load thresholds from config
|
||||||
|
self.contrast_threshold = settings.layout_preprocessing_contrast_threshold
|
||||||
|
self.edge_threshold = settings.layout_preprocessing_edge_threshold
|
||||||
|
self.binarize_threshold = settings.layout_preprocessing_binarize_threshold
|
||||||
|
|
||||||
|
# CLAHE parameters
|
||||||
|
self.clahe_clip_limit = 2.0
|
||||||
|
self.clahe_tile_grid_size = (8, 8)
|
||||||
|
|
||||||
|
# Sharpening kernel (unsharp mask style)
|
||||||
|
self.sharpen_kernel = np.array([
|
||||||
|
[0, -1, 0],
|
||||||
|
[-1, 5, -1],
|
||||||
|
[0, -1, 0]
|
||||||
|
], dtype=np.float32)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"LayoutPreprocessingService initialized with thresholds: "
|
||||||
|
f"contrast={self.contrast_threshold}, edge={self.edge_threshold}, "
|
||||||
|
f"binarize={self.binarize_threshold}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def analyze_image_quality(self, image: np.ndarray) -> ImageQualityMetrics:
|
||||||
|
"""
|
||||||
|
Analyze image quality to determine preprocessing needs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image (BGR or grayscale)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ImageQualityMetrics with contrast and edge_strength
|
||||||
|
"""
|
||||||
|
# Convert to grayscale if needed
|
||||||
|
if len(image.shape) == 3:
|
||||||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
|
else:
|
||||||
|
gray = image
|
||||||
|
|
||||||
|
# Calculate contrast (standard deviation of pixel values)
|
||||||
|
contrast = float(np.std(gray))
|
||||||
|
|
||||||
|
# Calculate edge strength (mean of Sobel gradient magnitude)
|
||||||
|
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
|
||||||
|
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
|
||||||
|
edge_strength = float(np.mean(np.sqrt(sobel_x**2 + sobel_y**2)))
|
||||||
|
|
||||||
|
return ImageQualityMetrics(
|
||||||
|
contrast=round(contrast, 2),
|
||||||
|
edge_strength=round(edge_strength, 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_auto_config(self, metrics: ImageQualityMetrics) -> PreprocessingConfig:
|
||||||
|
"""
|
||||||
|
Determine optimal preprocessing config based on image quality.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metrics: Image quality metrics from analyze_image_quality()
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PreprocessingConfig with recommended settings
|
||||||
|
"""
|
||||||
|
# Determine contrast enhancement
|
||||||
|
if metrics.contrast < self.contrast_threshold:
|
||||||
|
contrast = PreprocessingContrastEnum.CLAHE
|
||||||
|
else:
|
||||||
|
contrast = PreprocessingContrastEnum.NONE
|
||||||
|
|
||||||
|
# Determine sharpening
|
||||||
|
sharpen = metrics.edge_strength < self.edge_threshold
|
||||||
|
|
||||||
|
# Determine binarization (only for very low contrast)
|
||||||
|
binarize = metrics.contrast < self.binarize_threshold
|
||||||
|
|
||||||
|
return PreprocessingConfig(
|
||||||
|
contrast=contrast,
|
||||||
|
sharpen=sharpen,
|
||||||
|
binarize=binarize
|
||||||
|
)
|
||||||
|
|
||||||
|
def apply_contrast_enhancement(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
method: PreprocessingContrastEnum
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Apply contrast enhancement to image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image (BGR)
|
||||||
|
method: Enhancement method (none, histogram, clahe)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Enhanced image (BGR)
|
||||||
|
"""
|
||||||
|
if method == PreprocessingContrastEnum.NONE:
|
||||||
|
return image
|
||||||
|
|
||||||
|
# Convert to LAB color space for better enhancement
|
||||||
|
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
|
||||||
|
l_channel, a_channel, b_channel = cv2.split(lab)
|
||||||
|
|
||||||
|
if method == PreprocessingContrastEnum.HISTOGRAM:
|
||||||
|
# Standard histogram equalization
|
||||||
|
l_enhanced = cv2.equalizeHist(l_channel)
|
||||||
|
elif method == PreprocessingContrastEnum.CLAHE:
|
||||||
|
# Contrast Limited Adaptive Histogram Equalization
|
||||||
|
clahe = cv2.createCLAHE(
|
||||||
|
clipLimit=self.clahe_clip_limit,
|
||||||
|
tileGridSize=self.clahe_tile_grid_size
|
||||||
|
)
|
||||||
|
l_enhanced = clahe.apply(l_channel)
|
||||||
|
else:
|
||||||
|
return image
|
||||||
|
|
||||||
|
# Merge channels and convert back to BGR
|
||||||
|
enhanced_lab = cv2.merge([l_enhanced, a_channel, b_channel])
|
||||||
|
enhanced_bgr = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
|
||||||
|
|
||||||
|
return enhanced_bgr
|
||||||
|
|
||||||
|
def apply_sharpening(self, image: np.ndarray) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Apply sharpening to enhance edges and faint lines.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image (BGR)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sharpened image (BGR)
|
||||||
|
"""
|
||||||
|
# Apply unsharp mask style sharpening
|
||||||
|
sharpened = cv2.filter2D(image, -1, self.sharpen_kernel)
|
||||||
|
|
||||||
|
# Clip values to valid range
|
||||||
|
sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
|
||||||
|
|
||||||
|
return sharpened
|
||||||
|
|
||||||
|
def apply_binarization(self, image: np.ndarray) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Apply adaptive binarization for very low contrast documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image (BGR)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Binarized image (BGR, but grayscale values)
|
||||||
|
"""
|
||||||
|
# Convert to grayscale
|
||||||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
|
# Apply adaptive thresholding
|
||||||
|
binary = cv2.adaptiveThreshold(
|
||||||
|
gray,
|
||||||
|
255,
|
||||||
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
|
cv2.THRESH_BINARY,
|
||||||
|
blockSize=11,
|
||||||
|
C=2
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert back to BGR for consistency
|
||||||
|
binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
|
||||||
|
|
||||||
|
return binary_bgr
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
image: Union[np.ndarray, Image.Image, str, Path],
|
||||||
|
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
|
||||||
|
config: Optional[PreprocessingConfig] = None
|
||||||
|
) -> PreprocessingResult:
|
||||||
|
"""
|
||||||
|
Preprocess image for layout detection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image (numpy array, PIL Image, or path)
|
||||||
|
mode: Preprocessing mode (auto, manual, disabled)
|
||||||
|
config: Manual configuration (required if mode='manual')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PreprocessingResult with preprocessed image and metadata
|
||||||
|
"""
|
||||||
|
# Load image if path provided
|
||||||
|
if isinstance(image, (str, Path)):
|
||||||
|
image = cv2.imread(str(image))
|
||||||
|
if image is None:
|
||||||
|
raise ValueError(f"Failed to load image: {image}")
|
||||||
|
elif isinstance(image, Image.Image):
|
||||||
|
# Convert PIL to OpenCV format (BGR)
|
||||||
|
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
||||||
|
|
||||||
|
# Analyze quality
|
||||||
|
metrics = self.analyze_image_quality(image)
|
||||||
|
logger.debug(f"Image quality: contrast={metrics.contrast}, edge_strength={metrics.edge_strength}")
|
||||||
|
|
||||||
|
# Determine configuration
|
||||||
|
if mode == PreprocessingModeEnum.DISABLED:
|
||||||
|
return PreprocessingResult(
|
||||||
|
image=image,
|
||||||
|
config_used=PreprocessingConfig(
|
||||||
|
contrast=PreprocessingContrastEnum.NONE,
|
||||||
|
sharpen=False,
|
||||||
|
binarize=False
|
||||||
|
),
|
||||||
|
quality_metrics=metrics,
|
||||||
|
was_processed=False
|
||||||
|
)
|
||||||
|
|
||||||
|
if mode == PreprocessingModeEnum.AUTO:
|
||||||
|
config = self.get_auto_config(metrics)
|
||||||
|
logger.debug(f"Auto config: {config}")
|
||||||
|
elif config is None:
|
||||||
|
# Manual mode but no config provided, use defaults
|
||||||
|
config = PreprocessingConfig()
|
||||||
|
|
||||||
|
# Apply preprocessing pipeline
|
||||||
|
processed = image.copy()
|
||||||
|
was_processed = False
|
||||||
|
|
||||||
|
# Step 1: Contrast enhancement
|
||||||
|
if config.contrast != PreprocessingContrastEnum.NONE:
|
||||||
|
processed = self.apply_contrast_enhancement(processed, config.contrast)
|
||||||
|
was_processed = True
|
||||||
|
logger.debug(f"Applied contrast enhancement: {config.contrast}")
|
||||||
|
|
||||||
|
# Step 2: Sharpening
|
||||||
|
if config.sharpen:
|
||||||
|
processed = self.apply_sharpening(processed)
|
||||||
|
was_processed = True
|
||||||
|
logger.debug("Applied sharpening")
|
||||||
|
|
||||||
|
# Step 3: Binarization (last step, overwrites color)
|
||||||
|
if config.binarize:
|
||||||
|
processed = self.apply_binarization(processed)
|
||||||
|
was_processed = True
|
||||||
|
logger.debug("Applied binarization")
|
||||||
|
|
||||||
|
return PreprocessingResult(
|
||||||
|
image=processed,
|
||||||
|
config_used=config,
|
||||||
|
quality_metrics=metrics,
|
||||||
|
was_processed=was_processed
|
||||||
|
)
|
||||||
|
|
||||||
|
def preprocess_to_pil(
|
||||||
|
self,
|
||||||
|
image: Union[np.ndarray, Image.Image, str, Path],
|
||||||
|
mode: PreprocessingModeEnum = PreprocessingModeEnum.AUTO,
|
||||||
|
config: Optional[PreprocessingConfig] = None
|
||||||
|
) -> Tuple[Image.Image, PreprocessingResult]:
|
||||||
|
"""
|
||||||
|
Preprocess image and return as PIL Image.
|
||||||
|
|
||||||
|
Convenience method for integration with PP-Structure which accepts PIL images.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image
|
||||||
|
mode: Preprocessing mode
|
||||||
|
config: Manual configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (PIL Image, PreprocessingResult)
|
||||||
|
"""
|
||||||
|
result = self.preprocess(image, mode, config)
|
||||||
|
|
||||||
|
# Convert BGR to RGB for PIL
|
||||||
|
rgb_image = cv2.cvtColor(result.image, cv2.COLOR_BGR2RGB)
|
||||||
|
pil_image = Image.fromarray(rgb_image)
|
||||||
|
|
||||||
|
return pil_image, result
|
||||||
|
|
||||||
|
def save_preview(
|
||||||
|
self,
|
||||||
|
original: np.ndarray,
|
||||||
|
preprocessed: np.ndarray,
|
||||||
|
output_dir: Path,
|
||||||
|
prefix: str = "preview"
|
||||||
|
) -> Tuple[Path, Path]:
|
||||||
|
"""
|
||||||
|
Save original and preprocessed images for preview.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
original: Original image (BGR)
|
||||||
|
preprocessed: Preprocessed image (BGR)
|
||||||
|
output_dir: Directory to save images
|
||||||
|
prefix: Filename prefix
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (original_path, preprocessed_path)
|
||||||
|
"""
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
original_path = output_dir / f"{prefix}_original.png"
|
||||||
|
preprocessed_path = output_dir / f"{prefix}_preprocessed.png"
|
||||||
|
|
||||||
|
cv2.imwrite(str(original_path), original)
|
||||||
|
cv2.imwrite(str(preprocessed_path), preprocessed)
|
||||||
|
|
||||||
|
return original_path, preprocessed_path
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton instance
|
||||||
|
_layout_preprocessing_service: Optional[LayoutPreprocessingService] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_layout_preprocessing_service() -> LayoutPreprocessingService:
|
||||||
|
"""Get or create the layout preprocessing service singleton."""
|
||||||
|
global _layout_preprocessing_service
|
||||||
|
if _layout_preprocessing_service is None:
|
||||||
|
_layout_preprocessing_service = LayoutPreprocessingService()
|
||||||
|
return _layout_preprocessing_service
|
||||||
@@ -26,6 +26,11 @@ except ImportError:
|
|||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.services.office_converter import OfficeConverter, OfficeConverterError
|
from app.services.office_converter import OfficeConverter, OfficeConverterError
|
||||||
from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
|
from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
|
||||||
|
from app.services.layout_preprocessing_service import (
|
||||||
|
get_layout_preprocessing_service,
|
||||||
|
LayoutPreprocessingService,
|
||||||
|
)
|
||||||
|
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
|
||||||
|
|
||||||
# Import dual-track components
|
# Import dual-track components
|
||||||
try:
|
try:
|
||||||
@@ -865,7 +870,9 @@ class OCRService:
|
|||||||
confidence_threshold: Optional[float] = None,
|
confidence_threshold: Optional[float] = None,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
current_page: int = 0,
|
current_page: int = 0,
|
||||||
layout_model: Optional[str] = None
|
layout_model: Optional[str] = None,
|
||||||
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
|
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Process single image with OCR and layout analysis
|
Process single image with OCR and layout analysis
|
||||||
@@ -878,6 +885,8 @@ class OCRService:
|
|||||||
output_dir: Optional output directory for saving extracted images
|
output_dir: Optional output directory for saving extracted images
|
||||||
current_page: Current page number (0-based) for multi-page documents
|
current_page: Current page number (0-based) for multi-page documents
|
||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with OCR results and metadata
|
Dictionary with OCR results and metadata
|
||||||
@@ -946,7 +955,9 @@ class OCRService:
|
|||||||
confidence_threshold=confidence_threshold,
|
confidence_threshold=confidence_threshold,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
||||||
layout_model=layout_model
|
layout_model=layout_model,
|
||||||
|
preprocessing_mode=preprocessing_mode,
|
||||||
|
preprocessing_config=preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Accumulate results
|
# Accumulate results
|
||||||
@@ -1092,7 +1103,9 @@ class OCRService:
|
|||||||
image_path,
|
image_path,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
current_page=current_page,
|
current_page=current_page,
|
||||||
layout_model=layout_model
|
layout_model=layout_model,
|
||||||
|
preprocessing_mode=preprocessing_mode,
|
||||||
|
preprocessing_config=preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate Markdown
|
# Generate Markdown
|
||||||
@@ -1248,7 +1261,9 @@ class OCRService:
|
|||||||
image_path: Path,
|
image_path: Path,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
current_page: int = 0,
|
current_page: int = 0,
|
||||||
layout_model: Optional[str] = None
|
layout_model: Optional[str] = None,
|
||||||
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
|
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||||
) -> Tuple[Optional[Dict], List[Dict]]:
|
) -> Tuple[Optional[Dict], List[Dict]]:
|
||||||
"""
|
"""
|
||||||
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
||||||
@@ -1258,6 +1273,8 @@ class OCRService:
|
|||||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||||
current_page: Current page number (0-based) for multi-page documents
|
current_page: Current page number (0-based) for multi-page documents
|
||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
|
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (layout_data, images_metadata)
|
Tuple of (layout_data, images_metadata)
|
||||||
@@ -1277,13 +1294,45 @@ class OCRService:
|
|||||||
|
|
||||||
structure_engine = self._ensure_structure_engine(layout_model)
|
structure_engine = self._ensure_structure_engine(layout_model)
|
||||||
|
|
||||||
|
# Apply image preprocessing for layout detection
|
||||||
|
# Preprocessing enhances faint lines/borders to improve table detection
|
||||||
|
# Original image is preserved for element extraction
|
||||||
|
preprocessed_image = None
|
||||||
|
preprocessing_result = None
|
||||||
|
|
||||||
|
# Determine preprocessing mode (default from config if not specified)
|
||||||
|
mode = preprocessing_mode or PreprocessingModeEnum(settings.layout_preprocessing_mode)
|
||||||
|
|
||||||
|
if mode != PreprocessingModeEnum.DISABLED:
|
||||||
|
try:
|
||||||
|
preprocessing_service = get_layout_preprocessing_service()
|
||||||
|
preprocessed_pil, preprocessing_result = preprocessing_service.preprocess_to_pil(
|
||||||
|
image_path,
|
||||||
|
mode=mode,
|
||||||
|
config=preprocessing_config
|
||||||
|
)
|
||||||
|
|
||||||
|
if preprocessing_result.was_processed:
|
||||||
|
preprocessed_image = preprocessed_pil
|
||||||
|
logger.info(
|
||||||
|
f"Layout preprocessing applied: mode={mode.value}, "
|
||||||
|
f"config={preprocessing_result.config_used}, "
|
||||||
|
f"metrics={preprocessing_result.quality_metrics}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(f"No preprocessing needed (mode={mode.value})")
|
||||||
|
|
||||||
|
except Exception as preprocess_error:
|
||||||
|
logger.warning(f"Preprocessing failed, using original image: {preprocess_error}")
|
||||||
|
preprocessed_image = None
|
||||||
|
|
||||||
# Try enhanced processing first
|
# Try enhanced processing first
|
||||||
try:
|
try:
|
||||||
from app.services.pp_structure_enhanced import PPStructureEnhanced
|
from app.services.pp_structure_enhanced import PPStructureEnhanced
|
||||||
|
|
||||||
enhanced_processor = PPStructureEnhanced(structure_engine)
|
enhanced_processor = PPStructureEnhanced(structure_engine)
|
||||||
result = enhanced_processor.analyze_with_full_structure(
|
result = enhanced_processor.analyze_with_full_structure(
|
||||||
image_path, output_dir, current_page
|
image_path, output_dir, current_page, preprocessed_image=preprocessed_image
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.get('has_parsing_res_list'):
|
if result.get('has_parsing_res_list'):
|
||||||
@@ -1337,7 +1386,17 @@ class OCRService:
|
|||||||
logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
|
logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
|
||||||
return None, []
|
return None, []
|
||||||
|
|
||||||
results = structure_engine.predict(str(image_path))
|
# Use preprocessed image if available, otherwise original path
|
||||||
|
if preprocessed_image is not None:
|
||||||
|
import numpy as np
|
||||||
|
# Convert PIL to numpy array (BGR format for PP-Structure)
|
||||||
|
predict_input = np.array(preprocessed_image)
|
||||||
|
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
|
||||||
|
# Convert RGB to BGR
|
||||||
|
predict_input = predict_input[:, :, ::-1]
|
||||||
|
results = structure_engine.predict(predict_input)
|
||||||
|
else:
|
||||||
|
results = structure_engine.predict(str(image_path))
|
||||||
|
|
||||||
layout_elements = []
|
layout_elements = []
|
||||||
images_metadata = []
|
images_metadata = []
|
||||||
@@ -1509,7 +1568,9 @@ class OCRService:
|
|||||||
confidence_threshold: Optional[float] = None,
|
confidence_threshold: Optional[float] = None,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
force_track: Optional[str] = None,
|
force_track: Optional[str] = None,
|
||||||
layout_model: Optional[str] = None
|
layout_model: Optional[str] = None,
|
||||||
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
|
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||||
) -> Union[UnifiedDocument, Dict]:
|
) -> Union[UnifiedDocument, Dict]:
|
||||||
"""
|
"""
|
||||||
Process document using dual-track approach.
|
Process document using dual-track approach.
|
||||||
@@ -1522,6 +1583,8 @@ class OCRService:
|
|||||||
output_dir: Optional output directory for extracted images
|
output_dir: Optional output directory for extracted images
|
||||||
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
|
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
|
||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||||
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
UnifiedDocument if dual-track is enabled, Dict otherwise
|
UnifiedDocument if dual-track is enabled, Dict otherwise
|
||||||
@@ -1529,7 +1592,8 @@ class OCRService:
|
|||||||
if not self.dual_track_enabled:
|
if not self.dual_track_enabled:
|
||||||
# Fallback to traditional OCR processing
|
# Fallback to traditional OCR processing
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
|
preprocessing_mode, preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = datetime.now()
|
start_time = datetime.now()
|
||||||
@@ -1601,7 +1665,9 @@ class OCRService:
|
|||||||
ocr_result = self.process_file_traditional(
|
ocr_result = self.process_file_traditional(
|
||||||
actual_file_path, lang, detect_layout=True,
|
actual_file_path, lang, detect_layout=True,
|
||||||
confidence_threshold=confidence_threshold,
|
confidence_threshold=confidence_threshold,
|
||||||
output_dir=output_dir, layout_model=layout_model
|
output_dir=output_dir, layout_model=layout_model,
|
||||||
|
preprocessing_mode=preprocessing_mode,
|
||||||
|
preprocessing_config=preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert OCR result to extract images
|
# Convert OCR result to extract images
|
||||||
@@ -1634,7 +1700,8 @@ class OCRService:
|
|||||||
# Use OCR for scanned documents, images, etc.
|
# Use OCR for scanned documents, images, etc.
|
||||||
logger.info("Using OCR track (PaddleOCR)")
|
logger.info("Using OCR track (PaddleOCR)")
|
||||||
ocr_result = self.process_file_traditional(
|
ocr_result = self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
|
preprocessing_mode, preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert OCR result to UnifiedDocument using the converter
|
# Convert OCR result to UnifiedDocument using the converter
|
||||||
@@ -1664,7 +1731,8 @@ class OCRService:
|
|||||||
logger.error(f"Error in dual-track processing: {e}")
|
logger.error(f"Error in dual-track processing: {e}")
|
||||||
# Fallback to traditional OCR
|
# Fallback to traditional OCR
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
|
preprocessing_mode, preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
def _merge_ocr_images_into_direct(
|
def _merge_ocr_images_into_direct(
|
||||||
@@ -1743,7 +1811,9 @@ class OCRService:
|
|||||||
detect_layout: bool = True,
|
detect_layout: bool = True,
|
||||||
confidence_threshold: Optional[float] = None,
|
confidence_threshold: Optional[float] = None,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
layout_model: Optional[str] = None
|
layout_model: Optional[str] = None,
|
||||||
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
|
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Traditional OCR processing (legacy method).
|
Traditional OCR processing (legacy method).
|
||||||
@@ -1755,6 +1825,8 @@ class OCRService:
|
|||||||
confidence_threshold: Minimum confidence threshold
|
confidence_threshold: Minimum confidence threshold
|
||||||
output_dir: Optional output directory
|
output_dir: Optional output directory
|
||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with OCR results in legacy format
|
Dictionary with OCR results in legacy format
|
||||||
@@ -1767,7 +1839,8 @@ class OCRService:
|
|||||||
all_results = []
|
all_results = []
|
||||||
for i, image_path in enumerate(image_paths):
|
for i, image_path in enumerate(image_paths):
|
||||||
result = self.process_image(
|
result = self.process_image(
|
||||||
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
|
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
|
||||||
|
preprocessing_mode, preprocessing_config
|
||||||
)
|
)
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
@@ -1783,7 +1856,8 @@ class OCRService:
|
|||||||
else:
|
else:
|
||||||
# Single image or other file
|
# Single image or other file
|
||||||
return self.process_image(
|
return self.process_image(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
|
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
|
||||||
|
preprocessing_mode, preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
def _combine_results(self, results: List[Dict]) -> Dict:
|
def _combine_results(self, results: List[Dict]) -> Dict:
|
||||||
@@ -1868,7 +1942,9 @@ class OCRService:
|
|||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
use_dual_track: bool = True,
|
use_dual_track: bool = True,
|
||||||
force_track: Optional[str] = None,
|
force_track: Optional[str] = None,
|
||||||
layout_model: Optional[str] = None
|
layout_model: Optional[str] = None,
|
||||||
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
|
preprocessing_config: Optional[PreprocessingConfig] = None
|
||||||
) -> Union[UnifiedDocument, Dict]:
|
) -> Union[UnifiedDocument, Dict]:
|
||||||
"""
|
"""
|
||||||
Main processing method with dual-track support.
|
Main processing method with dual-track support.
|
||||||
@@ -1882,6 +1958,8 @@ class OCRService:
|
|||||||
use_dual_track: Whether to use dual-track processing (default True)
|
use_dual_track: Whether to use dual-track processing (default True)
|
||||||
force_track: Force specific track ("ocr" or "direct")
|
force_track: Force specific track ("ocr" or "direct")
|
||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||||
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
||||||
@@ -1893,12 +1971,14 @@ class OCRService:
|
|||||||
if (use_dual_track or force_track) and self.dual_track_enabled:
|
if (use_dual_track or force_track) and self.dual_track_enabled:
|
||||||
# Use dual-track processing (or forced track)
|
# Use dual-track processing (or forced track)
|
||||||
return self.process_with_dual_track(
|
return self.process_with_dual_track(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
|
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
|
||||||
|
preprocessing_mode, preprocessing_config
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Use traditional OCR processing (no force_track support)
|
# Use traditional OCR processing (no force_track support)
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
|
preprocessing_mode, preprocessing_config
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_legacy(
|
def process_legacy(
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ except ImportError:
|
|||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddleocr import PPStructureV3
|
from paddleocr import PPStructureV3
|
||||||
|
from PIL import Image
|
||||||
|
import numpy as np
|
||||||
from app.models.unified_document import ElementType
|
from app.models.unified_document import ElementType
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.services.memory_manager import prediction_context
|
from app.services.memory_manager import prediction_context
|
||||||
@@ -78,15 +80,19 @@ class PPStructureEnhanced:
|
|||||||
self,
|
self,
|
||||||
image_path: Path,
|
image_path: Path,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
current_page: int = 0
|
current_page: int = 0,
|
||||||
|
preprocessed_image: Optional[Image.Image] = None
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Analyze document with full PP-StructureV3 capabilities.
|
Analyze document with full PP-StructureV3 capabilities.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_path: Path to image file
|
image_path: Path to original image file (used for cropping extracted images)
|
||||||
output_dir: Optional output directory for saving extracted content
|
output_dir: Optional output directory for saving extracted content
|
||||||
current_page: Current page number (0-based)
|
current_page: Current page number (0-based)
|
||||||
|
preprocessed_image: Optional preprocessed PIL Image for layout detection.
|
||||||
|
If provided, this is used for PP-Structure prediction,
|
||||||
|
but original image_path is still used for cropping images.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with complete structure information including:
|
Dictionary with complete structure information including:
|
||||||
@@ -97,6 +103,8 @@ class PPStructureEnhanced:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
|
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
|
||||||
|
if preprocessed_image:
|
||||||
|
logger.info("Using preprocessed image for layout detection")
|
||||||
|
|
||||||
# Perform structure analysis with semaphore control
|
# Perform structure analysis with semaphore control
|
||||||
# This prevents OOM errors from multiple simultaneous predictions
|
# This prevents OOM errors from multiple simultaneous predictions
|
||||||
@@ -113,7 +121,16 @@ class PPStructureEnhanced:
|
|||||||
'error': 'Prediction slot timeout'
|
'error': 'Prediction slot timeout'
|
||||||
}
|
}
|
||||||
|
|
||||||
results = self.structure_engine.predict(str(image_path))
|
# Use preprocessed image if provided, otherwise use original path
|
||||||
|
if preprocessed_image is not None:
|
||||||
|
# Convert PIL to numpy array (BGR format for PP-Structure)
|
||||||
|
predict_input = np.array(preprocessed_image)
|
||||||
|
if len(predict_input.shape) == 3 and predict_input.shape[2] == 3:
|
||||||
|
# Convert RGB to BGR
|
||||||
|
predict_input = predict_input[:, :, ::-1]
|
||||||
|
results = self.structure_engine.predict(predict_input)
|
||||||
|
else:
|
||||||
|
results = self.structure_engine.predict(str(image_path))
|
||||||
|
|
||||||
all_elements = []
|
all_elements = []
|
||||||
all_images = []
|
all_images = []
|
||||||
|
|||||||
@@ -93,7 +93,7 @@
|
|||||||
- `frontend/src/i18n/locales/zh-TW.json` - Traditional Chinese
|
- `frontend/src/i18n/locales/zh-TW.json` - Traditional Chinese
|
||||||
- `frontend/src/i18n/locales/en.json` - English (if exists)
|
- `frontend/src/i18n/locales/en.json` - English (if exists)
|
||||||
|
|
||||||
## 6. Testing
|
## 6. Testing (with env)
|
||||||
|
|
||||||
- [ ] 6.1 Unit tests for preprocessing_service
|
- [ ] 6.1 Unit tests for preprocessing_service
|
||||||
- Test contrast enhancement methods
|
- Test contrast enhancement methods
|
||||||
@@ -106,7 +106,7 @@
|
|||||||
- Test preview endpoint returns correct images
|
- Test preview endpoint returns correct images
|
||||||
- Test auto-detection returns sensible config
|
- Test auto-detection returns sensible config
|
||||||
|
|
||||||
- [ ] 6.3 Integration tests
|
- [ ] 6.3 Integration tests (accountL ymirliu@panjit.com.tw ; password: 4RFV5tgb6yhn)
|
||||||
- Test OCR track with preprocessing modes (auto/manual/disabled)
|
- Test OCR track with preprocessing modes (auto/manual/disabled)
|
||||||
- Verify image element quality is preserved
|
- Verify image element quality is preserved
|
||||||
- Test with known problematic documents (faint table borders)
|
- Test with known problematic documents (faint table borders)
|
||||||
|
|||||||
Reference in New Issue
Block a user