""" Tool_OCR - Task Management Schemas """ from typing import Optional, List from datetime import datetime from pydantic import BaseModel, Field from enum import Enum from app.schemas.base import UTCDatetimeBaseModel class TaskStatusEnum(str, Enum): """Task status enumeration""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" FAILED = "failed" class ProcessingTrackEnum(str, Enum): """Processing track enumeration for dual-track processing""" OCR = "ocr" # PaddleOCR PP-StructureV3 for scanned documents DIRECT = "direct" # PyMuPDF for editable PDFs HYBRID = "hybrid" # Mixed processing AUTO = "auto" # Auto-detect best track class LayoutModelEnum(str, Enum): """Layout detection model selection for OCR track. Different models are optimized for different document types: - CHINESE: PP-DocLayout_plus-L (83.2% mAP), optimized for complex Chinese documents - DEFAULT: PubLayNet-based (~94% mAP), optimized for English academic papers - CDLA: CDLA model (~86% mAP), specialized Chinese document layout analysis """ CHINESE = "chinese" # PP-DocLayout_plus-L - Best for Chinese documents (recommended) DEFAULT = "default" # PubLayNet-based - Best for English documents CDLA = "cdla" # CDLA model - Alternative for Chinese layout class PreprocessingModeEnum(str, Enum): """Preprocessing mode for layout detection enhancement. - AUTO: Analyze image quality and automatically apply optimal preprocessing - MANUAL: Use user-specified preprocessing configuration - DISABLED: Skip preprocessing entirely """ AUTO = "auto" # Analyze and apply automatically (default) MANUAL = "manual" # Use specified configuration DISABLED = "disabled" # Skip preprocessing class PreprocessingContrastEnum(str, Enum): """Contrast enhancement method for preprocessing. - NONE: No contrast enhancement - HISTOGRAM: Standard histogram equalization - CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended for most cases) - DOCUMENT: Background normalization + CLAHE (recommended for scanned documents) Removes uneven illumination before enhancement. Best for scans with yellowed paper, shadow, or scanner lighting issues. """ NONE = "none" HISTOGRAM = "histogram" CLAHE = "clahe" DOCUMENT = "document" class PreprocessingConfig(BaseModel): """Preprocessing configuration for layout detection enhancement. Used to configure image preprocessing before PP-Structure layout detection. Preprocessing helps detect tables with faint lines or low contrast borders. Original image is preserved for element extraction. """ contrast: PreprocessingContrastEnum = Field( default=PreprocessingContrastEnum.CLAHE, description="Contrast enhancement method" ) contrast_strength: float = Field( default=1.0, ge=0.5, le=3.0, description="Contrast enhancement strength (0.5=subtle, 1.0=normal, 2.0=strong, 3.0=maximum)" ) sharpen: bool = Field( default=True, description="Enable sharpening for faint lines" ) sharpen_strength: float = Field( default=1.0, ge=0.5, le=2.0, description="Sharpening strength (0.5=subtle, 1.0=normal, 1.5=strong, 2.0=maximum)" ) binarize: bool = Field( default=False, description="Enable binarization (aggressive, for very low contrast). Not recommended for most documents." ) remove_scan_artifacts: bool = Field( default=True, description="Remove horizontal scan line artifacts. Recommended for scanned documents to prevent misdetection of scanner light bar lines as table borders." ) class ImageQualityMetrics(BaseModel): """Image quality metrics from auto-analysis.""" contrast: float = Field(..., description="Contrast level (std dev of grayscale)") edge_strength: float = Field(..., description="Edge strength (Sobel gradient mean)") class PreprocessingPreviewRequest(BaseModel): """Request for preprocessing preview.""" page: int = Field(default=1, ge=1, description="Page number to preview") mode: PreprocessingModeEnum = Field( default=PreprocessingModeEnum.AUTO, description="Preprocessing mode" ) config: Optional[PreprocessingConfig] = Field( None, description="Manual configuration (only used when mode='manual')" ) class PreprocessingPreviewResponse(BaseModel): """Response for preprocessing preview.""" original_url: str = Field(..., description="URL to original image") preprocessed_url: str = Field(..., description="URL to preprocessed image") quality_metrics: ImageQualityMetrics = Field(..., description="Image quality analysis") auto_config: PreprocessingConfig = Field(..., description="Auto-detected configuration") mode_used: PreprocessingModeEnum = Field(..., description="Mode that was applied") class TaskCreate(BaseModel): """Task creation request""" filename: Optional[str] = Field(None, description="Original filename") file_type: Optional[str] = Field(None, description="File MIME type") class TaskUpdate(BaseModel): """Task update request""" status: Optional[TaskStatusEnum] = None error_message: Optional[str] = None processing_time_ms: Optional[int] = None result_json_path: Optional[str] = None result_markdown_path: Optional[str] = None result_pdf_path: Optional[str] = None class TaskFileResponse(UTCDatetimeBaseModel): """Task file response schema""" id: int original_name: Optional[str] = None stored_path: Optional[str] = None file_size: Optional[int] = None mime_type: Optional[str] = None file_hash: Optional[str] = None created_at: datetime class TaskResponse(UTCDatetimeBaseModel): """Task response schema""" id: int user_id: int task_id: str filename: Optional[str] = None file_type: Optional[str] = None status: TaskStatusEnum result_json_path: Optional[str] = None result_markdown_path: Optional[str] = None result_pdf_path: Optional[str] = None error_message: Optional[str] = None processing_time_ms: Optional[int] = None created_at: datetime updated_at: datetime completed_at: Optional[datetime] = None file_deleted: bool = False class TaskDetailResponse(TaskResponse): """Detailed task response with files""" files: List[TaskFileResponse] = [] # Dual-track processing field (extracted from result metadata) processing_track: Optional[ProcessingTrackEnum] = None # Visualization availability (OCR Track only) has_visualization: bool = False class TaskListResponse(BaseModel): """Paginated task list response""" tasks: List[TaskResponse] total: int page: int page_size: int has_more: bool class TaskStatsResponse(BaseModel): """User task statistics""" total: int pending: int processing: int completed: int failed: int class TaskHistoryQuery(BaseModel): """Task history query parameters""" status: Optional[TaskStatusEnum] = None filename: Optional[str] = None date_from: Optional[datetime] = None date_to: Optional[datetime] = None page: int = Field(default=1, ge=1) page_size: int = Field(default=50, ge=1, le=100) order_by: str = Field(default="created_at") order_desc: bool = Field(default=True) class UploadFileInfo(BaseModel): """Uploaded file information""" filename: str file_size: int file_type: str class UploadResponse(BaseModel): """File upload response""" task_id: str = Field(..., description="Created task ID") filename: str = Field(..., description="Original filename") file_size: int = Field(..., description="File size in bytes") file_type: str = Field(..., description="File MIME type") status: TaskStatusEnum = Field(..., description="Initial task status") # ===== Dual-Track Processing Schemas ===== class PPStructureV3Params(BaseModel): """PP-StructureV3 fine-tuning parameters for OCR track. DEPRECATED: This class is deprecated and will be removed in a future version. Use `layout_model` parameter in ProcessingOptions instead. """ layout_detection_threshold: Optional[float] = Field( None, ge=0, le=1, description="Layout block detection score threshold (lower=more blocks, higher=high confidence only)" ) layout_nms_threshold: Optional[float] = Field( None, ge=0, le=1, description="Layout NMS IoU threshold (lower=aggressive overlap removal, higher=allow more overlap)" ) layout_merge_bboxes_mode: Optional[str] = Field( None, pattern="^(union|large|small)$", description="Bbox merging strategy: 'small'=conservative, 'large'=aggressive, 'union'=middle" ) layout_unclip_ratio: Optional[float] = Field( None, gt=0, description="Layout bbox expansion ratio (larger=looser boxes, smaller=tighter boxes)" ) text_det_thresh: Optional[float] = Field( None, ge=0, le=1, description="Text detection score threshold (lower=detect more small/low-contrast text, higher=cleaner)" ) text_det_box_thresh: Optional[float] = Field( None, ge=0, le=1, description="Text box candidate threshold (lower=more text boxes, higher=fewer false positives)" ) text_det_unclip_ratio: Optional[float] = Field( None, gt=0, description="Text box expansion ratio (larger=looser boxes, smaller=tighter boxes)" ) class ProcessingOptions(BaseModel): """Processing options for dual-track OCR""" use_dual_track: bool = Field(default=True, description="Enable dual-track processing") force_track: Optional[ProcessingTrackEnum] = Field(None, description="Force specific track (ocr/direct)") language: str = Field(default="ch", description="OCR language code") include_layout: bool = Field(default=True, description="Include layout analysis") include_images: bool = Field(default=True, description="Extract and save images") confidence_threshold: Optional[float] = Field(None, ge=0, le=1, description="OCR confidence threshold") # Layout model selection (OCR track only) layout_model: Optional[LayoutModelEnum] = Field( default=LayoutModelEnum.CHINESE, description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)" ) # Layout preprocessing (OCR track only) preprocessing_mode: PreprocessingModeEnum = Field( default=PreprocessingModeEnum.AUTO, description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'" ) preprocessing_config: Optional[PreprocessingConfig] = Field( None, description="Manual preprocessing config (only used when preprocessing_mode='manual')" ) class AnalyzeRequest(BaseModel): """Document analysis request""" use_dual_track: bool = Field(default=True, description="Enable dual-track processing") force_track: Optional[ProcessingTrackEnum] = Field(None, description="Force specific track") language: str = Field(default="ch", description="OCR language") include_layout: bool = Field(default=True, description="Include layout analysis") class DocumentAnalysisResponse(BaseModel): """Document type analysis response""" task_id: str filename: str recommended_track: ProcessingTrackEnum confidence: float = Field(..., ge=0, le=1, description="Detection confidence") reason: str = Field(..., description="Reason for recommendation") document_info: dict = Field(default_factory=dict, description="Document metadata") is_editable: bool = Field(..., description="Whether document has extractable text") text_coverage: Optional[float] = Field(None, description="Percentage of text coverage") page_count: Optional[int] = Field(None, description="Number of pages") class ProcessingMetadata(BaseModel): """Processing metadata included in responses""" processing_track: ProcessingTrackEnum processing_time_seconds: float language: str page_count: int total_elements: int total_text_regions: int total_tables: int total_images: int average_confidence: Optional[float] = None unified_format: bool = True class TaskResponseWithMetadata(TaskResponse): """Extended task response with processing metadata""" processing_track: Optional[ProcessingTrackEnum] = None processing_metadata: Optional[ProcessingMetadata] = None class ExportOptions(BaseModel): """Export format options""" format: str = Field(default="json", description="Export format: json, markdown, pdf, unified") include_metadata: bool = Field(default=True, description="Include processing metadata") include_statistics: bool = Field(default=True, description="Include document statistics") legacy_format: bool = Field(default=False, description="Use legacy JSON format for compatibility")