Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
524 lines
19 KiB
Python
524 lines
19 KiB
Python
"""
|
|
Tool_OCR - Task Management Schemas
|
|
"""
|
|
|
|
from typing import Optional, List
|
|
from datetime import datetime
|
|
from pydantic import BaseModel, Field
|
|
from enum import Enum
|
|
|
|
|
|
class TaskStatusEnum(str, Enum):
|
|
"""Task status enumeration"""
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
|
|
|
|
class ProcessingTrackEnum(str, Enum):
|
|
"""Processing track enumeration for dual-track processing"""
|
|
OCR = "ocr" # PaddleOCR PP-StructureV3 for scanned documents
|
|
DIRECT = "direct" # PyMuPDF for editable PDFs
|
|
HYBRID = "hybrid" # Mixed processing
|
|
AUTO = "auto" # Auto-detect best track
|
|
|
|
|
|
class LayoutModelEnum(str, Enum):
|
|
"""Layout detection model selection for OCR track.
|
|
|
|
Different models are optimized for different document types:
|
|
- CHINESE: PP-DocLayout_plus-L (83.2% mAP), optimized for complex Chinese documents
|
|
- DEFAULT: PubLayNet-based (~94% mAP), optimized for English academic papers
|
|
- CDLA: CDLA model (~86% mAP), specialized Chinese document layout analysis
|
|
"""
|
|
CHINESE = "chinese" # PP-DocLayout_plus-L - Best for Chinese documents (recommended)
|
|
DEFAULT = "default" # PubLayNet-based - Best for English documents
|
|
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
|
|
|
|
|
|
class PreprocessingModeEnum(str, Enum):
|
|
"""Preprocessing mode for layout detection enhancement.
|
|
|
|
- AUTO: Analyze image quality and automatically apply optimal preprocessing
|
|
- MANUAL: Use user-specified preprocessing configuration
|
|
- DISABLED: Skip preprocessing entirely
|
|
"""
|
|
AUTO = "auto" # Analyze and apply automatically (default)
|
|
MANUAL = "manual" # Use specified configuration
|
|
DISABLED = "disabled" # Skip preprocessing
|
|
|
|
|
|
class PreprocessingContrastEnum(str, Enum):
|
|
"""Contrast enhancement method for preprocessing.
|
|
|
|
- NONE: No contrast enhancement
|
|
- HISTOGRAM: Standard histogram equalization
|
|
- CLAHE: Contrast Limited Adaptive Histogram Equalization (recommended for most cases)
|
|
- DOCUMENT: Background normalization + CLAHE (recommended for scanned documents)
|
|
Removes uneven illumination before enhancement. Best for scans with
|
|
yellowed paper, shadow, or scanner lighting issues.
|
|
"""
|
|
NONE = "none"
|
|
HISTOGRAM = "histogram"
|
|
CLAHE = "clahe"
|
|
DOCUMENT = "document"
|
|
|
|
|
|
class OCRPresetEnum(str, Enum):
|
|
"""OCR processing preset for different document types.
|
|
|
|
Presets provide optimized PP-Structure configurations for common document types:
|
|
- TEXT_HEAVY: Reports, articles, manuals (disable table recognition)
|
|
- DATASHEET: Technical datasheets, TDS (conservative table parsing)
|
|
- TABLE_HEAVY: Financial reports, spreadsheets (full table recognition)
|
|
- FORM: Applications, surveys (conservative table parsing)
|
|
- MIXED: General documents (classification only)
|
|
- CUSTOM: User-defined settings (use ocr_config)
|
|
"""
|
|
TEXT_HEAVY = "text_heavy" # Reports, articles, manuals
|
|
DATASHEET = "datasheet" # Technical datasheets, TDS
|
|
TABLE_HEAVY = "table_heavy" # Financial reports, spreadsheets
|
|
FORM = "form" # Applications, surveys
|
|
MIXED = "mixed" # General documents
|
|
CUSTOM = "custom" # User-defined settings
|
|
|
|
|
|
class TableParsingModeEnum(str, Enum):
|
|
"""Table parsing mode controlling how aggressively tables are parsed.
|
|
|
|
- FULL: Full table recognition with cell segmentation (aggressive)
|
|
- CONSERVATIVE: Disable wireless tables to prevent cell explosion
|
|
- CLASSIFICATION_ONLY: Only classify table regions, no cell segmentation
|
|
- DISABLED: Completely disable table recognition
|
|
"""
|
|
FULL = "full"
|
|
CONSERVATIVE = "conservative"
|
|
CLASSIFICATION_ONLY = "classification_only"
|
|
DISABLED = "disabled"
|
|
|
|
|
|
class OCRConfig(BaseModel):
|
|
"""OCR processing configuration for PP-Structure.
|
|
|
|
Allows fine-grained control over PP-Structure parameters.
|
|
Use with ocr_preset=CUSTOM or to override specific preset values.
|
|
"""
|
|
# Table Processing
|
|
table_parsing_mode: TableParsingModeEnum = Field(
|
|
default=TableParsingModeEnum.CONSERVATIVE,
|
|
description="Table parsing mode: full, conservative, classification_only, disabled"
|
|
)
|
|
enable_wired_table: bool = Field(
|
|
default=True,
|
|
description="Enable wired (bordered) table detection"
|
|
)
|
|
enable_wireless_table: bool = Field(
|
|
default=False,
|
|
description="Enable wireless (borderless) table detection. Can cause cell explosion."
|
|
)
|
|
|
|
# Layout Detection
|
|
layout_threshold: Optional[float] = Field(
|
|
default=None,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Layout detection threshold. Higher = stricter. None uses default."
|
|
)
|
|
layout_nms_threshold: Optional[float] = Field(
|
|
default=None,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Layout NMS threshold. None uses default."
|
|
)
|
|
|
|
# Preprocessing
|
|
use_doc_orientation_classify: bool = Field(
|
|
default=True,
|
|
description="Auto-detect and correct document rotation"
|
|
)
|
|
use_doc_unwarping: bool = Field(
|
|
default=False,
|
|
description="Correct document warping. Can cause distortion."
|
|
)
|
|
use_textline_orientation: bool = Field(
|
|
default=True,
|
|
description="Detect textline orientation"
|
|
)
|
|
|
|
# Recognition Modules
|
|
enable_chart_recognition: bool = Field(
|
|
default=True,
|
|
description="Enable chart/diagram recognition"
|
|
)
|
|
enable_formula_recognition: bool = Field(
|
|
default=True,
|
|
description="Enable math formula recognition"
|
|
)
|
|
enable_seal_recognition: bool = Field(
|
|
default=False,
|
|
description="Enable seal/stamp recognition"
|
|
)
|
|
enable_region_detection: bool = Field(
|
|
default=True,
|
|
description="Enable region detection for better structure"
|
|
)
|
|
|
|
|
|
# Preset configurations mapping
|
|
OCR_PRESET_CONFIGS = {
|
|
OCRPresetEnum.TEXT_HEAVY: OCRConfig(
|
|
table_parsing_mode=TableParsingModeEnum.DISABLED,
|
|
enable_wired_table=False,
|
|
enable_wireless_table=False,
|
|
enable_chart_recognition=False,
|
|
enable_formula_recognition=False,
|
|
),
|
|
OCRPresetEnum.DATASHEET: OCRConfig(
|
|
table_parsing_mode=TableParsingModeEnum.CONSERVATIVE,
|
|
enable_wired_table=True,
|
|
enable_wireless_table=False,
|
|
),
|
|
OCRPresetEnum.TABLE_HEAVY: OCRConfig(
|
|
table_parsing_mode=TableParsingModeEnum.FULL,
|
|
enable_wired_table=True,
|
|
enable_wireless_table=True,
|
|
),
|
|
OCRPresetEnum.FORM: OCRConfig(
|
|
table_parsing_mode=TableParsingModeEnum.CONSERVATIVE,
|
|
enable_wired_table=True,
|
|
enable_wireless_table=False,
|
|
),
|
|
OCRPresetEnum.MIXED: OCRConfig(
|
|
table_parsing_mode=TableParsingModeEnum.CLASSIFICATION_ONLY,
|
|
enable_wired_table=True,
|
|
enable_wireless_table=False,
|
|
),
|
|
# CUSTOM uses user-provided config directly
|
|
}
|
|
|
|
|
|
class PreprocessingConfig(BaseModel):
|
|
"""Preprocessing configuration for layout detection enhancement.
|
|
|
|
Used to configure image preprocessing before PP-Structure layout detection.
|
|
Preprocessing helps detect tables with faint lines or low contrast borders.
|
|
Original image is preserved for element extraction.
|
|
"""
|
|
contrast: PreprocessingContrastEnum = Field(
|
|
default=PreprocessingContrastEnum.CLAHE,
|
|
description="Contrast enhancement method"
|
|
)
|
|
contrast_strength: float = Field(
|
|
default=1.0,
|
|
ge=0.5,
|
|
le=3.0,
|
|
description="Contrast enhancement strength (0.5=subtle, 1.0=normal, 2.0=strong, 3.0=maximum)"
|
|
)
|
|
sharpen: bool = Field(
|
|
default=True,
|
|
description="Enable sharpening for faint lines"
|
|
)
|
|
sharpen_strength: float = Field(
|
|
default=1.0,
|
|
ge=0.5,
|
|
le=2.0,
|
|
description="Sharpening strength (0.5=subtle, 1.0=normal, 1.5=strong, 2.0=maximum)"
|
|
)
|
|
binarize: bool = Field(
|
|
default=False,
|
|
description="Enable binarization (aggressive, for very low contrast). Not recommended for most documents."
|
|
)
|
|
remove_scan_artifacts: bool = Field(
|
|
default=True,
|
|
description="Remove horizontal scan line artifacts. Recommended for scanned documents to prevent misdetection of scanner light bar lines as table borders."
|
|
)
|
|
|
|
|
|
class TableDetectionConfig(BaseModel):
|
|
"""Table detection configuration for PP-StructureV3.
|
|
|
|
Controls which table detection modes to enable. PP-StructureV3 uses specialized
|
|
models for different table types:
|
|
- Wired (bordered): Tables with visible cell borders/grid lines
|
|
- Wireless (borderless): Tables without visible borders, relying on alignment
|
|
- Region detection: Detect table-like regions for better cell structure
|
|
|
|
Multiple options can be enabled simultaneously for comprehensive detection.
|
|
"""
|
|
enable_wired_table: bool = Field(
|
|
default=True,
|
|
description="Enable wired (bordered) table detection. Best for tables with visible grid lines."
|
|
)
|
|
enable_wireless_table: bool = Field(
|
|
default=True,
|
|
description="Enable wireless (borderless) table detection. Best for tables without visible borders."
|
|
)
|
|
enable_region_detection: bool = Field(
|
|
default=True,
|
|
description="Enable region detection for better table structure inference."
|
|
)
|
|
|
|
|
|
class ImageQualityMetrics(BaseModel):
|
|
"""Image quality metrics from auto-analysis."""
|
|
contrast: float = Field(..., description="Contrast level (std dev of grayscale)")
|
|
edge_strength: float = Field(..., description="Edge strength (Sobel gradient mean)")
|
|
|
|
|
|
class PreprocessingPreviewRequest(BaseModel):
|
|
"""Request for preprocessing preview."""
|
|
page: int = Field(default=1, ge=1, description="Page number to preview")
|
|
mode: PreprocessingModeEnum = Field(
|
|
default=PreprocessingModeEnum.AUTO,
|
|
description="Preprocessing mode"
|
|
)
|
|
config: Optional[PreprocessingConfig] = Field(
|
|
None,
|
|
description="Manual configuration (only used when mode='manual')"
|
|
)
|
|
|
|
|
|
class PreprocessingPreviewResponse(BaseModel):
|
|
"""Response for preprocessing preview."""
|
|
original_url: str = Field(..., description="URL to original image")
|
|
preprocessed_url: str = Field(..., description="URL to preprocessed image")
|
|
quality_metrics: ImageQualityMetrics = Field(..., description="Image quality analysis")
|
|
auto_config: PreprocessingConfig = Field(..., description="Auto-detected configuration")
|
|
mode_used: PreprocessingModeEnum = Field(..., description="Mode that was applied")
|
|
|
|
|
|
class TaskCreate(BaseModel):
|
|
"""Task creation request"""
|
|
filename: Optional[str] = Field(None, description="Original filename")
|
|
file_type: Optional[str] = Field(None, description="File MIME type")
|
|
|
|
|
|
class TaskUpdate(BaseModel):
|
|
"""Task update request"""
|
|
status: Optional[TaskStatusEnum] = None
|
|
error_message: Optional[str] = None
|
|
processing_time_ms: Optional[int] = None
|
|
result_json_path: Optional[str] = None
|
|
result_markdown_path: Optional[str] = None
|
|
result_pdf_path: Optional[str] = None
|
|
|
|
|
|
class TaskFileResponse(BaseModel):
|
|
"""Task file response schema"""
|
|
id: int
|
|
original_name: Optional[str] = None
|
|
stored_path: Optional[str] = None
|
|
file_size: Optional[int] = None
|
|
mime_type: Optional[str] = None
|
|
file_hash: Optional[str] = None
|
|
created_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class TaskResponse(BaseModel):
|
|
"""Task response schema"""
|
|
id: int
|
|
user_id: int
|
|
task_id: str
|
|
filename: Optional[str] = None
|
|
file_type: Optional[str] = None
|
|
status: TaskStatusEnum
|
|
result_json_path: Optional[str] = None
|
|
result_markdown_path: Optional[str] = None
|
|
result_pdf_path: Optional[str] = None
|
|
error_message: Optional[str] = None
|
|
processing_time_ms: Optional[int] = None
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
completed_at: Optional[datetime] = None
|
|
file_deleted: bool = False
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class TaskDetailResponse(TaskResponse):
|
|
"""Detailed task response with files"""
|
|
files: List[TaskFileResponse] = []
|
|
# Dual-track processing field (extracted from result metadata)
|
|
processing_track: Optional[ProcessingTrackEnum] = None
|
|
|
|
|
|
class TaskListResponse(BaseModel):
|
|
"""Paginated task list response"""
|
|
tasks: List[TaskResponse]
|
|
total: int
|
|
page: int
|
|
page_size: int
|
|
has_more: bool
|
|
|
|
|
|
class TaskStatsResponse(BaseModel):
|
|
"""User task statistics"""
|
|
total: int
|
|
pending: int
|
|
processing: int
|
|
completed: int
|
|
failed: int
|
|
|
|
|
|
class TaskHistoryQuery(BaseModel):
|
|
"""Task history query parameters"""
|
|
status: Optional[TaskStatusEnum] = None
|
|
filename: Optional[str] = None
|
|
date_from: Optional[datetime] = None
|
|
date_to: Optional[datetime] = None
|
|
page: int = Field(default=1, ge=1)
|
|
page_size: int = Field(default=50, ge=1, le=100)
|
|
order_by: str = Field(default="created_at")
|
|
order_desc: bool = Field(default=True)
|
|
|
|
|
|
class UploadFileInfo(BaseModel):
|
|
"""Uploaded file information"""
|
|
filename: str
|
|
file_size: int
|
|
file_type: str
|
|
|
|
|
|
class UploadResponse(BaseModel):
|
|
"""File upload response"""
|
|
task_id: str = Field(..., description="Created task ID")
|
|
filename: str = Field(..., description="Original filename")
|
|
file_size: int = Field(..., description="File size in bytes")
|
|
file_type: str = Field(..., description="File MIME type")
|
|
status: TaskStatusEnum = Field(..., description="Initial task status")
|
|
|
|
|
|
# ===== Dual-Track Processing Schemas =====
|
|
|
|
class PPStructureV3Params(BaseModel):
|
|
"""PP-StructureV3 fine-tuning parameters for OCR track.
|
|
|
|
DEPRECATED: This class is deprecated and will be removed in a future version.
|
|
Use `layout_model` parameter in ProcessingOptions instead.
|
|
"""
|
|
layout_detection_threshold: Optional[float] = Field(
|
|
None, ge=0, le=1,
|
|
description="Layout block detection score threshold (lower=more blocks, higher=high confidence only)"
|
|
)
|
|
layout_nms_threshold: Optional[float] = Field(
|
|
None, ge=0, le=1,
|
|
description="Layout NMS IoU threshold (lower=aggressive overlap removal, higher=allow more overlap)"
|
|
)
|
|
layout_merge_bboxes_mode: Optional[str] = Field(
|
|
None, pattern="^(union|large|small)$",
|
|
description="Bbox merging strategy: 'small'=conservative, 'large'=aggressive, 'union'=middle"
|
|
)
|
|
layout_unclip_ratio: Optional[float] = Field(
|
|
None, gt=0,
|
|
description="Layout bbox expansion ratio (larger=looser boxes, smaller=tighter boxes)"
|
|
)
|
|
text_det_thresh: Optional[float] = Field(
|
|
None, ge=0, le=1,
|
|
description="Text detection score threshold (lower=detect more small/low-contrast text, higher=cleaner)"
|
|
)
|
|
text_det_box_thresh: Optional[float] = Field(
|
|
None, ge=0, le=1,
|
|
description="Text box candidate threshold (lower=more text boxes, higher=fewer false positives)"
|
|
)
|
|
text_det_unclip_ratio: Optional[float] = Field(
|
|
None, gt=0,
|
|
description="Text box expansion ratio (larger=looser boxes, smaller=tighter boxes)"
|
|
)
|
|
|
|
|
|
class ProcessingOptions(BaseModel):
|
|
"""Processing options for dual-track OCR"""
|
|
use_dual_track: bool = Field(default=True, description="Enable dual-track processing")
|
|
force_track: Optional[ProcessingTrackEnum] = Field(None, description="Force specific track (ocr/direct)")
|
|
language: str = Field(default="ch", description="OCR language code")
|
|
include_layout: bool = Field(default=True, description="Include layout analysis")
|
|
include_images: bool = Field(default=True, description="Extract and save images")
|
|
confidence_threshold: Optional[float] = Field(None, ge=0, le=1, description="OCR confidence threshold")
|
|
|
|
# Layout model selection (OCR track only)
|
|
layout_model: Optional[LayoutModelEnum] = Field(
|
|
default=LayoutModelEnum.CHINESE,
|
|
description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)"
|
|
)
|
|
|
|
# Layout preprocessing (OCR track only)
|
|
preprocessing_mode: PreprocessingModeEnum = Field(
|
|
default=PreprocessingModeEnum.AUTO,
|
|
description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
|
|
)
|
|
preprocessing_config: Optional[PreprocessingConfig] = Field(
|
|
None,
|
|
description="Manual preprocessing config (only used when preprocessing_mode='manual')"
|
|
)
|
|
|
|
# Table detection configuration (OCR track only)
|
|
table_detection: Optional[TableDetectionConfig] = Field(
|
|
None,
|
|
description="Table detection config. If None, all table detection modes are enabled."
|
|
)
|
|
|
|
# OCR Processing Preset (OCR track only)
|
|
# Use presets for optimized configurations or CUSTOM with ocr_config for fine-tuning
|
|
ocr_preset: Optional[OCRPresetEnum] = Field(
|
|
default=OCRPresetEnum.DATASHEET,
|
|
description="OCR processing preset: text_heavy, datasheet, table_heavy, form, mixed, custom"
|
|
)
|
|
ocr_config: Optional[OCRConfig] = Field(
|
|
None,
|
|
description="Custom OCR config. Used when ocr_preset=custom or to override preset values."
|
|
)
|
|
|
|
|
|
class AnalyzeRequest(BaseModel):
|
|
"""Document analysis request"""
|
|
use_dual_track: bool = Field(default=True, description="Enable dual-track processing")
|
|
force_track: Optional[ProcessingTrackEnum] = Field(None, description="Force specific track")
|
|
language: str = Field(default="ch", description="OCR language")
|
|
include_layout: bool = Field(default=True, description="Include layout analysis")
|
|
|
|
|
|
class DocumentAnalysisResponse(BaseModel):
|
|
"""Document type analysis response"""
|
|
task_id: str
|
|
filename: str
|
|
recommended_track: ProcessingTrackEnum
|
|
confidence: float = Field(..., ge=0, le=1, description="Detection confidence")
|
|
reason: str = Field(..., description="Reason for recommendation")
|
|
document_info: dict = Field(default_factory=dict, description="Document metadata")
|
|
is_editable: bool = Field(..., description="Whether document has extractable text")
|
|
text_coverage: Optional[float] = Field(None, description="Percentage of text coverage")
|
|
page_count: Optional[int] = Field(None, description="Number of pages")
|
|
|
|
|
|
class ProcessingMetadata(BaseModel):
|
|
"""Processing metadata included in responses"""
|
|
processing_track: ProcessingTrackEnum
|
|
processing_time_seconds: float
|
|
language: str
|
|
page_count: int
|
|
total_elements: int
|
|
total_text_regions: int
|
|
total_tables: int
|
|
total_images: int
|
|
average_confidence: Optional[float] = None
|
|
unified_format: bool = True
|
|
|
|
|
|
class TaskResponseWithMetadata(TaskResponse):
|
|
"""Extended task response with processing metadata"""
|
|
processing_track: Optional[ProcessingTrackEnum] = None
|
|
processing_metadata: Optional[ProcessingMetadata] = None
|
|
|
|
|
|
class ExportOptions(BaseModel):
|
|
"""Export format options"""
|
|
format: str = Field(default="json", description="Export format: json, markdown, pdf, unified")
|
|
include_metadata: bool = Field(default=True, description="Include processing metadata")
|
|
include_statistics: bool = Field(default=True, description="Include document statistics")
|
|
legacy_format: bool = Field(default=False, description="Use legacy JSON format for compatibility")
|