Files
OCR/backend/app/core/config.py
egg ea0dd7456c feat: implement layout preprocessing backend
Backend implementation for add-layout-preprocessing proposal:
- Add LayoutPreprocessingService with CLAHE, sharpen, binarize
- Add auto-detection: analyze_image_quality() for contrast/edge metrics
- Integrate preprocessing into OCR pipeline (analyze_layout)
- Add Preview API: POST /api/v2/tasks/{id}/preview/preprocessing
- Add config options: layout_preprocessing_mode, thresholds
- Add schemas: PreprocessingConfig, PreprocessingPreviewResponse

Preprocessing only affects layout detection input.
Original images preserved for element extraction.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 15:17:20 +08:00

317 lines
15 KiB
Python

"""
Tool_OCR - Configuration Management
Loads environment variables and provides centralized configuration
"""
from typing import List, Optional
from pydantic_settings import BaseSettings
from pydantic import Field
from pathlib import Path
class Settings(BaseSettings):
"""Application settings loaded from environment variables"""
# ===== Database Configuration =====
mysql_host: str = Field(default="mysql.theaken.com")
mysql_port: int = Field(default=33306)
mysql_user: str = Field(default="A060")
mysql_password: str = Field(default="")
mysql_database: str = Field(default="db_A060")
@property
def database_url(self) -> str:
"""Construct SQLAlchemy database URL"""
return (
f"mysql+pymysql://{self.mysql_user}:{self.mysql_password}"
f"@{self.mysql_host}:{self.mysql_port}/{self.mysql_database}"
)
# ===== Application Configuration =====
backend_port: int = Field(default=12010)
frontend_port: int = Field(default=12011)
secret_key: str = Field(default="your-secret-key-change-this")
algorithm: str = Field(default="HS256")
access_token_expire_minutes: int = Field(default=1440) # 24 hours
# ===== External Authentication Configuration =====
external_auth_api_url: str = Field(default="https://pj-auth-api.vercel.app")
external_auth_endpoint: str = Field(default="/api/auth/login")
external_auth_timeout: int = Field(default=30)
token_refresh_buffer: int = Field(default=300) # Refresh tokens 5 minutes before expiry
@property
def external_auth_full_url(self) -> str:
"""Construct full external authentication URL"""
return f"{self.external_auth_api_url.rstrip('/')}{self.external_auth_endpoint}"
# ===== Task Management Configuration =====
database_table_prefix: str = Field(default="tool_ocr_")
enable_task_history: bool = Field(default=True)
task_retention_days: int = Field(default=30)
max_tasks_per_user: int = Field(default=1000)
# ===== OCR Configuration =====
paddleocr_model_dir: str = Field(default="./models/paddleocr")
ocr_languages: str = Field(default="ch,en,japan,korean")
ocr_confidence_threshold: float = Field(default=0.5)
max_ocr_workers: int = Field(default=4)
@property
def ocr_languages_list(self) -> List[str]:
"""Get OCR languages as list"""
return [lang.strip() for lang in self.ocr_languages.split(",")]
# ===== GPU Acceleration Configuration =====
# Basic GPU settings
force_cpu_mode: bool = Field(default=False)
gpu_memory_fraction: float = Field(default=0.7) # Optimized for RTX 4060 8GB
gpu_device_id: int = Field(default=0)
# Memory management for RTX 4060 8GB
gpu_memory_limit_mb: int = Field(default=6144) # 6GB max for models (leave 2GB buffer)
gpu_memory_reserve_mb: int = Field(default=512) # Reserve for CUDA overhead
enable_memory_optimization: bool = Field(default=True)
# Model loading and caching
enable_lazy_model_loading: bool = Field(default=True) # Load models on demand
enable_model_cache: bool = Field(default=True)
model_cache_limit_mb: int = Field(default=4096) # Max 4GB for cached models
auto_unload_unused_models: bool = Field(default=True) # Unload unused language models
model_idle_timeout_seconds: int = Field(default=300) # Unload after 5 min idle
# Batch processing configuration
enable_batch_processing: bool = Field(default=True)
inference_batch_size: int = Field(default=1) # Conservative for 8GB VRAM
max_concurrent_pages: int = Field(default=2) # Process 2 pages concurrently
# PP-StructureV3 optimization
enable_chart_recognition: bool = Field(default=True) # Chart/diagram recognition
enable_formula_recognition: bool = Field(default=True) # Math formula recognition
enable_table_recognition: bool = Field(default=True) # Table structure recognition
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
enable_text_recognition: bool = Field(default=True) # General text recognition
# PP-StructureV3 Preprocessing (Stage 1)
use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation
use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos
use_textline_orientation: bool = Field(default=True) # Detect textline orientation
layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection
layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements
layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging
layout_unclip_ratio: float = Field(default=1.2) # Smaller unclip to preserve element boundaries
text_det_thresh: float = Field(default=0.2) # More sensitive text detection
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
# Layout Detection Model Configuration (Stage 3)
# Available models:
# - None (default): Use PP-StructureV3's built-in model (PubLayNet-based)
# - "PP-DocLayout_plus-L": Best for Chinese docs (83.2% mAP, 20 categories) - complex layouts
# - "PP-DocLayout-L": High accuracy (90.4% mAP, 23 categories) - general purpose
# - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout
layout_detection_model_name: Optional[str] = Field(
default="PP-DocLayout_plus-L",
description="Layout detection model name. PP-DocLayout_plus-L recommended for complex Chinese documents."
)
layout_detection_model_dir: Optional[str] = Field(
default=None,
description="Custom layout detection model directory. If None, downloads official model."
)
# Table Structure Recognition Model Configuration (Stage 4)
# PP-StructureV3 uses separate models for wired (bordered) and wireless (borderless) tables
# Both models should be configured for comprehensive table detection
# Available models:
# - "SLANeXt_wired": Best for wired/bordered tables (69.65% accuracy, 351MB)
# - "SLANeXt_wireless": Best for wireless/borderless tables (69.65% accuracy, 351MB)
# - "SLANet": Legacy model (59.52% accuracy, 6.9MB)
# - "SLANet_plus": Improved legacy (63.69% accuracy, 6.9MB)
wired_table_model_name: Optional[str] = Field(
default="SLANeXt_wired",
description="Table structure model for bordered tables. SLANeXt_wired recommended."
)
wireless_table_model_name: Optional[str] = Field(
default="SLANeXt_wireless",
description="Table structure model for borderless tables. SLANeXt_wireless recommended."
)
# Formula Recognition Model Configuration (Stage 4)
# Available models:
# - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
# - "PP-FormulaNet-L": Good for English formulas (90.36% English BLEU)
# - "PP-FormulaNet-S": Fast inference (87% English BLEU)
formula_recognition_model_name: Optional[str] = Field(
default="PP-FormulaNet_plus-L",
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
)
# ===== Layout Preprocessing Configuration =====
# Image preprocessing to enhance layout detection for documents with faint lines/borders
# Preprocessing only affects layout detection input; original image is preserved for extraction
layout_preprocessing_mode: str = Field(
default="auto",
description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'"
)
layout_preprocessing_contrast: str = Field(
default="clahe",
description="Contrast enhancement method: 'none', 'histogram', 'clahe' (recommended)"
)
layout_preprocessing_sharpen: bool = Field(
default=True,
description="Enable sharpening to enhance faint lines and borders"
)
layout_preprocessing_binarize: bool = Field(
default=False,
description="Enable binarization (aggressive, use for very low contrast documents only)"
)
# Auto-detection thresholds
layout_preprocessing_contrast_threshold: float = Field(
default=40.0,
description="Contrast (std dev) below this triggers CLAHE in auto mode"
)
layout_preprocessing_edge_threshold: float = Field(
default=15.0,
description="Edge strength below this triggers sharpening in auto mode"
)
layout_preprocessing_binarize_threshold: float = Field(
default=20.0,
description="Contrast below this triggers binarization in auto mode"
)
# ===== Gap Filling Configuration =====
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
gap_filling_coverage_threshold: float = Field(default=0.7) # Activate when coverage < 70%
gap_filling_iou_threshold: float = Field(default=0.15) # IoU threshold for coverage detection
gap_filling_confidence_threshold: float = Field(default=0.3) # Min confidence for raw OCR regions
gap_filling_dedup_iou_threshold: float = Field(default=0.5) # IoU threshold for deduplication
# ===== Debug Configuration =====
# Enable debug outputs for PP-StructureV3 analysis
pp_structure_debug_enabled: bool = Field(default=True) # Save debug files for PP-StructureV3
pp_structure_debug_visualization: bool = Field(default=True) # Generate visualization images
# Performance tuning
use_fp16_inference: bool = Field(default=False) # Half-precision (if supported)
enable_cudnn_benchmark: bool = Field(default=True) # Optimize convolution algorithms
num_threads: int = Field(default=4) # CPU threads for preprocessing
# ===== Enhanced Memory Management Configuration =====
# Memory thresholds (as ratio of total GPU memory)
memory_warning_threshold: float = Field(default=0.80) # 80% - start warning
memory_critical_threshold: float = Field(default=0.95) # 95% - throttle operations
memory_emergency_threshold: float = Field(default=0.98) # 98% - emergency cleanup
# Memory monitoring
memory_check_interval_seconds: int = Field(default=30) # Background check interval
enable_memory_alerts: bool = Field(default=True) # Enable memory alerts
# Model lifecycle management
enable_model_lifecycle_management: bool = Field(default=True) # Use ModelManager
pp_structure_idle_timeout_seconds: int = Field(default=300) # Unload PP-Structure after idle
structure_model_memory_mb: int = Field(default=2000) # Estimated memory for PP-StructureV3
ocr_model_memory_mb: int = Field(default=500) # Estimated memory per OCR language model
# Service pool configuration
enable_service_pool: bool = Field(default=True) # Use OCRServicePool
max_services_per_device: int = Field(default=1) # Max OCRService per GPU
max_total_services: int = Field(default=2) # Max total OCRService instances
service_acquire_timeout_seconds: float = Field(default=300.0) # Timeout for acquiring service
max_queue_size: int = Field(default=50) # Max pending tasks per device
# Concurrency control
max_concurrent_predictions: int = Field(default=2) # Max concurrent PP-StructureV3 predictions
enable_cpu_fallback: bool = Field(default=True) # Fall back to CPU when GPU memory low
# Emergency recovery
enable_emergency_cleanup: bool = Field(default=True) # Auto-cleanup on memory pressure
enable_worker_restart: bool = Field(default=False) # Restart workers on OOM (requires supervisor)
# ===== File Upload Configuration =====
max_upload_size: int = Field(default=52428800) # 50MB
allowed_extensions: str = Field(default="png,jpg,jpeg,pdf,bmp,tiff,doc,docx,ppt,pptx")
upload_dir: str = Field(default="./uploads")
temp_dir: str = Field(default="./uploads/temp")
processed_dir: str = Field(default="./uploads/processed")
images_dir: str = Field(default="./uploads/images")
@property
def allowed_extensions_list(self) -> List[str]:
"""Get allowed extensions as list"""
return [ext.strip() for ext in self.allowed_extensions.split(",")]
# ===== Export Configuration =====
storage_dir: str = Field(default="./storage")
markdown_dir: str = Field(default="./storage/markdown")
json_dir: str = Field(default="./storage/json")
exports_dir: str = Field(default="./storage/exports")
result_dir: str = Field(default="./storage/results")
# ===== PDF Generation Configuration =====
pandoc_path: str = Field(default="/opt/homebrew/bin/pandoc")
font_dir: str = Field(default="/System/Library/Fonts")
pdf_page_size: str = Field(default="A4")
pdf_margin_top: int = Field(default=20)
pdf_margin_bottom: int = Field(default=20)
pdf_margin_left: int = Field(default=20)
pdf_margin_right: int = Field(default=20)
# ===== Layout-Preserving PDF Configuration =====
chinese_font_path: str = Field(default="./backend/fonts/NotoSansSC-Regular.ttf")
pdf_font_size_base: int = Field(default=12)
pdf_enable_bbox_debug: bool = Field(default=False) # Draw bounding boxes for debugging
# ===== Translation Configuration (Reserved) =====
enable_translation: bool = Field(default=False)
translation_engine: str = Field(default="offline")
argostranslate_models_dir: str = Field(default="./models/argostranslate")
# ===== Background Tasks Configuration =====
task_queue_type: str = Field(default="memory")
redis_url: str = Field(default="redis://localhost:6379/0")
# ===== CORS Configuration =====
cors_origins: str = Field(default="http://localhost:12011,http://127.0.0.1:12011")
@property
def cors_origins_list(self) -> List[str]:
"""Get CORS origins as list"""
return [origin.strip() for origin in self.cors_origins.split(",")]
# ===== Logging Configuration =====
log_level: str = Field(default="INFO")
log_file: str = Field(default="./logs/app.log")
class Config:
# Look for .env in project root (one level up from backend/)
env_file = str(Path(__file__).resolve().parent.parent.parent.parent / ".env")
env_file_encoding = "utf-8"
case_sensitive = False
def ensure_directories(self):
"""Create all necessary directories if they don't exist"""
dirs = [
self.upload_dir,
self.temp_dir,
self.processed_dir,
self.images_dir,
self.storage_dir,
self.markdown_dir,
self.json_dir,
self.exports_dir,
self.result_dir,
self.paddleocr_model_dir,
Path(self.log_file).parent,
]
if self.enable_translation and self.translation_engine == "offline":
dirs.append(self.argostranslate_models_dir)
for dir_path in dirs:
Path(dir_path).mkdir(parents=True, exist_ok=True)
# Global settings instance
settings = Settings()