""" Tool_OCR - Configuration Management Loads environment variables and provides centralized configuration """ from typing import List, Optional from pydantic_settings import BaseSettings from pydantic import Field, model_validator from pathlib import Path # Anchor all default paths to the backend directory to avoid scattering runtime folders BACKEND_ROOT = Path(__file__).resolve().parent.parent.parent PROJECT_ROOT = BACKEND_ROOT.parent class Settings(BaseSettings): """Application settings loaded from environment variables""" # ===== Database Configuration ===== mysql_host: str = Field(default="mysql.theaken.com") mysql_port: int = Field(default=33306) mysql_user: str = Field(default="A060") mysql_password: str = Field(default="") mysql_database: str = Field(default="db_A060") @property def database_url(self) -> str: """Construct SQLAlchemy database URL""" return ( f"mysql+pymysql://{self.mysql_user}:{self.mysql_password}" f"@{self.mysql_host}:{self.mysql_port}/{self.mysql_database}" ) # ===== Application Configuration ===== backend_port: int = Field(default=8000) frontend_port: int = Field(default=5173) secret_key: str = Field(default="your-secret-key-change-this") algorithm: str = Field(default="HS256") access_token_expire_minutes: int = Field(default=1440) # 24 hours # ===== External Authentication Configuration ===== external_auth_api_url: str = Field(default="https://pj-auth-api.vercel.app") external_auth_endpoint: str = Field(default="/api/auth/login") external_auth_timeout: int = Field(default=30) token_refresh_buffer: int = Field(default=300) # Refresh tokens 5 minutes before expiry @property def external_auth_full_url(self) -> str: """Construct full external authentication URL""" return f"{self.external_auth_api_url.rstrip('/')}{self.external_auth_endpoint}" # ===== Task Management Configuration ===== database_table_prefix: str = Field(default="tool_ocr_") enable_task_history: bool = Field(default=True) task_retention_days: int = Field(default=30) max_tasks_per_user: int = Field(default=1000) # ===== OCR Configuration ===== # Note: PaddleOCR models are stored in ~/.paddleocr/ and ~/.paddlex/ by default ocr_languages: str = Field(default="ch,en,japan,korean") ocr_confidence_threshold: float = Field(default=0.5) max_ocr_workers: int = Field(default=4) @property def ocr_languages_list(self) -> List[str]: """Get OCR languages as list""" return [lang.strip() for lang in self.ocr_languages.split(",")] # ===== GPU Acceleration Configuration ===== # Basic GPU settings force_cpu_mode: bool = Field(default=False) gpu_memory_fraction: float = Field(default=0.7) # Optimized for RTX 4060 8GB gpu_device_id: int = Field(default=0) # Memory management for RTX 4060 8GB gpu_memory_limit_mb: int = Field(default=6144) # 6GB max for models (leave 2GB buffer) gpu_memory_reserve_mb: int = Field(default=512) # Reserve for CUDA overhead enable_memory_optimization: bool = Field(default=True) # Model loading and caching enable_lazy_model_loading: bool = Field(default=True) # Load models on demand enable_model_cache: bool = Field(default=True) model_cache_limit_mb: int = Field(default=4096) # Max 4GB for cached models auto_unload_unused_models: bool = Field(default=True) # Unload unused language models model_idle_timeout_seconds: int = Field(default=300) # Unload after 5 min idle # Batch processing configuration enable_batch_processing: bool = Field(default=True) inference_batch_size: int = Field(default=1) # Conservative for 8GB VRAM max_concurrent_pages: int = Field(default=2) # Process 2 pages concurrently # PP-StructureV3 optimization # Strategy: Use raw OCR positioning (simple-text-positioning) instead of table structure reconstruction # - Layout Detection: ON (detect regions) # - General OCR: ON (text recognition) # - Table Recognition: OFF (no cell/structure parsing - use raw OCR bbox instead) # - Seal/Formula/Chart: ON (specialized recognition) enable_chart_recognition: bool = Field(default=True) # Chart/diagram recognition enable_formula_recognition: bool = Field(default=True) # Math formula recognition enable_table_recognition: bool = Field(default=False) # Table structure recognition - DISABLED (use raw OCR) enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition enable_region_detection: bool = Field(default=True) # Region detection for layout enable_text_recognition: bool = Field(default=True) # General text recognition # Table Parsing Mode - Controls how aggressively tables are parsed # This is the KEY setting to prevent "cell explosion" on datasheet-type documents # Options: # - "full": Full table recognition with cell segmentation (aggressive, may over-detect) # - "conservative": Conservative models + disable wireless tables + higher layout threshold # - "classification_only": Only classify table regions, no cell segmentation (recommended for datasheets) # - "disabled": Completely disable table recognition (safest for text-heavy documents) table_parsing_mode: str = Field( default="conservative", description="Table parsing mode: 'full', 'conservative', 'classification_only', 'disabled'" ) # Layout threshold for table detection (higher = stricter, less false positives) # WARNING: This affects ALL layout detection, not just tables. Use with caution. # Default None uses PaddleX default. Only set this if you understand the impact. table_layout_threshold: Optional[float] = Field( default=None, description="Layout threshold for ALL element detection. Higher values = fewer elements detected." ) # Table Column Alignment Correction (Header-Anchor Algorithm) # Corrects PP-Structure's column assignment errors using header row X-coordinates as reference table_column_correction_enabled: bool = Field( default=True, description="Enable header-anchor column correction for table cells" ) table_column_correction_threshold: float = Field( default=0.5, description="Minimum X-overlap ratio (0-1) to trigger column correction" ) # Vertical Text Fragment Merging # Detects and merges narrow vertical text blocks that were split by OCR vertical_fragment_merge_enabled: bool = Field( default=True, description="Enable vertical text fragment merging for Chinese vertical text" ) vertical_fragment_aspect_ratio: float = Field( default=0.3, description="Max width/height ratio to consider as vertical text (lower = narrower)" ) # Simple Text Positioning Mode (OCR Track) # When enabled, bypasses complex table structure reconstruction and renders # raw OCR text directly at detected positions with rotation correction. # This is more reliable for documents where PP-Structure fails to parse tables correctly. simple_text_positioning_enabled: bool = Field( default=True, description="Use simple text positioning instead of complex table reconstruction for OCR track" ) simple_text_positioning_debug: bool = Field( default=False, description="Enable debug logging for simple text positioning" ) # PP-StructureV3 Preprocessing (Stage 1) # NOTE: doc_orientation_classify ENABLED - detects and corrects document orientation # for scanned PDFs where content orientation differs from PDF page metadata. # When rotation is detected (90°/270°), page dimensions are swapped accordingly. use_doc_orientation_classify: bool = Field(default=True) # Enabled: auto-detect and correct page orientation use_doc_unwarping: bool = Field(default=False) # Disabled: can cause document distortion/skewing use_textline_orientation: bool = Field(default=True) # Detect textline orientation # Layout Detection Parameters (Stage 3) # NOTE: Testing showed that PaddleX defaults work better for table detection. # Previously we used aggressive low thresholds (0.2) which caused table detection failures. # Now using None to let PaddleX use its optimized defaults. layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default # layout_merge_bboxes_mode options: # - "large": Keep larger box when overlap (default) # - "small": Keep smaller box when overlap # - "union": Keep all boxes (preserve overlapping tables/images) # Using "union" to prevent tables from being merged together layout_merge_mode: Optional[str] = Field( default="union", description="How to handle overlapping detection boxes. 'union' preserves all detected regions." ) layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default # Text Detection Parameters text_det_thresh: Optional[float] = Field(default=None) # None = use PaddleX default text_det_box_thresh: Optional[float] = Field(default=None) # None = use PaddleX default text_det_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default # Layout Detection Model Configuration (Stage 3) # Available models: # - None (default): Use PP-StructureV3's built-in model (PubLayNet-based) # - "PP-DocLayout_plus-L": Best for Chinese docs (83.2% mAP, 20 categories) - complex layouts # - "PP-DocLayout-L": High accuracy (90.4% mAP, 23 categories) - general purpose # - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout layout_detection_model_name: Optional[str] = Field( default="PP-DocLayout_plus-L", description="Layout detection model name. PP-DocLayout_plus-L recommended for complex Chinese documents." ) layout_detection_model_dir: Optional[str] = Field( default=None, description="Custom layout detection model directory. If None, downloads official model." ) # Table Structure Recognition Model Configuration (Stage 4) # PP-StructureV3 uses separate models for wired (bordered) and wireless (borderless) tables # Both models should be configured for comprehensive table detection # Available models: # - "SLANeXt_wired": Best for wired/bordered tables (69.65% accuracy, 351MB) # - "SLANeXt_wireless": Best for wireless/borderless tables (69.65% accuracy, 351MB) # - "SLANet": Legacy model (59.52% accuracy, 6.9MB) # - "SLANet_plus": Improved legacy (63.69% accuracy, 6.9MB) wired_table_model_name: Optional[str] = Field( default="SLANeXt_wired", description="Table structure model for bordered tables. SLANeXt_wired recommended." ) wireless_table_model_name: Optional[str] = Field( default="SLANeXt_wireless", description="Table structure model for borderless tables. SLANeXt_wireless recommended." ) # Table Classification Model - determines if table is wired or wireless table_classification_model_name: Optional[str] = Field( default="PP-LCNet_x1_0_table_cls", description="Model to classify table type (wired vs wireless). Enables automatic model selection." ) # Table Cell Detection Models - detect individual cells within tables # These are crucial for accurate cell boundary detection in complex tables wired_table_cells_detection_model_name: Optional[str] = Field( default="RT-DETR-L_wired_table_cell_det", description="Cell detection model for bordered tables. RT-DETR-L provides best accuracy." ) wireless_table_cells_detection_model_name: Optional[str] = Field( default="RT-DETR-L_wireless_table_cell_det", description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy." ) # Note: Table cell boxes are now extracted from table_res_list returned by PPStructureV3 # No additional model calls needed - PPStructureV3 provides cell_box_list in table_res_list # Formula Recognition Model Configuration (Stage 4) # Available models: # - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU) # - "PP-FormulaNet-L": Good for English formulas (90.36% English BLEU) # - "PP-FormulaNet-S": Fast inference (87% English BLEU) formula_recognition_model_name: Optional[str] = Field( default="PP-FormulaNet_plus-L", description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support." ) # Chart Recognition Model Configuration chart_recognition_model_name: Optional[str] = Field( default="PP-Chart2Table", description="Chart to table recognition model." ) # Text Detection and Recognition Model Configuration # PP-OCRv5_server provides best accuracy for document OCR text_detection_model_name: Optional[str] = Field( default="PP-OCRv5_server_det", description="Text detection model. PP-OCRv5_server_det recommended for documents." ) text_recognition_model_name: Optional[str] = Field( default="PP-OCRv5_server_rec", description="Text recognition model. PP-OCRv5_server_rec recommended for documents." ) # Document Preprocessing Model Configuration (Stage 1) doc_orientation_classify_model_name: Optional[str] = Field( default="PP-LCNet_x1_0_doc_ori", description="Document orientation classification model for auto-rotation." ) doc_unwarping_model_name: Optional[str] = Field( default="UVDoc", description="Document unwarping model for correcting perspective distortion." ) textline_orientation_model_name: Optional[str] = Field( default="PP-LCNet_x1_0_textline_ori", description="Textline orientation model for detecting text direction." ) # ===== Layout Preprocessing Configuration ===== # Image preprocessing to enhance layout detection for documents with faint lines/borders # Preprocessing only affects layout detection input; original image is preserved for extraction layout_preprocessing_mode: str = Field( default="auto", description="Preprocessing mode: 'auto' (analyze and apply), 'manual' (use config), 'disabled'" ) layout_preprocessing_contrast: str = Field( default="clahe", description="Contrast enhancement method: 'none', 'histogram', 'clahe' (recommended)" ) layout_preprocessing_sharpen: bool = Field( default=True, description="Enable sharpening to enhance faint lines and borders" ) layout_preprocessing_binarize: bool = Field( default=False, description="Enable binarization (aggressive, use for very low contrast documents only)" ) # Auto-detection thresholds layout_preprocessing_contrast_threshold: float = Field( default=40.0, description="Contrast (std dev) below this triggers CLAHE in auto mode" ) layout_preprocessing_edge_threshold: float = Field( default=15.0, description="Edge strength below this triggers sharpening in auto mode" ) layout_preprocessing_binarize_threshold: float = Field( default=20.0, description="Contrast below this triggers binarization in auto mode" ) # Layout image scaling for better table detection # Automatic bidirectional scaling for layout detection # PDF conversion now uses 150 DPI (~1240x1754 for A4), which falls within optimal range # Scaling acts as a safety net for: # - Very large images (>2000px): Downscale to target # - Very small images (<1200px): Upscale to target # - 150 DPI A4 (1240x1754): No scaling needed (already optimal) layout_image_scaling_enabled: bool = Field( default=True, description="Enable automatic bidirectional scaling for layout detection. " "Images outside optimal range are scaled to target dimension." ) layout_image_scaling_max_dimension: int = Field( default=2000, description="Max dimension (pixels) before downscaling. Images larger than this will be scaled down." ) layout_image_scaling_min_dimension: int = Field( default=1200, description="Min dimension (pixels) before upscaling. Images smaller than this will be scaled up." ) layout_image_scaling_target_dimension: int = Field( default=1600, description="Target dimension (pixels) for scaling. Optimal size for PP-Structure layout detection." ) # ===== Gap Filling Configuration ===== # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete # Uses IoA (Intersection over Area) instead of IoU for better coverage detection gap_filling_enabled: bool = Field(default=False) # Enable gap filling for OCR track gap_filling_coverage_threshold: float = Field(default=0.7) # Activate when coverage < 70% gap_filling_confidence_threshold: float = Field(default=0.3) # Min confidence for raw OCR regions # IoA (Intersection over Area) thresholds - different thresholds per element type # IoA = intersection_area / ocr_box_area (measures how much of OCR box is inside layout region) gap_filling_ioa_threshold_text: float = Field( default=0.6, description="IoA threshold for TEXT/TITLE elements. Tolerates boundary errors." ) gap_filling_ioa_threshold_table: float = Field( default=0.1, description="IoA threshold for TABLE elements. Strict to prevent duplicate table content." ) gap_filling_ioa_threshold_figure: float = Field( default=0.8, description="IoA threshold for FIGURE/IMAGE elements. Preserves text inside figures." ) gap_filling_dedup_ioa_threshold: float = Field( default=0.5, description="IoA threshold for deduplication against existing TEXT elements." ) gap_filling_shrink_pixels: int = Field( default=1, description="Shrink OCR bbox inward by this many pixels to reduce edge duplicates." ) # Use PP-StructureV3's internal OCR (overall_ocr_res) instead of separate Raw OCR gap_filling_use_overall_ocr: bool = Field( default=True, description="Use PP-StructureV3's internal OCR results instead of separate inference." ) # ===== Debug Configuration ===== # Enable debug outputs for PP-StructureV3 analysis pp_structure_debug_enabled: bool = Field(default=True) # Save debug files for PP-StructureV3 pp_structure_debug_visualization: bool = Field(default=True) # Generate visualization images # Performance tuning use_fp16_inference: bool = Field(default=False) # Half-precision (if supported) enable_cudnn_benchmark: bool = Field(default=True) # Optimize convolution algorithms num_threads: int = Field(default=4) # CPU threads for preprocessing # ===== Enhanced Memory Management Configuration ===== # Memory thresholds (as ratio of total GPU memory) memory_warning_threshold: float = Field(default=0.80) # 80% - start warning memory_critical_threshold: float = Field(default=0.95) # 95% - throttle operations memory_emergency_threshold: float = Field(default=0.98) # 98% - emergency cleanup # Memory monitoring memory_check_interval_seconds: int = Field(default=30) # Background check interval enable_memory_alerts: bool = Field(default=True) # Enable memory alerts # Model lifecycle management enable_model_lifecycle_management: bool = Field(default=True) # Use ModelManager pp_structure_idle_timeout_seconds: int = Field(default=300) # Unload PP-Structure after idle structure_model_memory_mb: int = Field(default=2000) # Estimated memory for PP-StructureV3 ocr_model_memory_mb: int = Field(default=500) # Estimated memory per OCR language model # Service pool configuration enable_service_pool: bool = Field(default=True) # Use OCRServicePool max_services_per_device: int = Field(default=1) # Max OCRService per GPU max_total_services: int = Field(default=2) # Max total OCRService instances service_acquire_timeout_seconds: float = Field(default=300.0) # Timeout for acquiring service max_queue_size: int = Field(default=50) # Max pending tasks per device # Concurrency control max_concurrent_predictions: int = Field(default=2) # Max concurrent PP-StructureV3 predictions enable_cpu_fallback: bool = Field(default=True) # Fall back to CPU when GPU memory low # Emergency recovery enable_emergency_cleanup: bool = Field(default=True) # Auto-cleanup on memory pressure enable_worker_restart: bool = Field(default=False) # Restart workers on OOM (requires supervisor) # ===== File Upload Configuration ===== max_upload_size: int = Field(default=52428800) # 50MB allowed_extensions: str = Field(default="png,jpg,jpeg,pdf,bmp,tiff,doc,docx,ppt,pptx") upload_dir: str = Field(default=str(BACKEND_ROOT / "uploads")) temp_dir: str = Field(default=str(BACKEND_ROOT / "uploads" / "temp")) processed_dir: str = Field(default=str(BACKEND_ROOT / "uploads" / "processed")) images_dir: str = Field(default=str(BACKEND_ROOT / "uploads" / "images")) @property def allowed_extensions_list(self) -> List[str]: """Get allowed extensions as list""" return [ext.strip() for ext in self.allowed_extensions.split(",")] # ===== Export Configuration ===== storage_dir: str = Field(default=str(BACKEND_ROOT / "storage")) markdown_dir: str = Field(default=str(BACKEND_ROOT / "storage" / "markdown")) json_dir: str = Field(default=str(BACKEND_ROOT / "storage" / "json")) exports_dir: str = Field(default=str(BACKEND_ROOT / "storage" / "exports")) result_dir: str = Field(default=str(BACKEND_ROOT / "storage" / "results")) # ===== PDF Generation Configuration ===== pandoc_path: str = Field(default="/opt/homebrew/bin/pandoc") font_dir: str = Field(default="/System/Library/Fonts") pdf_page_size: str = Field(default="A4") pdf_margin_top: int = Field(default=20) pdf_margin_bottom: int = Field(default=20) pdf_margin_left: int = Field(default=20) pdf_margin_right: int = Field(default=20) # ===== Layout-Preserving PDF Configuration ===== chinese_font_path: str = Field(default=str(BACKEND_ROOT / "fonts" / "NotoSansSC-Regular.ttf")) pdf_font_size_base: int = Field(default=12) pdf_enable_bbox_debug: bool = Field(default=False) # Draw bounding boxes for debugging # ===== Translation Configuration (DIFY API) ===== enable_translation: bool = Field(default=True) dify_base_url: str = Field(default="https://dify.theaken.com/v1") dify_api_key: str = Field(default="") # Required: set in .env.local dify_timeout: float = Field(default=120.0) # seconds dify_max_retries: int = Field(default=3) dify_max_batch_chars: int = Field(default=5000) # Max characters per batch dify_max_batch_items: int = Field(default=20) # Max items per batch # Translation cost calculation (USD per 1M tokens) - FALLBACK only # Dify API returns actual price (total_price), this is only used as fallback # when actual price is not available translation_cost_per_million_tokens: float = Field( default=3.0, description="Fallback cost per 1M tokens when Dify doesn't return actual price" ) # ===== Background Tasks Configuration ===== task_queue_type: str = Field(default="memory") redis_url: str = Field(default="redis://localhost:6379/0") # ===== CORS Configuration ===== cors_origins: str = Field(default="http://localhost:5173,http://127.0.0.1:5173") @property def cors_origins_list(self) -> List[str]: """Get CORS origins as list""" return [origin.strip() for origin in self.cors_origins.split(",")] # ===== Logging Configuration ===== log_level: str = Field(default="INFO") log_file: str = Field(default=str(BACKEND_ROOT / "logs" / "app.log")) @model_validator(mode="after") def _normalize_paths(self): """Resolve all runtime paths to backend-rooted absolutes""" path_fields = [ "upload_dir", "temp_dir", "processed_dir", "images_dir", "storage_dir", "markdown_dir", "json_dir", "exports_dir", "result_dir", "log_file", "chinese_font_path", ] for field in path_fields: value = getattr(self, field) if value: setattr(self, field, str(self._resolve_path(str(value)))) return self class Config: # Look for .env files in project root (one level up from backend/) # .env.local has higher priority and overrides .env env_file = ( str(PROJECT_ROOT / ".env"), str(PROJECT_ROOT / ".env.local"), ) env_file_encoding = "utf-8" case_sensitive = False # Ignore extra environment variables not defined in Settings # This allows backwards compatibility with old .env files (e.g., Docker) extra = "ignore" def _resolve_path(self, path_value: str) -> Path: """ Convert relative paths to backend-rooted absolute paths. This keeps runtime artifacts contained under backend/ even when the app is launched from different working directories. """ path = Path(path_value) return path if path.is_absolute() else BACKEND_ROOT / path def ensure_directories(self): """Create all necessary directories if they don't exist""" dirs = [ self.upload_dir, self.temp_dir, self.processed_dir, self.images_dir, self.storage_dir, self.markdown_dir, self.json_dir, self.exports_dir, self.result_dir, Path(self.log_file).parent, ] for dir_path in dirs: self._resolve_path(str(dir_path)).mkdir(parents=True, exist_ok=True) # Global settings instance settings = Settings()