Files
DashBoard/.env.example

379 lines
14 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ============================================================
# MES Dashboard Environment Configuration
# ============================================================
# Copy this file to .env and fill in your actual values:
# cp .env.example .env
# nano .env
# ============================================================
# ============================================================
# Database Configuration (REQUIRED)
# ============================================================
# Oracle Database connection settings
DB_HOST=your_database_host
DB_PORT=1521
DB_SERVICE=your_service_name
DB_USER=your_username
DB_PASSWORD=your_password
# Database Pool Settings (optional, has defaults)
# Adjust based on expected load
DB_POOL_SIZE=5 # Default: 5 (dev: 2, prod: 10)
DB_MAX_OVERFLOW=10 # Default: 10 (dev: 3, prod: 20)
DB_POOL_TIMEOUT=30 # Seconds to wait when pool is exhausted
DB_POOL_RECYCLE=1800 # Recycle connection after N seconds
DB_TCP_CONNECT_TIMEOUT=10
DB_CONNECT_RETRY_COUNT=1
DB_CONNECT_RETRY_DELAY=1.0
DB_CALL_TIMEOUT_MS=55000 # Must stay below worker timeout
DB_SLOW_CALL_TIMEOUT_MS=300000
DB_SLOW_MAX_CONCURRENT=5
DB_SLOW_POOL_ENABLED=true
DB_SLOW_POOL_SIZE=2
DB_SLOW_POOL_MAX_OVERFLOW=1
DB_SLOW_POOL_TIMEOUT=30
DB_SLOW_POOL_RECYCLE=1800
# ============================================================
# Flask Configuration
# ============================================================
# Environment mode: development | production | testing
# If omitted, runtime defaults to production (fail-safe)
FLASK_ENV=development
# Debug mode: 0 for production, 1 for development
FLASK_DEBUG=0
# Session Security (REQUIRED for production!)
# Generate with: python -c "import secrets; print(secrets.token_hex(32))"
SECRET_KEY=your-secret-key-change-in-production
# Session timeout in seconds (default: 28800 = 8 hours)
SESSION_LIFETIME=28800
# JSON request payload upper bound in bytes (default: 262144 = 256KB)
MAX_JSON_BODY_BYTES=262144
# Route input-budget guardrails
QUERY_TOOL_MAX_CONTAINER_IDS=200
RESOURCE_DETAIL_DEFAULT_LIMIT=500
RESOURCE_DETAIL_MAX_LIMIT=500
# Trust boundary for forwarded headers (safe default: false)
# Direct-exposure deployment (no reverse proxy): keep this false
TRUST_PROXY_HEADERS=false
# Required when TRUST_PROXY_HEADERS=true. Supports comma-separated IP/CIDR entries.
# Example: TRUSTED_PROXY_IPS=127.0.0.1,10.0.0.0/24
TRUSTED_PROXY_IPS=
# CSP opt-in compatibility flag (default false = safer)
CSP_ALLOW_UNSAFE_EVAL=false
# ============================================================
# Authentication Configuration
# ============================================================
# LDAP API endpoint for user authentication
LDAP_API_URL=https://your-ldap-api-endpoint.example.com
# Admin email addresses (comma-separated for multiple)
ADMIN_EMAILS=admin@example.com
# Local Authentication (for development/testing)
# When enabled, uses local credentials instead of LDAP
# Set LOCAL_AUTH_ENABLED=true to bypass LDAP authentication
# Production safety: when FLASK_ENV=production, local auth is forcibly disabled
LOCAL_AUTH_ENABLED=false
LOCAL_AUTH_USERNAME=
LOCAL_AUTH_PASSWORD=
# ============================================================
# Gunicorn Configuration
# ============================================================
# Server bind address and port
GUNICORN_BIND=0.0.0.0:8080
# Number of worker processes
# Recommend: 2 for ≤ 8GB RAM (trace queries consume 2-3 GB peak per worker)
# Recommend: 4 for ≥ 16GB RAM
GUNICORN_WORKERS=2
# Threads per worker
GUNICORN_THREADS=4
# Worker timeout (seconds): should stay above DB/query-tool slow paths
GUNICORN_TIMEOUT=130
# Graceful shutdown timeout for worker reloads (seconds)
GUNICORN_GRACEFUL_TIMEOUT=60
# Worker recycle policy (set 0 to disable)
GUNICORN_MAX_REQUESTS=5000
GUNICORN_MAX_REQUESTS_JITTER=500
# ============================================================
# Redis Configuration (for WIP cache)
# ============================================================
# Redis connection URL
REDIS_URL=redis://localhost:6379/0
# Enable/disable Redis cache (set to false to fallback to Oracle)
REDIS_ENABLED=true
# Redis key prefix (to separate from other applications)
REDIS_KEY_PREFIX=mes_wip
# Redis memory guardrail (prevent unbounded RAM growth)
# Example: 512mb / 1gb / 2gb. Set 0 to disable limit (NOT recommended).
REDIS_MAXMEMORY=512mb
# Eviction policy when maxmemory is reached
# Recommended: allkeys-lru (general cache), volatile-lru (TTL keys only)
REDIS_MAXMEMORY_POLICY=allkeys-lru
# Redis persistence (physical storage)
REDIS_PERSISTENCE_ENABLED=true
REDIS_APPENDONLY=yes
REDIS_APPENDFSYNC=everysec
REDIS_SAVE=900 1 300 10 60 10000
# Startup cleanup: remove stale keys that accidentally have no TTL
REDIS_TTL_CLEANUP_ON_START=true
REDIS_TTL_CLEANUP_PATTERNS=batch:*,reject_dataset:*,hold_dataset:*,resource_dataset:*,job_query:*
# Cache check interval in seconds (default: 600 = 10 minutes)
CACHE_CHECK_INTERVAL=600
# Optional explicit TTL for WIP Redis keys (seconds)
# If unset/0, TTL defaults to 3 * CACHE_CHECK_INTERVAL
# Example: CACHE_CHECK_INTERVAL=600 -> default TTL=1800
WIP_CACHE_TTL_SECONDS=1800
# ============================================================
# Resource Cache Configuration
# ============================================================
# Enable/disable Resource cache (DW_MES_RESOURCE)
# When disabled, queries will fallback to Oracle directly
RESOURCE_CACHE_ENABLED=true
# Resource cache sync interval in seconds (default: 14400 = 4 hours)
# The cache will check for updates at this interval using MAX(LASTCHANGEDATE)
RESOURCE_SYNC_INTERVAL=14400
# Resource index version check interval in seconds (default: 5)
RESOURCE_INDEX_VERSION_CHECK_INTERVAL=5
# Realtime equipment status cache toggle and sync interval
REALTIME_EQUIPMENT_CACHE_ENABLED=true
EQUIPMENT_STATUS_SYNC_INTERVAL=300
# Filter cache SQL view overrides
FILTER_CACHE_WIP_VIEW=DWH.DW_MES_LOT_V
FILTER_CACHE_SPEC_WORKCENTER_VIEW=DWH.DW_MES_SPEC_WORKCENTER_V
# ============================================================
# Circuit Breaker Configuration
# ============================================================
# Enable/disable circuit breaker for database protection
CIRCUIT_BREAKER_ENABLED=true
# Minimum failures before circuit can open
CIRCUIT_BREAKER_FAILURE_THRESHOLD=5
# Failure rate threshold (0.0 - 1.0)
CIRCUIT_BREAKER_FAILURE_RATE=0.5
# Seconds to wait in OPEN state before trying HALF_OPEN
CIRCUIT_BREAKER_RECOVERY_TIMEOUT=30
# Sliding window size for counting successes/failures
CIRCUIT_BREAKER_WINDOW_SIZE=10
# ============================================================
# Trace Pipeline Configuration
# ============================================================
# Slow query warning threshold (seconds) — logs warning when stage exceeds this
TRACE_SLOW_THRESHOLD_SECONDS=15
# Max parallel workers for events domain fetching (per request)
# Recommend: 2 (each worker × EVENT_FETCHER_MAX_WORKERS = peak slow query slots)
TRACE_EVENTS_MAX_WORKERS=2
# Max parallel workers for EventFetcher batch queries (per domain)
# Recommend: 2 (peak concurrent slow queries = TRACE_EVENTS_MAX_WORKERS × this)
EVENT_FETCHER_MAX_WORKERS=2
# Max parallel workers for forward pipeline WIP+rejects fetching
FORWARD_PIPELINE_MAX_WORKERS=2
# --- Admission Control (提案 1: trace-events-memory-triage) ---
# Max container IDs per synchronous events request.
# Requests exceeding this limit return HTTP 413 (or HTTP 202 when async job queue is enabled).
# Set based on available RAM: 50K CIDs ≈ 2-3 GB peak memory per request.
TRACE_EVENTS_CID_LIMIT=50000
# Cursor fetchmany batch size for slow query iterator mode.
# Smaller = less peak memory; larger = fewer Oracle round-trips.
DB_SLOW_FETCHMANY_SIZE=5000
# Domain-level cache skip threshold (CID count).
# When CID count exceeds this, per-domain and route-level cache writes are skipped.
EVENT_FETCHER_CACHE_SKIP_CID_THRESHOLD=10000
# --- Async Job Queue (提案 2: trace-async-job-queue) ---
# Enable RQ trace worker for async large query processing
# Set to true and start the worker: rq worker trace-events
TRACE_WORKER_ENABLED=true
# CID threshold for automatic async job routing (requires RQ worker).
# Requests with CID count > threshold are queued instead of processed synchronously.
TRACE_ASYNC_CID_THRESHOLD=20000
# Job result retention time in seconds (default: 3600 = 1 hour)
TRACE_JOB_TTL_SECONDS=3600
# Job execution timeout in seconds (default: 1800 = 30 minutes)
TRACE_JOB_TIMEOUT_SECONDS=1800
# Number of RQ worker processes for trace jobs
TRACE_WORKER_COUNT=1
# RQ queue name for trace jobs
TRACE_WORKER_QUEUE=trace-events
# --- Streaming Response (提案 3: trace-streaming-response) ---
# NDJSON stream batch size (records per NDJSON line)
TRACE_STREAM_BATCH_SIZE=5000
# ============================================================
# Performance Metrics Configuration
# ============================================================
# Slow query threshold in seconds (default: 5.0)
# Note: Real-time Oracle views may take 2-5s per query, set threshold accordingly
SLOW_QUERY_THRESHOLD=5.0
# In-memory query metrics sliding window size
METRICS_WINDOW_SIZE=1000
# ============================================================
# SQLite Log Store Configuration
# ============================================================
# Enable/disable SQLite log store for admin dashboard
LOG_STORE_ENABLED=true
# SQLite database path
LOG_SQLITE_PATH=logs/admin_logs.sqlite
# Log retention period in days (default: 7)
LOG_SQLITE_RETENTION_DAYS=7
# Maximum log rows (default: 100000)
LOG_SQLITE_MAX_ROWS=100000
# ============================================================
# Worker Watchdog Configuration
# ============================================================
# Runtime directory for restart flag/pid/state files
WATCHDOG_RUNTIME_DIR=./tmp
# Path to restart flag file (watchdog monitors this file)
WATCHDOG_RESTART_FLAG=./tmp/mes_dashboard_restart.flag
# Gunicorn PID file path (must match start script / systemd config)
WATCHDOG_PID_FILE=./tmp/gunicorn.pid
# Path to restart state file (stores last restart info)
WATCHDOG_STATE_FILE=./tmp/mes_dashboard_restart_state.json
# Max entries persisted in restart history (bounded to avoid state growth)
WATCHDOG_RESTART_HISTORY_MAX=50
# Cooldown period between restart requests in seconds (default: 60)
WORKER_RESTART_COOLDOWN=60
# Watchdog loop check interval in seconds
WATCHDOG_CHECK_INTERVAL=5
# ============================================================
# Portal Shell / Full Modernization Flags
# ============================================================
# Frontend build mode: auto | always | never
# - auto: build only when dist is missing/stale
# - always: force build on every start/restart
# - never: skip build entirely
FRONTEND_BUILD_MODE=always
# Fail startup when frontend build fails (recommended in production)
FRONTEND_BUILD_FAIL_ON_ERROR=true
# Shell-first SPA navigation
# Local default: false (avoid coupling first boot to shell-first cutover behavior)
# Production recommendation: set true after shell cutover verification is complete
PORTAL_SPA_ENABLED=false
# Fail-fast readiness gate for in-scope dist assets
# Local default: false (allow startup when dist artifacts are intentionally absent)
# Production recommendation: set true to enforce release readiness at boot
MODERNIZATION_ENFORCE_ASSET_READINESS=false
# Retire in-scope runtime fallback behavior
# Local default: false (retain compatibility fallback while developing)
# Production recommendation: set true only after all in-scope dist assets are verified
MODERNIZATION_RETIRE_IN_SCOPE_RUNTIME_FALLBACK=false
# Runtime contract strict validation toggle
# Local default: false (avoid strict conda/runtime checks on onboarding machines)
# Production recommendation: set true to fail fast on contract drift
RUNTIME_CONTRACT_ENFORCE=false
# Health endpoint memo cache TTL in seconds
HEALTH_MEMO_TTL_SECONDS=5
# Reject history options API cache TTL in seconds (default: 14400 = 4 hours)
REJECT_HISTORY_OPTIONS_CACHE_TTL_SECONDS=14400
# ============================================================
# Reject History Batch/Spill Guardrails
# ============================================================
# Batch chunking controls (for long-range reject queries)
REJECT_ENGINE_GRAIN_DAYS=10
REJECT_ENGINE_PARALLEL=2
REJECT_ENGINE_MAX_ROWS_PER_CHUNK=50000
REJECT_ENGINE_MAX_TOTAL_ROWS=300000
# Large result spill controls
REJECT_ENGINE_SPILL_ENABLED=true
REJECT_ENGINE_MAX_RESULT_MB=64
QUERY_SPOOL_DIR=tmp/query_spool
REJECT_ENGINE_SPOOL_TTL_SECONDS=21600
REJECT_ENGINE_SPOOL_MAX_BYTES=2147483648
REJECT_ENGINE_SPOOL_WARN_RATIO=0.85
REJECT_ENGINE_SPOOL_CLEANUP_INTERVAL_SECONDS=300
REJECT_ENGINE_SPOOL_ORPHAN_GRACE_SECONDS=600
# Batch query engine thresholds
BATCH_QUERY_TIME_THRESHOLD_DAYS=60
BATCH_QUERY_ID_THRESHOLD=1000
BATCH_CHUNK_MAX_MEMORY_MB=256
# ============================================================
# Runtime Resilience Diagnostics Thresholds
# ============================================================
# Alert window for sustained degraded state (seconds)
RESILIENCE_DEGRADED_ALERT_SECONDS=300
# Pool saturation warning / critical levels
RESILIENCE_POOL_SATURATION_WARNING=0.90
RESILIENCE_POOL_SATURATION_CRITICAL=1.0
# Restart churn threshold: N restarts within window triggers churn warning
RESILIENCE_RESTART_CHURN_WINDOW_SECONDS=600
RESILIENCE_RESTART_CHURN_THRESHOLD=3
# ============================================================
# CORS Configuration
# ============================================================
# Comma-separated list of allowed origins for CORS
# Example: https://example.com,https://app.example.com
# Set to * for development (not recommended for production)
CORS_ALLOWED_ORIGINS=