DashBoard/.env.example

# ============================================================
# MES Dashboard Environment Configuration
# ============================================================
# Copy this file to .env and fill in your actual values:
#   cp .env.example .env
#   nano .env
# ============================================================

# ============================================================
# Database Configuration (REQUIRED)
# ============================================================
# Oracle Database connection settings
DB_HOST=your_database_host
DB_PORT=1521
DB_SERVICE=your_service_name
DB_USER=your_username
DB_PASSWORD=your_password

# Database Pool Settings (optional, has defaults)
# Adjust based on expected load
DB_POOL_SIZE=5          # Default: 5 (dev: 2, prod: 10)
DB_MAX_OVERFLOW=10      # Default: 10 (dev: 3, prod: 20)
DB_POOL_TIMEOUT=30      # Seconds to wait when pool is exhausted
DB_POOL_RECYCLE=1800    # Recycle connection after N seconds
DB_TCP_CONNECT_TIMEOUT=10
DB_CONNECT_RETRY_COUNT=1
DB_CONNECT_RETRY_DELAY=1.0
DB_CALL_TIMEOUT_MS=55000  # Must stay below worker timeout
DB_SLOW_CALL_TIMEOUT_MS=300000
DB_SLOW_MAX_CONCURRENT=5
DB_SLOW_POOL_ENABLED=true
DB_SLOW_POOL_SIZE=2
DB_SLOW_POOL_MAX_OVERFLOW=1
DB_SLOW_POOL_TIMEOUT=30
DB_SLOW_POOL_RECYCLE=1800

# ============================================================
# Flask Configuration
# ============================================================
# Environment mode: development | production | testing
# If omitted, runtime defaults to production (fail-safe)
FLASK_ENV=development

# Debug mode: 0 for production, 1 for development
FLASK_DEBUG=0

# Session Security (REQUIRED for production!)
# Generate with: python -c "import secrets; print(secrets.token_hex(32))"
SECRET_KEY=your-secret-key-change-in-production

# Session timeout in seconds (default: 28800 = 8 hours)
SESSION_LIFETIME=28800

# JSON request payload upper bound in bytes (default: 262144 = 256KB)
MAX_JSON_BODY_BYTES=262144

# Route input-budget guardrails
QUERY_TOOL_MAX_CONTAINER_IDS=200
RESOURCE_DETAIL_DEFAULT_LIMIT=500
RESOURCE_DETAIL_MAX_LIMIT=500

# Trust boundary for forwarded headers (safe default: false)
# Direct-exposure deployment (no reverse proxy): keep this false
TRUST_PROXY_HEADERS=false
# Required when TRUST_PROXY_HEADERS=true. Supports comma-separated IP/CIDR entries.
# Example: TRUSTED_PROXY_IPS=127.0.0.1,10.0.0.0/24
TRUSTED_PROXY_IPS=

# CSP opt-in compatibility flag (default false = safer)
CSP_ALLOW_UNSAFE_EVAL=false

# ============================================================
# Authentication Configuration
# ============================================================
# LDAP API endpoint for user authentication
LDAP_API_URL=https://your-ldap-api-endpoint.example.com

# Admin email addresses (comma-separated for multiple)
ADMIN_EMAILS=admin@example.com

# Local Authentication (for development/testing)
# When enabled, uses local credentials instead of LDAP
# Set LOCAL_AUTH_ENABLED=true to bypass LDAP authentication
# Production safety: when FLASK_ENV=production, local auth is forcibly disabled
LOCAL_AUTH_ENABLED=false
LOCAL_AUTH_USERNAME=
LOCAL_AUTH_PASSWORD=

# ============================================================
# Gunicorn Configuration
# ============================================================
# Server bind address and port
GUNICORN_BIND=0.0.0.0:8080

# Number of worker processes
# Recommend: 2 for ≤ 8GB RAM (trace queries consume 2-3 GB peak per worker)
# Recommend: 4 for ≥ 16GB RAM
GUNICORN_WORKERS=2

# Threads per worker
GUNICORN_THREADS=4

# Worker timeout (seconds): should stay above DB/query-tool slow paths
GUNICORN_TIMEOUT=130

# Graceful shutdown timeout for worker reloads (seconds)
GUNICORN_GRACEFUL_TIMEOUT=60

# Worker recycle policy (set 0 to disable)
GUNICORN_MAX_REQUESTS=5000
GUNICORN_MAX_REQUESTS_JITTER=500

# ============================================================
# Redis Configuration (for WIP cache)
# ============================================================
# Redis connection URL
REDIS_URL=redis://localhost:6379/0

# Enable/disable Redis cache (set to false to fallback to Oracle)
REDIS_ENABLED=true

# Redis key prefix (to separate from other applications)
REDIS_KEY_PREFIX=mes_wip

# Redis memory guardrail (prevent unbounded RAM growth)
# Example: 512mb / 1gb / 2gb. Set 0 to disable limit (NOT recommended).
REDIS_MAXMEMORY=512mb

# Eviction policy when maxmemory is reached
# Recommended: allkeys-lru (general cache), volatile-lru (TTL keys only)
REDIS_MAXMEMORY_POLICY=allkeys-lru

# Redis persistence (physical storage)
REDIS_PERSISTENCE_ENABLED=true
REDIS_APPENDONLY=yes
REDIS_APPENDFSYNC=everysec
REDIS_SAVE=900 1 300 10 60 10000

# Startup cleanup: remove stale keys that accidentally have no TTL
REDIS_TTL_CLEANUP_ON_START=true
REDIS_TTL_CLEANUP_PATTERNS=batch:*,reject_dataset:*,hold_dataset:*,resource_dataset:*,job_query:*

# Cache check interval in seconds (default: 600 = 10 minutes)
CACHE_CHECK_INTERVAL=600

# Optional explicit TTL for WIP Redis keys (seconds)
# If unset/0, TTL defaults to 3 * CACHE_CHECK_INTERVAL
# Example: CACHE_CHECK_INTERVAL=600 -> default TTL=1800
WIP_CACHE_TTL_SECONDS=1800

# ============================================================
# Resource Cache Configuration
# ============================================================
# Enable/disable Resource cache (DW_MES_RESOURCE)
# When disabled, queries will fallback to Oracle directly
RESOURCE_CACHE_ENABLED=true

# Resource cache sync interval in seconds (default: 14400 = 4 hours)
# The cache will check for updates at this interval using MAX(LASTCHANGEDATE)
RESOURCE_SYNC_INTERVAL=14400

# Resource index version check interval in seconds (default: 5)
RESOURCE_INDEX_VERSION_CHECK_INTERVAL=5

# Realtime equipment status cache toggle and sync interval
REALTIME_EQUIPMENT_CACHE_ENABLED=true
EQUIPMENT_STATUS_SYNC_INTERVAL=300

# Filter cache SQL view overrides
FILTER_CACHE_WIP_VIEW=DWH.DW_MES_LOT_V
FILTER_CACHE_SPEC_WORKCENTER_VIEW=DWH.DW_MES_SPEC_WORKCENTER_V

# ============================================================
# Circuit Breaker Configuration
# ============================================================
# Enable/disable circuit breaker for database protection
CIRCUIT_BREAKER_ENABLED=true

# Minimum failures before circuit can open
CIRCUIT_BREAKER_FAILURE_THRESHOLD=5

# Failure rate threshold (0.0 - 1.0)
CIRCUIT_BREAKER_FAILURE_RATE=0.5

# Seconds to wait in OPEN state before trying HALF_OPEN
CIRCUIT_BREAKER_RECOVERY_TIMEOUT=30

# Sliding window size for counting successes/failures
CIRCUIT_BREAKER_WINDOW_SIZE=10

# ============================================================
# Trace Pipeline Configuration
# ============================================================
# Slow query warning threshold (seconds) — logs warning when stage exceeds this
TRACE_SLOW_THRESHOLD_SECONDS=15

# Max parallel workers for events domain fetching (per request)
# Recommend: 2 (each worker × EVENT_FETCHER_MAX_WORKERS = peak slow query slots)
TRACE_EVENTS_MAX_WORKERS=2

# Max parallel workers for EventFetcher batch queries (per domain)
# Recommend: 2 (peak concurrent slow queries = TRACE_EVENTS_MAX_WORKERS × this)
EVENT_FETCHER_MAX_WORKERS=2

# Max parallel workers for forward pipeline WIP+rejects fetching
FORWARD_PIPELINE_MAX_WORKERS=2

# --- Admission Control (提案 1: trace-events-memory-triage) ---
# Max container IDs per synchronous events request.
# Requests exceeding this limit return HTTP 413 (or HTTP 202 when async job queue is enabled).
# Set based on available RAM: 50K CIDs ≈ 2-3 GB peak memory per request.
TRACE_EVENTS_CID_LIMIT=50000

# Cursor fetchmany batch size for slow query iterator mode.
# Smaller = less peak memory; larger = fewer Oracle round-trips.
DB_SLOW_FETCHMANY_SIZE=5000

# Domain-level cache skip threshold (CID count).
# When CID count exceeds this, per-domain and route-level cache writes are skipped.
EVENT_FETCHER_CACHE_SKIP_CID_THRESHOLD=10000

# --- Async Job Queue (提案 2: trace-async-job-queue) ---
# Enable RQ trace worker for async large query processing
# Set to true and start the worker: rq worker trace-events
TRACE_WORKER_ENABLED=true

# CID threshold for automatic async job routing (requires RQ worker).
# Requests with CID count > threshold are queued instead of processed synchronously.
TRACE_ASYNC_CID_THRESHOLD=20000

# Job result retention time in seconds (default: 3600 = 1 hour)
TRACE_JOB_TTL_SECONDS=3600

# Job execution timeout in seconds (default: 1800 = 30 minutes)
TRACE_JOB_TIMEOUT_SECONDS=1800

# Number of RQ worker processes for trace jobs
TRACE_WORKER_COUNT=1

# RQ queue name for trace jobs
TRACE_WORKER_QUEUE=trace-events

# --- Streaming Response (提案 3: trace-streaming-response) ---
# NDJSON stream batch size (records per NDJSON line)
TRACE_STREAM_BATCH_SIZE=5000

# ============================================================
# Performance Metrics Configuration
# ============================================================
# Slow query threshold in seconds (default: 5.0)
# Note: Real-time Oracle views may take 2-5s per query, set threshold accordingly
SLOW_QUERY_THRESHOLD=5.0

# In-memory query metrics sliding window size
METRICS_WINDOW_SIZE=1000

# ============================================================
# SQLite Log Store Configuration
# ============================================================
# Enable/disable SQLite log store for admin dashboard
LOG_STORE_ENABLED=true

# SQLite database path
LOG_SQLITE_PATH=logs/admin_logs.sqlite

# Log retention period in days (default: 7)
LOG_SQLITE_RETENTION_DAYS=7

# Maximum log rows (default: 100000)
LOG_SQLITE_MAX_ROWS=100000

# ============================================================
# Worker Watchdog Configuration
# ============================================================
# Runtime directory for restart flag/pid/state files
WATCHDOG_RUNTIME_DIR=./tmp

# Path to restart flag file (watchdog monitors this file)
WATCHDOG_RESTART_FLAG=./tmp/mes_dashboard_restart.flag

# Gunicorn PID file path (must match start script / systemd config)
WATCHDOG_PID_FILE=./tmp/gunicorn.pid

# Path to restart state file (stores last restart info)
WATCHDOG_STATE_FILE=./tmp/mes_dashboard_restart_state.json

# Max entries persisted in restart history (bounded to avoid state growth)
WATCHDOG_RESTART_HISTORY_MAX=50

# Cooldown period between restart requests in seconds (default: 60)
WORKER_RESTART_COOLDOWN=60

# Watchdog loop check interval in seconds
WATCHDOG_CHECK_INTERVAL=5

# ============================================================
# Portal Shell / Full Modernization Flags
# ============================================================
# Frontend build mode: auto | always | never
# - auto: build only when dist is missing/stale
# - always: force build on every start/restart
# - never: skip build entirely
FRONTEND_BUILD_MODE=always

# Fail startup when frontend build fails (recommended in production)
FRONTEND_BUILD_FAIL_ON_ERROR=true

# Shell-first SPA navigation
# Local default: false (avoid coupling first boot to shell-first cutover behavior)
# Production recommendation: set true after shell cutover verification is complete
PORTAL_SPA_ENABLED=false

# Fail-fast readiness gate for in-scope dist assets
# Local default: false (allow startup when dist artifacts are intentionally absent)
# Production recommendation: set true to enforce release readiness at boot
MODERNIZATION_ENFORCE_ASSET_READINESS=false

# Retire in-scope runtime fallback behavior
# Local default: false (retain compatibility fallback while developing)
# Production recommendation: set true only after all in-scope dist assets are verified
MODERNIZATION_RETIRE_IN_SCOPE_RUNTIME_FALLBACK=false

# Runtime contract strict validation toggle
# Local default: false (avoid strict conda/runtime checks on onboarding machines)
# Production recommendation: set true to fail fast on contract drift
RUNTIME_CONTRACT_ENFORCE=false

# Health endpoint memo cache TTL in seconds
HEALTH_MEMO_TTL_SECONDS=5

# Reject history options API cache TTL in seconds (default: 14400 = 4 hours)
REJECT_HISTORY_OPTIONS_CACHE_TTL_SECONDS=14400

# ============================================================
# Reject History Batch/Spill Guardrails
# ============================================================
# Batch chunking controls (for long-range reject queries)
REJECT_ENGINE_GRAIN_DAYS=10
REJECT_ENGINE_PARALLEL=2
REJECT_ENGINE_MAX_ROWS_PER_CHUNK=50000
REJECT_ENGINE_MAX_TOTAL_ROWS=300000

# Large result spill controls
REJECT_ENGINE_SPILL_ENABLED=true
REJECT_ENGINE_MAX_RESULT_MB=64
QUERY_SPOOL_DIR=tmp/query_spool
REJECT_ENGINE_SPOOL_TTL_SECONDS=21600
REJECT_ENGINE_SPOOL_MAX_BYTES=2147483648
REJECT_ENGINE_SPOOL_WARN_RATIO=0.85
REJECT_ENGINE_SPOOL_CLEANUP_INTERVAL_SECONDS=300
REJECT_ENGINE_SPOOL_ORPHAN_GRACE_SECONDS=600

# Batch query engine thresholds
BATCH_QUERY_TIME_THRESHOLD_DAYS=60
BATCH_QUERY_ID_THRESHOLD=1000
BATCH_CHUNK_MAX_MEMORY_MB=256

# ============================================================
# Runtime Resilience Diagnostics Thresholds
# ============================================================
# Alert window for sustained degraded state (seconds)
RESILIENCE_DEGRADED_ALERT_SECONDS=300

# Pool saturation warning / critical levels
RESILIENCE_POOL_SATURATION_WARNING=0.90
RESILIENCE_POOL_SATURATION_CRITICAL=1.0

# Restart churn threshold: N restarts within window triggers churn warning
RESILIENCE_RESTART_CHURN_WINDOW_SECONDS=600
RESILIENCE_RESTART_CHURN_THRESHOLD=3

# ============================================================
# CORS Configuration
# ============================================================
# Comma-separated list of allowed origins for CORS
# Example: https://example.com,https://app.example.com
# Set to * for development (not recommended for production)
CORS_ALLOWED_ORIGINS=