Chunk failures in BatchQueryEngine were silently discarded — `has_partial_failure` was tracked in Redis but never surfaced to the API response or frontend. Users could see incomplete data without any warning. This commit closes the gap end-to-end: Backend: - Track failed chunk time ranges (`failed_ranges`) in batch engine progress metadata - Add single retry for transient Oracle errors (timeout, connection) in `_execute_single_chunk` - Read `get_batch_progress()` after merge but before `redis_clear_batch()` cleanup - Inject `has_partial_failure`, `failed_chunk_count`, `failed_ranges` into API response meta - Persist partial failure flag to independent Redis key with TTL aligned to data storage layer - Add shared container-resolution policy module with wildcard/expansion guardrails - Refactor reason filter from single-value to multi-select (`reason` → `reasons`) Frontend: - Add client-side date range validation (730-day limit) before API submission - Display amber warning banner on partial failure with specific failed date ranges - Support generic fallback message for container-mode queries without date ranges - Update FilterPanel to support multi-select reason chips Specs & tests: - Create batch-query-resilience spec; update reject-history-api and reject-history-page specs - Add 7 new tests for retry, memory guard, failed ranges, partial failure propagation, TTL - Cross-service regression verified (hold, resource, job, msd — 411 tests pass) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
391 lines
14 KiB
Plaintext
391 lines
14 KiB
Plaintext
# ============================================================
|
||
# MES Dashboard Environment Configuration
|
||
# ============================================================
|
||
# Copy this file to .env and fill in your actual values:
|
||
# cp .env.example .env
|
||
# nano .env
|
||
# ============================================================
|
||
|
||
# ============================================================
|
||
# Database Configuration (REQUIRED)
|
||
# ============================================================
|
||
# Oracle Database connection settings
|
||
DB_HOST=your_database_host
|
||
DB_PORT=1521
|
||
DB_SERVICE=your_service_name
|
||
DB_USER=your_username
|
||
DB_PASSWORD=your_password
|
||
|
||
# Database Pool Settings (optional, has defaults)
|
||
# Adjust based on expected load
|
||
DB_POOL_SIZE=5 # Default: 5 (dev: 2, prod: 10)
|
||
DB_MAX_OVERFLOW=10 # Default: 10 (dev: 3, prod: 20)
|
||
DB_POOL_TIMEOUT=30 # Seconds to wait when pool is exhausted
|
||
DB_POOL_RECYCLE=1800 # Recycle connection after N seconds
|
||
DB_TCP_CONNECT_TIMEOUT=10
|
||
DB_CONNECT_RETRY_COUNT=1
|
||
DB_CONNECT_RETRY_DELAY=1.0
|
||
DB_CALL_TIMEOUT_MS=55000 # Must stay below worker timeout
|
||
DB_SLOW_CALL_TIMEOUT_MS=300000
|
||
DB_SLOW_MAX_CONCURRENT=5
|
||
DB_SLOW_POOL_ENABLED=true
|
||
DB_SLOW_POOL_SIZE=2
|
||
DB_SLOW_POOL_MAX_OVERFLOW=1
|
||
DB_SLOW_POOL_TIMEOUT=30
|
||
DB_SLOW_POOL_RECYCLE=1800
|
||
|
||
# ============================================================
|
||
# Flask Configuration
|
||
# ============================================================
|
||
# Environment mode: development | production | testing
|
||
# If omitted, runtime defaults to production (fail-safe)
|
||
FLASK_ENV=development
|
||
|
||
# Debug mode: 0 for production, 1 for development
|
||
FLASK_DEBUG=0
|
||
|
||
# Session Security (REQUIRED for production!)
|
||
# Generate with: python -c "import secrets; print(secrets.token_hex(32))"
|
||
SECRET_KEY=your-secret-key-change-in-production
|
||
|
||
# Session timeout in seconds (default: 28800 = 8 hours)
|
||
SESSION_LIFETIME=28800
|
||
|
||
# JSON request payload upper bound in bytes (default: 262144 = 256KB)
|
||
MAX_JSON_BODY_BYTES=262144
|
||
|
||
# Route input-budget guardrails
|
||
QUERY_TOOL_MAX_CONTAINER_IDS=200
|
||
RESOURCE_DETAIL_DEFAULT_LIMIT=500
|
||
RESOURCE_DETAIL_MAX_LIMIT=500
|
||
|
||
# Shared container-resolution guardrails
|
||
# 0 = disable raw input count cap (recommended: rely on expansion limits instead)
|
||
CONTAINER_RESOLVE_INPUT_MAX_VALUES=0
|
||
# Wildcard pattern must include this many literal-prefix chars before %/_ (e.g., GA%)
|
||
CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN=4
|
||
# Per-token expansion guard (avoid one wildcard exploding into too many container IDs)
|
||
CONTAINER_RESOLVE_MAX_EXPANSION_PER_TOKEN=2000
|
||
# Total resolved container-ID guard for a single resolve request
|
||
CONTAINER_RESOLVE_MAX_CONTAINER_IDS=30000
|
||
|
||
# Trust boundary for forwarded headers (safe default: false)
|
||
# Direct-exposure deployment (no reverse proxy): keep this false
|
||
TRUST_PROXY_HEADERS=false
|
||
# Required when TRUST_PROXY_HEADERS=true. Supports comma-separated IP/CIDR entries.
|
||
# Example: TRUSTED_PROXY_IPS=127.0.0.1,10.0.0.0/24
|
||
TRUSTED_PROXY_IPS=
|
||
|
||
# CSP opt-in compatibility flag (default false = safer)
|
||
CSP_ALLOW_UNSAFE_EVAL=false
|
||
|
||
# ============================================================
|
||
# Authentication Configuration
|
||
# ============================================================
|
||
# LDAP API endpoint for user authentication
|
||
LDAP_API_URL=https://your-ldap-api-endpoint.example.com
|
||
|
||
# Admin email addresses (comma-separated for multiple)
|
||
ADMIN_EMAILS=admin@example.com
|
||
|
||
# Local Authentication (for development/testing)
|
||
# When enabled, uses local credentials instead of LDAP
|
||
# Set LOCAL_AUTH_ENABLED=true to bypass LDAP authentication
|
||
# Production safety: when FLASK_ENV=production, local auth is forcibly disabled
|
||
LOCAL_AUTH_ENABLED=false
|
||
LOCAL_AUTH_USERNAME=
|
||
LOCAL_AUTH_PASSWORD=
|
||
|
||
# ============================================================
|
||
# Gunicorn Configuration
|
||
# ============================================================
|
||
# Server bind address and port
|
||
GUNICORN_BIND=0.0.0.0:8080
|
||
|
||
# Number of worker processes
|
||
# Recommend: 2 for ≤ 8GB RAM (trace queries consume 2-3 GB peak per worker)
|
||
# Recommend: 4 for ≥ 16GB RAM
|
||
GUNICORN_WORKERS=2
|
||
|
||
# Threads per worker
|
||
GUNICORN_THREADS=4
|
||
|
||
# Worker timeout (seconds): should stay above DB/query-tool slow paths
|
||
GUNICORN_TIMEOUT=360
|
||
|
||
# Graceful shutdown timeout for worker reloads (seconds)
|
||
GUNICORN_GRACEFUL_TIMEOUT=300
|
||
|
||
# Worker recycle policy (set 0 to disable)
|
||
GUNICORN_MAX_REQUESTS=1200
|
||
GUNICORN_MAX_REQUESTS_JITTER=300
|
||
|
||
# ============================================================
|
||
# Redis Configuration (for WIP cache)
|
||
# ============================================================
|
||
# Redis connection URL
|
||
REDIS_URL=redis://localhost:6379/0
|
||
|
||
# Enable/disable Redis cache (set to false to fallback to Oracle)
|
||
REDIS_ENABLED=true
|
||
|
||
# Redis key prefix (to separate from other applications)
|
||
REDIS_KEY_PREFIX=mes_wip
|
||
|
||
# Redis memory guardrail (prevent unbounded RAM growth)
|
||
# Example: 512mb / 1gb / 2gb. Set 0 to disable limit (NOT recommended).
|
||
REDIS_MAXMEMORY=512mb
|
||
|
||
# Eviction policy when maxmemory is reached
|
||
# Recommended: allkeys-lru (general cache), volatile-lru (TTL keys only)
|
||
REDIS_MAXMEMORY_POLICY=allkeys-lru
|
||
|
||
# Redis persistence (physical storage)
|
||
REDIS_PERSISTENCE_ENABLED=true
|
||
REDIS_APPENDONLY=yes
|
||
REDIS_APPENDFSYNC=everysec
|
||
REDIS_SAVE=900 1 300 10 60 10000
|
||
|
||
# Startup cleanup: remove stale keys that accidentally have no TTL
|
||
REDIS_TTL_CLEANUP_ON_START=true
|
||
REDIS_TTL_CLEANUP_PATTERNS=batch:*,reject_dataset:*,hold_dataset:*,resource_dataset:*,job_query:*
|
||
|
||
# Cache check interval in seconds (default: 600 = 10 minutes)
|
||
CACHE_CHECK_INTERVAL=600
|
||
|
||
# Optional explicit TTL for WIP Redis keys (seconds)
|
||
# If unset/0, TTL defaults to 3 * CACHE_CHECK_INTERVAL
|
||
# Example: CACHE_CHECK_INTERVAL=600 -> default TTL=1800
|
||
WIP_CACHE_TTL_SECONDS=1800
|
||
|
||
# ============================================================
|
||
# Resource Cache Configuration
|
||
# ============================================================
|
||
# Enable/disable Resource cache (DW_MES_RESOURCE)
|
||
# When disabled, queries will fallback to Oracle directly
|
||
RESOURCE_CACHE_ENABLED=true
|
||
|
||
# Resource cache sync interval in seconds (default: 14400 = 4 hours)
|
||
# The cache will check for updates at this interval using MAX(LASTCHANGEDATE)
|
||
RESOURCE_SYNC_INTERVAL=14400
|
||
|
||
# Resource index version check interval in seconds (default: 5)
|
||
RESOURCE_INDEX_VERSION_CHECK_INTERVAL=5
|
||
|
||
# Realtime equipment status cache toggle and sync interval
|
||
REALTIME_EQUIPMENT_CACHE_ENABLED=true
|
||
EQUIPMENT_STATUS_SYNC_INTERVAL=300
|
||
|
||
# Filter cache SQL view overrides
|
||
FILTER_CACHE_WIP_VIEW=DWH.DW_MES_LOT_V
|
||
FILTER_CACHE_SPEC_WORKCENTER_VIEW=DWH.DW_MES_SPEC_WORKCENTER_V
|
||
|
||
# ============================================================
|
||
# Circuit Breaker Configuration
|
||
# ============================================================
|
||
# Enable/disable circuit breaker for database protection
|
||
CIRCUIT_BREAKER_ENABLED=true
|
||
|
||
# Minimum failures before circuit can open
|
||
CIRCUIT_BREAKER_FAILURE_THRESHOLD=5
|
||
|
||
# Failure rate threshold (0.0 - 1.0)
|
||
CIRCUIT_BREAKER_FAILURE_RATE=0.5
|
||
|
||
# Seconds to wait in OPEN state before trying HALF_OPEN
|
||
CIRCUIT_BREAKER_RECOVERY_TIMEOUT=30
|
||
|
||
# Sliding window size for counting successes/failures
|
||
CIRCUIT_BREAKER_WINDOW_SIZE=10
|
||
|
||
# ============================================================
|
||
# Trace Pipeline Configuration
|
||
# ============================================================
|
||
# Slow query warning threshold (seconds) — logs warning when stage exceeds this
|
||
TRACE_SLOW_THRESHOLD_SECONDS=15
|
||
|
||
# Max parallel workers for events domain fetching (per request)
|
||
# Recommend: 2 (each worker × EVENT_FETCHER_MAX_WORKERS = peak slow query slots)
|
||
TRACE_EVENTS_MAX_WORKERS=2
|
||
|
||
# Max parallel workers for EventFetcher batch queries (per domain)
|
||
# Recommend: 2 (peak concurrent slow queries = TRACE_EVENTS_MAX_WORKERS × this)
|
||
EVENT_FETCHER_MAX_WORKERS=2
|
||
# false = any failed batch raises error (avoid silent partial data)
|
||
EVENT_FETCHER_ALLOW_PARTIAL_RESULTS=false
|
||
|
||
# Max parallel workers for forward pipeline WIP+rejects fetching
|
||
FORWARD_PIPELINE_MAX_WORKERS=2
|
||
|
||
# --- Admission Control (提案 1: trace-events-memory-triage) ---
|
||
# Max container IDs per synchronous events request.
|
||
# Requests exceeding this limit return HTTP 413 (or HTTP 202 when async job queue is enabled).
|
||
# Set based on available RAM: 50K CIDs ≈ 2-3 GB peak memory per request.
|
||
TRACE_EVENTS_CID_LIMIT=50000
|
||
|
||
# Cursor fetchmany batch size for slow query iterator mode.
|
||
# Smaller = less peak memory; larger = fewer Oracle round-trips.
|
||
DB_SLOW_FETCHMANY_SIZE=5000
|
||
|
||
# Domain-level cache skip threshold (CID count).
|
||
# When CID count exceeds this, per-domain and route-level cache writes are skipped.
|
||
EVENT_FETCHER_CACHE_SKIP_CID_THRESHOLD=10000
|
||
|
||
# --- Async Job Queue (提案 2: trace-async-job-queue) ---
|
||
# Enable RQ trace worker for async large query processing
|
||
# Set to true and start the worker: rq worker trace-events
|
||
TRACE_WORKER_ENABLED=true
|
||
|
||
# CID threshold for automatic async job routing (requires RQ worker).
|
||
# Requests with CID count > threshold are queued instead of processed synchronously.
|
||
TRACE_ASYNC_CID_THRESHOLD=20000
|
||
|
||
# Job result retention time in seconds (default: 3600 = 1 hour)
|
||
TRACE_JOB_TTL_SECONDS=3600
|
||
|
||
# Job execution timeout in seconds (default: 1800 = 30 minutes)
|
||
TRACE_JOB_TIMEOUT_SECONDS=1800
|
||
|
||
# Number of RQ worker processes for trace jobs
|
||
TRACE_WORKER_COUNT=1
|
||
|
||
# RQ queue name for trace jobs
|
||
TRACE_WORKER_QUEUE=trace-events
|
||
|
||
# --- Streaming Response (提案 3: trace-streaming-response) ---
|
||
# NDJSON stream batch size (records per NDJSON line)
|
||
TRACE_STREAM_BATCH_SIZE=5000
|
||
|
||
# ============================================================
|
||
# Performance Metrics Configuration
|
||
# ============================================================
|
||
# Slow query threshold in seconds (default: 5.0)
|
||
# Note: Real-time Oracle views may take 2-5s per query, set threshold accordingly
|
||
SLOW_QUERY_THRESHOLD=5.0
|
||
|
||
# In-memory query metrics sliding window size
|
||
METRICS_WINDOW_SIZE=1000
|
||
|
||
# ============================================================
|
||
# SQLite Log Store Configuration
|
||
# ============================================================
|
||
# Enable/disable SQLite log store for admin dashboard
|
||
LOG_STORE_ENABLED=true
|
||
|
||
# SQLite database path
|
||
LOG_SQLITE_PATH=logs/admin_logs.sqlite
|
||
|
||
# Log retention period in days (default: 7)
|
||
LOG_SQLITE_RETENTION_DAYS=7
|
||
|
||
# Maximum log rows (default: 100000)
|
||
LOG_SQLITE_MAX_ROWS=100000
|
||
|
||
# ============================================================
|
||
# Worker Watchdog Configuration
|
||
# ============================================================
|
||
# Runtime directory for restart flag/pid/state files
|
||
WATCHDOG_RUNTIME_DIR=./tmp
|
||
|
||
# Path to restart flag file (watchdog monitors this file)
|
||
WATCHDOG_RESTART_FLAG=./tmp/mes_dashboard_restart.flag
|
||
|
||
# Gunicorn PID file path (must match start script / systemd config)
|
||
WATCHDOG_PID_FILE=./tmp/gunicorn.pid
|
||
|
||
# Path to restart state file (stores last restart info)
|
||
WATCHDOG_STATE_FILE=./tmp/mes_dashboard_restart_state.json
|
||
|
||
# Max entries persisted in restart history (bounded to avoid state growth)
|
||
WATCHDOG_RESTART_HISTORY_MAX=50
|
||
|
||
# Cooldown period between restart requests in seconds (default: 60)
|
||
WORKER_RESTART_COOLDOWN=60
|
||
|
||
# Watchdog loop check interval in seconds
|
||
WATCHDOG_CHECK_INTERVAL=5
|
||
|
||
# ============================================================
|
||
# Portal Shell / Full Modernization Flags
|
||
# ============================================================
|
||
# Frontend build mode: auto | always | never
|
||
# - auto: build only when dist is missing/stale
|
||
# - always: force build on every start/restart
|
||
# - never: skip build entirely
|
||
FRONTEND_BUILD_MODE=always
|
||
|
||
# Fail startup when frontend build fails (recommended in production)
|
||
FRONTEND_BUILD_FAIL_ON_ERROR=true
|
||
|
||
# Shell-first SPA navigation
|
||
# Local default: false (avoid coupling first boot to shell-first cutover behavior)
|
||
# Production recommendation: set true after shell cutover verification is complete
|
||
PORTAL_SPA_ENABLED=false
|
||
|
||
# Fail-fast readiness gate for in-scope dist assets
|
||
# Local default: false (allow startup when dist artifacts are intentionally absent)
|
||
# Production recommendation: set true to enforce release readiness at boot
|
||
MODERNIZATION_ENFORCE_ASSET_READINESS=false
|
||
|
||
# Retire in-scope runtime fallback behavior
|
||
# Local default: false (retain compatibility fallback while developing)
|
||
# Production recommendation: set true only after all in-scope dist assets are verified
|
||
MODERNIZATION_RETIRE_IN_SCOPE_RUNTIME_FALLBACK=false
|
||
|
||
# Runtime contract strict validation toggle
|
||
# Local default: false (avoid strict conda/runtime checks on onboarding machines)
|
||
# Production recommendation: set true to fail fast on contract drift
|
||
RUNTIME_CONTRACT_ENFORCE=false
|
||
|
||
# Health endpoint memo cache TTL in seconds
|
||
HEALTH_MEMO_TTL_SECONDS=5
|
||
|
||
# Reject history options API cache TTL in seconds (default: 14400 = 4 hours)
|
||
REJECT_HISTORY_OPTIONS_CACHE_TTL_SECONDS=14400
|
||
|
||
# ============================================================
|
||
# Reject History Batch/Spill Guardrails
|
||
# ============================================================
|
||
# Batch chunking controls (for long-range reject queries)
|
||
REJECT_ENGINE_GRAIN_DAYS=10
|
||
REJECT_ENGINE_PARALLEL=2
|
||
REJECT_ENGINE_MAX_ROWS_PER_CHUNK=50000
|
||
REJECT_ENGINE_MAX_TOTAL_ROWS=300000
|
||
|
||
# Large result spill controls
|
||
REJECT_ENGINE_SPILL_ENABLED=true
|
||
REJECT_ENGINE_MAX_RESULT_MB=64
|
||
QUERY_SPOOL_DIR=tmp/query_spool
|
||
REJECT_ENGINE_SPOOL_TTL_SECONDS=21600
|
||
REJECT_ENGINE_SPOOL_MAX_BYTES=2147483648
|
||
REJECT_ENGINE_SPOOL_WARN_RATIO=0.85
|
||
REJECT_ENGINE_SPOOL_CLEANUP_INTERVAL_SECONDS=300
|
||
REJECT_ENGINE_SPOOL_ORPHAN_GRACE_SECONDS=600
|
||
|
||
# Batch query engine thresholds
|
||
BATCH_QUERY_TIME_THRESHOLD_DAYS=10
|
||
BATCH_QUERY_ID_THRESHOLD=1000
|
||
BATCH_CHUNK_MAX_MEMORY_MB=256
|
||
|
||
# ============================================================
|
||
# Runtime Resilience Diagnostics Thresholds
|
||
# ============================================================
|
||
# Alert window for sustained degraded state (seconds)
|
||
RESILIENCE_DEGRADED_ALERT_SECONDS=300
|
||
|
||
# Pool saturation warning / critical levels
|
||
RESILIENCE_POOL_SATURATION_WARNING=0.90
|
||
RESILIENCE_POOL_SATURATION_CRITICAL=1.0
|
||
|
||
# Restart churn threshold: N restarts within window triggers churn warning
|
||
RESILIENCE_RESTART_CHURN_WINDOW_SECONDS=600
|
||
RESILIENCE_RESTART_CHURN_THRESHOLD=3
|
||
|
||
# ============================================================
|
||
# CORS Configuration
|
||
# ============================================================
|
||
# Comma-separated list of allowed origins for CORS
|
||
# Example: https://example.com,https://app.example.com
|
||
# Set to * for development (not recommended for production)
|
||
CORS_ALLOWED_ORIGINS=
|