diff --git a/.env.example b/.env.example index 0541199..60c0bd3 100644 --- a/.env.example +++ b/.env.example @@ -55,6 +55,7 @@ ADMIN_EMAILS=admin@example.com # Local Authentication (for development/testing) # When enabled, uses local credentials instead of LDAP # Set LOCAL_AUTH_ENABLED=true to bypass LDAP authentication +# Production safety: when FLASK_ENV=production, local auth is forcibly disabled LOCAL_AUTH_ENABLED=false LOCAL_AUTH_USERNAME= LOCAL_AUTH_PASSWORD= @@ -86,6 +87,11 @@ REDIS_KEY_PREFIX=mes_wip # Cache check interval in seconds (default: 600 = 10 minutes) CACHE_CHECK_INTERVAL=600 +# Optional explicit TTL for WIP Redis keys (seconds) +# If unset/0, TTL defaults to 3 * CACHE_CHECK_INTERVAL +# Example: CACHE_CHECK_INTERVAL=600 -> default TTL=1800 +WIP_CACHE_TTL_SECONDS=1800 + # ============================================================ # Resource Cache Configuration # ============================================================ diff --git a/README.md b/README.md index 3c67de7..693baa7 100644 --- a/README.md +++ b/README.md @@ -146,8 +146,8 @@ nano .env ### 日常操作 ```bash -# 啟動服務(背景執行) -./scripts/start_server.sh start +# 啟動服務(背景執行,含 Gunicorn + worker_watchdog) +./scripts/start_server.sh start # 停止服務 ./scripts/start_server.sh stop @@ -158,8 +158,9 @@ nano .env # 查看狀態 ./scripts/start_server.sh status -# 查看日誌 -./scripts/start_server.sh logs follow +# 查看日誌(含 watchdog) +./scripts/start_server.sh logs follow +./scripts/start_server.sh logs watchdog ``` 訪問網址: **http://localhost:8080** (可在 .env 中配置) @@ -330,17 +331,16 @@ RESOURCE_STATUS_RATE_LIMIT_WINDOW_SECONDS=60 ### Conda + systemd 服務配置 -建議在生產環境使用同一份 conda runtime contract 啟動 App 與 Watchdog: +建議在生產環境使用同一份 `.env`(`/opt/mes-dashboard/.env`)啟動 App 與 Watchdog,與開發環境一致: ```bash # 1. 複製 systemd 服務檔案 sudo cp deploy/mes-dashboard.service /etc/systemd/system/ sudo cp deploy/mes-dashboard-watchdog.service /etc/systemd/system/ -# 2. 準備環境設定檔 -sudo mkdir -p /etc/mes-dashboard -sudo cp deploy/mes-dashboard.env.example /etc/mes-dashboard/mes-dashboard.env -sudo cp .env /etc/mes-dashboard/mes-dashboard.env +# 2. 確認 /opt/mes-dashboard/.env 權限(供 www-data 讀取) +sudo chown root:www-data /opt/mes-dashboard/.env +sudo chmod 640 /opt/mes-dashboard/.env # 3. 重新載入 systemd sudo systemctl daemon-reload @@ -615,8 +615,7 @@ DashBoard_vite/ ├── deploy/ # 部署設定 │ ├── mes-dashboard.service # Gunicorn systemd 服務 (Conda) │ ├── mes-dashboard-watchdog.service # Watchdog systemd 服務 (Conda) -│ └── mes-dashboard.env.example # Runtime contract 環境範本 -├── tests/ # 測試 +├── tests/ # 測試 ├── data/ # 資料檔案 ├── logs/ # 日誌 ├── docs/ # 文檔 diff --git a/README.mdj b/README.mdj index c11cf35..0332781 100644 --- a/README.mdj +++ b/README.mdj @@ -9,6 +9,8 @@ - 快取:Redis + process-level cache + indexed selection telemetry - 資料:Oracle(QueuePool) - 運維:watchdog + admin worker restart API + guarded-mode policy +- 環境設定:開發與正式環境統一使用專案根目錄同一份 `.env` +- 啟動腳本:`./scripts/start_server.sh start` 會同時啟動 Gunicorn 與 `worker_watchdog.py` ## 2. 既有設計原則(保留) diff --git a/deploy/mes-dashboard-watchdog.service b/deploy/mes-dashboard-watchdog.service index fe31b58..502e915 100644 --- a/deploy/mes-dashboard-watchdog.service +++ b/deploy/mes-dashboard-watchdog.service @@ -9,27 +9,13 @@ Type=simple User=www-data Group=www-data WorkingDirectory=/opt/mes-dashboard -EnvironmentFile=-/etc/mes-dashboard/mes-dashboard.env +EnvironmentFile=-/opt/mes-dashboard/.env Environment="PYTHONPATH=/opt/mes-dashboard/src" -Environment="CONDA_BIN=/opt/miniconda3/bin/conda" -Environment="CONDA_ENV_NAME=mes-dashboard" -Environment="WATCHDOG_RUNTIME_DIR=/run/mes-dashboard" -Environment="WATCHDOG_RESTART_FLAG=/run/mes-dashboard/mes_dashboard_restart.flag" -Environment="WATCHDOG_PID_FILE=/run/mes-dashboard/gunicorn.pid" -Environment="WATCHDOG_STATE_FILE=/var/lib/mes-dashboard/restart_state.json" -Environment="WATCHDOG_CHECK_INTERVAL=5" -Environment="RUNTIME_CONTRACT_VERSION=2026.02-p2" -Environment="RUNTIME_CONTRACT_ENFORCE=true" -Environment="WORKER_RESTART_COOLDOWN=60" -Environment="WORKER_RESTART_RETRY_BUDGET=3" -Environment="WORKER_RESTART_WINDOW_SECONDS=600" -Environment="WORKER_RESTART_CHURN_THRESHOLD=3" -Environment="WORKER_GUARDED_MODE_ENABLED=true" RuntimeDirectory=mes-dashboard StateDirectory=mes-dashboard -ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN}" run --no-capture-output -n "${CONDA_ENV_NAME}" python scripts/worker_watchdog.py' +ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN:-/opt/miniconda3/bin/conda}" run --no-capture-output -n "${CONDA_ENV_NAME:-mes-dashboard}" python scripts/worker_watchdog.py' Restart=always RestartSec=5 diff --git a/deploy/mes-dashboard.env.example b/deploy/mes-dashboard.env.example deleted file mode 100644 index 82f6f81..0000000 --- a/deploy/mes-dashboard.env.example +++ /dev/null @@ -1,26 +0,0 @@ -# MES Dashboard runtime contract (version 2026.02-p2) - -# Conda runtime -CONDA_BIN=/opt/miniconda3/bin/conda -CONDA_ENV_NAME=mes-dashboard - -# Single-port serving contract -GUNICORN_BIND=0.0.0.0:8080 - -# Watchdog/runtime paths -WATCHDOG_RUNTIME_DIR=/run/mes-dashboard -WATCHDOG_RESTART_FLAG=/run/mes-dashboard/mes_dashboard_restart.flag -WATCHDOG_PID_FILE=/run/mes-dashboard/gunicorn.pid -WATCHDOG_STATE_FILE=/var/lib/mes-dashboard/restart_state.json -WATCHDOG_CHECK_INTERVAL=5 - -# Runtime contract enforcement -RUNTIME_CONTRACT_VERSION=2026.02-p2 -RUNTIME_CONTRACT_ENFORCE=true - -# Worker recovery policy -WORKER_RESTART_COOLDOWN=60 -WORKER_RESTART_RETRY_BUDGET=3 -WORKER_RESTART_WINDOW_SECONDS=600 -WORKER_RESTART_CHURN_THRESHOLD=3 -WORKER_GUARDED_MODE_ENABLED=true diff --git a/deploy/mes-dashboard.service b/deploy/mes-dashboard.service index 44f6c83..729d0a8 100644 --- a/deploy/mes-dashboard.service +++ b/deploy/mes-dashboard.service @@ -9,28 +9,14 @@ Type=simple User=www-data Group=www-data WorkingDirectory=/opt/mes-dashboard -EnvironmentFile=-/etc/mes-dashboard/mes-dashboard.env +EnvironmentFile=-/opt/mes-dashboard/.env Environment="PYTHONPATH=/opt/mes-dashboard/src" -Environment="CONDA_BIN=/opt/miniconda3/bin/conda" -Environment="CONDA_ENV_NAME=mes-dashboard" -Environment="GUNICORN_BIND=0.0.0.0:8080" -Environment="WATCHDOG_RUNTIME_DIR=/run/mes-dashboard" -Environment="WATCHDOG_RESTART_FLAG=/run/mes-dashboard/mes_dashboard_restart.flag" -Environment="WATCHDOG_PID_FILE=/run/mes-dashboard/gunicorn.pid" -Environment="WATCHDOG_STATE_FILE=/var/lib/mes-dashboard/restart_state.json" -Environment="RUNTIME_CONTRACT_VERSION=2026.02-p2" -Environment="RUNTIME_CONTRACT_ENFORCE=true" -Environment="WORKER_RESTART_COOLDOWN=60" -Environment="WORKER_RESTART_RETRY_BUDGET=3" -Environment="WORKER_RESTART_WINDOW_SECONDS=600" -Environment="WORKER_RESTART_CHURN_THRESHOLD=3" -Environment="WORKER_GUARDED_MODE_ENABLED=true" RuntimeDirectory=mes-dashboard StateDirectory=mes-dashboard PIDFile=/run/mes-dashboard/gunicorn.pid -ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN}" run --no-capture-output -n "${CONDA_ENV_NAME}" gunicorn --config gunicorn.conf.py --pid "${WATCHDOG_PID_FILE}" --capture-output "mes_dashboard:create_app()"' +ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN:-/opt/miniconda3/bin/conda}" run --no-capture-output -n "${CONDA_ENV_NAME:-mes-dashboard}" gunicorn --config gunicorn.conf.py --pid "${WATCHDOG_PID_FILE:-/run/mes-dashboard/gunicorn.pid}" --capture-output "mes_dashboard:create_app()"' KillSignal=SIGTERM TimeoutStopSec=30 diff --git a/docs/migration_gates_and_runbook.md b/docs/migration_gates_and_runbook.md index a63bfdc..d5dba78 100644 --- a/docs/migration_gates_and_runbook.md +++ b/docs/migration_gates_and_runbook.md @@ -62,8 +62,8 @@ A release is cutover-ready only when all gates pass: 5. Conda + systemd rehearsal (recommended before production cutover) - `sudo cp deploy/mes-dashboard.service /etc/systemd/system/` - `sudo cp deploy/mes-dashboard-watchdog.service /etc/systemd/system/` -- `sudo mkdir -p /etc/mes-dashboard && sudo cp deploy/mes-dashboard.env.example /etc/mes-dashboard/mes-dashboard.env` -- merge deployment secrets from `.env` into `/etc/mes-dashboard/mes-dashboard.env` +- ensure deployment uses the same single env file: `/opt/mes-dashboard/.env` +- `sudo chown root:www-data /opt/mes-dashboard/.env && sudo chmod 640 /opt/mes-dashboard/.env` - `sudo systemctl daemon-reload` - `sudo systemctl enable --now mes-dashboard mes-dashboard-watchdog` - call `/admin/api/worker/status` and verify runtime contract paths exist diff --git a/scripts/deploy.sh b/scripts/deploy.sh index fb27b0c..e3179af 100644 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -237,8 +237,9 @@ show_next_steps() { echo " http://localhost:${port:-8080}" echo "" echo "Optional: install conda+systemd services" - echo " sudo mkdir -p /etc/mes-dashboard" - echo " sudo cp .env /etc/mes-dashboard/mes-dashboard.env" + echo " # systemd and local scripts both use the same /opt/mes-dashboard/.env" + echo " sudo chown root:www-data .env" + echo " sudo chmod 640 .env" echo " sudo cp deploy/mes-dashboard.service /etc/systemd/system/" echo " sudo cp deploy/mes-dashboard-watchdog.service /etc/systemd/system/" echo " sudo systemctl daemon-reload" diff --git a/scripts/start_server.sh b/scripts/start_server.sh index 7000561..6f28af2 100755 --- a/scripts/start_server.sh +++ b/scripts/start_server.sh @@ -16,12 +16,15 @@ PID_FILE="${WATCHDOG_PID_FILE:-${PID_FILE_DEFAULT}}" LOG_DIR="${ROOT}/logs" ACCESS_LOG="${LOG_DIR}/access.log" ERROR_LOG="${LOG_DIR}/error.log" +WATCHDOG_LOG="${LOG_DIR}/watchdog.log" STARTUP_LOG="${LOG_DIR}/startup.log" DEFAULT_PORT="${GUNICORN_BIND:-0.0.0.0:8080}" PORT=$(echo "$DEFAULT_PORT" | cut -d: -f2) # Redis configuration REDIS_ENABLED="${REDIS_ENABLED:-true}" +# Worker watchdog configuration +WATCHDOG_ENABLED="${WATCHDOG_ENABLED:-true}" # Colors for output RED='\033[0;31m' @@ -53,13 +56,25 @@ timestamp() { date '+%Y-%m-%d %H:%M:%S' } +is_enabled() { + case "${1:-}" in + 1|[Tt][Rr][Uu][Ee]|[Yy][Ee][Ss]|[Oo][Nn]) + return 0 + ;; + *) + return 1 + ;; + esac +} + resolve_runtime_paths() { WATCHDOG_RUNTIME_DIR="${WATCHDOG_RUNTIME_DIR:-${ROOT}/tmp}" WATCHDOG_RESTART_FLAG="${WATCHDOG_RESTART_FLAG:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart.flag}" WATCHDOG_PID_FILE="${WATCHDOG_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/gunicorn.pid}" WATCHDOG_STATE_FILE="${WATCHDOG_STATE_FILE:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart_state.json}" + WATCHDOG_PROCESS_PID_FILE="${WATCHDOG_PROCESS_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/worker_watchdog.pid}" PID_FILE="${WATCHDOG_PID_FILE}" - export WATCHDOG_RUNTIME_DIR WATCHDOG_RESTART_FLAG WATCHDOG_PID_FILE WATCHDOG_STATE_FILE + export WATCHDOG_RUNTIME_DIR WATCHDOG_RESTART_FLAG WATCHDOG_PID_FILE WATCHDOG_STATE_FILE WATCHDOG_PROCESS_PID_FILE } # Load .env file if exists @@ -396,14 +411,20 @@ rotate_logs() { log_info "Archived error.log -> archive/error_${ts}.log" fi + if [ -f "$WATCHDOG_LOG" ] && [ -s "$WATCHDOG_LOG" ]; then + mv "$WATCHDOG_LOG" "${LOG_DIR}/archive/watchdog_${ts}.log" + log_info "Archived watchdog.log -> archive/watchdog_${ts}.log" + fi + # Clean up old archives (keep last 10) cd "${LOG_DIR}/archive" 2>/dev/null && \ ls -t access_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f && \ - ls -t error_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f + ls -t error_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f && \ + ls -t watchdog_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f cd "$ROOT" # Create fresh log files - touch "$ACCESS_LOG" "$ERROR_LOG" + touch "$ACCESS_LOG" "$ERROR_LOG" "$WATCHDOG_LOG" } get_pid() { @@ -429,6 +450,84 @@ is_running() { get_pid &>/dev/null } +get_watchdog_pid() { + if [ -f "$WATCHDOG_PROCESS_PID_FILE" ]; then + local pid + pid=$(cat "$WATCHDOG_PROCESS_PID_FILE" 2>/dev/null || true) + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + echo "$pid" + return 0 + fi + rm -f "$WATCHDOG_PROCESS_PID_FILE" + fi + return 1 +} + +is_watchdog_running() { + get_watchdog_pid &>/dev/null +} + +start_watchdog() { + if ! is_enabled "${WATCHDOG_ENABLED:-true}"; then + log_info "Worker watchdog is disabled (WATCHDOG_ENABLED=${WATCHDOG_ENABLED})" + return 0 + fi + + if is_watchdog_running; then + local pid + pid=$(get_watchdog_pid) + log_success "Worker watchdog already running (PID: ${pid})" + return 0 + fi + + log_info "Starting worker watchdog..." + nohup python scripts/worker_watchdog.py >> "$WATCHDOG_LOG" 2>&1 & + local pid=$! + echo "$pid" > "$WATCHDOG_PROCESS_PID_FILE" + + sleep 1 + if kill -0 "$pid" 2>/dev/null; then + log_success "Worker watchdog started (PID: ${pid})" + return 0 + fi + + rm -f "$WATCHDOG_PROCESS_PID_FILE" + log_error "Failed to start worker watchdog" + return 1 +} + +stop_watchdog() { + if ! is_watchdog_running; then + rm -f "$WATCHDOG_PROCESS_PID_FILE" + return 0 + fi + + local pid + pid=$(get_watchdog_pid) + log_info "Stopping worker watchdog (PID: ${pid})..." + kill -TERM "$pid" 2>/dev/null || true + + local count=0 + while kill -0 "$pid" 2>/dev/null && [ $count -lt 5 ]; do + sleep 1 + count=$((count + 1)) + done + + if kill -0 "$pid" 2>/dev/null; then + kill -9 "$pid" 2>/dev/null || true + sleep 1 + fi + + rm -f "$WATCHDOG_PROCESS_PID_FILE" + if kill -0 "$pid" 2>/dev/null; then + log_error "Failed to stop worker watchdog" + return 1 + fi + + log_success "Worker watchdog stopped" + return 0 +} + do_start() { local foreground=false @@ -442,7 +541,14 @@ do_start() { if is_running; then local pid=$(get_pid) log_warn "Server is already running (PID: ${pid})" - return 1 + if is_enabled "${WATCHDOG_ENABLED:-true}" && ! is_watchdog_running; then + check_conda || return 1 + conda activate "$CONDA_ENV" + export PYTHONPATH="${ROOT}/src:${PYTHONPATH:-}" + cd "$ROOT" + start_watchdog || return 1 + fi + return 0 fi # Run checks @@ -470,6 +576,9 @@ do_start() { echo "[$(timestamp)] Starting server" >> "$STARTUP_LOG" if [ "$foreground" = true ]; then + if is_enabled "${WATCHDOG_ENABLED:-true}"; then + log_info "Foreground mode does not auto-start watchdog (use background start for watchdog)." + fi log_info "Running in foreground mode (Ctrl+C to stop)" exec gunicorn \ --config gunicorn.conf.py \ @@ -495,6 +604,7 @@ do_start() { log_success "Server started successfully (PID: ${pid})" log_info "Access URL: http://localhost:${PORT}" log_info "Logs: ${LOG_DIR}/" + start_watchdog || return 1 echo "[$(timestamp)] Server started (PID: ${pid})" >> "$STARTUP_LOG" else log_error "Failed to start server" @@ -509,48 +619,54 @@ do_stop() { load_env resolve_runtime_paths - if ! is_running; then - log_warn "Server is not running" - return 0 - fi - - local pid=$(get_pid) - log_info "Stopping server (PID: ${pid})..." - - # Find all gunicorn processes (master + workers) - local all_pids=$(pgrep -f "gunicorn.*mes_dashboard" 2>/dev/null | tr '\n' ' ') - - # Graceful shutdown with SIGTERM - kill -TERM "$pid" 2>/dev/null - - # Wait for graceful shutdown (max 10 seconds) - local count=0 - while kill -0 "$pid" 2>/dev/null && [ $count -lt 10 ]; do - sleep 1 - count=$((count + 1)) - echo -n "." - done - echo "" - - # Force kill if still running (including orphaned workers) - if kill -0 "$pid" 2>/dev/null || [ -n "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then - log_warn "Graceful shutdown timeout, forcing..." - # Kill all gunicorn processes related to mes_dashboard - pkill -9 -f "gunicorn.*mes_dashboard" 2>/dev/null - sleep 1 - fi - - # Cleanup PID file - rm -f "$PID_FILE" - - # Verify all processes are stopped - if [ -z "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then - log_success "Server stopped" - echo "[$(timestamp)] Server stopped (PID: ${pid})" >> "$STARTUP_LOG" + local server_running=false + local pid="" + if is_running; then + server_running=true + pid=$(get_pid) + log_info "Stopping server (PID: ${pid})..." else - log_error "Failed to stop server" - return 1 + log_warn "Server is not running" fi + + if [ "$server_running" = true ]; then + # Find all gunicorn processes (master + workers) + local all_pids=$(pgrep -f "gunicorn.*mes_dashboard" 2>/dev/null | tr '\n' ' ') + + # Graceful shutdown with SIGTERM + kill -TERM "$pid" 2>/dev/null + + # Wait for graceful shutdown (max 10 seconds) + local count=0 + while kill -0 "$pid" 2>/dev/null && [ $count -lt 10 ]; do + sleep 1 + count=$((count + 1)) + echo -n "." + done + echo "" + + # Force kill if still running (including orphaned workers) + if kill -0 "$pid" 2>/dev/null || [ -n "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then + log_warn "Graceful shutdown timeout, forcing..." + # Kill all gunicorn processes related to mes_dashboard + pkill -9 -f "gunicorn.*mes_dashboard" 2>/dev/null + sleep 1 + fi + + # Cleanup PID file + rm -f "$PID_FILE" + + # Verify all processes are stopped + if [ -z "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then + log_success "Server stopped" + echo "[$(timestamp)] Server stopped (PID: ${pid})" >> "$STARTUP_LOG" + else + log_error "Failed to stop server" + return 1 + fi + fi + + stop_watchdog } do_restart() { @@ -585,6 +701,16 @@ do_status() { # Show Redis status redis_status + if is_enabled "${WATCHDOG_ENABLED:-true}"; then + if is_watchdog_running; then + local watchdog_pid=$(get_watchdog_pid) + echo -e " Watchdog:${GREEN} RUNNING${NC} (PID: ${watchdog_pid})" + else + echo -e " Watchdog:${YELLOW} STOPPED${NC}" + fi + else + echo -e " Watchdog:${YELLOW} DISABLED${NC}" + fi if is_running; then echo "" @@ -635,7 +761,15 @@ do_logs() { ;; follow) log_info "Following logs (Ctrl+C to stop)..." - tail -f "$ACCESS_LOG" "$ERROR_LOG" 2>/dev/null + tail -f "$ACCESS_LOG" "$ERROR_LOG" "$WATCHDOG_LOG" 2>/dev/null + ;; + watchdog) + if [ -f "$WATCHDOG_LOG" ]; then + log_info "Watchdog log (last ${lines} lines):" + tail -n "$lines" "$WATCHDOG_LOG" + else + log_warn "Watchdog log not found" + fi ;; *) log_info "=== Error Log (last 20 lines) ===" @@ -643,6 +777,9 @@ do_logs() { echo "" log_info "=== Access Log (last 20 lines) ===" tail -20 "$ACCESS_LOG" 2>/dev/null || echo "(empty)" + echo "" + log_info "=== Watchdog Log (last 20 lines) ===" + tail -20 "$WATCHDOG_LOG" 2>/dev/null || echo "(empty)" ;; esac } @@ -660,7 +797,7 @@ show_help() { echo " stop Stop the server gracefully" echo " restart Restart the server" echo " status Show server and Redis status" - echo " logs [type] View logs (access|error|follow|all)" + echo " logs [type] View logs (access|error|watchdog|follow|all)" echo " check Run environment checks only" echo " help Show this help message" echo "" @@ -676,6 +813,7 @@ show_help() { echo " GUNICORN_THREADS Threads per worker (default: 4)" echo " REDIS_ENABLED Enable Redis cache (default: true)" echo " REDIS_URL Redis connection URL" + echo " WATCHDOG_ENABLED Enable worker watchdog (default: true)" echo "" } diff --git a/scripts/worker_watchdog.py b/scripts/worker_watchdog.py index bac2c8d..ab302a5 100755 --- a/scripts/worker_watchdog.py +++ b/scripts/worker_watchdog.py @@ -40,6 +40,7 @@ from mes_dashboard.core.runtime_contract import ( # noqa: E402 build_runtime_contract_diagnostics, load_runtime_contract, ) +from mes_dashboard.core.watchdog_logging import attach_sqlite_log_handler # noqa: E402 from mes_dashboard.core.worker_recovery_policy import ( # noqa: E402 decide_restart_request, evaluate_worker_recovery_state, @@ -57,6 +58,7 @@ logging.basicConfig( ] ) logger = logging.getLogger('mes_dashboard.watchdog') +attach_sqlite_log_handler(logger) # ============================================================ # Configuration diff --git a/src/mes_dashboard/core/cache_updater.py b/src/mes_dashboard/core/cache_updater.py index ffa6e01..6a771e3 100644 --- a/src/mes_dashboard/core/cache_updater.py +++ b/src/mes_dashboard/core/cache_updater.py @@ -31,6 +31,7 @@ logger = logging.getLogger('mes_dashboard.cache_updater') CACHE_CHECK_INTERVAL = int(os.getenv('CACHE_CHECK_INTERVAL', '600')) # 10 minutes WIP_VIEW = "DWH.DW_MES_LOT_V" +WIP_CACHE_TTL_SECONDS = int(os.getenv('WIP_CACHE_TTL_SECONDS', '0')) # Resource cache sync interval (default: 4 hours) RESOURCE_SYNC_INTERVAL = int(os.getenv('RESOURCE_SYNC_INTERVAL', '14400')) @@ -236,6 +237,7 @@ class CacheUpdater: staging_key: str | None = None try: + ttl_seconds = self._resolve_cache_ttl_seconds() # Convert DataFrame to JSON # Handle datetime columns df_copy = df.copy() @@ -250,10 +252,10 @@ class CacheUpdater: staging_key = get_key(f"data:staging:{unique_suffix}") pipe = client.pipeline() - pipe.set(staging_key, data_json) + pipe.set(staging_key, data_json, ex=ttl_seconds) pipe.rename(staging_key, get_key("data")) - pipe.set(get_key("meta:sys_date"), sys_date) - pipe.set(get_key("meta:updated_at"), now) + pipe.set(get_key("meta:sys_date"), sys_date, ex=ttl_seconds) + pipe.set(get_key("meta:updated_at"), now, ex=ttl_seconds) pipe.execute() return True @@ -266,6 +268,16 @@ class CacheUpdater: pass return False + def _resolve_cache_ttl_seconds(self) -> int: + """Resolve Redis TTL for WIP snapshot keys. + + Default strategy: 3x sync interval to tolerate temporary sync gaps while + preventing stale data from lingering forever when updater stops. + """ + if WIP_CACHE_TTL_SECONDS > 0: + return WIP_CACHE_TTL_SECONDS + return max(int(self.interval) * 3, 60) + def _check_resource_update(self, force: bool = False) -> bool: """Check and update resource cache if needed. diff --git a/src/mes_dashboard/core/database.py b/src/mes_dashboard/core/database.py index a5db670..b119c02 100644 --- a/src/mes_dashboard/core/database.py +++ b/src/mes_dashboard/core/database.py @@ -670,7 +670,7 @@ def get_table_data( logger.error(f"get_table_data failed - ORA-{ora_code}: {exc}") if connection: connection.close() - return {'error': f'查詢失敗: {str(exc)}'} + return {'error': '查詢服務暫時無法使用'} def get_table_column_metadata(table_name: str) -> Dict[str, Any]: diff --git a/src/mes_dashboard/core/watchdog_logging.py b/src/mes_dashboard/core/watchdog_logging.py new file mode 100644 index 0000000..126420f --- /dev/null +++ b/src/mes_dashboard/core/watchdog_logging.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +"""Logging helpers shared by watchdog runtime.""" + +from __future__ import annotations + +import logging + +_SQLITE_HANDLER_FLAG = "_watchdog_sqlite_handler_registered" + + +def attach_sqlite_log_handler(target_logger: logging.Logger) -> bool: + """Attach SQLite log handler to watchdog logger when enabled. + + Returns: + True if a new handler was attached; otherwise False. + """ + if getattr(target_logger, _SQLITE_HANDLER_FLAG, False): + return False + + try: + from mes_dashboard.core.log_store import LOG_STORE_ENABLED, get_sqlite_log_handler + except Exception as exc: + target_logger.warning("Failed to import SQLite log store: %s", exc) + return False + + if not LOG_STORE_ENABLED: + return False + + try: + sqlite_handler = get_sqlite_log_handler() + sqlite_handler.setLevel(logging.INFO) + target_logger.addHandler(sqlite_handler) + setattr(target_logger, _SQLITE_HANDLER_FLAG, True) + return True + except Exception as exc: + target_logger.warning("Failed to initialize SQLite log handler: %s", exc) + return False diff --git a/src/mes_dashboard/routes/auth_routes.py b/src/mes_dashboard/routes/auth_routes.py index 21a4060..5fe883e 100644 --- a/src/mes_dashboard/routes/auth_routes.py +++ b/src/mes_dashboard/routes/auth_routes.py @@ -3,11 +3,12 @@ from __future__ import annotations -import logging -import time -from collections import defaultdict -from datetime import datetime -from threading import Lock +import logging +import time +from collections import defaultdict +from datetime import datetime +from threading import Lock +from urllib.parse import urlparse from flask import Blueprint, flash, redirect, render_template, request, session, url_for @@ -26,8 +27,25 @@ auth_bp = Blueprint("auth", __name__, url_prefix="/admin") _rate_limit_lock = Lock() _login_attempts: dict = defaultdict(list) # IP -> list of timestamps -RATE_LIMIT_MAX_ATTEMPTS = 5 -RATE_LIMIT_WINDOW_SECONDS = 300 # 5 minutes +RATE_LIMIT_MAX_ATTEMPTS = 5 +RATE_LIMIT_WINDOW_SECONDS = 300 # 5 minutes + + +def _sanitize_next_url(next_url: str | None) -> str: + """Return a safe post-login redirect URL limited to local paths.""" + fallback = url_for("portal_index") + if not next_url: + return fallback + + parsed = urlparse(next_url) + if parsed.scheme or parsed.netloc: + logger.warning("Blocked external next redirect: %s", next_url) + return fallback + + if not next_url.startswith("/") or next_url.startswith("//"): + return fallback + + return next_url def _is_rate_limited(ip: str) -> bool: @@ -103,7 +121,7 @@ def login(): "login_time": datetime.now().isoformat(), } rotate_csrf_token() - next_url = request.args.get("next", url_for("portal_index")) + next_url = _sanitize_next_url(request.args.get("next")) return redirect(next_url) return render_template("login.html", error=error) diff --git a/src/mes_dashboard/routes/job_query_routes.py b/src/mes_dashboard/routes/job_query_routes.py index 981cdb9..313cf2d 100644 --- a/src/mes_dashboard/routes/job_query_routes.py +++ b/src/mes_dashboard/routes/job_query_routes.py @@ -1,13 +1,15 @@ # -*- coding: utf-8 -*- -"""Job Query API routes. +"""Job Query API routes. Contains Flask Blueprint for maintenance job query endpoints: - Job list query by resources - Job transaction history detail - CSV export with full history -""" - -from flask import Blueprint, jsonify, request, Response, render_template +""" + +import logging + +from flask import Blueprint, jsonify, request, Response, render_template from mes_dashboard.services.job_query_service import ( get_jobs_by_resources, @@ -16,8 +18,9 @@ from mes_dashboard.services.job_query_service import ( validate_date_range, ) -# Create Blueprint -job_query_bp = Blueprint('job_query', __name__) +# Create Blueprint +job_query_bp = Blueprint('job_query', __name__) +logger = logging.getLogger('mes_dashboard.job_query_routes') # ============================================================ @@ -65,8 +68,9 @@ def get_resources(): 'total': len(data) }) - except Exception as exc: - return jsonify({'error': f'載入設備資料失敗: {str(exc)}'}), 500 + except Exception as exc: + logger.exception("Failed to load job-query resources: %s", exc) + return jsonify({'error': '服務暫時無法使用'}), 500 @job_query_bp.route('/api/job-query/jobs', methods=['POST']) diff --git a/src/mes_dashboard/routes/resource_routes.py b/src/mes_dashboard/routes/resource_routes.py index 5cf66f6..529f25e 100644 --- a/src/mes_dashboard/routes/resource_routes.py +++ b/src/mes_dashboard/routes/resource_routes.py @@ -5,6 +5,7 @@ Contains Flask Blueprint for resource/equipment-related API endpoints. """ import math +import logging from flask import Blueprint, jsonify, request from mes_dashboard.core.database import ( @@ -14,6 +15,7 @@ from mes_dashboard.core.database import ( ) from mes_dashboard.core.cache import cache_get, cache_set, make_cache_key from mes_dashboard.core.rate_limit import configured_rate_limit +from mes_dashboard.core.response import INTERNAL_ERROR, error_response from mes_dashboard.core.utils import get_days_back, parse_bool_query @@ -112,6 +114,7 @@ from mes_dashboard.config.constants import STATUS_CATEGORIES # Create Blueprint resource_bp = Blueprint('resource', __name__, url_prefix='/api/resource') +logger = logging.getLogger('mes_dashboard.resource_routes') _RESOURCE_DETAIL_RATE_LIMIT = configured_rate_limit( bucket="resource-detail", @@ -253,7 +256,12 @@ def api_resource_status_values(): except Exception as exc: if connection: connection.close() - return jsonify({'success': False, 'error': str(exc)}), 500 + logger.exception("Failed to load resource status values: %s", exc) + return error_response( + INTERNAL_ERROR, + "服務暫時無法使用", + status_code=500, + ) # ============================================================ @@ -301,7 +309,12 @@ def api_resource_status(): except (DatabasePoolExhaustedError, DatabaseCircuitOpenError): raise except Exception as exc: - return jsonify({'success': False, 'error': str(exc)}), 500 + logger.exception("Failed to load realtime resource status: %s", exc) + return error_response( + INTERNAL_ERROR, + "服務暫時無法使用", + status_code=500, + ) @resource_bp.route('/status/options') @@ -324,7 +337,12 @@ def api_resource_status_options(): except (DatabasePoolExhaustedError, DatabaseCircuitOpenError): raise except Exception as exc: - return jsonify({'success': False, 'error': str(exc)}), 500 + logger.exception("Failed to load realtime resource options: %s", exc) + return error_response( + INTERNAL_ERROR, + "服務暫時無法使用", + status_code=500, + ) @resource_bp.route('/status/summary') @@ -355,7 +373,12 @@ def api_resource_status_summary(): except (DatabasePoolExhaustedError, DatabaseCircuitOpenError): raise except Exception as exc: - return jsonify({'success': False, 'error': str(exc)}), 500 + logger.exception("Failed to load realtime resource summary: %s", exc) + return error_response( + INTERNAL_ERROR, + "服務暫時無法使用", + status_code=500, + ) @resource_bp.route('/status/matrix') @@ -384,4 +407,9 @@ def api_resource_status_matrix(): except (DatabasePoolExhaustedError, DatabaseCircuitOpenError): raise except Exception as exc: - return jsonify({'success': False, 'error': str(exc)}), 500 + logger.exception("Failed to load realtime resource matrix: %s", exc) + return error_response( + INTERNAL_ERROR, + "服務暫時無法使用", + status_code=500, + ) diff --git a/src/mes_dashboard/services/auth_service.py b/src/mes_dashboard/services/auth_service.py index bf4c866..79dfe03 100644 --- a/src/mes_dashboard/services/auth_service.py +++ b/src/mes_dashboard/services/auth_service.py @@ -18,7 +18,26 @@ LDAP_TIMEOUT = 10 ADMIN_EMAILS = os.environ.get("ADMIN_EMAILS", "").lower().split(",") # Local authentication configuration (for development/testing) -LOCAL_AUTH_ENABLED = os.environ.get("LOCAL_AUTH_ENABLED", "false").lower() in ("true", "1", "yes") +def _resolve_local_auth_enabled( + raw_value: str | None = None, + flask_env: str | None = None, +) -> bool: + """Resolve local auth toggle with production safety guard.""" + requested = (raw_value if raw_value is not None else os.environ.get("LOCAL_AUTH_ENABLED", "false")) + local_auth_requested = str(requested).strip().lower() in ("true", "1", "yes", "on") + + effective_env = (flask_env if flask_env is not None else os.environ.get("FLASK_ENV", "development")) + normalized_env = str(effective_env).strip().lower() + is_production = normalized_env in {"production", "prod"} + + if local_auth_requested and is_production: + logger.error("LOCAL_AUTH_ENABLED is blocked in production environment") + return False + + return local_auth_requested + + +LOCAL_AUTH_ENABLED = _resolve_local_auth_enabled() LOCAL_AUTH_USERNAME = os.environ.get("LOCAL_AUTH_USERNAME", "") LOCAL_AUTH_PASSWORD = os.environ.get("LOCAL_AUTH_PASSWORD", "") diff --git a/src/mes_dashboard/services/excel_query_service.py b/src/mes_dashboard/services/excel_query_service.py index 24fcd22..e56d601 100644 --- a/src/mes_dashboard/services/excel_query_service.py +++ b/src/mes_dashboard/services/excel_query_service.py @@ -5,14 +5,16 @@ Provides Excel parsing, batch query execution, and CSV export functions. Supports large datasets (7000+ rows) by splitting queries into batches. """ -import re -from datetime import datetime -from typing import Any, Dict, List, Tuple +import re +import logging +from datetime import datetime +from typing import Any, Dict, List, Tuple import pandas as pd -from mes_dashboard.core.database import get_db_connection - +from mes_dashboard.core.database import get_db_connection + +logger = logging.getLogger('mes_dashboard.excel_query_service') # Oracle IN clause limit BATCH_SIZE = 1000 @@ -21,7 +23,10 @@ BATCH_SIZE = 1000 LIKE_KEYWORD_LIMIT = 100 # Large table threshold for performance warning (10 million rows) -LARGE_TABLE_THRESHOLD = 10_000_000 +LARGE_TABLE_THRESHOLD = 10_000_000 +PARSE_ERROR_MESSAGE = "Excel 解析失敗,請確認檔案格式" +COLUMN_READ_ERROR_MESSAGE = "讀取欄位失敗,請稍後再試" +QUERY_ERROR_MESSAGE = "查詢服務暫時無法使用" def parse_excel(file_storage) -> Dict[str, Any]: @@ -45,8 +50,9 @@ def parse_excel(file_storage) -> Dict[str, Any]: 'preview': preview, 'total_rows': len(df) } - except Exception as exc: - return {'error': f'Excel 解析失敗: {str(exc)}'} + except Exception as exc: + logger.exception("Excel parse failed: %s", exc) + return {'error': PARSE_ERROR_MESSAGE} def get_column_unique_values(file_storage, column_name: str) -> Dict[str, Any]: @@ -73,8 +79,9 @@ def get_column_unique_values(file_storage, column_name: str) -> Dict[str, Any]: 'values': values_list, 'count': len(values_list) } - except Exception as exc: - return {'error': f'讀取欄位失敗: {str(exc)}'} + except Exception as exc: + logger.exception("Excel column read failed for %s: %s", column_name, exc) + return {'error': COLUMN_READ_ERROR_MESSAGE} def detect_excel_column_type(values: List[str]) -> Dict[str, Any]: @@ -369,10 +376,11 @@ def execute_batch_query( 'batch_count': total_batches } - except Exception as exc: - if connection: - connection.close() - return {'error': f'查詢失敗: {str(exc)}'} + except Exception as exc: + if connection: + connection.close() + logger.exception("Excel batch query failed: %s", exc) + return {'error': QUERY_ERROR_MESSAGE} def execute_advanced_batch_query( @@ -527,10 +535,11 @@ def execute_advanced_batch_query( 'query_type': query_type } - except Exception as exc: - if connection: - connection.close() - return {'error': f'查詢失敗: {str(exc)}'} + except Exception as exc: + if connection: + connection.close() + logger.exception("Excel advanced batch query failed: %s", exc) + return {'error': QUERY_ERROR_MESSAGE} def generate_csv_content(data: List[Dict], columns: List[str]) -> str: diff --git a/src/mes_dashboard/services/job_query_service.py b/src/mes_dashboard/services/job_query_service.py index 3b7a340..7d330b1 100644 --- a/src/mes_dashboard/services/job_query_service.py +++ b/src/mes_dashboard/services/job_query_service.py @@ -13,23 +13,25 @@ Architecture: - Supports batching for large resource lists (Oracle IN clause limit) """ -import csv -import io -import logging -from datetime import datetime -from typing import Dict, List, Any, Optional, Generator - -import pandas as pd - +import csv +import io +import logging +from datetime import datetime +from typing import Dict, List, Any, Optional, Generator, Tuple + +import pandas as pd + from mes_dashboard.core.database import read_sql_df, get_db_connection -from mes_dashboard.sql import SQLLoader +from mes_dashboard.sql import SQLLoader, QueryBuilder from mes_dashboard.config.field_contracts import get_export_headers, get_export_api_keys -logger = logging.getLogger('mes_dashboard.job_query') +logger = logging.getLogger('mes_dashboard.job_query') # Constants -BATCH_SIZE = 1000 # Oracle IN clause limit -MAX_DATE_RANGE_DAYS = 365 +BATCH_SIZE = 1000 # Oracle IN clause limit +MAX_DATE_RANGE_DAYS = 365 +QUERY_ERROR_MESSAGE = "查詢服務暫時無法使用" +EXPORT_ERROR_MESSAGE = "匯出服務暫時無法使用" # ============================================================ @@ -66,55 +68,72 @@ def validate_date_range(start_date: str, end_date: str) -> Optional[str]: # Resource Filter Helpers # ============================================================ -def _build_resource_filter(resource_ids: List[str], max_chunk_size: int = BATCH_SIZE) -> List[str]: - """Build SQL IN clause lists for resource IDs. - - Oracle has a limit of ~1000 items per IN clause, so we chunk if needed. - - Args: - resource_ids: List of resource IDs. - max_chunk_size: Maximum items per IN clause. - - Returns: - List of SQL IN clause strings (e.g., "'ID1', 'ID2', 'ID3'"). - """ - if not resource_ids: - return [] - - # Escape single quotes - escaped_ids = [rid.replace("'", "''") for rid in resource_ids] - - # Chunk into groups - chunks = [] - for i in range(0, len(escaped_ids), max_chunk_size): - chunk = escaped_ids[i:i + max_chunk_size] - chunks.append("'" + "', '".join(chunk) + "'") - - return chunks - - -def _build_resource_filter_sql(resource_ids: List[str], column: str = 'j.RESOURCEID') -> str: - """Build SQL WHERE clause for resource ID filtering. - - Handles chunking for large resource lists. - - Args: - resource_ids: List of resource IDs. - column: Column name to filter on. - - Returns: - SQL condition string (e.g., "j.RESOURCEID IN ('ID1', 'ID2')"). - """ - chunks = _build_resource_filter(resource_ids) - if not chunks: - return "1=0" # No resources = no results - - if len(chunks) == 1: - return f"{column} IN ({chunks[0]})" - - # Multiple chunks need OR - conditions = [f"{column} IN ({chunk})" for chunk in chunks] - return "(" + " OR ".join(conditions) + ")" +def _build_resource_filter( + resource_ids: List[str], max_chunk_size: int = BATCH_SIZE +) -> List[List[str]]: + """Build chunked resource ID lists for Oracle IN clause limits. + + Args: + resource_ids: List of resource IDs. + max_chunk_size: Maximum items per IN clause. + + Returns: + Chunked resource ID values. + """ + normalized_ids: List[str] = [] + for rid in resource_ids: + if rid is None: + continue + text = str(rid).strip() + if text: + normalized_ids.append(text) + + if not normalized_ids: + return [] + + chunks: List[List[str]] = [] + for i in range(0, len(normalized_ids), max_chunk_size): + chunk = normalized_ids[i:i + max_chunk_size] + chunks.append(chunk) + return chunks + + +def _build_resource_filter_sql( + resource_ids: List[str], + column: str = 'j.RESOURCEID', + max_chunk_size: int = BATCH_SIZE, + return_params: bool = False, +) -> str | Tuple[str, Dict[str, Any]]: + """Build parameterized SQL condition for resource ID filtering. + + Uses bind variables via QueryBuilder and chunks values to satisfy Oracle + IN-clause limits. + + Args: + resource_ids: List of resource IDs. + column: Column name to filter on. + max_chunk_size: Maximum items per IN clause. + return_params: If True, return (condition_sql, params). + + Returns: + Condition SQL string, or tuple of condition SQL and parameters. + """ + chunks = _build_resource_filter(resource_ids, max_chunk_size=max_chunk_size) + if not chunks: + result: Tuple[str, Dict[str, Any]] = ("1=0", {}) + return result if return_params else result[0] + + builder = QueryBuilder() + for chunk in chunks: + builder.add_in_condition(column, chunk) + + if len(builder.conditions) == 1: + condition_sql = builder.conditions[0] + else: + condition_sql = "(" + " OR ".join(builder.conditions) + ")" + + result = (condition_sql, builder.params.copy()) + return result if return_params else result[0] # ============================================================ @@ -147,14 +166,20 @@ def get_jobs_by_resources( try: # Build resource filter - resource_filter = _build_resource_filter_sql(resource_ids) + resource_filter, resource_params = _build_resource_filter_sql( + resource_ids, return_params=True + ) # Load SQL template sql = SQLLoader.load("job_query/job_list") sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter) # Execute query - params = {'start_date': start_date, 'end_date': end_date} + params = { + 'start_date': start_date, + 'end_date': end_date, + **resource_params, + } df = read_sql_df(sql, params) # Convert to records @@ -179,9 +204,9 @@ def get_jobs_by_resources( 'resource_count': len(resource_ids) } - except Exception as exc: - logger.error(f"Job query failed: {exc}") - return {'error': f'查詢失敗: {str(exc)}'} + except Exception as exc: + logger.exception("Job query failed: %s", exc) + return {'error': QUERY_ERROR_MESSAGE} def get_job_txn_history(job_id: str) -> Dict[str, Any]: @@ -227,9 +252,9 @@ def get_job_txn_history(job_id: str) -> Dict[str, Any]: 'job_id': job_id } - except Exception as exc: - logger.error(f"Transaction history query failed for job {job_id}: {exc}") - return {'error': f'查詢失敗: {str(exc)}'} + except Exception as exc: + logger.exception("Transaction history query failed for job %s: %s", job_id, exc) + return {'error': QUERY_ERROR_MESSAGE} # ============================================================ @@ -265,14 +290,20 @@ def export_jobs_with_history( try: # Build resource filter - resource_filter = _build_resource_filter_sql(resource_ids) + resource_filter, resource_params = _build_resource_filter_sql( + resource_ids, return_params=True + ) # Load SQL template sql = SQLLoader.load("job_query/job_txn_export") sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter) # Execute query - params = {'start_date': start_date, 'end_date': end_date} + params = { + 'start_date': start_date, + 'end_date': end_date, + **resource_params, + } df = read_sql_df(sql, params) if df is None or len(df) == 0: @@ -321,9 +352,9 @@ def export_jobs_with_history( logger.info(f"CSV export completed: {len(df)} records") - except Exception as exc: - logger.error(f"CSV export failed: {exc}") - yield f"Error: 匯出失敗 - {str(exc)}\n" + except Exception as exc: + logger.exception("CSV export failed: %s", exc) + yield f"Error: {EXPORT_ERROR_MESSAGE}\n" def get_export_data( @@ -351,14 +382,20 @@ def get_export_data( try: # Build resource filter - resource_filter = _build_resource_filter_sql(resource_ids) + resource_filter, resource_params = _build_resource_filter_sql( + resource_ids, return_params=True + ) # Load SQL template sql = SQLLoader.load("job_query/job_txn_export") sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter) # Execute query - params = {'start_date': start_date, 'end_date': end_date} + params = { + 'start_date': start_date, + 'end_date': end_date, + **resource_params, + } df = read_sql_df(sql, params) # Convert to records @@ -381,6 +418,6 @@ def get_export_data( 'total': len(data) } - except Exception as exc: - logger.error(f"Export data query failed: {exc}") - return {'error': f'查詢失敗: {str(exc)}'} + except Exception as exc: + logger.exception("Export data query failed: %s", exc) + return {'error': QUERY_ERROR_MESSAGE} diff --git a/tests/test_auth_integration.py b/tests/test_auth_integration.py index 37b10d0..8ad570d 100644 --- a/tests/test_auth_integration.py +++ b/tests/test_auth_integration.py @@ -102,9 +102,62 @@ class TestLoginRoute: assert response.status_code == 302 # Check session contains admin - with client.session_transaction() as sess: - assert "admin" in sess - assert sess["admin"]["username"] == "92367" + with client.session_transaction() as sess: + assert "admin" in sess + assert sess["admin"]["username"] == "92367" + + @patch('mes_dashboard.services.auth_service.LOCAL_AUTH_ENABLED', False) + @patch('mes_dashboard.routes.auth_routes.is_admin', return_value=True) + @patch('mes_dashboard.services.auth_service.requests.post') + def test_login_blocks_external_next_redirect(self, mock_post, _mock_is_admin, client): + """Should ignore external next URL and redirect to portal.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "success": True, + "user": { + "username": "92367", + "displayName": "Admin User", + "mail": "ymirliu@panjit.com.tw", + "department": "Test Dept", + }, + } + mock_post.return_value = mock_response + + response = client.post( + "/admin/login?next=https://evil.example/phish", + data={"username": "92367", "password": "password123"}, + follow_redirects=False, + ) + + assert response.status_code == 302 + assert "evil.example" not in response.location + assert response.location.endswith("/") + + @patch('mes_dashboard.services.auth_service.LOCAL_AUTH_ENABLED', False) + @patch('mes_dashboard.routes.auth_routes.is_admin', return_value=True) + @patch('mes_dashboard.services.auth_service.requests.post') + def test_login_allows_internal_next_redirect(self, mock_post, _mock_is_admin, client): + """Should keep validated local path in next URL.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "success": True, + "user": { + "username": "92367", + "displayName": "Admin User", + "mail": "ymirliu@panjit.com.tw", + "department": "Test Dept", + }, + } + mock_post.return_value = mock_response + + response = client.post( + "/admin/login?next=/admin/pages", + data={"username": "92367", "password": "password123"}, + follow_redirects=False, + ) + + assert response.status_code == 302 + assert response.location.endswith("/admin/pages") @patch('mes_dashboard.services.auth_service.LOCAL_AUTH_ENABLED', False) @patch('mes_dashboard.services.auth_service.requests.post') diff --git a/tests/test_auth_service.py b/tests/test_auth_service.py index cdc4618..2d7ef08 100644 --- a/tests/test_auth_service.py +++ b/tests/test_auth_service.py @@ -161,6 +161,24 @@ class TestLocalAuthenticate: assert result is None +class TestLocalAuthSafetyGuard: + """Tests for production guard on local auth toggle.""" + + def test_resolve_local_auth_enabled_blocks_production(self): + result = auth_service._resolve_local_auth_enabled( + raw_value="true", + flask_env="production", + ) + assert result is False + + def test_resolve_local_auth_enabled_allows_development(self): + result = auth_service._resolve_local_auth_enabled( + raw_value="true", + flask_env="development", + ) + assert result is True + + class TestIsAdmin: """Tests for is_admin function.""" diff --git a/tests/test_cache_updater.py b/tests/test_cache_updater.py index 6e8a04c..01deaf8 100644 --- a/tests/test_cache_updater.py +++ b/tests/test_cache_updater.py @@ -149,9 +149,9 @@ class TestLoadFullTable: assert result is None -class TestUpdateRedisCache: - """Test Redis cache update logic.""" - +class TestUpdateRedisCache: + """Test Redis cache update logic.""" + def test_update_redis_cache_success(self): """Test _update_redis_cache updates cache correctly.""" import mes_dashboard.core.cache_updater as cu @@ -173,7 +173,10 @@ class TestUpdateRedisCache: assert result is True mock_pipeline.rename.assert_called_once() mock_pipeline.execute.assert_called_once() - + assert mock_pipeline.set.call_count == 3 + for call in mock_pipeline.set.call_args_list: + assert call.kwargs.get("ex") == updater.interval * 3 + def test_update_redis_cache_no_client(self): """Test _update_redis_cache handles no client.""" import mes_dashboard.core.cache_updater as cu @@ -205,6 +208,26 @@ class TestUpdateRedisCache: mock_client.delete.assert_called_once() staged_key = mock_client.delete.call_args.args[0] assert "staging" in staged_key + + def test_update_redis_cache_ttl_override(self): + """Configured TTL override should apply to all Redis keys.""" + import mes_dashboard.core.cache_updater as cu + + mock_client = MagicMock() + mock_pipeline = MagicMock() + mock_client.pipeline.return_value = mock_pipeline + test_df = pd.DataFrame({'LOTID': ['LOT001'], 'QTY': [100]}) + + with patch.object(cu, 'WIP_CACHE_TTL_SECONDS', 42): + with patch.object(cu, 'get_redis_client', return_value=mock_client): + with patch.object(cu, 'get_key', side_effect=lambda k: f'mes_wip:{k}'): + updater = cu.CacheUpdater(interval=600) + result = updater._update_redis_cache(test_df, '2024-01-15 10:30:00') + + assert result is True + assert mock_pipeline.set.call_count == 3 + for call in mock_pipeline.set.call_args_list: + assert call.kwargs.get("ex") == 42 class TestCacheUpdateFlow: diff --git a/tests/test_excel_query_service.py b/tests/test_excel_query_service.py index 0afe7de..2b94b41 100644 --- a/tests/test_excel_query_service.py +++ b/tests/test_excel_query_service.py @@ -4,17 +4,25 @@ Tests the core service functions without database dependencies. """ -import pytest -from mes_dashboard.services.excel_query_service import ( - detect_excel_column_type, - escape_like_pattern, - build_like_condition, - build_date_range_condition, - validate_like_keywords, - sanitize_column_name, - validate_table_name, - LIKE_KEYWORD_LIMIT, -) +import pytest +from unittest.mock import MagicMock, patch +from mes_dashboard.services.excel_query_service import ( + parse_excel, + get_column_unique_values, + execute_batch_query, + execute_advanced_batch_query, + detect_excel_column_type, + escape_like_pattern, + build_like_condition, + build_date_range_condition, + validate_like_keywords, + sanitize_column_name, + validate_table_name, + LIKE_KEYWORD_LIMIT, + PARSE_ERROR_MESSAGE, + COLUMN_READ_ERROR_MESSAGE, + QUERY_ERROR_MESSAGE, +) class TestDetectExcelColumnType: @@ -236,7 +244,7 @@ class TestSanitizeColumnName: assert sanitize_column_name("COL; DROP TABLE--") == 'COLDROPTABLE' -class TestValidateTableName: +class TestValidateTableName: """Tests for validate_table_name function.""" def test_simple_name(self): @@ -256,6 +264,65 @@ class TestValidateTableName: assert validate_table_name('TABLE-NAME') is False assert validate_table_name('TABLE NAME') is False - def test_sql_injection_prevention(self): - """Should reject SQL injection attempts.""" - assert validate_table_name('TABLE; DROP--') is False + def test_sql_injection_prevention(self): + """Should reject SQL injection attempts.""" + assert validate_table_name('TABLE; DROP--') is False + + +class TestErrorLeakageProtection: + """Tests for exception detail masking in excel-query service.""" + + @patch("mes_dashboard.services.excel_query_service.pd.read_excel") + def test_parse_excel_masks_internal_error_details(self, mock_read_excel): + mock_read_excel.side_effect = RuntimeError("openpyxl stack trace detail") + + result = parse_excel(MagicMock()) + + assert result["error"] == PARSE_ERROR_MESSAGE + assert "openpyxl" not in result["error"] + + @patch("mes_dashboard.services.excel_query_service.pd.read_excel") + def test_get_column_unique_values_masks_internal_error_details(self, mock_read_excel): + mock_read_excel.side_effect = RuntimeError("internal parser detail") + + result = get_column_unique_values(MagicMock(), "LOT_ID") + + assert result["error"] == COLUMN_READ_ERROR_MESSAGE + assert "internal parser detail" not in result["error"] + + @patch("mes_dashboard.services.excel_query_service.get_db_connection") + def test_execute_batch_query_masks_internal_error_details(self, mock_get_db): + mock_cursor = MagicMock() + mock_cursor.execute.side_effect = RuntimeError("ORA-00942: table missing") + mock_conn = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_get_db.return_value = mock_conn + + result = execute_batch_query( + table_name="DWH.DW_MES_WIP", + search_column="LOT_ID", + return_columns=["LOT_ID"], + search_values=["LOT001"], + ) + + assert result["error"] == QUERY_ERROR_MESSAGE + assert "ORA-00942" not in result["error"] + + @patch("mes_dashboard.services.excel_query_service.get_db_connection") + def test_execute_advanced_batch_query_masks_internal_error_details(self, mock_get_db): + mock_cursor = MagicMock() + mock_cursor.execute.side_effect = RuntimeError("sensitive sql context") + mock_conn = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_get_db.return_value = mock_conn + + result = execute_advanced_batch_query( + table_name="DWH.DW_MES_WIP", + search_column="LOT_ID", + return_columns=["LOT_ID"], + search_values=["LOT001"], + query_type="in", + ) + + assert result["error"] == QUERY_ERROR_MESSAGE + assert "sensitive sql context" not in result["error"] diff --git a/tests/test_job_query_routes.py b/tests/test_job_query_routes.py index 06eca53..08ff567 100644 --- a/tests/test_job_query_routes.py +++ b/tests/test_job_query_routes.py @@ -74,15 +74,17 @@ class TestGetResources: data = json.loads(response.data) assert 'error' in data - @patch('mes_dashboard.services.resource_cache.get_all_resources') - def test_get_resources_exception(self, mock_get_resources, client): - """Should handle exception gracefully.""" - mock_get_resources.side_effect = Exception('Database error') - - response = client.get('/api/job-query/resources') - assert response.status_code == 500 - data = json.loads(response.data) - assert 'error' in data + @patch('mes_dashboard.services.resource_cache.get_all_resources') + def test_get_resources_exception(self, mock_get_resources, client): + """Should handle exception gracefully.""" + mock_get_resources.side_effect = Exception('ORA-01017 invalid username/password') + + response = client.get('/api/job-query/resources') + assert response.status_code == 500 + data = json.loads(response.data) + assert 'error' in data + assert data['error'] == '服務暫時無法使用' + assert 'ORA-01017' not in data['error'] class TestQueryJobs: diff --git a/tests/test_job_query_service.py b/tests/test_job_query_service.py index 7b64257..951a66b 100644 --- a/tests/test_job_query_service.py +++ b/tests/test_job_query_service.py @@ -4,14 +4,19 @@ Tests the core service functions without database dependencies. """ -import pytest -from mes_dashboard.services.job_query_service import ( - validate_date_range, - _build_resource_filter, - _build_resource_filter_sql, - BATCH_SIZE, - MAX_DATE_RANGE_DAYS, -) +import pytest +from unittest.mock import patch +from mes_dashboard.services.job_query_service import ( + validate_date_range, + _build_resource_filter, + _build_resource_filter_sql, + get_jobs_by_resources, + export_jobs_with_history, + BATCH_SIZE, + MAX_DATE_RANGE_DAYS, + QUERY_ERROR_MESSAGE, + EXPORT_ERROR_MESSAGE, +) class TestValidateDateRange: @@ -77,94 +82,125 @@ class TestValidateDateRange: assert '格式' in result or 'format' in result.lower() -class TestBuildResourceFilter: - """Tests for _build_resource_filter function.""" - - def test_empty_list(self): - """Should return empty list for empty input.""" - result = _build_resource_filter([]) - assert result == [] - - def test_single_id(self): - """Should return single chunk for single ID.""" - result = _build_resource_filter(['RES001']) - assert len(result) == 1 - assert result[0] == "'RES001'" - - def test_multiple_ids(self): - """Should join multiple IDs with comma.""" - result = _build_resource_filter(['RES001', 'RES002', 'RES003']) - assert len(result) == 1 - assert "'RES001'" in result[0] - assert "'RES002'" in result[0] - assert "'RES003'" in result[0] - - def test_chunking(self): - """Should chunk when exceeding batch size.""" - # Create more than BATCH_SIZE IDs - ids = [f'RES{i:05d}' for i in range(BATCH_SIZE + 10)] - result = _build_resource_filter(ids) - assert len(result) == 2 - # First chunk should have BATCH_SIZE items - assert result[0].count("'") == BATCH_SIZE * 2 # 2 quotes per ID - - def test_escape_single_quotes(self): - """Should escape single quotes in IDs.""" - result = _build_resource_filter(["RES'001"]) - assert len(result) == 1 - assert "RES''001" in result[0] # Escaped - - def test_custom_chunk_size(self): - """Should respect custom chunk size.""" - ids = ['RES001', 'RES002', 'RES003', 'RES004', 'RES005'] - result = _build_resource_filter(ids, max_chunk_size=2) +class TestBuildResourceFilter: + """Tests for _build_resource_filter function.""" + + def test_empty_list(self): + """Should return empty list for empty input.""" + result = _build_resource_filter([]) + assert result == [] + + def test_single_id(self): + """Should return single chunk for single ID.""" + result = _build_resource_filter(['RES001']) + assert len(result) == 1 + assert result[0] == ['RES001'] + + def test_multiple_ids(self): + """Should join multiple IDs with comma.""" + result = _build_resource_filter(['RES001', 'RES002', 'RES003']) + assert len(result) == 1 + assert result[0] == ['RES001', 'RES002', 'RES003'] + + def test_chunking(self): + """Should chunk when exceeding batch size.""" + # Create more than BATCH_SIZE IDs + ids = [f'RES{i:05d}' for i in range(BATCH_SIZE + 10)] + result = _build_resource_filter(ids) + assert len(result) == 2 + # First chunk should have BATCH_SIZE items + assert len(result[0]) == BATCH_SIZE + + def test_preserve_id_value_without_sql_interpolation(self): + """Should keep raw value and defer safety to bind variables.""" + result = _build_resource_filter(["RES'001"]) + assert len(result) == 1 + assert result[0] == ["RES'001"] + + def test_custom_chunk_size(self): + """Should respect custom chunk size.""" + ids = ['RES001', 'RES002', 'RES003', 'RES004', 'RES005'] + result = _build_resource_filter(ids, max_chunk_size=2) assert len(result) == 3 # 2+2+1 -class TestBuildResourceFilterSql: - """Tests for _build_resource_filter_sql function.""" - - def test_empty_list(self): - """Should return 1=0 for empty input (no results).""" - result = _build_resource_filter_sql([]) - assert result == "1=0" - - def test_single_id(self): - """Should build simple IN clause for single ID.""" - result = _build_resource_filter_sql(['RES001']) - assert "j.RESOURCEID IN" in result - assert "'RES001'" in result - - def test_multiple_ids(self): - """Should build IN clause with multiple IDs.""" - result = _build_resource_filter_sql(['RES001', 'RES002']) - assert "j.RESOURCEID IN" in result - assert "'RES001'" in result - assert "'RES002'" in result - - def test_custom_column(self): - """Should use custom column name.""" - result = _build_resource_filter_sql(['RES001'], column='r.ID') - assert "r.ID IN" in result +class TestBuildResourceFilterSql: + """Tests for _build_resource_filter_sql function.""" + + def test_empty_list(self): + """Should return 1=0 for empty input (no results).""" + result = _build_resource_filter_sql([]) + assert result == "1=0" + + def test_single_id(self): + """Should build IN clause with bind variable for single ID.""" + result, params = _build_resource_filter_sql(['RES001'], return_params=True) + assert "j.RESOURCEID IN" in result + assert ":p0" in result + assert params["p0"] == "RES001" + assert "RES001" not in result + + def test_multiple_ids(self): + """Should build IN clause with multiple bind variables.""" + result, params = _build_resource_filter_sql(['RES001', 'RES002'], return_params=True) + assert "j.RESOURCEID IN" in result + assert ":p0" in result + assert ":p1" in result + assert params["p0"] == "RES001" + assert params["p1"] == "RES002" + + def test_custom_column(self): + """Should use custom column name.""" + result = _build_resource_filter_sql(['RES001'], column='r.ID') + assert "r.ID IN" in result def test_large_list_uses_or(self): """Should use OR for chunked results.""" # Create more than BATCH_SIZE IDs ids = [f'RES{i:05d}' for i in range(BATCH_SIZE + 10)] - result = _build_resource_filter_sql(ids) - assert " OR " in result - # Should have parentheses wrapping the OR conditions - assert result.startswith("(") - assert result.endswith(")") + result = _build_resource_filter_sql(ids) + assert " OR " in result + # Should have parentheses wrapping the OR conditions + assert result.startswith("(") + assert result.endswith(")") + + def test_sql_injection_payload_stays_in_params(self): + """Injection payload should never be interpolated into SQL text.""" + payload = "RES001' OR '1'='1" + sql, params = _build_resource_filter_sql([payload], return_params=True) + assert payload in params.values() + assert payload not in sql -class TestServiceConstants: +class TestServiceConstants: """Tests for service constants.""" def test_batch_size_is_reasonable(self): """Batch size should be <= 1000 (Oracle limit).""" assert BATCH_SIZE <= 1000 - def test_max_date_range_is_year(self): - """Max date range should be 365 days.""" - assert MAX_DATE_RANGE_DAYS == 365 + def test_max_date_range_is_year(self): + """Max date range should be 365 days.""" + assert MAX_DATE_RANGE_DAYS == 365 + + +class TestErrorLeakageProtection: + """Tests for exception detail masking in job-query service.""" + + @patch("mes_dashboard.services.job_query_service.read_sql_df") + def test_query_error_masks_internal_details(self, mock_read): + mock_read.side_effect = RuntimeError("ORA-00942: table or view does not exist") + + result = get_jobs_by_resources(["RES001"], "2024-01-01", "2024-01-31") + + assert result["error"] == QUERY_ERROR_MESSAGE + assert "ORA-00942" not in result["error"] + + @patch("mes_dashboard.services.job_query_service.read_sql_df") + def test_export_stream_error_masks_internal_details(self, mock_read): + mock_read.side_effect = RuntimeError("sensitive sql context") + + output = "".join(export_jobs_with_history(["RES001"], "2024-01-01", "2024-01-31")) + + assert EXPORT_ERROR_MESSAGE in output + assert "sensitive sql context" not in output diff --git a/tests/test_resource_routes.py b/tests/test_resource_routes.py index d32eace..311b848 100644 --- a/tests/test_resource_routes.py +++ b/tests/test_resource_routes.py @@ -3,6 +3,18 @@ from __future__ import annotations +from unittest.mock import patch + +import mes_dashboard.core.database as db +from mes_dashboard.app import create_app + + +def _client(): + db._ENGINE = None + app = create_app("testing") + app.config["TESTING"] = True + return app.test_client() + def test_clean_nan_values_handles_deep_nesting_without_recursion_error(): from mes_dashboard.routes.resource_routes import _clean_nan_values @@ -30,3 +42,33 @@ def test_clean_nan_values_breaks_cycles_safely(): cleaned = _clean_nan_values(payload) assert cleaned["name"] == "root" assert cleaned["self"] is None + + +@patch( + "mes_dashboard.routes.resource_routes.get_resource_status_summary", + side_effect=RuntimeError("ORA-00942: table or view does not exist"), +) +def test_resource_status_summary_masks_internal_error_details(_mock_summary): + response = _client().get("/api/resource/status/summary") + assert response.status_code == 500 + + payload = response.get_json() + assert payload["success"] is False + assert payload["error"]["code"] == "INTERNAL_ERROR" + assert payload["error"]["message"] == "服務暫時無法使用" + assert "ORA-00942" not in str(payload) + + +@patch( + "mes_dashboard.routes.resource_routes.get_merged_resource_status", + side_effect=RuntimeError("sensitive sql context"), +) +def test_resource_status_masks_internal_error_details(_mock_status): + response = _client().get("/api/resource/status") + assert response.status_code == 500 + + payload = response.get_json() + assert payload["success"] is False + assert payload["error"]["code"] == "INTERNAL_ERROR" + assert payload["error"]["message"] == "服務暫時無法使用" + assert "sensitive sql context" not in str(payload) diff --git a/tests/test_watchdog_logging.py b/tests/test_watchdog_logging.py new file mode 100644 index 0000000..63d1ead --- /dev/null +++ b/tests/test_watchdog_logging.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +"""Unit tests for watchdog logging helpers.""" + +from __future__ import annotations + +import logging +from unittest.mock import patch + +from mes_dashboard.core.watchdog_logging import attach_sqlite_log_handler + + +def _reset_logger(logger: logging.Logger) -> None: + logger.handlers.clear() + if hasattr(logger, "_watchdog_sqlite_handler_registered"): + delattr(logger, "_watchdog_sqlite_handler_registered") + + +def test_attach_sqlite_log_handler_enabled_attaches_once(): + test_logger = logging.getLogger("mes_dashboard.watchdog.test.enabled") + _reset_logger(test_logger) + handler_one = logging.NullHandler() + handler_two = logging.NullHandler() + + with patch("mes_dashboard.core.log_store.LOG_STORE_ENABLED", True), patch( + "mes_dashboard.core.log_store.get_sqlite_log_handler", + side_effect=[handler_one, handler_two], + ) as handler_factory: + first = attach_sqlite_log_handler(test_logger) + second = attach_sqlite_log_handler(test_logger) + + assert first is True + assert second is False + assert handler_factory.call_count == 1 + assert handler_one in test_logger.handlers + assert handler_two not in test_logger.handlers + + _reset_logger(test_logger) + + +def test_attach_sqlite_log_handler_disabled_skips_factory(): + test_logger = logging.getLogger("mes_dashboard.watchdog.test.disabled") + _reset_logger(test_logger) + + with patch("mes_dashboard.core.log_store.LOG_STORE_ENABLED", False), patch( + "mes_dashboard.core.log_store.get_sqlite_log_handler" + ) as handler_factory: + attached = attach_sqlite_log_handler(test_logger) + + assert attached is False + handler_factory.assert_not_called() + assert not test_logger.handlers + + _reset_logger(test_logger) + + +def test_attach_sqlite_log_handler_handles_handler_errors(): + test_logger = logging.getLogger("mes_dashboard.watchdog.test.error") + _reset_logger(test_logger) + + with patch("mes_dashboard.core.log_store.LOG_STORE_ENABLED", True), patch( + "mes_dashboard.core.log_store.get_sqlite_log_handler", + side_effect=RuntimeError("boom"), + ): + attached = attach_sqlite_log_handler(test_logger) + + assert attached is False + assert not test_logger.handlers + + _reset_logger(test_logger)