chore: finalize vite migration hardening and watchdog logging

2026-02-08 22:55:38 +08:00
parent c8e225101e
commit cf194bc3a3
27 changed files with 924 additions and 356 deletions
--- a/.env.example
+++ b/.env.example
@@ -55,6 +55,7 @@ ADMIN_EMAILS=admin@example.com
 # Local Authentication (for development/testing)
 # When enabled, uses local credentials instead of LDAP
 # Set LOCAL_AUTH_ENABLED=true to bypass LDAP authentication
 # Production safety: when FLASK_ENV=production, local auth is forcibly disabled
 LOCAL_AUTH_ENABLED=false
 LOCAL_AUTH_USERNAME=
 LOCAL_AUTH_PASSWORD=
@@ -86,6 +87,11 @@ REDIS_KEY_PREFIX=mes_wip
 # Cache check interval in seconds (default: 600 = 10 minutes)
 CACHE_CHECK_INTERVAL=600
 # Optional explicit TTL for WIP Redis keys (seconds)
 # If unset/0, TTL defaults to 3 * CACHE_CHECK_INTERVAL
 # Example: CACHE_CHECK_INTERVAL=600 -> default TTL=1800
 WIP_CACHE_TTL_SECONDS=1800
 # ============================================================
 # Resource Cache Configuration
 # ============================================================
--- a/README.md
+++ b/README.md
@@ -146,8 +146,8 @@ nano .env
 ### 日常操作
 ```bash
-# 啟動服務（背景執行）
+# 啟動服務（背景執行，含 Gunicorn + worker_watchdog）
-./scripts/start_server.sh start
+./scripts/start_server.sh start
 # 停止服務
 ./scripts/start_server.sh stop
@@ -158,8 +158,9 @@ nano .env
 # 查看狀態
 ./scripts/start_server.sh status
-# 查看日誌
+# 查看日誌（含 watchdog）
-./scripts/start_server.sh logs follow
+./scripts/start_server.sh logs follow
 ./scripts/start_server.sh logs watchdog
 ```
 訪問網址: **http://localhost:8080** （可在 .env 中配置）
@@ -330,17 +331,16 @@ RESOURCE_STATUS_RATE_LIMIT_WINDOW_SECONDS=60
 ### Conda + systemd 服務配置
-建議在生產環境使用同一份 conda runtime contract 啟動 App 與 Watchdog：
+建議在生產環境使用同一份 `.env`（`/opt/mes-dashboard/.env`）啟動 App 與 Watchdog，與開發環境一致：
 ```bash
 # 1. 複製 systemd 服務檔案
 sudo cp deploy/mes-dashboard.service /etc/systemd/system/
 sudo cp deploy/mes-dashboard-watchdog.service /etc/systemd/system/
-# 2. 準備環境設定檔
+# 2. 確認 /opt/mes-dashboard/.env 權限（供 www-data 讀取）
-sudo mkdir -p /etc/mes-dashboard
+sudo chown root:www-data /opt/mes-dashboard/.env
-sudo cp deploy/mes-dashboard.env.example /etc/mes-dashboard/mes-dashboard.env
+sudo chmod 640 /opt/mes-dashboard/.env
 sudo cp .env /etc/mes-dashboard/mes-dashboard.env
 # 3. 重新載入 systemd
 sudo systemctl daemon-reload
@@ -615,8 +615,7 @@ DashBoard_vite/
 ├── deploy/                     # 部署設定
 │   ├── mes-dashboard.service            # Gunicorn systemd 服務 (Conda)
 │   ├── mes-dashboard-watchdog.service   # Watchdog systemd 服務 (Conda)
-│   └── mes-dashboard.env.example        # Runtime contract 環境範本
+├── tests/                      # 測試
 ├── tests/                      # 測試
 ├── data/                       # 資料檔案
 ├── logs/                       # 日誌
 ├── docs/                       # 文檔
--- a/README.mdj
+++ b/README.mdj
@@ -9,6 +9,8 @@
 - 快取：Redis + process-level cache + indexed selection telemetry
 - 資料：Oracle（QueuePool）
 - 運維：watchdog + admin worker restart API + guarded-mode policy
 - 環境設定：開發與正式環境統一使用專案根目錄同一份 `.env`
 - 啟動腳本：`./scripts/start_server.sh start` 會同時啟動 Gunicorn 與 `worker_watchdog.py`
 ## 2. 既有設計原則（保留）
--- a/deploy/mes-dashboard-watchdog.service
+++ b/deploy/mes-dashboard-watchdog.service
@@ -9,27 +9,13 @@ Type=simple
 User=www-data
 Group=www-data
 WorkingDirectory=/opt/mes-dashboard
-EnvironmentFile=-/etc/mes-dashboard/mes-dashboard.env
+EnvironmentFile=-/opt/mes-dashboard/.env
 Environment="PYTHONPATH=/opt/mes-dashboard/src"
 Environment="CONDA_BIN=/opt/miniconda3/bin/conda"
 Environment="CONDA_ENV_NAME=mes-dashboard"
 Environment="WATCHDOG_RUNTIME_DIR=/run/mes-dashboard"
 Environment="WATCHDOG_RESTART_FLAG=/run/mes-dashboard/mes_dashboard_restart.flag"
 Environment="WATCHDOG_PID_FILE=/run/mes-dashboard/gunicorn.pid"
 Environment="WATCHDOG_STATE_FILE=/var/lib/mes-dashboard/restart_state.json"
 Environment="WATCHDOG_CHECK_INTERVAL=5"
 Environment="RUNTIME_CONTRACT_VERSION=2026.02-p2"
 Environment="RUNTIME_CONTRACT_ENFORCE=true"
 Environment="WORKER_RESTART_COOLDOWN=60"
 Environment="WORKER_RESTART_RETRY_BUDGET=3"
 Environment="WORKER_RESTART_WINDOW_SECONDS=600"
 Environment="WORKER_RESTART_CHURN_THRESHOLD=3"
 Environment="WORKER_GUARDED_MODE_ENABLED=true"
 RuntimeDirectory=mes-dashboard
 StateDirectory=mes-dashboard
-ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN}" run --no-capture-output -n "${CONDA_ENV_NAME}" python scripts/worker_watchdog.py'
+ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN:-/opt/miniconda3/bin/conda}" run --no-capture-output -n "${CONDA_ENV_NAME:-mes-dashboard}" python scripts/worker_watchdog.py'
 Restart=always
 RestartSec=5
--- a/deploy/mes-dashboard.env.example
+++ b/deploy/mes-dashboard.env.example
@@ -1,26 +0,0 @@
 # MES Dashboard runtime contract (version 2026.02-p2)
 # Conda runtime
 CONDA_BIN=/opt/miniconda3/bin/conda
 CONDA_ENV_NAME=mes-dashboard
 # Single-port serving contract
 GUNICORN_BIND=0.0.0.0:8080
 # Watchdog/runtime paths
 WATCHDOG_RUNTIME_DIR=/run/mes-dashboard
 WATCHDOG_RESTART_FLAG=/run/mes-dashboard/mes_dashboard_restart.flag
 WATCHDOG_PID_FILE=/run/mes-dashboard/gunicorn.pid
 WATCHDOG_STATE_FILE=/var/lib/mes-dashboard/restart_state.json
 WATCHDOG_CHECK_INTERVAL=5
 # Runtime contract enforcement
 RUNTIME_CONTRACT_VERSION=2026.02-p2
 RUNTIME_CONTRACT_ENFORCE=true
 # Worker recovery policy
 WORKER_RESTART_COOLDOWN=60
 WORKER_RESTART_RETRY_BUDGET=3
 WORKER_RESTART_WINDOW_SECONDS=600
 WORKER_RESTART_CHURN_THRESHOLD=3
 WORKER_GUARDED_MODE_ENABLED=true
--- a/deploy/mes-dashboard.service
+++ b/deploy/mes-dashboard.service
@@ -9,28 +9,14 @@ Type=simple
 User=www-data
 Group=www-data
 WorkingDirectory=/opt/mes-dashboard
-EnvironmentFile=-/etc/mes-dashboard/mes-dashboard.env
+EnvironmentFile=-/opt/mes-dashboard/.env
 Environment="PYTHONPATH=/opt/mes-dashboard/src"
 Environment="CONDA_BIN=/opt/miniconda3/bin/conda"
 Environment="CONDA_ENV_NAME=mes-dashboard"
 Environment="GUNICORN_BIND=0.0.0.0:8080"
 Environment="WATCHDOG_RUNTIME_DIR=/run/mes-dashboard"
 Environment="WATCHDOG_RESTART_FLAG=/run/mes-dashboard/mes_dashboard_restart.flag"
 Environment="WATCHDOG_PID_FILE=/run/mes-dashboard/gunicorn.pid"
 Environment="WATCHDOG_STATE_FILE=/var/lib/mes-dashboard/restart_state.json"
 Environment="RUNTIME_CONTRACT_VERSION=2026.02-p2"
 Environment="RUNTIME_CONTRACT_ENFORCE=true"
 Environment="WORKER_RESTART_COOLDOWN=60"
 Environment="WORKER_RESTART_RETRY_BUDGET=3"
 Environment="WORKER_RESTART_WINDOW_SECONDS=600"
 Environment="WORKER_RESTART_CHURN_THRESHOLD=3"
 Environment="WORKER_GUARDED_MODE_ENABLED=true"
 RuntimeDirectory=mes-dashboard
 StateDirectory=mes-dashboard
 PIDFile=/run/mes-dashboard/gunicorn.pid
-ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN}" run --no-capture-output -n "${CONDA_ENV_NAME}" gunicorn --config gunicorn.conf.py --pid "${WATCHDOG_PID_FILE}" --capture-output "mes_dashboard:create_app()"'
+ExecStart=/usr/bin/env bash -lc 'exec "${CONDA_BIN:-/opt/miniconda3/bin/conda}" run --no-capture-output -n "${CONDA_ENV_NAME:-mes-dashboard}" gunicorn --config gunicorn.conf.py --pid "${WATCHDOG_PID_FILE:-/run/mes-dashboard/gunicorn.pid}" --capture-output "mes_dashboard:create_app()"'
 KillSignal=SIGTERM
 TimeoutStopSec=30
--- a/docs/migration_gates_and_runbook.md
+++ b/docs/migration_gates_and_runbook.md
@@ -62,8 +62,8 @@ A release is cutover-ready only when all gates pass:
 5. Conda + systemd rehearsal (recommended before production cutover)
 - `sudo cp deploy/mes-dashboard.service /etc/systemd/system/`
 - `sudo cp deploy/mes-dashboard-watchdog.service /etc/systemd/system/`
- `sudo mkdir -p /etc/mes-dashboard && sudo cp deploy/mes-dashboard.env.example /etc/mes-dashboard/mes-dashboard.env`
+- ensure deployment uses the same single env file: `/opt/mes-dashboard/.env`
- merge deployment secrets from `.env` into `/etc/mes-dashboard/mes-dashboard.env`
+- `sudo chown root:www-data /opt/mes-dashboard/.env && sudo chmod 640 /opt/mes-dashboard/.env`
 - `sudo systemctl daemon-reload`
 - `sudo systemctl enable --now mes-dashboard mes-dashboard-watchdog`
 - call `/admin/api/worker/status` and verify runtime contract paths exist
--- a/scripts/deploy.sh
+++ b/scripts/deploy.sh
@@ -237,8 +237,9 @@ show_next_steps() {
    echo "  http://localhost:${port:-8080}"
    echo ""
    echo "Optional: install conda+systemd services"
-    echo "  sudo mkdir -p /etc/mes-dashboard"
+    echo "  # systemd and local scripts both use the same /opt/mes-dashboard/.env"
-    echo "  sudo cp .env /etc/mes-dashboard/mes-dashboard.env"
+    echo "  sudo chown root:www-data .env"
    echo "  sudo chmod 640 .env"
    echo "  sudo cp deploy/mes-dashboard.service /etc/systemd/system/"
    echo "  sudo cp deploy/mes-dashboard-watchdog.service /etc/systemd/system/"
    echo "  sudo systemctl daemon-reload"
--- a/scripts/start_server.sh
+++ b/scripts/start_server.sh
@@ -16,12 +16,15 @@ PID_FILE="${WATCHDOG_PID_FILE:-${PID_FILE_DEFAULT}}"
 LOG_DIR="${ROOT}/logs"
 ACCESS_LOG="${LOG_DIR}/access.log"
 ERROR_LOG="${LOG_DIR}/error.log"
 WATCHDOG_LOG="${LOG_DIR}/watchdog.log"
 STARTUP_LOG="${LOG_DIR}/startup.log"
 DEFAULT_PORT="${GUNICORN_BIND:-0.0.0.0:8080}"
 PORT=$(echo "$DEFAULT_PORT" | cut -d: -f2)
 # Redis configuration
 REDIS_ENABLED="${REDIS_ENABLED:-true}"
 # Worker watchdog configuration
 WATCHDOG_ENABLED="${WATCHDOG_ENABLED:-true}"
 # Colors for output
 RED='\033[0;31m'
@@ -53,13 +56,25 @@ timestamp() {
    date '+%Y-%m-%d %H:%M:%S'
 }
 is_enabled() {
    case "${1:-}" in
        1|[Tt][Rr][Uu][Ee]|[Yy][Ee][Ss]|[Oo][Nn])
            return 0
            ;;
        *)
            return 1
            ;;
    esac
 }
 resolve_runtime_paths() {
    WATCHDOG_RUNTIME_DIR="${WATCHDOG_RUNTIME_DIR:-${ROOT}/tmp}"
    WATCHDOG_RESTART_FLAG="${WATCHDOG_RESTART_FLAG:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart.flag}"
    WATCHDOG_PID_FILE="${WATCHDOG_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/gunicorn.pid}"
    WATCHDOG_STATE_FILE="${WATCHDOG_STATE_FILE:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart_state.json}"
    WATCHDOG_PROCESS_PID_FILE="${WATCHDOG_PROCESS_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/worker_watchdog.pid}"
    PID_FILE="${WATCHDOG_PID_FILE}"
-    export WATCHDOG_RUNTIME_DIR WATCHDOG_RESTART_FLAG WATCHDOG_PID_FILE WATCHDOG_STATE_FILE
+    export WATCHDOG_RUNTIME_DIR WATCHDOG_RESTART_FLAG WATCHDOG_PID_FILE WATCHDOG_STATE_FILE WATCHDOG_PROCESS_PID_FILE
 }
 # Load .env file if exists
@@ -396,14 +411,20 @@ rotate_logs() {
        log_info "Archived error.log -> archive/error_${ts}.log"
    fi
    if [ -f "$WATCHDOG_LOG" ] && [ -s "$WATCHDOG_LOG" ]; then
        mv "$WATCHDOG_LOG" "${LOG_DIR}/archive/watchdog_${ts}.log"
        log_info "Archived watchdog.log -> archive/watchdog_${ts}.log"
    fi
    # Clean up old archives (keep last 10)
    cd "${LOG_DIR}/archive" 2>/dev/null && \
        ls -t access_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f && \
-        ls -t error_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f
+        ls -t error_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f && \
        ls -t watchdog_*.log 2>/dev/null | tail -n +11 | xargs -r rm -f
    cd "$ROOT"
    # Create fresh log files
-    touch "$ACCESS_LOG" "$ERROR_LOG"
+    touch "$ACCESS_LOG" "$ERROR_LOG" "$WATCHDOG_LOG"
 }
 get_pid() {
@@ -429,6 +450,84 @@ is_running() {
    get_pid &>/dev/null
 }
 get_watchdog_pid() {
    if [ -f "$WATCHDOG_PROCESS_PID_FILE" ]; then
        local pid
        pid=$(cat "$WATCHDOG_PROCESS_PID_FILE" 2>/dev/null || true)
        if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
            echo "$pid"
            return 0
        fi
        rm -f "$WATCHDOG_PROCESS_PID_FILE"
    fi
    return 1
 }
 is_watchdog_running() {
    get_watchdog_pid &>/dev/null
 }
 start_watchdog() {
    if ! is_enabled "${WATCHDOG_ENABLED:-true}"; then
        log_info "Worker watchdog is disabled (WATCHDOG_ENABLED=${WATCHDOG_ENABLED})"
        return 0
    fi
    if is_watchdog_running; then
        local pid
        pid=$(get_watchdog_pid)
        log_success "Worker watchdog already running (PID: ${pid})"
        return 0
    fi
    log_info "Starting worker watchdog..."
    nohup python scripts/worker_watchdog.py >> "$WATCHDOG_LOG" 2>&1 &
    local pid=$!
    echo "$pid" > "$WATCHDOG_PROCESS_PID_FILE"
    sleep 1
    if kill -0 "$pid" 2>/dev/null; then
        log_success "Worker watchdog started (PID: ${pid})"
        return 0
    fi
    rm -f "$WATCHDOG_PROCESS_PID_FILE"
    log_error "Failed to start worker watchdog"
    return 1
 }
 stop_watchdog() {
    if ! is_watchdog_running; then
        rm -f "$WATCHDOG_PROCESS_PID_FILE"
        return 0
    fi
    local pid
    pid=$(get_watchdog_pid)
    log_info "Stopping worker watchdog (PID: ${pid})..."
    kill -TERM "$pid" 2>/dev/null || true
    local count=0
    while kill -0 "$pid" 2>/dev/null && [ $count -lt 5 ]; do
        sleep 1
        count=$((count + 1))
    done
    if kill -0 "$pid" 2>/dev/null; then
        kill -9 "$pid" 2>/dev/null || true
        sleep 1
    fi
    rm -f "$WATCHDOG_PROCESS_PID_FILE"
    if kill -0 "$pid" 2>/dev/null; then
        log_error "Failed to stop worker watchdog"
        return 1
    fi
    log_success "Worker watchdog stopped"
    return 0
 }
 do_start() {
    local foreground=false
@@ -442,7 +541,14 @@ do_start() {
    if is_running; then
        local pid=$(get_pid)
        log_warn "Server is already running (PID: ${pid})"
-        return 1
+        if is_enabled "${WATCHDOG_ENABLED:-true}" && ! is_watchdog_running; then
            check_conda || return 1
            conda activate "$CONDA_ENV"
            export PYTHONPATH="${ROOT}/src:${PYTHONPATH:-}"
            cd "$ROOT"
            start_watchdog || return 1
        fi
        return 0
    fi
    # Run checks
@@ -470,6 +576,9 @@ do_start() {
    echo "[$(timestamp)] Starting server" >> "$STARTUP_LOG"
    if [ "$foreground" = true ]; then
        if is_enabled "${WATCHDOG_ENABLED:-true}"; then
            log_info "Foreground mode does not auto-start watchdog (use background start for watchdog)."
        fi
        log_info "Running in foreground mode (Ctrl+C to stop)"
        exec gunicorn \
            --config gunicorn.conf.py \
@@ -495,6 +604,7 @@ do_start() {
            log_success "Server started successfully (PID: ${pid})"
            log_info "Access URL: http://localhost:${PORT}"
            log_info "Logs: ${LOG_DIR}/"
            start_watchdog || return 1
            echo "[$(timestamp)] Server started (PID: ${pid})" >> "$STARTUP_LOG"
        else
            log_error "Failed to start server"
@@ -509,48 +619,54 @@ do_stop() {
    load_env
    resolve_runtime_paths
-    if ! is_running; then
+    local server_running=false
-        log_warn "Server is not running"
+    local pid=""
-        return 0
+    if is_running; then
-    fi
+        server_running=true
-
+        pid=$(get_pid)
-    local pid=$(get_pid)
+        log_info "Stopping server (PID: ${pid})..."
    log_info "Stopping server (PID: ${pid})..."
    # Find all gunicorn processes (master + workers)
    local all_pids=$(pgrep -f "gunicorn.*mes_dashboard" 2>/dev/null | tr '\n' ' ')
    # Graceful shutdown with SIGTERM
    kill -TERM "$pid" 2>/dev/null
    # Wait for graceful shutdown (max 10 seconds)
    local count=0
    while kill -0 "$pid" 2>/dev/null && [ $count -lt 10 ]; do
        sleep 1
        count=$((count + 1))
        echo -n "."
    done
    echo ""
    # Force kill if still running (including orphaned workers)
    if kill -0 "$pid" 2>/dev/null || [ -n "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then
        log_warn "Graceful shutdown timeout, forcing..."
        # Kill all gunicorn processes related to mes_dashboard
        pkill -9 -f "gunicorn.*mes_dashboard" 2>/dev/null
        sleep 1
    fi
    # Cleanup PID file
    rm -f "$PID_FILE"
    # Verify all processes are stopped
    if [ -z "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then
        log_success "Server stopped"
        echo "[$(timestamp)] Server stopped (PID: ${pid})" >> "$STARTUP_LOG"
    else
-        log_error "Failed to stop server"
+        log_warn "Server is not running"
        return 1
    fi
    if [ "$server_running" = true ]; then
        # Find all gunicorn processes (master + workers)
        local all_pids=$(pgrep -f "gunicorn.*mes_dashboard" 2>/dev/null | tr '\n' ' ')
        # Graceful shutdown with SIGTERM
        kill -TERM "$pid" 2>/dev/null
        # Wait for graceful shutdown (max 10 seconds)
        local count=0
        while kill -0 "$pid" 2>/dev/null && [ $count -lt 10 ]; do
            sleep 1
            count=$((count + 1))
            echo -n "."
        done
        echo ""
        # Force kill if still running (including orphaned workers)
        if kill -0 "$pid" 2>/dev/null || [ -n "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then
            log_warn "Graceful shutdown timeout, forcing..."
            # Kill all gunicorn processes related to mes_dashboard
            pkill -9 -f "gunicorn.*mes_dashboard" 2>/dev/null
            sleep 1
        fi
        # Cleanup PID file
        rm -f "$PID_FILE"
        # Verify all processes are stopped
        if [ -z "$(pgrep -f 'gunicorn.*mes_dashboard' 2>/dev/null)" ]; then
            log_success "Server stopped"
            echo "[$(timestamp)] Server stopped (PID: ${pid})" >> "$STARTUP_LOG"
        else
            log_error "Failed to stop server"
            return 1
        fi
    fi
    stop_watchdog
 }
 do_restart() {
@@ -585,6 +701,16 @@ do_status() {
    # Show Redis status
    redis_status
    if is_enabled "${WATCHDOG_ENABLED:-true}"; then
        if is_watchdog_running; then
            local watchdog_pid=$(get_watchdog_pid)
            echo -e "  Watchdog:${GREEN} RUNNING${NC} (PID: ${watchdog_pid})"
        else
            echo -e "  Watchdog:${YELLOW} STOPPED${NC}"
        fi
    else
        echo -e "  Watchdog:${YELLOW} DISABLED${NC}"
    fi
    if is_running; then
        echo ""
@@ -635,7 +761,15 @@ do_logs() {
            ;;
        follow)
            log_info "Following logs (Ctrl+C to stop)..."
-            tail -f "$ACCESS_LOG" "$ERROR_LOG" 2>/dev/null
+            tail -f "$ACCESS_LOG" "$ERROR_LOG" "$WATCHDOG_LOG" 2>/dev/null
            ;;
        watchdog)
            if [ -f "$WATCHDOG_LOG" ]; then
                log_info "Watchdog log (last ${lines} lines):"
                tail -n "$lines" "$WATCHDOG_LOG"
            else
                log_warn "Watchdog log not found"
            fi
            ;;
        *)
            log_info "=== Error Log (last 20 lines) ==="
@@ -643,6 +777,9 @@ do_logs() {
            echo ""
            log_info "=== Access Log (last 20 lines) ==="
            tail -20 "$ACCESS_LOG" 2>/dev/null || echo "(empty)"
            echo ""
            log_info "=== Watchdog Log (last 20 lines) ==="
            tail -20 "$WATCHDOG_LOG" 2>/dev/null || echo "(empty)"
            ;;
    esac
 }
@@ -660,7 +797,7 @@ show_help() {
    echo "  stop           Stop the server gracefully"
    echo "  restart        Restart the server"
    echo "  status         Show server and Redis status"
-    echo "  logs [type]    View logs (access|error|follow|all)"
+    echo "  logs [type]    View logs (access|error|watchdog|follow|all)"
    echo "  check          Run environment checks only"
    echo "  help           Show this help message"
    echo ""
@@ -676,6 +813,7 @@ show_help() {
    echo "  GUNICORN_THREADS   Threads per worker (default: 4)"
    echo "  REDIS_ENABLED      Enable Redis cache (default: true)"
    echo "  REDIS_URL          Redis connection URL"
    echo "  WATCHDOG_ENABLED   Enable worker watchdog (default: true)"
    echo ""
 }
--- a/scripts/worker_watchdog.py
+++ b/scripts/worker_watchdog.py
@@ -40,6 +40,7 @@ from mes_dashboard.core.runtime_contract import (  # noqa: E402
    build_runtime_contract_diagnostics,
    load_runtime_contract,
 )
 from mes_dashboard.core.watchdog_logging import attach_sqlite_log_handler  # noqa: E402
 from mes_dashboard.core.worker_recovery_policy import (  # noqa: E402
    decide_restart_request,
    evaluate_worker_recovery_state,
@@ -57,6 +58,7 @@ logging.basicConfig(
    ]
 )
 logger = logging.getLogger('mes_dashboard.watchdog')
 attach_sqlite_log_handler(logger)
 # ============================================================
 # Configuration
--- a/src/mes_dashboard/core/cache_updater.py
+++ b/src/mes_dashboard/core/cache_updater.py
@@ -31,6 +31,7 @@ logger = logging.getLogger('mes_dashboard.cache_updater')
 CACHE_CHECK_INTERVAL = int(os.getenv('CACHE_CHECK_INTERVAL', '600'))  # 10 minutes
 WIP_VIEW = "DWH.DW_MES_LOT_V"
 WIP_CACHE_TTL_SECONDS = int(os.getenv('WIP_CACHE_TTL_SECONDS', '0'))
 # Resource cache sync interval (default: 4 hours)
 RESOURCE_SYNC_INTERVAL = int(os.getenv('RESOURCE_SYNC_INTERVAL', '14400'))
@@ -236,6 +237,7 @@ class CacheUpdater:
        staging_key: str | None = None
        try:
            ttl_seconds = self._resolve_cache_ttl_seconds()
            # Convert DataFrame to JSON
            # Handle datetime columns
            df_copy = df.copy()
@@ -250,10 +252,10 @@ class CacheUpdater:
            staging_key = get_key(f"data:staging:{unique_suffix}")
            pipe = client.pipeline()
-            pipe.set(staging_key, data_json)
+            pipe.set(staging_key, data_json, ex=ttl_seconds)
            pipe.rename(staging_key, get_key("data"))
-            pipe.set(get_key("meta:sys_date"), sys_date)
+            pipe.set(get_key("meta:sys_date"), sys_date, ex=ttl_seconds)
-            pipe.set(get_key("meta:updated_at"), now)
+            pipe.set(get_key("meta:updated_at"), now, ex=ttl_seconds)
            pipe.execute()
            return True
@@ -266,6 +268,16 @@ class CacheUpdater:
                    pass
            return False
    def _resolve_cache_ttl_seconds(self) -> int:
        """Resolve Redis TTL for WIP snapshot keys.
        Default strategy: 3x sync interval to tolerate temporary sync gaps while
        preventing stale data from lingering forever when updater stops.
        """
        if WIP_CACHE_TTL_SECONDS > 0:
            return WIP_CACHE_TTL_SECONDS
        return max(int(self.interval) * 3, 60)
    def _check_resource_update(self, force: bool = False) -> bool:
        """Check and update resource cache if needed.
--- a/src/mes_dashboard/core/database.py
+++ b/src/mes_dashboard/core/database.py
@@ -670,7 +670,7 @@ def get_table_data(
        logger.error(f"get_table_data failed - ORA-{ora_code}: {exc}")
        if connection:
            connection.close()
-        return {'error': f'查詢失敗: {str(exc)}'}
+        return {'error': '查詢服務暫時無法使用'}
 def get_table_column_metadata(table_name: str) -> Dict[str, Any]:
--- a/src/mes_dashboard/core/watchdog_logging.py
+++ b/src/mes_dashboard/core/watchdog_logging.py
@@ -0,0 +1,37 @@
 # -*- coding: utf-8 -*-
 """Logging helpers shared by watchdog runtime."""
 from __future__ import annotations
 import logging
 _SQLITE_HANDLER_FLAG = "_watchdog_sqlite_handler_registered"
 def attach_sqlite_log_handler(target_logger: logging.Logger) -> bool:
    """Attach SQLite log handler to watchdog logger when enabled.
    Returns:
        True if a new handler was attached; otherwise False.
    """
    if getattr(target_logger, _SQLITE_HANDLER_FLAG, False):
        return False
    try:
        from mes_dashboard.core.log_store import LOG_STORE_ENABLED, get_sqlite_log_handler
    except Exception as exc:
        target_logger.warning("Failed to import SQLite log store: %s", exc)
        return False
    if not LOG_STORE_ENABLED:
        return False
    try:
        sqlite_handler = get_sqlite_log_handler()
        sqlite_handler.setLevel(logging.INFO)
        target_logger.addHandler(sqlite_handler)
        setattr(target_logger, _SQLITE_HANDLER_FLAG, True)
        return True
    except Exception as exc:
        target_logger.warning("Failed to initialize SQLite log handler: %s", exc)
        return False
--- a/src/mes_dashboard/routes/auth_routes.py
+++ b/src/mes_dashboard/routes/auth_routes.py
@@ -3,11 +3,12 @@
 from __future__ import annotations
-import logging
+import logging
-import time
+import time
-from collections import defaultdict
+from collections import defaultdict
-from datetime import datetime
+from datetime import datetime
-from threading import Lock
+from threading import Lock
 from urllib.parse import urlparse
 from flask import Blueprint, flash, redirect, render_template, request, session, url_for
@@ -26,8 +27,25 @@ auth_bp = Blueprint("auth", __name__, url_prefix="/admin")
 _rate_limit_lock = Lock()
 _login_attempts: dict = defaultdict(list)  # IP -> list of timestamps
-RATE_LIMIT_MAX_ATTEMPTS = 5
+RATE_LIMIT_MAX_ATTEMPTS = 5
-RATE_LIMIT_WINDOW_SECONDS = 300  # 5 minutes
+RATE_LIMIT_WINDOW_SECONDS = 300  # 5 minutes
 def _sanitize_next_url(next_url: str | None) -> str:
    """Return a safe post-login redirect URL limited to local paths."""
    fallback = url_for("portal_index")
    if not next_url:
        return fallback
    parsed = urlparse(next_url)
    if parsed.scheme or parsed.netloc:
        logger.warning("Blocked external next redirect: %s", next_url)
        return fallback
    if not next_url.startswith("/") or next_url.startswith("//"):
        return fallback
    return next_url
 def _is_rate_limited(ip: str) -> bool:
@@ -103,7 +121,7 @@ def login():
                    "login_time": datetime.now().isoformat(),
                }
                rotate_csrf_token()
-                next_url = request.args.get("next", url_for("portal_index"))
+                next_url = _sanitize_next_url(request.args.get("next"))
                return redirect(next_url)
    return render_template("login.html", error=error)
--- a/src/mes_dashboard/routes/job_query_routes.py
+++ b/src/mes_dashboard/routes/job_query_routes.py
@@ -1,13 +1,15 @@
 # -*- coding: utf-8 -*-
-"""Job Query API routes.
+"""Job Query API routes.
 Contains Flask Blueprint for maintenance job query endpoints:
 - Job list query by resources
 - Job transaction history detail
 - CSV export with full history
-"""
+"""
-
+
-from flask import Blueprint, jsonify, request, Response, render_template
+import logging
 from flask import Blueprint, jsonify, request, Response, render_template
 from mes_dashboard.services.job_query_service import (
    get_jobs_by_resources,
@@ -16,8 +18,9 @@ from mes_dashboard.services.job_query_service import (
    validate_date_range,
 )
-# Create Blueprint
+# Create Blueprint
-job_query_bp = Blueprint('job_query', __name__)
+job_query_bp = Blueprint('job_query', __name__)
 logger = logging.getLogger('mes_dashboard.job_query_routes')
 # ============================================================
@@ -65,8 +68,9 @@ def get_resources():
            'total': len(data)
        })
-    except Exception as exc:
+    except Exception as exc:
-        return jsonify({'error': f'載入設備資料失敗: {str(exc)}'}), 500
+        logger.exception("Failed to load job-query resources: %s", exc)
        return jsonify({'error': '服務暫時無法使用'}), 500
@job_query_bp.route('/api/job-query/jobs', methods=['POST'])
--- a/src/mes_dashboard/routes/resource_routes.py
+++ b/src/mes_dashboard/routes/resource_routes.py
@@ -5,6 +5,7 @@ Contains Flask Blueprint for resource/equipment-related API endpoints.
 """
 import math
 import logging
 from flask import Blueprint, jsonify, request
 from mes_dashboard.core.database import (
@@ -14,6 +15,7 @@ from mes_dashboard.core.database import (
 )
 from mes_dashboard.core.cache import cache_get, cache_set, make_cache_key
 from mes_dashboard.core.rate_limit import configured_rate_limit
 from mes_dashboard.core.response import INTERNAL_ERROR, error_response
 from mes_dashboard.core.utils import get_days_back, parse_bool_query
@@ -112,6 +114,7 @@ from mes_dashboard.config.constants import STATUS_CATEGORIES
 # Create Blueprint
 resource_bp = Blueprint('resource', __name__, url_prefix='/api/resource')
 logger = logging.getLogger('mes_dashboard.resource_routes')
 _RESOURCE_DETAIL_RATE_LIMIT = configured_rate_limit(
    bucket="resource-detail",
@@ -253,7 +256,12 @@ def api_resource_status_values():
    except Exception as exc:
        if connection:
            connection.close()
-        return jsonify({'success': False, 'error': str(exc)}), 500
+        logger.exception("Failed to load resource status values: %s", exc)
        return error_response(
            INTERNAL_ERROR,
            "服務暫時無法使用",
            status_code=500,
        )
 # ============================================================
@@ -301,7 +309,12 @@ def api_resource_status():
    except (DatabasePoolExhaustedError, DatabaseCircuitOpenError):
        raise
    except Exception as exc:
-        return jsonify({'success': False, 'error': str(exc)}), 500
+        logger.exception("Failed to load realtime resource status: %s", exc)
        return error_response(
            INTERNAL_ERROR,
            "服務暫時無法使用",
            status_code=500,
        )
@resource_bp.route('/status/options')
@@ -324,7 +337,12 @@ def api_resource_status_options():
    except (DatabasePoolExhaustedError, DatabaseCircuitOpenError):
        raise
    except Exception as exc:
-        return jsonify({'success': False, 'error': str(exc)}), 500
+        logger.exception("Failed to load realtime resource options: %s", exc)
        return error_response(
            INTERNAL_ERROR,
            "服務暫時無法使用",
            status_code=500,
        )
@resource_bp.route('/status/summary')
@@ -355,7 +373,12 @@ def api_resource_status_summary():
    except (DatabasePoolExhaustedError, DatabaseCircuitOpenError):
        raise
    except Exception as exc:
-        return jsonify({'success': False, 'error': str(exc)}), 500
+        logger.exception("Failed to load realtime resource summary: %s", exc)
        return error_response(
            INTERNAL_ERROR,
            "服務暫時無法使用",
            status_code=500,
        )
@resource_bp.route('/status/matrix')
@@ -384,4 +407,9 @@ def api_resource_status_matrix():
    except (DatabasePoolExhaustedError, DatabaseCircuitOpenError):
        raise
    except Exception as exc:
-        return jsonify({'success': False, 'error': str(exc)}), 500
+        logger.exception("Failed to load realtime resource matrix: %s", exc)
        return error_response(
            INTERNAL_ERROR,
            "服務暫時無法使用",
            status_code=500,
        )
--- a/src/mes_dashboard/services/auth_service.py
+++ b/src/mes_dashboard/services/auth_service.py
@@ -18,7 +18,26 @@ LDAP_TIMEOUT = 10
 ADMIN_EMAILS = os.environ.get("ADMIN_EMAILS", "").lower().split(",")
 # Local authentication configuration (for development/testing)
-LOCAL_AUTH_ENABLED = os.environ.get("LOCAL_AUTH_ENABLED", "false").lower() in ("true", "1", "yes")
+def _resolve_local_auth_enabled(
    raw_value: str | None = None,
    flask_env: str | None = None,
 ) -> bool:
    """Resolve local auth toggle with production safety guard."""
    requested = (raw_value if raw_value is not None else os.environ.get("LOCAL_AUTH_ENABLED", "false"))
    local_auth_requested = str(requested).strip().lower() in ("true", "1", "yes", "on")
    effective_env = (flask_env if flask_env is not None else os.environ.get("FLASK_ENV", "development"))
    normalized_env = str(effective_env).strip().lower()
    is_production = normalized_env in {"production", "prod"}
    if local_auth_requested and is_production:
        logger.error("LOCAL_AUTH_ENABLED is blocked in production environment")
        return False
    return local_auth_requested
 LOCAL_AUTH_ENABLED = _resolve_local_auth_enabled()
 LOCAL_AUTH_USERNAME = os.environ.get("LOCAL_AUTH_USERNAME", "")
 LOCAL_AUTH_PASSWORD = os.environ.get("LOCAL_AUTH_PASSWORD", "")
--- a/src/mes_dashboard/services/excel_query_service.py
+++ b/src/mes_dashboard/services/excel_query_service.py
@@ -5,14 +5,16 @@ Provides Excel parsing, batch query execution, and CSV export functions.
 Supports large datasets (7000+ rows) by splitting queries into batches.
 """
-import re
+import re
-from datetime import datetime
+import logging
-from typing import Any, Dict, List, Tuple
+from datetime import datetime
 from typing import Any, Dict, List, Tuple
 import pandas as pd
-from mes_dashboard.core.database import get_db_connection
+from mes_dashboard.core.database import get_db_connection
-
+
 logger = logging.getLogger('mes_dashboard.excel_query_service')
 # Oracle IN clause limit
 BATCH_SIZE = 1000
@@ -21,7 +23,10 @@ BATCH_SIZE = 1000
 LIKE_KEYWORD_LIMIT = 100
 # Large table threshold for performance warning (10 million rows)
-LARGE_TABLE_THRESHOLD = 10_000_000
+LARGE_TABLE_THRESHOLD = 10_000_000
 PARSE_ERROR_MESSAGE = "Excel 解析失敗，請確認檔案格式"
 COLUMN_READ_ERROR_MESSAGE = "讀取欄位失敗，請稍後再試"
 QUERY_ERROR_MESSAGE = "查詢服務暫時無法使用"
 def parse_excel(file_storage) -> Dict[str, Any]:
@@ -45,8 +50,9 @@ def parse_excel(file_storage) -> Dict[str, Any]:
            'preview': preview,
            'total_rows': len(df)
        }
-    except Exception as exc:
+    except Exception as exc:
-        return {'error': f'Excel 解析失敗: {str(exc)}'}
+        logger.exception("Excel parse failed: %s", exc)
        return {'error': PARSE_ERROR_MESSAGE}
 def get_column_unique_values(file_storage, column_name: str) -> Dict[str, Any]:
@@ -73,8 +79,9 @@ def get_column_unique_values(file_storage, column_name: str) -> Dict[str, Any]:
            'values': values_list,
            'count': len(values_list)
        }
-    except Exception as exc:
+    except Exception as exc:
-        return {'error': f'讀取欄位失敗: {str(exc)}'}
+        logger.exception("Excel column read failed for %s: %s", column_name, exc)
        return {'error': COLUMN_READ_ERROR_MESSAGE}
 def detect_excel_column_type(values: List[str]) -> Dict[str, Any]:
@@ -369,10 +376,11 @@ def execute_batch_query(
            'batch_count': total_batches
        }
-    except Exception as exc:
+    except Exception as exc:
-        if connection:
+        if connection:
-            connection.close()
+            connection.close()
-        return {'error': f'查詢失敗: {str(exc)}'}
+        logger.exception("Excel batch query failed: %s", exc)
        return {'error': QUERY_ERROR_MESSAGE}
 def execute_advanced_batch_query(
@@ -527,10 +535,11 @@ def execute_advanced_batch_query(
            'query_type': query_type
        }
-    except Exception as exc:
+    except Exception as exc:
-        if connection:
+        if connection:
-            connection.close()
+            connection.close()
-        return {'error': f'查詢失敗: {str(exc)}'}
+        logger.exception("Excel advanced batch query failed: %s", exc)
        return {'error': QUERY_ERROR_MESSAGE}
 def generate_csv_content(data: List[Dict], columns: List[str]) -> str:
--- a/src/mes_dashboard/services/job_query_service.py
+++ b/src/mes_dashboard/services/job_query_service.py
@@ -13,23 +13,25 @@ Architecture:
 - Supports batching for large resource lists (Oracle IN clause limit)
 """
-import csv
+import csv
-import io
+import io
-import logging
+import logging
-from datetime import datetime
+from datetime import datetime
-from typing import Dict, List, Any, Optional, Generator
+from typing import Dict, List, Any, Optional, Generator, Tuple
-
+
-import pandas as pd
+import pandas as pd
-
+
 from mes_dashboard.core.database import read_sql_df, get_db_connection
-from mes_dashboard.sql import SQLLoader
+from mes_dashboard.sql import SQLLoader, QueryBuilder
 from mes_dashboard.config.field_contracts import get_export_headers, get_export_api_keys
-logger = logging.getLogger('mes_dashboard.job_query')
+logger = logging.getLogger('mes_dashboard.job_query')
 # Constants
-BATCH_SIZE = 1000  # Oracle IN clause limit
+BATCH_SIZE = 1000  # Oracle IN clause limit
-MAX_DATE_RANGE_DAYS = 365
+MAX_DATE_RANGE_DAYS = 365
 QUERY_ERROR_MESSAGE = "查詢服務暫時無法使用"
 EXPORT_ERROR_MESSAGE = "匯出服務暫時無法使用"
 # ============================================================
@@ -66,55 +68,72 @@ def validate_date_range(start_date: str, end_date: str) -> Optional[str]:
 # Resource Filter Helpers
 # ============================================================
-def _build_resource_filter(resource_ids: List[str], max_chunk_size: int = BATCH_SIZE) -> List[str]:
+def _build_resource_filter(
-    """Build SQL IN clause lists for resource IDs.
+    resource_ids: List[str], max_chunk_size: int = BATCH_SIZE
-
+) -> List[List[str]]:
-    Oracle has a limit of ~1000 items per IN clause, so we chunk if needed.
+    """Build chunked resource ID lists for Oracle IN clause limits.
-
+
-    Args:
+    Args:
-        resource_ids: List of resource IDs.
+        resource_ids: List of resource IDs.
-        max_chunk_size: Maximum items per IN clause.
+        max_chunk_size: Maximum items per IN clause.
-
+
-    Returns:
+    Returns:
-        List of SQL IN clause strings (e.g., "'ID1', 'ID2', 'ID3'").
+        Chunked resource ID values.
-    """
+    """
-    if not resource_ids:
+    normalized_ids: List[str] = []
-        return []
+    for rid in resource_ids:
-
+        if rid is None:
-    # Escape single quotes
+            continue
-    escaped_ids = [rid.replace("'", "''") for rid in resource_ids]
+        text = str(rid).strip()
-
+        if text:
-    # Chunk into groups
+            normalized_ids.append(text)
-    chunks = []
+
-    for i in range(0, len(escaped_ids), max_chunk_size):
+    if not normalized_ids:
-        chunk = escaped_ids[i:i + max_chunk_size]
+        return []
-        chunks.append("'" + "', '".join(chunk) + "'")
+
-
+    chunks: List[List[str]] = []
-    return chunks
+    for i in range(0, len(normalized_ids), max_chunk_size):
-
+        chunk = normalized_ids[i:i + max_chunk_size]
-
+        chunks.append(chunk)
-def _build_resource_filter_sql(resource_ids: List[str], column: str = 'j.RESOURCEID') -> str:
+    return chunks
-    """Build SQL WHERE clause for resource ID filtering.
+
-
+
-    Handles chunking for large resource lists.
+def _build_resource_filter_sql(
-
+    resource_ids: List[str],
-    Args:
+    column: str = 'j.RESOURCEID',
-        resource_ids: List of resource IDs.
+    max_chunk_size: int = BATCH_SIZE,
-        column: Column name to filter on.
+    return_params: bool = False,
-
+) -> str | Tuple[str, Dict[str, Any]]:
-    Returns:
+    """Build parameterized SQL condition for resource ID filtering.
-        SQL condition string (e.g., "j.RESOURCEID IN ('ID1', 'ID2')").
+
-    """
+    Uses bind variables via QueryBuilder and chunks values to satisfy Oracle
-    chunks = _build_resource_filter(resource_ids)
+    IN-clause limits.
-    if not chunks:
+
-        return "1=0"  # No resources = no results
+    Args:
-
+        resource_ids: List of resource IDs.
-    if len(chunks) == 1:
+        column: Column name to filter on.
-        return f"{column} IN ({chunks[0]})"
+        max_chunk_size: Maximum items per IN clause.
-
+        return_params: If True, return (condition_sql, params).
-    # Multiple chunks need OR
+
-    conditions = [f"{column} IN ({chunk})" for chunk in chunks]
+    Returns:
-    return "(" + " OR ".join(conditions) + ")"
+        Condition SQL string, or tuple of condition SQL and parameters.
    """
    chunks = _build_resource_filter(resource_ids, max_chunk_size=max_chunk_size)
    if not chunks:
        result: Tuple[str, Dict[str, Any]] = ("1=0", {})
        return result if return_params else result[0]
    builder = QueryBuilder()
    for chunk in chunks:
        builder.add_in_condition(column, chunk)
    if len(builder.conditions) == 1:
        condition_sql = builder.conditions[0]
    else:
        condition_sql = "(" + " OR ".join(builder.conditions) + ")"
    result = (condition_sql, builder.params.copy())
    return result if return_params else result[0]
 # ============================================================
@@ -147,14 +166,20 @@ def get_jobs_by_resources(
    try:
        # Build resource filter
-        resource_filter = _build_resource_filter_sql(resource_ids)
+        resource_filter, resource_params = _build_resource_filter_sql(
            resource_ids, return_params=True
        )
        # Load SQL template
        sql = SQLLoader.load("job_query/job_list")
        sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter)
        # Execute query
-        params = {'start_date': start_date, 'end_date': end_date}
+        params = {
            'start_date': start_date,
            'end_date': end_date,
            **resource_params,
        }
        df = read_sql_df(sql, params)
        # Convert to records
@@ -179,9 +204,9 @@ def get_jobs_by_resources(
            'resource_count': len(resource_ids)
        }
-    except Exception as exc:
+    except Exception as exc:
-        logger.error(f"Job query failed: {exc}")
+        logger.exception("Job query failed: %s", exc)
-        return {'error': f'查詢失敗: {str(exc)}'}
+        return {'error': QUERY_ERROR_MESSAGE}
 def get_job_txn_history(job_id: str) -> Dict[str, Any]:
@@ -227,9 +252,9 @@ def get_job_txn_history(job_id: str) -> Dict[str, Any]:
            'job_id': job_id
        }
-    except Exception as exc:
+    except Exception as exc:
-        logger.error(f"Transaction history query failed for job {job_id}: {exc}")
+        logger.exception("Transaction history query failed for job %s: %s", job_id, exc)
-        return {'error': f'查詢失敗: {str(exc)}'}
+        return {'error': QUERY_ERROR_MESSAGE}
 # ============================================================
@@ -265,14 +290,20 @@ def export_jobs_with_history(
    try:
        # Build resource filter
-        resource_filter = _build_resource_filter_sql(resource_ids)
+        resource_filter, resource_params = _build_resource_filter_sql(
            resource_ids, return_params=True
        )
        # Load SQL template
        sql = SQLLoader.load("job_query/job_txn_export")
        sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter)
        # Execute query
-        params = {'start_date': start_date, 'end_date': end_date}
+        params = {
            'start_date': start_date,
            'end_date': end_date,
            **resource_params,
        }
        df = read_sql_df(sql, params)
        if df is None or len(df) == 0:
@@ -321,9 +352,9 @@ def export_jobs_with_history(
        logger.info(f"CSV export completed: {len(df)} records")
-    except Exception as exc:
+    except Exception as exc:
-        logger.error(f"CSV export failed: {exc}")
+        logger.exception("CSV export failed: %s", exc)
-        yield f"Error: 匯出失敗 - {str(exc)}\n"
+        yield f"Error: {EXPORT_ERROR_MESSAGE}\n"
 def get_export_data(
@@ -351,14 +382,20 @@ def get_export_data(
    try:
        # Build resource filter
-        resource_filter = _build_resource_filter_sql(resource_ids)
+        resource_filter, resource_params = _build_resource_filter_sql(
            resource_ids, return_params=True
        )
        # Load SQL template
        sql = SQLLoader.load("job_query/job_txn_export")
        sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter)
        # Execute query
-        params = {'start_date': start_date, 'end_date': end_date}
+        params = {
            'start_date': start_date,
            'end_date': end_date,
            **resource_params,
        }
        df = read_sql_df(sql, params)
        # Convert to records
@@ -381,6 +418,6 @@ def get_export_data(
            'total': len(data)
        }
-    except Exception as exc:
+    except Exception as exc:
-        logger.error(f"Export data query failed: {exc}")
+        logger.exception("Export data query failed: %s", exc)
-        return {'error': f'查詢失敗: {str(exc)}'}
+        return {'error': QUERY_ERROR_MESSAGE}
--- a/tests/test_auth_integration.py
+++ b/tests/test_auth_integration.py
@@ -102,9 +102,62 @@ class TestLoginRoute:
        assert response.status_code == 302
        # Check session contains admin
-        with client.session_transaction() as sess:
+        with client.session_transaction() as sess:
-            assert "admin" in sess
+            assert "admin" in sess
-            assert sess["admin"]["username"] == "92367"
+            assert sess["admin"]["username"] == "92367"
    @patch('mes_dashboard.services.auth_service.LOCAL_AUTH_ENABLED', False)
    @patch('mes_dashboard.routes.auth_routes.is_admin', return_value=True)
    @patch('mes_dashboard.services.auth_service.requests.post')
    def test_login_blocks_external_next_redirect(self, mock_post, _mock_is_admin, client):
        """Should ignore external next URL and redirect to portal."""
        mock_response = MagicMock()
        mock_response.json.return_value = {
            "success": True,
            "user": {
                "username": "92367",
                "displayName": "Admin User",
                "mail": "ymirliu@panjit.com.tw",
                "department": "Test Dept",
            },
        }
        mock_post.return_value = mock_response
        response = client.post(
            "/admin/login?next=https://evil.example/phish",
            data={"username": "92367", "password": "password123"},
            follow_redirects=False,
        )
        assert response.status_code == 302
        assert "evil.example" not in response.location
        assert response.location.endswith("/")
    @patch('mes_dashboard.services.auth_service.LOCAL_AUTH_ENABLED', False)
    @patch('mes_dashboard.routes.auth_routes.is_admin', return_value=True)
    @patch('mes_dashboard.services.auth_service.requests.post')
    def test_login_allows_internal_next_redirect(self, mock_post, _mock_is_admin, client):
        """Should keep validated local path in next URL."""
        mock_response = MagicMock()
        mock_response.json.return_value = {
            "success": True,
            "user": {
                "username": "92367",
                "displayName": "Admin User",
                "mail": "ymirliu@panjit.com.tw",
                "department": "Test Dept",
            },
        }
        mock_post.return_value = mock_response
        response = client.post(
            "/admin/login?next=/admin/pages",
            data={"username": "92367", "password": "password123"},
            follow_redirects=False,
        )
        assert response.status_code == 302
        assert response.location.endswith("/admin/pages")
    @patch('mes_dashboard.services.auth_service.LOCAL_AUTH_ENABLED', False)
    @patch('mes_dashboard.services.auth_service.requests.post')
--- a/tests/test_auth_service.py
+++ b/tests/test_auth_service.py
@@ -161,6 +161,24 @@ class TestLocalAuthenticate:
        assert result is None
 class TestLocalAuthSafetyGuard:
    """Tests for production guard on local auth toggle."""
    def test_resolve_local_auth_enabled_blocks_production(self):
        result = auth_service._resolve_local_auth_enabled(
            raw_value="true",
            flask_env="production",
        )
        assert result is False
    def test_resolve_local_auth_enabled_allows_development(self):
        result = auth_service._resolve_local_auth_enabled(
            raw_value="true",
            flask_env="development",
        )
        assert result is True
 class TestIsAdmin:
    """Tests for is_admin function."""
--- a/tests/test_cache_updater.py
+++ b/tests/test_cache_updater.py
@@ -149,9 +149,9 @@ class TestLoadFullTable:
            assert result is None
-class TestUpdateRedisCache:
+class TestUpdateRedisCache:
-    """Test Redis cache update logic."""
+    """Test Redis cache update logic."""
-
+
    def test_update_redis_cache_success(self):
        """Test _update_redis_cache updates cache correctly."""
        import mes_dashboard.core.cache_updater as cu
@@ -173,7 +173,10 @@ class TestUpdateRedisCache:
                assert result is True
                mock_pipeline.rename.assert_called_once()
                mock_pipeline.execute.assert_called_once()
-
+                assert mock_pipeline.set.call_count == 3
                for call in mock_pipeline.set.call_args_list:
                    assert call.kwargs.get("ex") == updater.interval * 3
    def test_update_redis_cache_no_client(self):
        """Test _update_redis_cache handles no client."""
        import mes_dashboard.core.cache_updater as cu
@@ -205,6 +208,26 @@ class TestUpdateRedisCache:
        mock_client.delete.assert_called_once()
        staged_key = mock_client.delete.call_args.args[0]
        assert "staging" in staged_key
    def test_update_redis_cache_ttl_override(self):
        """Configured TTL override should apply to all Redis keys."""
        import mes_dashboard.core.cache_updater as cu
        mock_client = MagicMock()
        mock_pipeline = MagicMock()
        mock_client.pipeline.return_value = mock_pipeline
        test_df = pd.DataFrame({'LOTID': ['LOT001'], 'QTY': [100]})
        with patch.object(cu, 'WIP_CACHE_TTL_SECONDS', 42):
            with patch.object(cu, 'get_redis_client', return_value=mock_client):
                with patch.object(cu, 'get_key', side_effect=lambda k: f'mes_wip:{k}'):
                    updater = cu.CacheUpdater(interval=600)
                    result = updater._update_redis_cache(test_df, '2024-01-15 10:30:00')
        assert result is True
        assert mock_pipeline.set.call_count == 3
        for call in mock_pipeline.set.call_args_list:
            assert call.kwargs.get("ex") == 42
 class TestCacheUpdateFlow:
--- a/tests/test_excel_query_service.py
+++ b/tests/test_excel_query_service.py
@@ -4,17 +4,25 @@
 Tests the core service functions without database dependencies.
 """
-import pytest
+import pytest
-from mes_dashboard.services.excel_query_service import (
+from unittest.mock import MagicMock, patch
-    detect_excel_column_type,
+from mes_dashboard.services.excel_query_service import (
-    escape_like_pattern,
+    parse_excel,
-    build_like_condition,
+    get_column_unique_values,
-    build_date_range_condition,
+    execute_batch_query,
-    validate_like_keywords,
+    execute_advanced_batch_query,
-    sanitize_column_name,
+    detect_excel_column_type,
-    validate_table_name,
+    escape_like_pattern,
-    LIKE_KEYWORD_LIMIT,
+    build_like_condition,
-)
+    build_date_range_condition,
    validate_like_keywords,
    sanitize_column_name,
    validate_table_name,
    LIKE_KEYWORD_LIMIT,
    PARSE_ERROR_MESSAGE,
    COLUMN_READ_ERROR_MESSAGE,
    QUERY_ERROR_MESSAGE,
 )
 class TestDetectExcelColumnType:
@@ -236,7 +244,7 @@ class TestSanitizeColumnName:
        assert sanitize_column_name("COL; DROP TABLE--") == 'COLDROPTABLE'
-class TestValidateTableName:
+class TestValidateTableName:
    """Tests for validate_table_name function."""
    def test_simple_name(self):
@@ -256,6 +264,65 @@ class TestValidateTableName:
        assert validate_table_name('TABLE-NAME') is False
        assert validate_table_name('TABLE NAME') is False
-    def test_sql_injection_prevention(self):
+    def test_sql_injection_prevention(self):
-        """Should reject SQL injection attempts."""
+        """Should reject SQL injection attempts."""
-        assert validate_table_name('TABLE; DROP--') is False
+        assert validate_table_name('TABLE; DROP--') is False
 class TestErrorLeakageProtection:
    """Tests for exception detail masking in excel-query service."""
    @patch("mes_dashboard.services.excel_query_service.pd.read_excel")
    def test_parse_excel_masks_internal_error_details(self, mock_read_excel):
        mock_read_excel.side_effect = RuntimeError("openpyxl stack trace detail")
        result = parse_excel(MagicMock())
        assert result["error"] == PARSE_ERROR_MESSAGE
        assert "openpyxl" not in result["error"]
    @patch("mes_dashboard.services.excel_query_service.pd.read_excel")
    def test_get_column_unique_values_masks_internal_error_details(self, mock_read_excel):
        mock_read_excel.side_effect = RuntimeError("internal parser detail")
        result = get_column_unique_values(MagicMock(), "LOT_ID")
        assert result["error"] == COLUMN_READ_ERROR_MESSAGE
        assert "internal parser detail" not in result["error"]
    @patch("mes_dashboard.services.excel_query_service.get_db_connection")
    def test_execute_batch_query_masks_internal_error_details(self, mock_get_db):
        mock_cursor = MagicMock()
        mock_cursor.execute.side_effect = RuntimeError("ORA-00942: table missing")
        mock_conn = MagicMock()
        mock_conn.cursor.return_value = mock_cursor
        mock_get_db.return_value = mock_conn
        result = execute_batch_query(
            table_name="DWH.DW_MES_WIP",
            search_column="LOT_ID",
            return_columns=["LOT_ID"],
            search_values=["LOT001"],
        )
        assert result["error"] == QUERY_ERROR_MESSAGE
        assert "ORA-00942" not in result["error"]
    @patch("mes_dashboard.services.excel_query_service.get_db_connection")
    def test_execute_advanced_batch_query_masks_internal_error_details(self, mock_get_db):
        mock_cursor = MagicMock()
        mock_cursor.execute.side_effect = RuntimeError("sensitive sql context")
        mock_conn = MagicMock()
        mock_conn.cursor.return_value = mock_cursor
        mock_get_db.return_value = mock_conn
        result = execute_advanced_batch_query(
            table_name="DWH.DW_MES_WIP",
            search_column="LOT_ID",
            return_columns=["LOT_ID"],
            search_values=["LOT001"],
            query_type="in",
        )
        assert result["error"] == QUERY_ERROR_MESSAGE
        assert "sensitive sql context" not in result["error"]
--- a/tests/test_job_query_routes.py
+++ b/tests/test_job_query_routes.py
@@ -74,15 +74,17 @@ class TestGetResources:
        data = json.loads(response.data)
        assert 'error' in data
-    @patch('mes_dashboard.services.resource_cache.get_all_resources')
+    @patch('mes_dashboard.services.resource_cache.get_all_resources')
-    def test_get_resources_exception(self, mock_get_resources, client):
+    def test_get_resources_exception(self, mock_get_resources, client):
-        """Should handle exception gracefully."""
+        """Should handle exception gracefully."""
-        mock_get_resources.side_effect = Exception('Database error')
+        mock_get_resources.side_effect = Exception('ORA-01017 invalid username/password')
-
+
-        response = client.get('/api/job-query/resources')
+        response = client.get('/api/job-query/resources')
-        assert response.status_code == 500
+        assert response.status_code == 500
-        data = json.loads(response.data)
+        data = json.loads(response.data)
-        assert 'error' in data
+        assert 'error' in data
        assert data['error'] == '服務暫時無法使用'
        assert 'ORA-01017' not in data['error']
 class TestQueryJobs:
--- a/tests/test_job_query_service.py
+++ b/tests/test_job_query_service.py
@@ -4,14 +4,19 @@
 Tests the core service functions without database dependencies.
 """
-import pytest
+import pytest
-from mes_dashboard.services.job_query_service import (
+from unittest.mock import patch
-    validate_date_range,
+from mes_dashboard.services.job_query_service import (
-    _build_resource_filter,
+    validate_date_range,
-    _build_resource_filter_sql,
+    _build_resource_filter,
-    BATCH_SIZE,
+    _build_resource_filter_sql,
-    MAX_DATE_RANGE_DAYS,
+    get_jobs_by_resources,
-)
+    export_jobs_with_history,
    BATCH_SIZE,
    MAX_DATE_RANGE_DAYS,
    QUERY_ERROR_MESSAGE,
    EXPORT_ERROR_MESSAGE,
 )
 class TestValidateDateRange:
@@ -77,94 +82,125 @@ class TestValidateDateRange:
        assert '格式' in result or 'format' in result.lower()
-class TestBuildResourceFilter:
+class TestBuildResourceFilter:
-    """Tests for _build_resource_filter function."""
+    """Tests for _build_resource_filter function."""
-
+
-    def test_empty_list(self):
+    def test_empty_list(self):
-        """Should return empty list for empty input."""
+        """Should return empty list for empty input."""
-        result = _build_resource_filter([])
+        result = _build_resource_filter([])
-        assert result == []
+        assert result == []
-
+
-    def test_single_id(self):
+    def test_single_id(self):
-        """Should return single chunk for single ID."""
+        """Should return single chunk for single ID."""
-        result = _build_resource_filter(['RES001'])
+        result = _build_resource_filter(['RES001'])
-        assert len(result) == 1
+        assert len(result) == 1
-        assert result[0] == "'RES001'"
+        assert result[0] == ['RES001']
-
+
-    def test_multiple_ids(self):
+    def test_multiple_ids(self):
-        """Should join multiple IDs with comma."""
+        """Should join multiple IDs with comma."""
-        result = _build_resource_filter(['RES001', 'RES002', 'RES003'])
+        result = _build_resource_filter(['RES001', 'RES002', 'RES003'])
-        assert len(result) == 1
+        assert len(result) == 1
-        assert "'RES001'" in result[0]
+        assert result[0] == ['RES001', 'RES002', 'RES003']
-        assert "'RES002'" in result[0]
+
-        assert "'RES003'" in result[0]
+    def test_chunking(self):
-
+        """Should chunk when exceeding batch size."""
-    def test_chunking(self):
+        # Create more than BATCH_SIZE IDs
-        """Should chunk when exceeding batch size."""
+        ids = [f'RES{i:05d}' for i in range(BATCH_SIZE + 10)]
-        # Create more than BATCH_SIZE IDs
+        result = _build_resource_filter(ids)
-        ids = [f'RES{i:05d}' for i in range(BATCH_SIZE + 10)]
+        assert len(result) == 2
-        result = _build_resource_filter(ids)
+        # First chunk should have BATCH_SIZE items
-        assert len(result) == 2
+        assert len(result[0]) == BATCH_SIZE
-        # First chunk should have BATCH_SIZE items
+
-        assert result[0].count("'") == BATCH_SIZE * 2  # 2 quotes per ID
+    def test_preserve_id_value_without_sql_interpolation(self):
-
+        """Should keep raw value and defer safety to bind variables."""
-    def test_escape_single_quotes(self):
+        result = _build_resource_filter(["RES'001"])
-        """Should escape single quotes in IDs."""
+        assert len(result) == 1
-        result = _build_resource_filter(["RES'001"])
+        assert result[0] == ["RES'001"]
-        assert len(result) == 1
+
-        assert "RES''001" in result[0]  # Escaped
+    def test_custom_chunk_size(self):
-
+        """Should respect custom chunk size."""
-    def test_custom_chunk_size(self):
+        ids = ['RES001', 'RES002', 'RES003', 'RES004', 'RES005']
-        """Should respect custom chunk size."""
+        result = _build_resource_filter(ids, max_chunk_size=2)
        ids = ['RES001', 'RES002', 'RES003', 'RES004', 'RES005']
        result = _build_resource_filter(ids, max_chunk_size=2)
        assert len(result) == 3  # 2+2+1
-class TestBuildResourceFilterSql:
+class TestBuildResourceFilterSql:
-    """Tests for _build_resource_filter_sql function."""
+    """Tests for _build_resource_filter_sql function."""
-
+
-    def test_empty_list(self):
+    def test_empty_list(self):
-        """Should return 1=0 for empty input (no results)."""
+        """Should return 1=0 for empty input (no results)."""
-        result = _build_resource_filter_sql([])
+        result = _build_resource_filter_sql([])
-        assert result == "1=0"
+        assert result == "1=0"
-
+
-    def test_single_id(self):
+    def test_single_id(self):
-        """Should build simple IN clause for single ID."""
+        """Should build IN clause with bind variable for single ID."""
-        result = _build_resource_filter_sql(['RES001'])
+        result, params = _build_resource_filter_sql(['RES001'], return_params=True)
-        assert "j.RESOURCEID IN" in result
+        assert "j.RESOURCEID IN" in result
-        assert "'RES001'" in result
+        assert ":p0" in result
-
+        assert params["p0"] == "RES001"
-    def test_multiple_ids(self):
+        assert "RES001" not in result
-        """Should build IN clause with multiple IDs."""
+
-        result = _build_resource_filter_sql(['RES001', 'RES002'])
+    def test_multiple_ids(self):
-        assert "j.RESOURCEID IN" in result
+        """Should build IN clause with multiple bind variables."""
-        assert "'RES001'" in result
+        result, params = _build_resource_filter_sql(['RES001', 'RES002'], return_params=True)
-        assert "'RES002'" in result
+        assert "j.RESOURCEID IN" in result
-
+        assert ":p0" in result
-    def test_custom_column(self):
+        assert ":p1" in result
-        """Should use custom column name."""
+        assert params["p0"] == "RES001"
-        result = _build_resource_filter_sql(['RES001'], column='r.ID')
+        assert params["p1"] == "RES002"
-        assert "r.ID IN" in result
+
    def test_custom_column(self):
        """Should use custom column name."""
        result = _build_resource_filter_sql(['RES001'], column='r.ID')
        assert "r.ID IN" in result
    def test_large_list_uses_or(self):
        """Should use OR for chunked results."""
        # Create more than BATCH_SIZE IDs
        ids = [f'RES{i:05d}' for i in range(BATCH_SIZE + 10)]
-        result = _build_resource_filter_sql(ids)
+        result = _build_resource_filter_sql(ids)
-        assert " OR " in result
+        assert " OR " in result
-        # Should have parentheses wrapping the OR conditions
+        # Should have parentheses wrapping the OR conditions
-        assert result.startswith("(")
+        assert result.startswith("(")
-        assert result.endswith(")")
+        assert result.endswith(")")
    def test_sql_injection_payload_stays_in_params(self):
        """Injection payload should never be interpolated into SQL text."""
        payload = "RES001' OR '1'='1"
        sql, params = _build_resource_filter_sql([payload], return_params=True)
        assert payload in params.values()
        assert payload not in sql
-class TestServiceConstants:
+class TestServiceConstants:
    """Tests for service constants."""
    def test_batch_size_is_reasonable(self):
        """Batch size should be <= 1000 (Oracle limit)."""
        assert BATCH_SIZE <= 1000
-    def test_max_date_range_is_year(self):
+    def test_max_date_range_is_year(self):
-        """Max date range should be 365 days."""
+        """Max date range should be 365 days."""
-        assert MAX_DATE_RANGE_DAYS == 365
+        assert MAX_DATE_RANGE_DAYS == 365
 class TestErrorLeakageProtection:
    """Tests for exception detail masking in job-query service."""
    @patch("mes_dashboard.services.job_query_service.read_sql_df")
    def test_query_error_masks_internal_details(self, mock_read):
        mock_read.side_effect = RuntimeError("ORA-00942: table or view does not exist")
        result = get_jobs_by_resources(["RES001"], "2024-01-01", "2024-01-31")
        assert result["error"] == QUERY_ERROR_MESSAGE
        assert "ORA-00942" not in result["error"]
    @patch("mes_dashboard.services.job_query_service.read_sql_df")
    def test_export_stream_error_masks_internal_details(self, mock_read):
        mock_read.side_effect = RuntimeError("sensitive sql context")
        output = "".join(export_jobs_with_history(["RES001"], "2024-01-01", "2024-01-31"))
        assert EXPORT_ERROR_MESSAGE in output
        assert "sensitive sql context" not in output
--- a/tests/test_resource_routes.py
+++ b/tests/test_resource_routes.py
@@ -3,6 +3,18 @@
 from __future__ import annotations
 from unittest.mock import patch
 import mes_dashboard.core.database as db
 from mes_dashboard.app import create_app
 def _client():
    db._ENGINE = None
    app = create_app("testing")
    app.config["TESTING"] = True
    return app.test_client()
 def test_clean_nan_values_handles_deep_nesting_without_recursion_error():
    from mes_dashboard.routes.resource_routes import _clean_nan_values
@@ -30,3 +42,33 @@ def test_clean_nan_values_breaks_cycles_safely():
    cleaned = _clean_nan_values(payload)
    assert cleaned["name"] == "root"
    assert cleaned["self"] is None
@patch(
    "mes_dashboard.routes.resource_routes.get_resource_status_summary",
    side_effect=RuntimeError("ORA-00942: table or view does not exist"),
 )
 def test_resource_status_summary_masks_internal_error_details(_mock_summary):
    response = _client().get("/api/resource/status/summary")
    assert response.status_code == 500
    payload = response.get_json()
    assert payload["success"] is False
    assert payload["error"]["code"] == "INTERNAL_ERROR"
    assert payload["error"]["message"] == "服務暫時無法使用"
    assert "ORA-00942" not in str(payload)
@patch(
    "mes_dashboard.routes.resource_routes.get_merged_resource_status",
    side_effect=RuntimeError("sensitive sql context"),
 )
 def test_resource_status_masks_internal_error_details(_mock_status):
    response = _client().get("/api/resource/status")
    assert response.status_code == 500
    payload = response.get_json()
    assert payload["success"] is False
    assert payload["error"]["code"] == "INTERNAL_ERROR"
    assert payload["error"]["message"] == "服務暫時無法使用"
    assert "sensitive sql context" not in str(payload)
--- a/tests/test_watchdog_logging.py
+++ b/tests/test_watchdog_logging.py
@@ -0,0 +1,69 @@
 # -*- coding: utf-8 -*-
 """Unit tests for watchdog logging helpers."""
 from __future__ import annotations
 import logging
 from unittest.mock import patch
 from mes_dashboard.core.watchdog_logging import attach_sqlite_log_handler
 def _reset_logger(logger: logging.Logger) -> None:
    logger.handlers.clear()
    if hasattr(logger, "_watchdog_sqlite_handler_registered"):
        delattr(logger, "_watchdog_sqlite_handler_registered")
 def test_attach_sqlite_log_handler_enabled_attaches_once():
    test_logger = logging.getLogger("mes_dashboard.watchdog.test.enabled")
    _reset_logger(test_logger)
    handler_one = logging.NullHandler()
    handler_two = logging.NullHandler()
    with patch("mes_dashboard.core.log_store.LOG_STORE_ENABLED", True), patch(
        "mes_dashboard.core.log_store.get_sqlite_log_handler",
        side_effect=[handler_one, handler_two],
    ) as handler_factory:
        first = attach_sqlite_log_handler(test_logger)
        second = attach_sqlite_log_handler(test_logger)
    assert first is True
    assert second is False
    assert handler_factory.call_count == 1
    assert handler_one in test_logger.handlers
    assert handler_two not in test_logger.handlers
    _reset_logger(test_logger)
 def test_attach_sqlite_log_handler_disabled_skips_factory():
    test_logger = logging.getLogger("mes_dashboard.watchdog.test.disabled")
    _reset_logger(test_logger)
    with patch("mes_dashboard.core.log_store.LOG_STORE_ENABLED", False), patch(
        "mes_dashboard.core.log_store.get_sqlite_log_handler"
    ) as handler_factory:
        attached = attach_sqlite_log_handler(test_logger)
    assert attached is False
    handler_factory.assert_not_called()
    assert not test_logger.handlers
    _reset_logger(test_logger)
 def test_attach_sqlite_log_handler_handles_handler_errors():
    test_logger = logging.getLogger("mes_dashboard.watchdog.test.error")
    _reset_logger(test_logger)
    with patch("mes_dashboard.core.log_store.LOG_STORE_ENABLED", True), patch(
        "mes_dashboard.core.log_store.get_sqlite_log_handler",
        side_effect=RuntimeError("boom"),
    ):
        attached = attach_sqlite_log_handler(test_logger)
    assert attached is False
    assert not test_logger.handlers
    _reset_logger(test_logger)