From 07ced80fb07417cbb2fed7ebd8dc158fedbd7200 Mon Sep 17 00:00:00 2001 From: egg Date: Thu, 26 Feb 2026 09:48:54 +0800 Subject: [PATCH] feat(admin-perf): full Vue SPA migration + slow-query/memory monitoring gaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove Jinja2 template fallback (1249 lines) — /admin/performance now serves Vue SPA exclusively via send_from_directory. Backend: - Add _SLOW_QUERY_WAITING counter with get_slow_query_waiting_count() - Record slow-path latency in read_sql_df_slow/iter via record_query_latency() - Extend metrics_history schema with slow_query_active, slow_query_waiting, worker_rss_bytes columns + ALTER TABLE migration for existing DBs - Add cleanup_archive_logs() with configurable ARCHIVE_LOG_DIR/KEEP_COUNT - Integrate archive cleanup into MetricsHistoryCollector 50-min cycle Frontend: - Add slow_query_active and slow_query_waiting StatCards to connection pool - Add slow_query_active trend line to pool trend chart - Add Worker memory (RSS MB) trend chart with preprocessing - Update modernization gate check path to frontend style.css Co-Authored-By: Claude Opus 4.6 --- frontend/src/admin-performance/App.vue | 23 +- .../.openspec.yaml | 2 + .../design.md | 81 ++ .../proposal.md | 34 + .../specs/admin-performance-spa/spec.md | 37 + .../specs/archive-log-rotation/spec.md | 30 + .../specs/connection-pool-monitoring/spec.md | 29 + .../specs/metrics-history-trending/spec.md | 54 + .../specs/slow-query-observability/spec.md | 49 + .../specs/worker-memory-tracking/spec.md | 23 + .../tasks.md | 37 + openspec/specs/admin-performance-spa/spec.md | 137 +- openspec/specs/archive-log-rotation/spec.md | 30 + .../specs/connection-pool-monitoring/spec.md | 56 +- .../specs/metrics-history-trending/spec.md | 119 +- .../specs/slow-query-observability/spec.md | 49 + openspec/specs/worker-memory-tracking/spec.md | 23 + scripts/check_full_modernization_gates.py | 2 +- src/mes_dashboard/core/database.py | 33 +- src/mes_dashboard/core/metrics_history.py | 94 +- src/mes_dashboard/routes/admin_routes.py | 5 +- .../templates/admin/performance.html | 1249 ----------------- 22 files changed, 740 insertions(+), 1456 deletions(-) create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/.openspec.yaml create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/design.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/proposal.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/admin-performance-spa/spec.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/archive-log-rotation/spec.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/connection-pool-monitoring/spec.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/metrics-history-trending/spec.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/slow-query-observability/spec.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/worker-memory-tracking/spec.md create mode 100644 openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/tasks.md create mode 100644 openspec/specs/archive-log-rotation/spec.md create mode 100644 openspec/specs/slow-query-observability/spec.md create mode 100644 openspec/specs/worker-memory-tracking/spec.md delete mode 100644 src/mes_dashboard/templates/admin/performance.html diff --git a/frontend/src/admin-performance/App.vue b/frontend/src/admin-performance/App.vue index 9732407..a7b389f 100644 --- a/frontend/src/admin-performance/App.vue +++ b/frontend/src/admin-performance/App.vue @@ -160,6 +160,8 @@ + + @@ -175,6 +177,15 @@ :series="poolTrendSeries" /> + + +

Worker 控制

@@ -450,7 +461,12 @@ async function loadWorkerStatus() { async function loadPerformanceHistory() { try { const res = await apiGet('/admin/api/performance-history', { params: { minutes: 30 } }); - historyData.value = res?.data?.snapshots || []; + const snapshots = res?.data?.snapshots || []; + // Pre-process: convert worker_rss_bytes to worker_rss_mb for trend chart + historyData.value = snapshots.map((s) => ({ + ...s, + worker_rss_mb: s.worker_rss_bytes ? Math.round(s.worker_rss_bytes / 1048576 * 10) / 10 : 0, + })); } catch (e) { console.error('Failed to load performance history:', e); } @@ -460,6 +476,7 @@ async function loadPerformanceHistory() { const poolTrendSeries = [ { name: '飽和度', key: 'pool_saturation', color: '#6366f1' }, { name: '使用中', key: 'pool_checked_out', color: '#f59e0b' }, + { name: '慢查詢執行中', key: 'slow_query_active', color: '#ef4444' }, ]; const latencyTrendSeries = [ @@ -478,6 +495,10 @@ const hitRateTrendSeries = [ { name: 'L2 命中率', key: 'rc_l2_hit_rate', color: '#f59e0b' }, ]; +const memoryTrendSeries = [ + { name: 'RSS (MB)', key: 'worker_rss_mb', color: '#8b5cf6' }, +]; + async function refreshAll() { loading.value = true; try { diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/.openspec.yaml b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/.openspec.yaml new file mode 100644 index 0000000..85ae75c --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-02-26 diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/design.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/design.md new file mode 100644 index 0000000..c2f3680 --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/design.md @@ -0,0 +1,81 @@ +## Context + +2026-02-25 的 server crash 暴露出 pool 隔離架構變更後的監控盲區。event_fetcher 和 lineage_engine 已遷移到 `read_sql_df_slow`(獨立連線 + semaphore),但 metrics_history 快照只記錄 pool 相關指標,slow query 並行數、排隊數、Worker RSS 完全無歷史紀錄。 + +同時 `/admin/performance` 仍保留 1249 行 Jinja template 作為 Vue SPA fallback,但 SPA 已是唯一使用的版本(build artifact 存在於 `static/dist/admin-performance.html`),兩套 UI 增加維護成本且 Jinja 版功能遠不及 SPA。 + +`logs/archive/` 目錄累積 rotated log 檔案無自動清理,是唯一會無限增長的儲存。 + +## Goals / Non-Goals + +**Goals:** +- 移除 Jinja fallback,統一為 Vue SPA 單一架構 +- 讓 slow query 並行數、排隊數、Worker RSS 成為可觀測的歷史趨勢指標 +- 讓 P50/P95/P99 反映所有查詢路徑(pool + slow path) +- 解決 archive log 無限增長問題 + +**Non-Goals:** +- 不修改 `/admin/pages`(仍為 Jinja template) +- 不新增 async job queue 面板(P1,後續 change 處理) +- 不新增 event cache hit/miss 計數器(P2) +- 不增加即時告警或 webhook 通知機制 + +## Decisions + +### D1:SQLite schema migration 策略 + +**選擇**:啟動時執行 `ALTER TABLE ADD COLUMN IF NOT EXISTS`(容錯 "duplicate column" error) + +**替代方案**:version table + migration script → 過度工程,SQLite 只有 3 天保留,加欄是向後相容的 + +**理由**:新欄位 nullable,舊 row 自動為 NULL,不影響既有查詢。MetricsHistoryStore.initialize() 已在啟動時執行 CREATE TABLE IF NOT EXISTS,加入 ALTER TABLE 語句自然整合。 + +### D2:RSS 記憶體取得方式 + +**選擇**:`resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024`(Python stdlib,Linux 上單位為 KB) + +**替代方案 A**:讀取 `/proc/self/status` VmRSS → 平台相依,解析 overhead +**替代方案 B**:`psutil.Process().memory_info().rss` → 需新增外部依賴 + +**理由**:`resource` 模組為 Python 標準庫,無需額外依賴。`ru_maxrss` 在 Linux 上返回 KB,乘以 1024 轉為 bytes。 + +### D3:Semaphore 排隊計數器實作 + +**選擇**:在 `read_sql_df_slow()` 的 semaphore.acquire() 前後遞增/遞減 `_SLOW_QUERY_WAITING` atomic counter + +**流程**: +``` +_SLOW_QUERY_WAITING += 1 +acquired = semaphore.acquire(timeout=60) +_SLOW_QUERY_WAITING -= 1 +if not acquired: raise RuntimeError +_SLOW_QUERY_ACTIVE += 1 +... execute query ... +_SLOW_QUERY_ACTIVE -= 1 +``` + +**理由**:與既有 `_SLOW_QUERY_ACTIVE` 模式一致,使用 threading.Lock 保護。 + +### D4:Archive log cleanup 整合位置 + +**選擇**:整合到 `MetricsHistoryCollector._run()` 的 cleanup cycle(每 ~100 intervals ≈ 50 分鐘) + +**替代方案**:獨立 cron job → 需額外 crontab 配置,不自包含 + +**理由**:已有 daemon thread 定期 cleanup SQLite,加入 archive cleanup 邏輯一致且自包含。 + +### D5:移除 Jinja fallback 的安全性 + +**選擇**:直接移除 fallback,admin_routes.py 改為只 `send_from_directory(dist_dir, "admin-performance.html")` + +**理由**: +- Vue SPA build artifact 已存在(`static/dist/admin-performance.html`,2026-02-26 更新) +- `frontend/package.json` build script 已包含 admin-performance entry +- CI/deploy 流程必包含 `npx vite build` +- 若 build 失敗,`/health/frontend-shell` 已有 asset readiness 檢查可偵測 + +## Risks / Trade-offs + +- **[Risk] Build 失敗時 /admin/performance 返回 404** → 既有 `/health/frontend-shell` 檢查 + deploy script 驗證。移除 fallback 反而讓問題更早暴露。 +- **[Risk] ALTER TABLE 在 SQLite 大表上可能慢** → metrics_history 最多 50K rows,ALTER TABLE 即時完成。 +- **[Trade-off] `ru_maxrss` 是 peak RSS,非 current RSS** → 在 Linux 上 `ru_maxrss` 是 process lifetime 的 max RSS。改用 `/proc/self/status` 的 VmRSS 可取得 current,但需 file I/O。鑑於每 30 秒收集一次且 max RSS 更能反映記憶體壓力,接受此 trade-off。若日後需要 current RSS,可改讀 `/proc/self/status`。 diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/proposal.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/proposal.md new file mode 100644 index 0000000..d58c08f --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/proposal.md @@ -0,0 +1,34 @@ +## Why + +2026-02-25 server crash 暴露出管理員效能監控頁面在 pool 隔離架構變更後的關鍵盲區:slow query 並行數、slow-path 延遲、Worker 記憶體等核心指標既未收集也未顯示,導致 crash 前完全無法觀測系統真實負載。同時,`/admin/performance` 仍保留 1249 行的 Jinja template 作為 fallback,與已完成的 Vue SPA 遷移架構不一致,增加維護成本。 + +## What Changes + +- **移除** Jinja template `templates/admin/performance.html`,`/admin/performance` 路由直接服務 Vue SPA(`static/dist/admin-performance.html`),不再有 fallback 邏輯 +- **新增** `slow_query_active`、`slow_query_waiting`、`worker_rss_bytes` 三個欄位到 `metrics_history.sqlite` 快照,含 SQLite schema migration +- **新增** semaphore 排隊計數器(`_SLOW_QUERY_WAITING`),追蹤等待 slow query semaphore 的 thread 數量 +- **修正** `read_sql_df_slow()` 和 `read_sql_df_slow_iter()` 將查詢延遲記錄到 `QueryMetrics`,使 P50/P95/P99 反映所有查詢路徑 +- **新增** Vue SPA 連線池區塊顯示「慢查詢執行中」「慢查詢排隊中」指標 + 連線池趨勢圖加入 slow_query_active 線 + Worker 記憶體趨勢圖 +- **新增** archive log 自動清理機制,整合到既有 `MetricsHistoryCollector` 的 cleanup cycle + +## Capabilities + +### New Capabilities + +- `slow-query-observability`: 追蹤 slow query 並行數、排隊數、延遲,寫入 metrics history 並在前端顯示趨勢 +- `worker-memory-tracking`: 追蹤 Worker RSS 記憶體,寫入 metrics history 並在前端顯示趨勢 +- `archive-log-rotation`: logs/archive/ 目錄的自動清理機制,防止檔案無限增長 + +### Modified Capabilities + +- `admin-performance-spa`: 移除 Jinja template fallback,完全遷移至 Vue SPA,新增 slow query 與記憶體監控面板 +- `metrics-history-trending`: 擴充 snapshot schema 加入 slow_query_active、slow_query_waiting、worker_rss_bytes +- `connection-pool-monitoring`: 新增 semaphore 排隊計數器,slow-path 延遲納入 QueryMetrics + +## Impact + +- **後端**:`core/database.py`(排隊計數器 + latency 記錄)、`core/metrics_history.py`(schema 擴充 + archive cleanup)、`routes/admin_routes.py`(移除 fallback) +- **前端**:`frontend/src/admin-performance/App.vue`(新面板 + 趨勢圖)→ 需 rebuild +- **刪除**:`templates/admin/performance.html`(1249 行) +- **資料**:既有 `metrics_history.sqlite` 需 ALTER TABLE 加欄(向後相容,新欄位 nullable) +- **測試**:既有 `test_performance_integration.py` 已測試 SPA 路徑,無需修改;需新增 schema migration 測試 diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/admin-performance-spa/spec.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/admin-performance-spa/spec.md new file mode 100644 index 0000000..843dfc2 --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/admin-performance-spa/spec.md @@ -0,0 +1,37 @@ +## MODIFIED Requirements + +### Requirement: Vue 3 SPA page replaces Jinja2 template +The `/admin/performance` route SHALL serve the Vite-built `admin-performance.html` static file directly. The Jinja2 template fallback SHALL be removed. If the SPA build artifact does not exist, the server SHALL return a standard HTTP error (no fallback rendering). + +#### Scenario: Page loads as Vue SPA +- **WHEN** user navigates to `/admin/performance` +- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file via `send_from_directory` + +#### Scenario: Portal-shell integration +- **WHEN** the portal-shell renders `/admin/performance` +- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe) + +#### Scenario: Build artifact missing +- **WHEN** the SPA build artifact `admin-performance.html` does not exist in `static/dist/` +- **THEN** the server SHALL return an HTTP error (no Jinja2 fallback) + +### Requirement: Connection pool panel +The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, direct connection count, slow_query_active, and slow_query_waiting. + +#### Scenario: Pool under normal load +- **WHEN** pool saturation is below 80% +- **THEN** the GaugeBar SHALL display in a normal color (green/blue) + +#### Scenario: Pool near saturation +- **WHEN** pool saturation exceeds 80% +- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red) + +#### Scenario: Slow query metrics displayed +- **WHEN** `db_pool.status` includes `slow_query_active` and `slow_query_waiting` +- **THEN** the panel SHALL display StatCards for both values + +## REMOVED Requirements + +### Requirement: Jinja2 template fallback for performance page +**Reason**: The Vue SPA is the sole UI. Maintaining a 1249-line Jinja template as fallback adds maintenance burden and feature divergence. +**Migration**: Delete `templates/admin/performance.html`. The route handler serves the SPA directly. diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/archive-log-rotation/spec.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/archive-log-rotation/spec.md new file mode 100644 index 0000000..a8bbbeb --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/archive-log-rotation/spec.md @@ -0,0 +1,30 @@ +## ADDED Requirements + +### Requirement: Automatic archive log cleanup +The system SHALL provide a `cleanup_archive_logs()` function in `core/metrics_history.py` that deletes old rotated log files from `logs/archive/`, keeping the most recent N files per log type (access, error, watchdog, rq_worker, startup). + +#### Scenario: Cleanup keeps recent files +- **WHEN** `cleanup_archive_logs()` is called with `keep_per_type=20` and there are 30 access_*.log files +- **THEN** 10 oldest access_*.log files SHALL be deleted, keeping the 20 most recent by modification time + +#### Scenario: No excess files +- **WHEN** `cleanup_archive_logs()` is called and each type has fewer than `keep_per_type` files +- **THEN** no files SHALL be deleted + +#### Scenario: Archive directory missing +- **WHEN** `cleanup_archive_logs()` is called and the archive directory does not exist +- **THEN** the function SHALL return 0 without error + +### Requirement: Archive cleanup integrated into collector cycle +The `MetricsHistoryCollector` SHALL call `cleanup_archive_logs()` alongside the existing SQLite cleanup, running approximately every 50 minutes (every 100 collection intervals). + +#### Scenario: Periodic cleanup executes +- **WHEN** the cleanup counter reaches 100 intervals +- **THEN** both SQLite metrics cleanup and archive log cleanup SHALL execute + +### Requirement: Archive cleanup configuration +The archive log cleanup SHALL be configurable via environment variables: `ARCHIVE_LOG_DIR` (default: `logs/archive`) and `ARCHIVE_LOG_KEEP_COUNT` (default: 20). + +#### Scenario: Custom keep count +- **WHEN** `ARCHIVE_LOG_KEEP_COUNT=10` is set +- **THEN** cleanup SHALL keep only the 10 most recent files per type diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/connection-pool-monitoring/spec.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/connection-pool-monitoring/spec.md new file mode 100644 index 0000000..4902f47 --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/connection-pool-monitoring/spec.md @@ -0,0 +1,29 @@ +## MODIFIED Requirements + +### Requirement: Connection pool status in performance detail +The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation, slow_query_active, slow_query_waiting) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`. + +#### Scenario: Pool status retrieved +- **WHEN** the API is called +- **THEN** `db_pool.status` SHALL contain current pool utilization metrics including `slow_query_active` and `slow_query_waiting`, and `db_pool.config` SHALL contain the pool configuration values + +#### Scenario: Saturation calculation +- **WHEN** the pool has 8 checked_out connections and max_capacity is 30 +- **THEN** saturation SHALL be reported as approximately 26.7% + +#### Scenario: Slow query waiting included +- **WHEN** 2 threads are waiting for the slow query semaphore +- **THEN** `db_pool.status.slow_query_waiting` SHALL be 2 + +## ADDED Requirements + +### Requirement: Slow-path query latency included in QueryMetrics +The `read_sql_df_slow()` and `read_sql_df_slow_iter()` functions SHALL call `record_query_latency()` with the total elapsed time upon completion, ensuring P50/P95/P99 percentiles reflect queries from all paths (pooled and slow/direct). + +#### Scenario: Slow query latency recorded +- **WHEN** `read_sql_df_slow()` completes a query in 8.5 seconds +- **THEN** `record_query_latency(8.5)` SHALL be called and the value SHALL appear in subsequent `get_percentiles()` results + +#### Scenario: Slow iter latency recorded +- **WHEN** `read_sql_df_slow_iter()` completes streaming in 45 seconds +- **THEN** `record_query_latency(45.0)` SHALL be called in the finally block diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/metrics-history-trending/spec.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/metrics-history-trending/spec.md new file mode 100644 index 0000000..5e1c4b6 --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/metrics-history-trending/spec.md @@ -0,0 +1,54 @@ +## MODIFIED Requirements + +### Requirement: SQLite metrics history store +The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`. The schema SHALL include columns for `slow_query_active` (INTEGER), `slow_query_waiting` (INTEGER), and `worker_rss_bytes` (INTEGER) in addition to the existing pool, Redis, route cache, and latency columns. + +#### Scenario: Write and query snapshots +- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency/slow_query/memory metrics +- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp, worker PID, and all metric columns + +#### Scenario: Query by time range +- **WHEN** `query_snapshots(minutes=30)` is called +- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending, including the new columns + +#### Scenario: Retention cleanup +- **WHEN** `cleanup()` is called +- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000) + +#### Scenario: Thread safety +- **WHEN** multiple threads write snapshots concurrently +- **THEN** the write lock SHALL serialize writes and prevent database corruption + +#### Scenario: Schema migration for existing databases +- **WHEN** the store initializes on an existing database without the new columns +- **THEN** it SHALL execute ALTER TABLE ADD COLUMN for each missing column, tolerating "duplicate column" errors + +### Requirement: Background metrics collector +The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var). The collector SHALL include `slow_query_active`, `slow_query_waiting`, and `worker_rss_bytes` in each snapshot. + +#### Scenario: Automatic collection +- **WHEN** the collector is started via `start_metrics_history(app)` +- **THEN** it SHALL collect pool status (including slow_query_active and slow_query_waiting), Redis info, route cache status, query latency metrics, and worker RSS memory every interval and write them to the store + +#### Scenario: Graceful shutdown +- **WHEN** `stop_metrics_history()` is called +- **THEN** the collector thread SHALL stop within one interval period + +#### Scenario: Subsystem unavailability +- **WHEN** a subsystem (e.g., Redis) is unavailable during collection +- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics + +### Requirement: Frontend trend charts +The system SHALL display 5 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts: connection pool saturation, query latency (P50/P95/P99), Redis memory, cache hit rates, and worker memory. + +#### Scenario: Trend charts with data +- **WHEN** historical snapshots contain more than 1 data point +- **THEN** the dashboard SHALL display trend charts for: connection pool saturation (including slow_query_active), query latency (P50/P95/P99), Redis memory, cache hit rates, and worker memory (RSS in MB) + +#### Scenario: Trend charts without data +- **WHEN** historical snapshots are empty or contain only 1 data point +- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`) + +#### Scenario: Auto-refresh +- **WHEN** the dashboard auto-refreshes +- **THEN** historical data SHALL also be refreshed alongside real-time metrics diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/slow-query-observability/spec.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/slow-query-observability/spec.md new file mode 100644 index 0000000..fb3e130 --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/slow-query-observability/spec.md @@ -0,0 +1,49 @@ +## ADDED Requirements + +### Requirement: Slow query active count in metrics history snapshots +The `MetricsHistoryCollector` SHALL include `slow_query_active` in each 30-second snapshot, recording the number of slow queries currently executing via dedicated connections. + +#### Scenario: Snapshot includes slow_query_active +- **WHEN** the collector writes a snapshot while 3 slow queries are executing +- **THEN** the `slow_query_active` column SHALL contain the value 3 + +#### Scenario: No slow queries active +- **WHEN** the collector writes a snapshot while no slow queries are executing +- **THEN** the `slow_query_active` column SHALL contain the value 0 + +### Requirement: Slow query waiting count tracked and persisted +The system SHALL maintain a thread-safe counter `_SLOW_QUERY_WAITING` in `database.py` that tracks the number of threads currently waiting to acquire the slow query semaphore. This counter SHALL be included in `get_pool_status()` and persisted to metrics history snapshots. + +#### Scenario: Counter increments on semaphore wait +- **WHEN** a thread enters `read_sql_df_slow()` and the semaphore is full +- **THEN** `_SLOW_QUERY_WAITING` SHALL be incremented before `semaphore.acquire()` and decremented after acquire completes (success or timeout) + +#### Scenario: Counter in pool status API +- **WHEN** `get_pool_status()` is called +- **THEN** the returned dict SHALL include `slow_query_waiting` with the current waiting thread count + +#### Scenario: Counter persisted to metrics history +- **WHEN** the collector writes a snapshot +- **THEN** the `slow_query_waiting` column SHALL reflect the count at snapshot time + +### Requirement: Slow-path query latency recorded in QueryMetrics +The `read_sql_df_slow()` and `read_sql_df_slow_iter()` functions SHALL call `record_query_latency()` with the elapsed query time, so that P50/P95/P99 metrics reflect all query paths (pool + slow). + +#### Scenario: Slow query latency appears in percentiles +- **WHEN** a `read_sql_df_slow()` call completes in 5.2 seconds +- **THEN** `record_query_latency(5.2)` SHALL be called and the latency SHALL appear in subsequent `get_percentiles()` results + +#### Scenario: Slow iter latency recorded on completion +- **WHEN** a `read_sql_df_slow_iter()` generator completes after yielding all batches in 120 seconds total +- **THEN** `record_query_latency(120.0)` SHALL be called in the finally block + +### Requirement: Slow query metrics displayed in Vue SPA +The admin performance Vue SPA SHALL display `slow_query_active` and `slow_query_waiting` as StatCards in the connection pool panel, and include `slow_query_active` as a trend line in the connection pool trend chart. + +#### Scenario: StatCards display current values +- **WHEN** the performance-detail API returns `db_pool.status.slow_query_active = 4` and `db_pool.status.slow_query_waiting = 2` +- **THEN** the connection pool panel SHALL display StatCards showing "慢查詢執行中: 4" and "慢查詢排隊中: 2" + +#### Scenario: Trend chart includes slow_query_active +- **WHEN** historical snapshots contain `slow_query_active` data points +- **THEN** the connection pool trend chart SHALL include a "慢查詢執行中" line series diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/worker-memory-tracking/spec.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/worker-memory-tracking/spec.md new file mode 100644 index 0000000..73c3716 --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/specs/worker-memory-tracking/spec.md @@ -0,0 +1,23 @@ +## ADDED Requirements + +### Requirement: Worker RSS memory in metrics history snapshots +The `MetricsHistoryCollector` SHALL include `worker_rss_bytes` in each 30-second snapshot, recording the current worker process peak RSS memory using Python's `resource.getrusage()`. + +#### Scenario: RSS recorded in snapshot +- **WHEN** the collector writes a snapshot and the worker process has 256 MB peak RSS +- **THEN** the `worker_rss_bytes` column SHALL contain approximately 268435456 + +#### Scenario: RSS collection failure +- **WHEN** `resource.getrusage()` raises an exception +- **THEN** the collector SHALL write NULL for `worker_rss_bytes` and continue collecting other metrics + +### Requirement: Worker memory trend chart in Vue SPA +The admin performance Vue SPA SHALL display a "Worker 記憶體趨勢" TrendChart showing RSS memory over time in megabytes. + +#### Scenario: Memory trend displayed +- **WHEN** historical snapshots contain `worker_rss_bytes` data with more than 1 data point +- **THEN** the dashboard SHALL display a TrendChart with RSS values converted to MB + +#### Scenario: No memory data +- **WHEN** historical snapshots do not contain `worker_rss_bytes` data (all NULL) +- **THEN** the trend chart SHALL show "趨勢資料不足" message diff --git a/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/tasks.md b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/tasks.md new file mode 100644 index 0000000..e35cfd7 --- /dev/null +++ b/openspec/changes/archive/2026-02-26-admin-perf-vue-migration-monitoring-gaps/tasks.md @@ -0,0 +1,37 @@ +## 1. 後端:Semaphore 排隊計數器 + Slow-path latency + +- [x] 1.1 在 `src/mes_dashboard/core/database.py` 新增 `_SLOW_QUERY_WAITING` counter 和 `get_slow_query_waiting_count()` 函數 +- [x] 1.2 修改 `read_sql_df_slow()` 在 semaphore.acquire() 前後遞增/遞減 `_SLOW_QUERY_WAITING` +- [x] 1.3 修改 `read_sql_df_slow_iter()` 同上加入 waiting counter 邏輯 +- [x] 1.4 修改 `get_pool_status()` 回傳中加入 `slow_query_waiting` 欄位 +- [x] 1.5 在 `read_sql_df_slow()` finally block 呼叫 `record_query_latency(elapsed)` +- [x] 1.6 在 `read_sql_df_slow_iter()` finally block 呼叫 `record_query_latency(elapsed)` + +## 2. 後端:metrics_history schema 擴充 + archive cleanup + +- [x] 2.1 在 `src/mes_dashboard/core/metrics_history.py` 的 schema 新增 `slow_query_active INTEGER`, `slow_query_waiting INTEGER`, `worker_rss_bytes INTEGER` 欄位 +- [x] 2.2 在 `MetricsHistoryStore.initialize()` 加入 ALTER TABLE ADD COLUMN migration(容錯 duplicate column) +- [x] 2.3 更新 `COLUMNS` list 加入新欄位 +- [x] 2.4 更新 `write_snapshot()` 加入新欄位的讀取和 INSERT +- [x] 2.5 更新 `_collect_snapshot()` 收集 `slow_query_active`、`slow_query_waiting`(從 `get_pool_status()`)和 `worker_rss_bytes`(從 `resource.getrusage()`) +- [x] 2.6 新增 `cleanup_archive_logs(archive_dir, keep_per_type)` 函數,含 `ARCHIVE_LOG_DIR` 和 `ARCHIVE_LOG_KEEP_COUNT` env var 配置 +- [x] 2.7 在 `MetricsHistoryCollector._run()` 的 cleanup cycle 呼叫 `cleanup_archive_logs()` + +## 3. 後端:移除 Jinja fallback + +- [x] 3.1 修改 `src/mes_dashboard/routes/admin_routes.py` 的 `performance()` 路由,移除 Jinja fallback 邏輯(改為直接 `send_from_directory`) +- [x] 3.2 刪除 `src/mes_dashboard/templates/admin/performance.html` +- [x] 3.3 更新 `scripts/check_full_modernization_gates.py` 將 `/admin/performance` 的 gate check 從 template 路徑改為 `frontend/src/admin-performance/style.css` + +## 4. 前端:Vue SPA 新增監控面板 + +- [x] 4.1 在 `frontend/src/admin-performance/App.vue` 連線池 section 新增 `slow_query_active` 和 `slow_query_waiting` StatCards +- [x] 4.2 在 `poolTrendSeries` 加入 `slow_query_active` 趨勢線 +- [x] 4.3 新增 `memoryTrendSeries` 定義和 Worker 記憶體 TrendChart 組件 +- [x] 4.4 新增 `historyData` 預處理邏輯:將 `worker_rss_bytes` 轉為 `worker_rss_mb` + +## 5. Build + 測試驗證 + +- [x] 5.1 執行 `cd frontend && npx vite build` 確認 build 成功 +- [x] 5.2 執行 `python -m pytest tests/ -v --tb=short` 確認既有測試通過 +- [x] 5.3 確認 `test_performance_page_loads` 測試通過(SPA 路徑驗證) diff --git a/openspec/specs/admin-performance-spa/spec.md b/openspec/specs/admin-performance-spa/spec.md index 431b7a0..843dfc2 100644 --- a/openspec/specs/admin-performance-spa/spec.md +++ b/openspec/specs/admin-performance-spa/spec.md @@ -1,100 +1,37 @@ -## ADDED Requirements - -### Requirement: Vue 3 SPA page replaces Jinja2 template -The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route. - -#### Scenario: Page loads as Vue SPA -- **WHEN** user navigates to `/admin/performance` -- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template) - -#### Scenario: Portal-shell integration -- **WHEN** the portal-shell renders `/admin/performance` -- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe) - -### Requirement: Status cards display system health -The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value. - -#### Scenario: All systems healthy -- **WHEN** all backend systems report healthy status via `/admin/api/system-status` -- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values - -#### Scenario: Redis disabled -- **WHEN** Redis is disabled (`REDIS_ENABLED=false`) -- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message - -### Requirement: Query performance panel with ECharts -The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation. - -#### Scenario: Metrics loaded successfully -- **WHEN** `/admin/api/metrics` returns valid performance data -- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution - -#### Scenario: No metrics data -- **WHEN** `/admin/api/metrics` returns empty or null metrics -- **THEN** the panel SHALL display placeholder text indicating no data available - -### Requirement: Redis cache detail panel -The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table. - -#### Scenario: Redis active with data -- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts -- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count - -#### Scenario: Redis disabled -- **WHEN** Redis is disabled -- **THEN** the Redis detail panel SHALL display a disabled state message without errors - -### Requirement: Memory cache panel -The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads). - -#### Scenario: Multiple caches registered -- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries -- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description - -#### Scenario: Route cache telemetry -- **WHEN** `/admin/api/performance-detail` returns route_cache data -- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads - -### Requirement: Connection pool panel -The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count. - -#### Scenario: Pool under normal load -- **WHEN** pool saturation is below 80% -- **THEN** the GaugeBar SHALL display in a normal color (green/blue) - -#### Scenario: Pool near saturation -- **WHEN** pool saturation exceeds 80% -- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red) - -### Requirement: Worker control panel -The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal. - -#### Scenario: Restart worker -- **WHEN** user clicks the restart button and confirms in the modal -- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result - -#### Scenario: Restart during cooldown -- **WHEN** worker is in cooldown period -- **THEN** the restart button SHALL be disabled with a cooldown indicator - -### Requirement: System logs panel with filtering and pagination -The dashboard SHALL display system logs with level filtering, text search, and pagination controls. - -#### Scenario: Filter by log level -- **WHEN** user selects a specific log level filter -- **THEN** only logs matching that level SHALL be displayed - -#### Scenario: Paginate logs -- **WHEN** logs exceed the page size -- **THEN** pagination controls SHALL allow navigating between pages - -### Requirement: Auto-refresh with toggle -The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh. - -#### Scenario: Auto-refresh enabled -- **WHEN** auto-refresh is enabled (default) -- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch - -#### Scenario: Manual refresh -- **WHEN** user clicks the manual refresh button -- **THEN** all panels SHALL immediately refresh their data +## MODIFIED Requirements + +### Requirement: Vue 3 SPA page replaces Jinja2 template +The `/admin/performance` route SHALL serve the Vite-built `admin-performance.html` static file directly. The Jinja2 template fallback SHALL be removed. If the SPA build artifact does not exist, the server SHALL return a standard HTTP error (no fallback rendering). + +#### Scenario: Page loads as Vue SPA +- **WHEN** user navigates to `/admin/performance` +- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file via `send_from_directory` + +#### Scenario: Portal-shell integration +- **WHEN** the portal-shell renders `/admin/performance` +- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe) + +#### Scenario: Build artifact missing +- **WHEN** the SPA build artifact `admin-performance.html` does not exist in `static/dist/` +- **THEN** the server SHALL return an HTTP error (no Jinja2 fallback) + +### Requirement: Connection pool panel +The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, direct connection count, slow_query_active, and slow_query_waiting. + +#### Scenario: Pool under normal load +- **WHEN** pool saturation is below 80% +- **THEN** the GaugeBar SHALL display in a normal color (green/blue) + +#### Scenario: Pool near saturation +- **WHEN** pool saturation exceeds 80% +- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red) + +#### Scenario: Slow query metrics displayed +- **WHEN** `db_pool.status` includes `slow_query_active` and `slow_query_waiting` +- **THEN** the panel SHALL display StatCards for both values + +## REMOVED Requirements + +### Requirement: Jinja2 template fallback for performance page +**Reason**: The Vue SPA is the sole UI. Maintaining a 1249-line Jinja template as fallback adds maintenance burden and feature divergence. +**Migration**: Delete `templates/admin/performance.html`. The route handler serves the SPA directly. diff --git a/openspec/specs/archive-log-rotation/spec.md b/openspec/specs/archive-log-rotation/spec.md new file mode 100644 index 0000000..a8bbbeb --- /dev/null +++ b/openspec/specs/archive-log-rotation/spec.md @@ -0,0 +1,30 @@ +## ADDED Requirements + +### Requirement: Automatic archive log cleanup +The system SHALL provide a `cleanup_archive_logs()` function in `core/metrics_history.py` that deletes old rotated log files from `logs/archive/`, keeping the most recent N files per log type (access, error, watchdog, rq_worker, startup). + +#### Scenario: Cleanup keeps recent files +- **WHEN** `cleanup_archive_logs()` is called with `keep_per_type=20` and there are 30 access_*.log files +- **THEN** 10 oldest access_*.log files SHALL be deleted, keeping the 20 most recent by modification time + +#### Scenario: No excess files +- **WHEN** `cleanup_archive_logs()` is called and each type has fewer than `keep_per_type` files +- **THEN** no files SHALL be deleted + +#### Scenario: Archive directory missing +- **WHEN** `cleanup_archive_logs()` is called and the archive directory does not exist +- **THEN** the function SHALL return 0 without error + +### Requirement: Archive cleanup integrated into collector cycle +The `MetricsHistoryCollector` SHALL call `cleanup_archive_logs()` alongside the existing SQLite cleanup, running approximately every 50 minutes (every 100 collection intervals). + +#### Scenario: Periodic cleanup executes +- **WHEN** the cleanup counter reaches 100 intervals +- **THEN** both SQLite metrics cleanup and archive log cleanup SHALL execute + +### Requirement: Archive cleanup configuration +The archive log cleanup SHALL be configurable via environment variables: `ARCHIVE_LOG_DIR` (default: `logs/archive`) and `ARCHIVE_LOG_KEEP_COUNT` (default: 20). + +#### Scenario: Custom keep count +- **WHEN** `ARCHIVE_LOG_KEEP_COUNT=10` is set +- **THEN** cleanup SHALL keep only the 10 most recent files per type diff --git a/openspec/specs/connection-pool-monitoring/spec.md b/openspec/specs/connection-pool-monitoring/spec.md index ae71a86..4902f47 100644 --- a/openspec/specs/connection-pool-monitoring/spec.md +++ b/openspec/specs/connection-pool-monitoring/spec.md @@ -1,27 +1,29 @@ -## ADDED Requirements - -### Requirement: Connection pool status in performance detail -The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`. - -#### Scenario: Pool status retrieved -- **WHEN** the API is called -- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values - -#### Scenario: Saturation calculation -- **WHEN** the pool has 8 checked_out connections and max_capacity is 30 -- **THEN** saturation SHALL be reported as approximately 26.7% - -### Requirement: Direct Oracle connection counter -The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection. - -#### Scenario: Counter increments on direct connection -- **WHEN** `get_db_connection()` successfully creates a connection -- **THEN** the direct connection counter SHALL increment by 1 - -#### Scenario: Counter in performance detail -- **WHEN** the performance-detail API is called -- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID) - -#### Scenario: Counter is per-worker -- **WHEN** multiple gunicorn workers are running -- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker +## MODIFIED Requirements + +### Requirement: Connection pool status in performance detail +The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation, slow_query_active, slow_query_waiting) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`. + +#### Scenario: Pool status retrieved +- **WHEN** the API is called +- **THEN** `db_pool.status` SHALL contain current pool utilization metrics including `slow_query_active` and `slow_query_waiting`, and `db_pool.config` SHALL contain the pool configuration values + +#### Scenario: Saturation calculation +- **WHEN** the pool has 8 checked_out connections and max_capacity is 30 +- **THEN** saturation SHALL be reported as approximately 26.7% + +#### Scenario: Slow query waiting included +- **WHEN** 2 threads are waiting for the slow query semaphore +- **THEN** `db_pool.status.slow_query_waiting` SHALL be 2 + +## ADDED Requirements + +### Requirement: Slow-path query latency included in QueryMetrics +The `read_sql_df_slow()` and `read_sql_df_slow_iter()` functions SHALL call `record_query_latency()` with the total elapsed time upon completion, ensuring P50/P95/P99 percentiles reflect queries from all paths (pooled and slow/direct). + +#### Scenario: Slow query latency recorded +- **WHEN** `read_sql_df_slow()` completes a query in 8.5 seconds +- **THEN** `record_query_latency(8.5)` SHALL be called and the value SHALL appear in subsequent `get_percentiles()` results + +#### Scenario: Slow iter latency recorded +- **WHEN** `read_sql_df_slow_iter()` completes streaming in 45 seconds +- **THEN** `record_query_latency(45.0)` SHALL be called in the finally block diff --git a/openspec/specs/metrics-history-trending/spec.md b/openspec/specs/metrics-history-trending/spec.md index c13633b..5e1c4b6 100644 --- a/openspec/specs/metrics-history-trending/spec.md +++ b/openspec/specs/metrics-history-trending/spec.md @@ -1,65 +1,54 @@ -## ADDED Requirements - -### Requirement: SQLite metrics history store -The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`. - -#### Scenario: Write and query snapshots -- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics -- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID - -#### Scenario: Query by time range -- **WHEN** `query_snapshots(minutes=30)` is called -- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending - -#### Scenario: Retention cleanup -- **WHEN** `cleanup()` is called -- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000) - -#### Scenario: Thread safety -- **WHEN** multiple threads write snapshots concurrently -- **THEN** the write lock SHALL serialize writes and prevent database corruption - -### Requirement: Background metrics collector -The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var). - -#### Scenario: Automatic collection -- **WHEN** the collector is started via `start_metrics_history(app)` -- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store - -#### Scenario: Graceful shutdown -- **WHEN** `stop_metrics_history()` is called -- **THEN** the collector thread SHALL stop within one interval period - -#### Scenario: Subsystem unavailability -- **WHEN** a subsystem (e.g., Redis) is unavailable during collection -- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics - -### Requirement: Performance history API endpoint -The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots. - -#### Scenario: Query with time range -- **WHEN** the API is called with `?minutes=30` -- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}` - -#### Scenario: Time range bounds -- **WHEN** `minutes` is less than 1 or greater than 180 -- **THEN** it SHALL be clamped to the range [1, 180] - -#### Scenario: Admin authentication -- **WHEN** the API is called without admin authentication -- **THEN** it SHALL be rejected by the `@admin_required` decorator - -### Requirement: Frontend trend charts -The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts. - -#### Scenario: Trend charts with data -- **WHEN** historical snapshots contain more than 1 data point -- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates - -#### Scenario: Trend charts without data -- **WHEN** historical snapshots are empty or contain only 1 data point -- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`) - -#### Scenario: Auto-refresh -- **WHEN** the dashboard auto-refreshes -- **THEN** historical data SHALL also be refreshed alongside real-time metrics +## MODIFIED Requirements + +### Requirement: SQLite metrics history store +The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`. The schema SHALL include columns for `slow_query_active` (INTEGER), `slow_query_waiting` (INTEGER), and `worker_rss_bytes` (INTEGER) in addition to the existing pool, Redis, route cache, and latency columns. + +#### Scenario: Write and query snapshots +- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency/slow_query/memory metrics +- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp, worker PID, and all metric columns + +#### Scenario: Query by time range +- **WHEN** `query_snapshots(minutes=30)` is called +- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending, including the new columns + +#### Scenario: Retention cleanup +- **WHEN** `cleanup()` is called +- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000) + +#### Scenario: Thread safety +- **WHEN** multiple threads write snapshots concurrently +- **THEN** the write lock SHALL serialize writes and prevent database corruption + +#### Scenario: Schema migration for existing databases +- **WHEN** the store initializes on an existing database without the new columns +- **THEN** it SHALL execute ALTER TABLE ADD COLUMN for each missing column, tolerating "duplicate column" errors + +### Requirement: Background metrics collector +The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var). The collector SHALL include `slow_query_active`, `slow_query_waiting`, and `worker_rss_bytes` in each snapshot. + +#### Scenario: Automatic collection +- **WHEN** the collector is started via `start_metrics_history(app)` +- **THEN** it SHALL collect pool status (including slow_query_active and slow_query_waiting), Redis info, route cache status, query latency metrics, and worker RSS memory every interval and write them to the store + +#### Scenario: Graceful shutdown +- **WHEN** `stop_metrics_history()` is called +- **THEN** the collector thread SHALL stop within one interval period + +#### Scenario: Subsystem unavailability +- **WHEN** a subsystem (e.g., Redis) is unavailable during collection +- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics + +### Requirement: Frontend trend charts +The system SHALL display 5 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts: connection pool saturation, query latency (P50/P95/P99), Redis memory, cache hit rates, and worker memory. + +#### Scenario: Trend charts with data +- **WHEN** historical snapshots contain more than 1 data point +- **THEN** the dashboard SHALL display trend charts for: connection pool saturation (including slow_query_active), query latency (P50/P95/P99), Redis memory, cache hit rates, and worker memory (RSS in MB) + +#### Scenario: Trend charts without data +- **WHEN** historical snapshots are empty or contain only 1 data point +- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`) + +#### Scenario: Auto-refresh +- **WHEN** the dashboard auto-refreshes +- **THEN** historical data SHALL also be refreshed alongside real-time metrics diff --git a/openspec/specs/slow-query-observability/spec.md b/openspec/specs/slow-query-observability/spec.md new file mode 100644 index 0000000..fb3e130 --- /dev/null +++ b/openspec/specs/slow-query-observability/spec.md @@ -0,0 +1,49 @@ +## ADDED Requirements + +### Requirement: Slow query active count in metrics history snapshots +The `MetricsHistoryCollector` SHALL include `slow_query_active` in each 30-second snapshot, recording the number of slow queries currently executing via dedicated connections. + +#### Scenario: Snapshot includes slow_query_active +- **WHEN** the collector writes a snapshot while 3 slow queries are executing +- **THEN** the `slow_query_active` column SHALL contain the value 3 + +#### Scenario: No slow queries active +- **WHEN** the collector writes a snapshot while no slow queries are executing +- **THEN** the `slow_query_active` column SHALL contain the value 0 + +### Requirement: Slow query waiting count tracked and persisted +The system SHALL maintain a thread-safe counter `_SLOW_QUERY_WAITING` in `database.py` that tracks the number of threads currently waiting to acquire the slow query semaphore. This counter SHALL be included in `get_pool_status()` and persisted to metrics history snapshots. + +#### Scenario: Counter increments on semaphore wait +- **WHEN** a thread enters `read_sql_df_slow()` and the semaphore is full +- **THEN** `_SLOW_QUERY_WAITING` SHALL be incremented before `semaphore.acquire()` and decremented after acquire completes (success or timeout) + +#### Scenario: Counter in pool status API +- **WHEN** `get_pool_status()` is called +- **THEN** the returned dict SHALL include `slow_query_waiting` with the current waiting thread count + +#### Scenario: Counter persisted to metrics history +- **WHEN** the collector writes a snapshot +- **THEN** the `slow_query_waiting` column SHALL reflect the count at snapshot time + +### Requirement: Slow-path query latency recorded in QueryMetrics +The `read_sql_df_slow()` and `read_sql_df_slow_iter()` functions SHALL call `record_query_latency()` with the elapsed query time, so that P50/P95/P99 metrics reflect all query paths (pool + slow). + +#### Scenario: Slow query latency appears in percentiles +- **WHEN** a `read_sql_df_slow()` call completes in 5.2 seconds +- **THEN** `record_query_latency(5.2)` SHALL be called and the latency SHALL appear in subsequent `get_percentiles()` results + +#### Scenario: Slow iter latency recorded on completion +- **WHEN** a `read_sql_df_slow_iter()` generator completes after yielding all batches in 120 seconds total +- **THEN** `record_query_latency(120.0)` SHALL be called in the finally block + +### Requirement: Slow query metrics displayed in Vue SPA +The admin performance Vue SPA SHALL display `slow_query_active` and `slow_query_waiting` as StatCards in the connection pool panel, and include `slow_query_active` as a trend line in the connection pool trend chart. + +#### Scenario: StatCards display current values +- **WHEN** the performance-detail API returns `db_pool.status.slow_query_active = 4` and `db_pool.status.slow_query_waiting = 2` +- **THEN** the connection pool panel SHALL display StatCards showing "慢查詢執行中: 4" and "慢查詢排隊中: 2" + +#### Scenario: Trend chart includes slow_query_active +- **WHEN** historical snapshots contain `slow_query_active` data points +- **THEN** the connection pool trend chart SHALL include a "慢查詢執行中" line series diff --git a/openspec/specs/worker-memory-tracking/spec.md b/openspec/specs/worker-memory-tracking/spec.md new file mode 100644 index 0000000..73c3716 --- /dev/null +++ b/openspec/specs/worker-memory-tracking/spec.md @@ -0,0 +1,23 @@ +## ADDED Requirements + +### Requirement: Worker RSS memory in metrics history snapshots +The `MetricsHistoryCollector` SHALL include `worker_rss_bytes` in each 30-second snapshot, recording the current worker process peak RSS memory using Python's `resource.getrusage()`. + +#### Scenario: RSS recorded in snapshot +- **WHEN** the collector writes a snapshot and the worker process has 256 MB peak RSS +- **THEN** the `worker_rss_bytes` column SHALL contain approximately 268435456 + +#### Scenario: RSS collection failure +- **WHEN** `resource.getrusage()` raises an exception +- **THEN** the collector SHALL write NULL for `worker_rss_bytes` and continue collecting other metrics + +### Requirement: Worker memory trend chart in Vue SPA +The admin performance Vue SPA SHALL display a "Worker 記憶體趨勢" TrendChart showing RSS memory over time in megabytes. + +#### Scenario: Memory trend displayed +- **WHEN** historical snapshots contain `worker_rss_bytes` data with more than 1 data point +- **THEN** the dashboard SHALL display a TrendChart with RSS values converted to MB + +#### Scenario: No memory data +- **WHEN** historical snapshots do not contain `worker_rss_bytes` data (all NULL) +- **THEN** the trend chart SHALL show "趨勢資料不足" message diff --git a/scripts/check_full_modernization_gates.py b/scripts/check_full_modernization_gates.py index 887ad1e..b4d6514 100755 --- a/scripts/check_full_modernization_gates.py +++ b/scripts/check_full_modernization_gates.py @@ -86,7 +86,7 @@ def _route_css_targets() -> dict[str, list[Path]]: "/qc-gate": [ROOT / "frontend/src/qc-gate/style.css"], "/job-query": [ROOT / "frontend/src/job-query/style.css"], "/admin/pages": [ROOT / "src/mes_dashboard/templates/admin/pages.html"], - "/admin/performance": [ROOT / "src/mes_dashboard/templates/admin/performance.html"], + "/admin/performance": [ROOT / "frontend/src/admin-performance/style.css"], "/tables": [ROOT / "frontend/src/tables/style.css"], "/excel-query": [ROOT / "frontend/src/excel-query/style.css"], "/query-tool": [ROOT / "frontend/src/query-tool/style.css"], diff --git a/src/mes_dashboard/core/database.py b/src/mes_dashboard/core/database.py index cbe8dc0..5d3dca1 100644 --- a/src/mes_dashboard/core/database.py +++ b/src/mes_dashboard/core/database.py @@ -233,6 +233,7 @@ def get_pool_status() -> Dict[str, Any]: "max_capacity": max_capacity, "saturation": saturation, "slow_query_active": get_slow_query_active_count(), + "slow_query_waiting": get_slow_query_waiting_count(), } @@ -436,6 +437,7 @@ _DIRECT_CONN_LOCK = threading.Lock() # Slow-query concurrency control _SLOW_QUERY_SEMAPHORE: Optional[threading.Semaphore] = None _SLOW_QUERY_ACTIVE = 0 +_SLOW_QUERY_WAITING = 0 _SLOW_QUERY_LOCK = threading.Lock() @@ -454,6 +456,11 @@ def get_slow_query_active_count() -> int: return _SLOW_QUERY_ACTIVE +def get_slow_query_waiting_count() -> int: + """Return the number of threads waiting for the slow-query semaphore.""" + return _SLOW_QUERY_WAITING + + def get_direct_connection_count() -> int: """Return total direct (non-pooled) connections since worker start.""" return _DIRECT_CONN_COUNTER @@ -627,7 +634,8 @@ def read_sql_df_slow( Returns: DataFrame with query results. """ - global _SLOW_QUERY_ACTIVE + global _SLOW_QUERY_ACTIVE, _SLOW_QUERY_WAITING + from mes_dashboard.core.metrics import record_query_latency runtime = get_db_runtime_config() if timeout_seconds is None: @@ -636,7 +644,13 @@ def read_sql_df_slow( timeout_ms = timeout_seconds * 1000 sem = _get_slow_query_semaphore() - acquired = sem.acquire(timeout=60) + with _SLOW_QUERY_LOCK: + _SLOW_QUERY_WAITING += 1 + try: + acquired = sem.acquire(timeout=60) + finally: + with _SLOW_QUERY_LOCK: + _SLOW_QUERY_WAITING -= 1 if not acquired: raise RuntimeError( "Slow-query concurrency limit reached; try again later" @@ -690,6 +704,8 @@ def read_sql_df_slow( ) raise finally: + elapsed = time.time() - start_time + record_query_latency(elapsed) if conn: try: conn.close() @@ -718,7 +734,8 @@ def read_sql_df_slow_iter( batch_size: Number of rows per fetchmany call. None = use DB_SLOW_FETCHMANY_SIZE from config (default 5000). """ - global _SLOW_QUERY_ACTIVE + global _SLOW_QUERY_ACTIVE, _SLOW_QUERY_WAITING + from mes_dashboard.core.metrics import record_query_latency runtime = get_db_runtime_config() if timeout_seconds is None: @@ -730,7 +747,13 @@ def read_sql_df_slow_iter( batch_size = runtime["slow_fetchmany_size"] sem = _get_slow_query_semaphore() - acquired = sem.acquire(timeout=60) + with _SLOW_QUERY_LOCK: + _SLOW_QUERY_WAITING += 1 + try: + acquired = sem.acquire(timeout=60) + finally: + with _SLOW_QUERY_LOCK: + _SLOW_QUERY_WAITING -= 1 if not acquired: raise RuntimeError( "Slow-query concurrency limit reached; try again later" @@ -788,6 +811,8 @@ def read_sql_df_slow_iter( ) raise finally: + elapsed = time.time() - start_time + record_query_latency(elapsed) if conn: try: conn.close() diff --git a/src/mes_dashboard/core/metrics_history.py b/src/mes_dashboard/core/metrics_history.py index bdb08e2..99cd964 100644 --- a/src/mes_dashboard/core/metrics_history.py +++ b/src/mes_dashboard/core/metrics_history.py @@ -32,6 +32,9 @@ METRICS_HISTORY_INTERVAL = int(os.getenv('METRICS_HISTORY_INTERVAL', '30')) METRICS_HISTORY_RETENTION_DAYS = int(os.getenv('METRICS_HISTORY_RETENTION_DAYS', '3')) METRICS_HISTORY_MAX_ROWS = int(os.getenv('METRICS_HISTORY_MAX_ROWS', '50000')) +ARCHIVE_LOG_DIR = os.getenv('ARCHIVE_LOG_DIR', 'logs/archive') +ARCHIVE_LOG_KEEP_COUNT = int(os.getenv('ARCHIVE_LOG_KEEP_COUNT', '20')) + # ============================================================ # Database Schema # ============================================================ @@ -54,10 +57,20 @@ CREATE TABLE IF NOT EXISTS metrics_snapshots ( latency_p50_ms REAL, latency_p95_ms REAL, latency_p99_ms REAL, - latency_count INTEGER + latency_count INTEGER, + slow_query_active INTEGER, + slow_query_waiting INTEGER, + worker_rss_bytes INTEGER ); """ +# New columns added after initial schema — used for ALTER TABLE migration. +_MIGRATION_COLUMNS = [ + ("slow_query_active", "INTEGER"), + ("slow_query_waiting", "INTEGER"), + ("worker_rss_bytes", "INTEGER"), +] + CREATE_INDEX_SQL = ( "CREATE INDEX IF NOT EXISTS idx_metrics_ts ON metrics_snapshots(ts);" ) @@ -69,9 +82,49 @@ COLUMNS = [ "redis_used_memory", "redis_hit_rate", "rc_l1_hit_rate", "rc_l2_hit_rate", "rc_miss_rate", "latency_p50_ms", "latency_p95_ms", "latency_p99_ms", "latency_count", + "slow_query_active", "slow_query_waiting", "worker_rss_bytes", ] +# ============================================================ +# Archive Log Cleanup +# ============================================================ + +_ARCHIVE_LOG_PREFIXES = ("access_", "error_", "watchdog_", "rq_worker_", "startup_") + + +def cleanup_archive_logs( + archive_dir: str = ARCHIVE_LOG_DIR, + keep_per_type: int = ARCHIVE_LOG_KEEP_COUNT, +) -> int: + """Delete old rotated log files from the archive directory. + + Keeps the most recent *keep_per_type* files per log type (by mtime). + Returns the total number of files deleted. + """ + archive_path = Path(archive_dir) + if not archive_path.is_dir(): + return 0 + + deleted = 0 + for prefix in _ARCHIVE_LOG_PREFIXES: + files = sorted( + (f for f in archive_path.iterdir() if f.name.startswith(prefix) and f.is_file()), + key=lambda f: f.stat().st_mtime, + reverse=True, + ) + for old_file in files[keep_per_type:]: + try: + old_file.unlink() + deleted += 1 + except OSError as exc: + logger.warning("Failed to delete archive log %s: %s", old_file, exc) + + if deleted > 0: + logger.info("Cleaned up %d archive log files from %s", deleted, archive_dir) + return deleted + + # ============================================================ # Metrics History Store # ============================================================ @@ -94,6 +147,14 @@ class MetricsHistoryStore: cursor = conn.cursor() cursor.execute(CREATE_TABLE_SQL) cursor.execute(CREATE_INDEX_SQL) + # Migrate existing databases: add new columns if missing. + for col_name, col_type in _MIGRATION_COLUMNS: + try: + cursor.execute( + f"ALTER TABLE metrics_snapshots ADD COLUMN {col_name} {col_type}" + ) + except sqlite3.OperationalError: + pass # Column already exists — tolerate duplicate column error. conn.commit() self._initialized = True logger.info("Metrics history store initialized at %s", self.db_path) @@ -136,8 +197,9 @@ class MetricsHistoryStore: pool_overflow, pool_max_capacity, redis_used_memory, redis_hit_rate, rc_l1_hit_rate, rc_l2_hit_rate, rc_miss_rate, - latency_p50_ms, latency_p95_ms, latency_p99_ms, latency_count) - VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + latency_p50_ms, latency_p95_ms, latency_p99_ms, latency_count, + slow_query_active, slow_query_waiting, worker_rss_bytes) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """, ( ts, pid, @@ -155,6 +217,9 @@ class MetricsHistoryStore: lat.get("p95_ms"), lat.get("p99_ms"), lat.get("count"), + data.get("slow_query_active"), + data.get("slow_query_waiting"), + data.get("worker_rss_bytes"), ), ) conn.commit() @@ -263,17 +328,36 @@ class MetricsHistoryCollector: if self._cleanup_counter >= 100: self._cleanup_counter = 0 self._store.cleanup() + try: + cleanup_archive_logs() + except Exception as exc: + logger.debug("Archive log cleanup failed: %s", exc) def _collect_snapshot(self) -> None: try: data: Dict[str, Any] = {} - # Pool status + # Pool status (includes slow_query_active and slow_query_waiting) try: from mes_dashboard.core.database import get_pool_status - data["pool"] = get_pool_status() + pool_status = get_pool_status() + data["pool"] = pool_status + data["slow_query_active"] = pool_status.get("slow_query_active", 0) + data["slow_query_waiting"] = pool_status.get("slow_query_waiting", 0) except Exception: data["pool"] = {} + data["slow_query_active"] = 0 + data["slow_query_waiting"] = 0 + + # Worker RSS memory + try: + import resource + # ru_maxrss is in KB on Linux + data["worker_rss_bytes"] = resource.getrusage( + resource.RUSAGE_SELF + ).ru_maxrss * 1024 + except Exception: + data["worker_rss_bytes"] = 0 # Redis try: diff --git a/src/mes_dashboard/routes/admin_routes.py b/src/mes_dashboard/routes/admin_routes.py index 9dfc1b5..ef7f2fe 100644 --- a/src/mes_dashboard/routes/admin_routes.py +++ b/src/mes_dashboard/routes/admin_routes.py @@ -71,10 +71,7 @@ _last_restart_request: float = 0.0 def performance(): """Performance monitoring dashboard (Vue SPA).""" dist_dir = os.path.join(current_app.static_folder or "", "dist") - dist_html = os.path.join(dist_dir, "admin-performance.html") - if os.path.exists(dist_html): - return send_from_directory(dist_dir, "admin-performance.html") - return render_template("admin/performance.html") + return send_from_directory(dist_dir, "admin-performance.html") @admin_bp.route("/api/system-status", methods=["GET"]) diff --git a/src/mes_dashboard/templates/admin/performance.html b/src/mes_dashboard/templates/admin/performance.html deleted file mode 100644 index 54dfd0c..0000000 --- a/src/mes_dashboard/templates/admin/performance.html +++ /dev/null @@ -1,1249 +0,0 @@ -{% extends "_base.html" %} - -{% block title %}效能監控 - MES Dashboard{% endblock %} - -{% block head_extra %} - - -{% endblock %} - -{% block content %} -
-
-
-

效能監控儀表板

-

系統狀態、查詢效能與日誌記錄

-
-
-
- -- -
-
- - -
- ← 返回首頁 -
-
- - -
-
-
- Database - -
-
--
-
連線延遲
-
- -
-
- Redis - -
-
--
-
快取狀態
-
- -
-
- Circuit Breaker - -
-
--
-
--
-
- -
-
- Worker - -
-
--
-
Process ID
-
-
- - -
-
-
- 查詢效能 -
-
- P50 延遲 - -- ms -
-
- P95 延遲 - -- ms -
-
- P99 延遲 - -- ms -
-
- 總查詢數 - -- -
-
- 慢查詢數 (>1s) - -- -
-
- 慢查詢率 - --% -
-
- -
-
- 延遲分布 (最近 100 筆) -
-
- -
-
-
- - -
-
-

Worker 控制

-
-
-
-
-
- Worker PID - -- -
-
- 啟動時間 - -- -
-
- 冷卻狀態 - -- -
-
-
-
- 上次重啟 - -- -
-
- 重啟者 - -- -
-
- 重啟狀態 - -- -
-
-
-
- -

- 這將優雅地重新載入所有 Worker 程序 (不會中斷現有請求) -

-
-
-
- - -
-
-

確認重新啟動 Workers?

-

- 這將發送信號給 Gunicorn master 程序,優雅地重新載入所有 worker。 - 現有請求會完成處理後才關閉。 -

-
- - -
-
-
- - -
-
-

確認清理日誌?

-

- 這將刪除超過保留期限的舊日誌,以及超過上限筆數的記錄。此操作無法復原。 -

-
- - -
-
-
- - -
-
-

系統日誌

-
- -- - -
-
-
- -
-
-
- 總筆數 -
--
-
-
- 檔案大小 -
--
-
-
- 最舊記錄 -
--
-
-
- 保留天數 -
--
-
-
- 上限筆數 -
--
-
-
-
- -
- - -
-
- - - - - - - - - - - - -
時間等級來源訊息
載入中...
-
- - -
-
-
-{% endblock %} - -{% block scripts %} - -{% endblock %}