From 5d570ca7a201e25ca308daac242ce95a68801584 Mon Sep 17 00:00:00 2001 From: egg Date: Mon, 23 Feb 2026 09:18:10 +0800 Subject: [PATCH] feat(admin-performance): Vue 3 SPA dashboard with metrics history trending Rebuild /admin/performance from Jinja2 to Vue 3 SPA with ECharts, adding cache telemetry infrastructure, connection pool monitoring, and SQLite-backed historical metrics collection with trend chart visualization. Co-Authored-By: Claude Opus 4.6 --- frontend/package.json | 2 +- frontend/src/admin-performance/App.vue | 613 +++++++++++++++++ .../admin-performance/components/GaugeBar.vue | 49 ++ .../admin-performance/components/StatCard.vue | 24 + .../components/StatusDot.vue | 17 + .../components/TrendChart.vue | 98 +++ frontend/src/admin-performance/index.html | 12 + frontend/src/admin-performance/main.js | 6 + frontend/src/admin-performance/style.css | 544 +++++++++++++++ .../src/portal-shell/nativeModuleRegistry.js | 4 + frontend/src/portal-shell/routeContracts.js | 6 +- frontend/vite.config.js | 3 +- .../.openspec.yaml | 2 + .../design.md | 91 +++ .../proposal.md | 31 + .../specs/admin-performance-spa/spec.md | 100 +++ .../specs/cache-telemetry-api/spec.md | 56 ++ .../specs/connection-pool-monitoring/spec.md | 27 + .../specs/metrics-history-trending/spec.md | 65 ++ .../tasks.md | 80 +++ openspec/specs/admin-performance-spa/spec.md | 100 +++ openspec/specs/cache-telemetry-api/spec.md | 56 ++ .../specs/connection-pool-monitoring/spec.md | 27 + .../specs/metrics-history-trending/spec.md | 65 ++ src/mes_dashboard/app.py | 8 + src/mes_dashboard/core/cache.py | 29 + src/mes_dashboard/core/database.py | 14 + src/mes_dashboard/core/metrics_history.py | 369 ++++++++++ src/mes_dashboard/routes/admin_routes.py | 645 +++++++++++------- .../services/realtime_equipment_cache.py | 9 + .../services/reject_dataset_cache.py | 3 +- src/mes_dashboard/services/resource_cache.py | 9 + 32 files changed, 2903 insertions(+), 261 deletions(-) create mode 100644 frontend/src/admin-performance/App.vue create mode 100644 frontend/src/admin-performance/components/GaugeBar.vue create mode 100644 frontend/src/admin-performance/components/StatCard.vue create mode 100644 frontend/src/admin-performance/components/StatusDot.vue create mode 100644 frontend/src/admin-performance/components/TrendChart.vue create mode 100644 frontend/src/admin-performance/index.html create mode 100644 frontend/src/admin-performance/main.js create mode 100644 frontend/src/admin-performance/style.css create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/.openspec.yaml create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/design.md create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/proposal.md create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/admin-performance-spa/spec.md create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/cache-telemetry-api/spec.md create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/connection-pool-monitoring/spec.md create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/metrics-history-trending/spec.md create mode 100644 openspec/changes/archive/2026-02-23-admin-performance-vue-spa/tasks.md create mode 100644 openspec/specs/admin-performance-spa/spec.md create mode 100644 openspec/specs/cache-telemetry-api/spec.md create mode 100644 openspec/specs/connection-pool-monitoring/spec.md create mode 100644 openspec/specs/metrics-history-trending/spec.md create mode 100644 src/mes_dashboard/core/metrics_history.py diff --git a/frontend/package.json b/frontend/package.json index 0465e5d..e95ffc2 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -5,7 +5,7 @@ "type": "module", "scripts": { "dev": "vite --host", - "build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html", + "build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html && cp ../src/mes_dashboard/static/dist/src/admin-performance/index.html ../src/mes_dashboard/static/dist/admin-performance.html", "test": "node --test tests/*.test.js" }, "devDependencies": { diff --git a/frontend/src/admin-performance/App.vue b/frontend/src/admin-performance/App.vue new file mode 100644 index 0000000..9732407 --- /dev/null +++ b/frontend/src/admin-performance/App.vue @@ -0,0 +1,613 @@ + + + diff --git a/frontend/src/admin-performance/components/GaugeBar.vue b/frontend/src/admin-performance/components/GaugeBar.vue new file mode 100644 index 0000000..688650a --- /dev/null +++ b/frontend/src/admin-performance/components/GaugeBar.vue @@ -0,0 +1,49 @@ + + + diff --git a/frontend/src/admin-performance/components/StatCard.vue b/frontend/src/admin-performance/components/StatCard.vue new file mode 100644 index 0000000..24b4ae2 --- /dev/null +++ b/frontend/src/admin-performance/components/StatCard.vue @@ -0,0 +1,24 @@ + + + diff --git a/frontend/src/admin-performance/components/StatusDot.vue b/frontend/src/admin-performance/components/StatusDot.vue new file mode 100644 index 0000000..ba5b3c1 --- /dev/null +++ b/frontend/src/admin-performance/components/StatusDot.vue @@ -0,0 +1,17 @@ + + + diff --git a/frontend/src/admin-performance/components/TrendChart.vue b/frontend/src/admin-performance/components/TrendChart.vue new file mode 100644 index 0000000..e5fa3b1 --- /dev/null +++ b/frontend/src/admin-performance/components/TrendChart.vue @@ -0,0 +1,98 @@ + + + diff --git a/frontend/src/admin-performance/index.html b/frontend/src/admin-performance/index.html new file mode 100644 index 0000000..63e5435 --- /dev/null +++ b/frontend/src/admin-performance/index.html @@ -0,0 +1,12 @@ + + + + + + Performance Monitor + + +
+ + + diff --git a/frontend/src/admin-performance/main.js b/frontend/src/admin-performance/main.js new file mode 100644 index 0000000..c56440b --- /dev/null +++ b/frontend/src/admin-performance/main.js @@ -0,0 +1,6 @@ +import { createApp } from 'vue'; + +import App from './App.vue'; +import './style.css'; + +createApp(App).mount('#app'); diff --git a/frontend/src/admin-performance/style.css b/frontend/src/admin-performance/style.css new file mode 100644 index 0000000..f9e9bce --- /dev/null +++ b/frontend/src/admin-performance/style.css @@ -0,0 +1,544 @@ +/* Admin Performance Dashboard */ +*, +*::before, +*::after { + box-sizing: border-box; + margin: 0; + padding: 0; +} + +body { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + background: #f1f5f9; + color: #1e293b; + line-height: 1.5; +} + +.perf-dashboard { + max-width: 1280px; + margin: 0 auto; + padding: 0 16px 32px; +} + +/* Header */ +.perf-header { + background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%); + color: #fff; + padding: 20px 24px; + border-radius: 0 0 12px 12px; + margin: 0 -16px 20px; +} + +.perf-header-inner { + display: flex; + align-items: center; + justify-content: space-between; + flex-wrap: wrap; + gap: 12px; +} + +.perf-title { + font-size: 1.4rem; + font-weight: 700; +} + +.perf-header-actions { + display: flex; + align-items: center; + gap: 12px; +} + +.auto-refresh-toggle { + display: flex; + align-items: center; + gap: 6px; + font-size: 0.85rem; + cursor: pointer; + user-select: none; +} + +.auto-refresh-toggle input[type='checkbox'] { + accent-color: #fff; +} + +/* Buttons */ +.btn { + display: inline-flex; + align-items: center; + gap: 6px; + padding: 8px 16px; + border: none; + border-radius: 6px; + font-size: 0.85rem; + font-weight: 500; + cursor: pointer; + background: rgba(255, 255, 255, 0.2); + color: #fff; + transition: background 0.15s; +} + +.btn:hover:not(:disabled) { + background: rgba(255, 255, 255, 0.3); +} + +.btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.btn-sm { + padding: 5px 10px; + font-size: 0.8rem; + background: #e2e8f0; + color: #334155; +} + +.btn-sm:hover:not(:disabled) { + background: #cbd5e1; +} + +.btn-danger { + background: #ef4444; + color: #fff; +} + +.btn-danger:hover:not(:disabled) { + background: #dc2626; +} + +/* Panel */ +.panel { + background: #fff; + border-radius: 10px; + padding: 20px; + margin-bottom: 16px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.06); +} + +.panel-disabled { + opacity: 0.6; +} + +.panel-title { + font-size: 1rem; + font-weight: 600; + margin-bottom: 14px; + color: #334155; +} + +.sub-title { + font-size: 0.9rem; + font-weight: 600; + margin: 16px 0 10px; + color: #475569; +} + +.muted { + color: #94a3b8; + font-size: 0.85rem; +} + +/* Status Cards */ +.status-cards-grid { + display: grid; + grid-template-columns: repeat(4, 1fr); + gap: 12px; +} + +.status-card { + background: #f8fafc; + border-radius: 8px; + padding: 14px; + text-align: center; +} + +.status-card-title { + font-size: 0.75rem; + color: #64748b; + margin-bottom: 8px; + text-transform: uppercase; + letter-spacing: 0.05em; +} + +/* StatusDot */ +.status-dot-wrapper { + display: flex; + align-items: center; + justify-content: center; + gap: 6px; +} + +.status-dot { + width: 10px; + height: 10px; + border-radius: 50%; + flex-shrink: 0; +} + +.status-dot--healthy { + background: #22c55e; + box-shadow: 0 0 6px rgba(34, 197, 94, 0.4); +} + +.status-dot--degraded { + background: #f59e0b; + box-shadow: 0 0 6px rgba(245, 158, 11, 0.4); +} + +.status-dot--error { + background: #ef4444; + box-shadow: 0 0 6px rgba(239, 68, 68, 0.4); +} + +.status-dot--disabled { + background: #94a3b8; +} + +.status-dot-label { + font-size: 0.85rem; + font-weight: 500; +} + +/* GaugeBar */ +.gauge-bar { + margin-bottom: 12px; +} + +.gauge-bar-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 4px; +} + +.gauge-bar-label { + font-size: 0.8rem; + color: #64748b; +} + +.gauge-bar-value { + font-size: 0.8rem; + font-weight: 600; +} + +.gauge-bar-track { + height: 8px; + background: #e2e8f0; + border-radius: 4px; + overflow: hidden; +} + +.gauge-bar-fill { + height: 100%; + border-radius: 4px; + transition: width 0.4s ease, background-color 0.3s; + min-width: 2px; +} + +/* StatCard */ +.stat-card { + background: #f8fafc; + border-radius: 8px; + padding: 10px 12px; + text-align: center; +} + +.stat-card-value { + font-size: 1.1rem; + font-weight: 700; + color: #1e293b; + line-height: 1.2; +} + +.stat-card-label { + font-size: 0.7rem; + color: #64748b; + margin-top: 2px; +} + +/* Query Performance */ +.query-perf-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 16px; +} + +.query-perf-stats { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 8px; + align-content: start; +} + +.query-perf-chart { + min-height: 200px; +} + +/* Redis */ +.redis-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 16px; +} + +.redis-mini-stats { + display: grid; + grid-template-columns: repeat(2, 1fr); + gap: 8px; + margin-top: 12px; +} + +.redis-namespaces { + overflow-x: auto; +} + +/* Mini Table */ +.mini-table { + width: 100%; + border-collapse: collapse; + font-size: 0.82rem; +} + +.mini-table th, +.mini-table td { + padding: 6px 10px; + text-align: left; + border-bottom: 1px solid #e2e8f0; +} + +.mini-table th { + background: #f8fafc; + font-weight: 600; + color: #475569; +} + +/* Memory Cache Cards */ +.cache-cards-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); + gap: 12px; +} + +.cache-card { + background: #f8fafc; + border-radius: 8px; + padding: 14px; +} + +.cache-card-name { + font-size: 0.85rem; + font-weight: 600; + margin-bottom: 2px; +} + +.cache-card-desc { + font-size: 0.72rem; + color: #64748b; + margin-bottom: 8px; +} + +.cache-card-ttl { + font-size: 0.72rem; + color: #94a3b8; + margin-top: 4px; +} + +.route-cache-stats { + display: grid; + grid-template-columns: repeat(6, 1fr); + gap: 8px; +} + +/* Connection Pool */ +.pool-stats-grid { + display: grid; + grid-template-columns: repeat(4, 1fr); + gap: 8px; + margin-top: 14px; +} + +/* Worker */ +.worker-info { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 8px; + margin-bottom: 14px; +} + +/* Modal */ +.modal-backdrop { + position: fixed; + inset: 0; + background: rgba(0, 0, 0, 0.45); + display: flex; + align-items: center; + justify-content: center; + z-index: 1000; +} + +.modal-dialog { + background: #fff; + border-radius: 12px; + padding: 24px; + max-width: 400px; + width: 90%; + box-shadow: 0 20px 60px rgba(0, 0, 0, 0.15); +} + +.modal-dialog h3 { + margin-bottom: 8px; +} + +.modal-dialog p { + font-size: 0.9rem; + color: #475569; + margin-bottom: 16px; +} + +.modal-actions { + display: flex; + gap: 8px; + justify-content: flex-end; +} + +.modal-actions .btn { + background: #e2e8f0; + color: #334155; +} + +/* Log Controls */ +.log-controls { + display: flex; + gap: 8px; + margin-bottom: 12px; + flex-wrap: wrap; +} + +.log-controls select, +.log-controls input[type='text'] { + padding: 6px 10px; + border: 1px solid #cbd5e1; + border-radius: 6px; + font-size: 0.82rem; + outline: none; +} + +.log-controls input[type='text'] { + flex: 1; + min-width: 160px; +} + +/* Log Table */ +.log-table-wrapper { + overflow-x: auto; + max-height: 400px; + overflow-y: auto; +} + +.log-table { + width: 100%; + border-collapse: collapse; + font-size: 0.78rem; + font-family: 'SF Mono', 'Fira Code', monospace; +} + +.log-table th, +.log-table td { + padding: 5px 8px; + text-align: left; + border-bottom: 1px solid #f1f5f9; + white-space: nowrap; +} + +.log-table th { + background: #f8fafc; + font-weight: 600; + position: sticky; + top: 0; + z-index: 1; +} + +.log-msg { + white-space: pre-wrap; + word-break: break-all; + max-width: 600px; +} + +.log-time { + color: #64748b; +} + +.log-level { + font-weight: 600; +} + +.log-error .log-level { + color: #ef4444; +} + +.log-warning .log-level { + color: #f59e0b; +} + +.log-info .log-level { + color: #3b82f6; +} + +.log-debug .log-level { + color: #94a3b8; +} + +/* Log Pagination */ +.log-pagination { + display: flex; + align-items: center; + justify-content: center; + gap: 12px; + margin-top: 10px; + font-size: 0.82rem; +} + +/* Responsive */ +@media (max-width: 768px) { + .status-cards-grid { + grid-template-columns: repeat(2, 1fr); + } + + .query-perf-grid, + .redis-grid { + grid-template-columns: 1fr; + } + + .pool-stats-grid { + grid-template-columns: repeat(2, 1fr); + } + + .route-cache-stats { + grid-template-columns: repeat(3, 1fr); + } +} + +/* Trend Charts */ +.trend-chart-card { + margin-top: 4px; + background: #fff; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 16px; +} +.trend-chart-title { + font-size: 0.9rem; + font-weight: 600; + color: #475569; + margin-bottom: 8px; +} +.trend-chart-canvas { + width: 100%; + min-height: 200px; +} +.trend-chart-empty { + color: #94a3b8; + font-size: 0.85rem; + text-align: center; + padding: 32px 0; +} diff --git a/frontend/src/portal-shell/nativeModuleRegistry.js b/frontend/src/portal-shell/nativeModuleRegistry.js index 9e8e3d3..8e50775 100644 --- a/frontend/src/portal-shell/nativeModuleRegistry.js +++ b/frontend/src/portal-shell/nativeModuleRegistry.js @@ -70,6 +70,10 @@ const NATIVE_MODULE_LOADERS = Object.freeze({ () => import('../tables/App.vue'), [() => import('../tables/style.css')], ), + '/admin/performance': createNativeLoader( + () => import('../admin-performance/App.vue'), + [() => import('../admin-performance/style.css')], + ), }); export function getNativeModuleLoader(route) { diff --git a/frontend/src/portal-shell/routeContracts.js b/frontend/src/portal-shell/routeContracts.js index 9da0493..254a38a 100644 --- a/frontend/src/portal-shell/routeContracts.js +++ b/frontend/src/portal-shell/routeContracts.js @@ -190,13 +190,13 @@ const ROUTE_CONTRACTS = Object.freeze({ '/admin/performance': buildContract({ route: '/admin/performance', routeId: 'admin-performance', - renderMode: 'external', + renderMode: 'native', owner: 'frontend-platform-admin', title: '效能監控', - rollbackStrategy: 'external_route_reversion', + rollbackStrategy: 'fallback_to_legacy_route', visibilityPolicy: 'admin_only', scope: 'in-scope', - compatibilityPolicy: 'external_target_redirect', + compatibilityPolicy: 'redirect_to_shell_when_spa_enabled', }), '/tables': buildContract({ route: '/tables', diff --git a/frontend/vite.config.js b/frontend/vite.config.js index 8d13109..35fc4f4 100644 --- a/frontend/vite.config.js +++ b/frontend/vite.config.js @@ -28,7 +28,8 @@ export default defineConfig(({ mode }) => ({ 'query-tool': resolve(__dirname, 'src/query-tool/main.js'), 'tmtt-defect': resolve(__dirname, 'src/tmtt-defect/main.js'), 'qc-gate': resolve(__dirname, 'src/qc-gate/index.html'), - 'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html') + 'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html'), + 'admin-performance': resolve(__dirname, 'src/admin-performance/index.html') }, output: { entryFileNames: '[name].js', diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/.openspec.yaml b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/.openspec.yaml new file mode 100644 index 0000000..cbbb578 --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-02-22 diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/design.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/design.md new file mode 100644 index 0000000..0bd2406 --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/design.md @@ -0,0 +1,91 @@ +## Context + +現有 `/admin/performance` 是 Jinja2 server-rendered 頁面(vanilla JS + Chart.js),是唯一未遷移至 Vue 3 SPA 的前端頁面。後端已具備豐富的監控數據(連線池 `get_pool_status()`、Redis client、LayeredCache `.telemetry()`),但前端僅展示 4 張 status cards + query performance + worker control + logs,缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度等關鍵面板。 + +## Goals / Non-Goals + +**Goals:** +- 將 admin/performance 頁面從 Jinja2 切換為 Vue 3 SPA,與所有報表頁面架構一致 +- 新增完整的系統監控面板:Redis 快取詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤 +- 提供可複用的 gauge/stat card 組件,便於未來擴展監控項目 +- 保留所有既有功能(status cards、query performance、worker control、system logs) + +**Non-Goals:** +- 不新增告警/通知機制(未來可擴展) +- 不引入 WebSocket 即時推送(維持 30 秒輪詢) +- 不修改既有 API response format(`system-status`、`metrics`、`logs` 保持不變) +- 不新增使用者權限控制(沿用既有 admin 認證) + +## Decisions + +### 1. Vue 3 SPA + ECharts 取代 Jinja2 + Chart.js + +**選擇**: 全面重建為 Vue 3 SPA,使用 ECharts 繪製圖表 + +**理由**: 所有報表頁面已完成 Vue SPA 遷移,admin/performance 是最後一個 Jinja2 頁面。統一架構可複用 `apiGet`、`useAutoRefresh` 等共用基礎設施,減少維護成本。ECharts 已是專案標準圖表庫(query-tool、reject-history 等均使用)。 + +**替代方案**: 保留 Jinja2 僅加 API — 但會持續累積技術債,且無法複用 Vue 生態。 + +### 2. 單一 performance-detail API 聚合所有新增監控數據 + +**選擇**: 新增 `GET /admin/api/performance-detail` 一個 endpoint,回傳 `redis`、`process_caches`、`route_cache`、`db_pool`、`direct_connections` 五個 section。 + +**理由**: 減少前端並發請求數(已有 5 個 API,加 1 個共 6 個),後端可在同一 request 中順序收集各子系統狀態,避免多次 round-trip。 + +**替代方案**: 每個監控維度獨立 endpoint — 更 RESTful 但增加前端複雜度和網路開銷。 + +### 3. ProcessLevelCache 全域 registry 模式 + +**選擇**: 在 `core/cache.py` 新增 `_PROCESS_CACHE_REGISTRY` dict + `register_process_cache()` 函式,各服務在模組載入時自行註冊。 + +**理由**: 避免 admin_routes 硬編碼各快取實例的 import 路徑,新增快取時只需在該服務中加一行 `register_process_cache()` 即可自動出現在監控面板。 + +**替代方案**: admin_routes 直接 import 各快取實例 — 耦合度高,新增快取需改兩處。 + +### 4. Redis namespace 監控使用 SCAN 而非 KEYS + +**選擇**: 使用 `SCAN` 搭配 `MATCH` pattern 掃描各 namespace 的 key 數量。 + +**理由**: `KEYS *` 在生產環境會阻塞 Redis,`SCAN` 為非阻塞迭代器,安全性更高。 + +### 5. 直連 Oracle 使用 thread-safe atomic counter + +**選擇**: 在 `database.py` 使用 `threading.Lock` 保護的全域計數器,在 `get_db_connection()` 和 `read_sql_df_slow()` 建立連線後 increment。 + +**理由**: 追蹤連線池外的直接連線使用量,幫助判斷是否需要調整池大小。計數器為 monotonic(只增不減),記錄的是自 worker 啟動以來的總數。 + +### 6. 前端組件複用 GaugeBar / StatCard / StatusDot + +**選擇**: 新增 3 個小型可複用組件放在 `admin-performance/components/` 下。 + +**理由**: Redis 記憶體、連線池飽和度、ProcessLevelCache 使用率等多處需要 gauge 視覺化;status cards 跨面板重複。組件化可統一視覺風格並減少重複 template。 + +### 7. SQLite 持久化 metrics history store + +**選擇**: 新增 `core/metrics_history.py`,使用 SQLite 儲存 metrics snapshots(仿 `core/log_store.py` 的 `LogStore` 模式),搭配 daemon thread 每 30 秒採集一次。 + +**理由**: in-memory deque 在 worker 重啟或 gunicorn prefork 下無法跨 worker 共享且不保留歷史。SQLite 提供跨 worker 讀取、重啟持久化、可配置保留天數(預設 3 天 / 50000 rows),且不需額外 infra。 + +**替代方案**: +- in-memory deque — 簡單但 worker 獨立、重啟即失 +- Redis TSDB — 需額外模組且增加 Redis 負擔 +- PostgreSQL — 太重,且此數據不需 ACID + +**Schema**: `metrics_snapshots` table 含 timestamp、worker PID、pool/redis/route_cache/latency 各欄位,`idx_metrics_ts` 索引加速時間查詢。 + +**背景採集**: `MetricsHistoryCollector` daemon thread,間隔可透過 `METRICS_HISTORY_INTERVAL` 環境變數配置。在 `app.py` lifecycle 中 start/stop。 + +## Risks / Trade-offs + +- **Redis SCAN 效能**: 大量 key 時 SCAN 可能較慢 → 設定 `COUNT 100` 限制每次迭代量,且 30 秒才掃一次,可接受 +- **ProcessLevelCache registry 依賴模組載入順序**: 服務未 import 時不會註冊 → 在 app factory 或 gunicorn post_fork 確保所有服務模組已載入 +- **直連計數器跨 worker 不共享**: gunicorn prefork 模式下每個 worker 有獨立計數 → API 回傳當前 worker PID 供辨識,可透過 `/admin/api/system-status` 的 worker info 交叉比對 +- **舊 Jinja2 模板保留但不維護**: 切換後舊模板不再更新 → 透過 `routeContracts.js` 的 `rollbackStrategy: 'fallback_to_legacy_route'` 保留回退能力 + +## Migration Plan + +1. 後端先行:加 `stats()`、registry、直連計數器、新 API(不影響既有功能) +2. 前端建構:新建 `admin-performance/` Vue SPA,Vite 註冊 entry +3. 路由切換:`admin_routes.py` 改為 `send_from_directory`,`routeContracts.js` 改 `renderMode: 'native'` +4. 驗證後部署:確認所有面板正確顯示後上線 +5. 回退方案:`routeContracts.js` 改回 `renderMode: 'external'`,`admin_routes.py` 改回 `render_template` diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/proposal.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/proposal.md new file mode 100644 index 0000000..a6620f6 --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/proposal.md @@ -0,0 +1,31 @@ +## Why + +現有 `/admin/performance` 是唯一仍使用 Jinja2 + vanilla JS + Chart.js 的頁面,與所有已遷移至 Vue 3 SPA 的報表頁面架構不一致。同時,隨著報表系統功能擴充(L1/L2 快取層、連線池、直連 Oracle 等),後端已具備豐富的遙測數據,但管理後台的監控面板覆蓋不足——缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤等關鍵資訊。 + +## What Changes + +- 將 `/admin/performance` 從 Jinja2 server-rendered 頁面重建為 Vue 3 SPA(ECharts 取代 Chart.js) +- 新增 `GET /admin/api/performance-detail` API,整合 Redis INFO/SCAN、ProcessLevelCache registry、連線池狀態、直連計數等完整監控數據 +- 後端 `ProcessLevelCache` 加入 `stats()` 方法與全域 registry,支援動態收集所有快取實例狀態 +- 後端 `database.py` 加入直連 Oracle 計數器,追蹤非連線池的直接連線使用量 +- 前端新增 GaugeBar / StatCard / StatusDot 可複用組件,提供 gauge 飽和度視覺化 +- portal-shell 路由從 `renderMode: 'external'` 切換為 `'native'` +- Vite 構建新增 `admin-performance` entry point + +## Capabilities + +### New Capabilities +- `admin-performance-spa`: Vue 3 SPA 重建管理效能儀表板,包含 status cards、query performance、Redis 快取、記憶體快取、連線池、worker 控制、系統日誌等完整面板 +- `cache-telemetry-api`: ProcessLevelCache stats() + 全域 registry + performance-detail API,提供所有記憶體快取、Redis 快取、route cache 的遙測數據 +- `connection-pool-monitoring`: 連線池飽和度追蹤 + 直連 Oracle 計數器,完整呈現資料庫連線使用狀況 +- `metrics-history-trending`: SQLite 持久化背景採集 + 時間序列趨勢圖,可回溯連線池飽和度、查詢延遲、Redis 記憶體、快取命中率等歷史數據 + +### Modified Capabilities + + +## Impact + +- **Backend** (7 files): `core/cache.py`、`core/database.py`、`core/metrics_history.py`(NEW)、`routes/admin_routes.py`、`services/resource_cache.py`、`services/realtime_equipment_cache.py`、`services/reject_dataset_cache.py`、`app.py` +- **Frontend** (8 new + 3 modified): 新建 `admin-performance/` 目錄(index.html、main.js、App.vue、style.css、4 個組件含 TrendChart),修改 `vite.config.js`、`package.json`、`routeContracts.js` +- **API**: 新增 2 個 endpoint (`/admin/api/performance-detail`、`/admin/api/performance-history`),既有 5 個 endpoint 不變 +- **Rollback**: 舊 Jinja2 模板保留,可透過 `routeContracts.js` 切回 `renderMode: 'external'` diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/admin-performance-spa/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/admin-performance-spa/spec.md new file mode 100644 index 0000000..431b7a0 --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/admin-performance-spa/spec.md @@ -0,0 +1,100 @@ +## ADDED Requirements + +### Requirement: Vue 3 SPA page replaces Jinja2 template +The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route. + +#### Scenario: Page loads as Vue SPA +- **WHEN** user navigates to `/admin/performance` +- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template) + +#### Scenario: Portal-shell integration +- **WHEN** the portal-shell renders `/admin/performance` +- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe) + +### Requirement: Status cards display system health +The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value. + +#### Scenario: All systems healthy +- **WHEN** all backend systems report healthy status via `/admin/api/system-status` +- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values + +#### Scenario: Redis disabled +- **WHEN** Redis is disabled (`REDIS_ENABLED=false`) +- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message + +### Requirement: Query performance panel with ECharts +The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation. + +#### Scenario: Metrics loaded successfully +- **WHEN** `/admin/api/metrics` returns valid performance data +- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution + +#### Scenario: No metrics data +- **WHEN** `/admin/api/metrics` returns empty or null metrics +- **THEN** the panel SHALL display placeholder text indicating no data available + +### Requirement: Redis cache detail panel +The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table. + +#### Scenario: Redis active with data +- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts +- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count + +#### Scenario: Redis disabled +- **WHEN** Redis is disabled +- **THEN** the Redis detail panel SHALL display a disabled state message without errors + +### Requirement: Memory cache panel +The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads). + +#### Scenario: Multiple caches registered +- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries +- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description + +#### Scenario: Route cache telemetry +- **WHEN** `/admin/api/performance-detail` returns route_cache data +- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads + +### Requirement: Connection pool panel +The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count. + +#### Scenario: Pool under normal load +- **WHEN** pool saturation is below 80% +- **THEN** the GaugeBar SHALL display in a normal color (green/blue) + +#### Scenario: Pool near saturation +- **WHEN** pool saturation exceeds 80% +- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red) + +### Requirement: Worker control panel +The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal. + +#### Scenario: Restart worker +- **WHEN** user clicks the restart button and confirms in the modal +- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result + +#### Scenario: Restart during cooldown +- **WHEN** worker is in cooldown period +- **THEN** the restart button SHALL be disabled with a cooldown indicator + +### Requirement: System logs panel with filtering and pagination +The dashboard SHALL display system logs with level filtering, text search, and pagination controls. + +#### Scenario: Filter by log level +- **WHEN** user selects a specific log level filter +- **THEN** only logs matching that level SHALL be displayed + +#### Scenario: Paginate logs +- **WHEN** logs exceed the page size +- **THEN** pagination controls SHALL allow navigating between pages + +### Requirement: Auto-refresh with toggle +The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh. + +#### Scenario: Auto-refresh enabled +- **WHEN** auto-refresh is enabled (default) +- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch + +#### Scenario: Manual refresh +- **WHEN** user clicks the manual refresh button +- **THEN** all panels SHALL immediately refresh their data diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/cache-telemetry-api/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/cache-telemetry-api/spec.md new file mode 100644 index 0000000..96cf778 --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/cache-telemetry-api/spec.md @@ -0,0 +1,56 @@ +## ADDED Requirements + +### Requirement: ProcessLevelCache stats method +Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`. + +#### Scenario: Stats on active cache +- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s) +- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}` + +#### Scenario: Stats with expired entries +- **WHEN** `stats()` is called and some entries have exceeded TTL +- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl` + +#### Scenario: Thread safety +- **WHEN** `stats()` is called concurrently with cache writes +- **THEN** it SHALL acquire the cache lock and return consistent data without races + +### Requirement: ProcessLevelCache global registry +The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`. + +#### Scenario: Register and retrieve all caches +- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called +- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches + +#### Scenario: Cache not registered +- **WHEN** a service's ProcessLevelCache is not registered +- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output + +### Requirement: Performance detail API endpoint +The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`. + +#### Scenario: All systems available +- **WHEN** the API is called and all subsystems are healthy +- **THEN** it SHALL return all 5 sections with current telemetry data + +#### Scenario: Redis disabled +- **WHEN** Redis is disabled (`REDIS_ENABLED=false`) +- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally + +### Requirement: Redis namespace key distribution +The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`. + +#### Scenario: Keys exist across namespaces +- **WHEN** Redis contains keys across multiple namespaces +- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count` + +#### Scenario: SCAN safety +- **WHEN** scanning Redis keys +- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis + +### Requirement: Route cache telemetry in performance detail +The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`. + +#### Scenario: LayeredCache active +- **WHEN** route cache is in layered mode +- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/connection-pool-monitoring/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/connection-pool-monitoring/spec.md new file mode 100644 index 0000000..ae71a86 --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/connection-pool-monitoring/spec.md @@ -0,0 +1,27 @@ +## ADDED Requirements + +### Requirement: Connection pool status in performance detail +The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`. + +#### Scenario: Pool status retrieved +- **WHEN** the API is called +- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values + +#### Scenario: Saturation calculation +- **WHEN** the pool has 8 checked_out connections and max_capacity is 30 +- **THEN** saturation SHALL be reported as approximately 26.7% + +### Requirement: Direct Oracle connection counter +The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection. + +#### Scenario: Counter increments on direct connection +- **WHEN** `get_db_connection()` successfully creates a connection +- **THEN** the direct connection counter SHALL increment by 1 + +#### Scenario: Counter in performance detail +- **WHEN** the performance-detail API is called +- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID) + +#### Scenario: Counter is per-worker +- **WHEN** multiple gunicorn workers are running +- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/metrics-history-trending/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/metrics-history-trending/spec.md new file mode 100644 index 0000000..c13633b --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/metrics-history-trending/spec.md @@ -0,0 +1,65 @@ +## ADDED Requirements + +### Requirement: SQLite metrics history store +The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`. + +#### Scenario: Write and query snapshots +- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics +- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID + +#### Scenario: Query by time range +- **WHEN** `query_snapshots(minutes=30)` is called +- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending + +#### Scenario: Retention cleanup +- **WHEN** `cleanup()` is called +- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000) + +#### Scenario: Thread safety +- **WHEN** multiple threads write snapshots concurrently +- **THEN** the write lock SHALL serialize writes and prevent database corruption + +### Requirement: Background metrics collector +The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var). + +#### Scenario: Automatic collection +- **WHEN** the collector is started via `start_metrics_history(app)` +- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store + +#### Scenario: Graceful shutdown +- **WHEN** `stop_metrics_history()` is called +- **THEN** the collector thread SHALL stop within one interval period + +#### Scenario: Subsystem unavailability +- **WHEN** a subsystem (e.g., Redis) is unavailable during collection +- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics + +### Requirement: Performance history API endpoint +The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots. + +#### Scenario: Query with time range +- **WHEN** the API is called with `?minutes=30` +- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}` + +#### Scenario: Time range bounds +- **WHEN** `minutes` is less than 1 or greater than 180 +- **THEN** it SHALL be clamped to the range [1, 180] + +#### Scenario: Admin authentication +- **WHEN** the API is called without admin authentication +- **THEN** it SHALL be rejected by the `@admin_required` decorator + +### Requirement: Frontend trend charts +The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts. + +#### Scenario: Trend charts with data +- **WHEN** historical snapshots contain more than 1 data point +- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates + +#### Scenario: Trend charts without data +- **WHEN** historical snapshots are empty or contain only 1 data point +- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`) + +#### Scenario: Auto-refresh +- **WHEN** the dashboard auto-refreshes +- **THEN** historical data SHALL also be refreshed alongside real-time metrics diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/tasks.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/tasks.md new file mode 100644 index 0000000..6e7c7c0 --- /dev/null +++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/tasks.md @@ -0,0 +1,80 @@ +## 1. Backend — Cache Telemetry Infrastructure + +- [x] 1.1 Add `stats()` method to `ProcessLevelCache` in `core/cache.py` (returns entries/max_size/ttl_seconds with lock) +- [x] 1.2 Add `_PROCESS_CACHE_REGISTRY`, `register_process_cache()`, and `get_all_process_cache_stats()` to `core/cache.py` +- [x] 1.3 Register `_wip_df_cache` in `core/cache.py` +- [x] 1.4 Add `stats()` + `register_process_cache()` to `services/resource_cache.py` +- [x] 1.5 Add `stats()` + `register_process_cache()` to `services/realtime_equipment_cache.py` +- [x] 1.6 Add `register_process_cache()` to `services/reject_dataset_cache.py` + +## 2. Backend — Direct Connection Counter + +- [x] 2.1 Add `_DIRECT_CONN_COUNTER`, `_DIRECT_CONN_LOCK`, and `get_direct_connection_count()` to `core/database.py` +- [x] 2.2 Increment counter in `get_db_connection()` and `read_sql_df_slow()` after successful connection creation + +## 3. Backend — Performance Detail API + +- [x] 3.1 Add `GET /admin/api/performance-detail` endpoint in `routes/admin_routes.py` returning redis, process_caches, route_cache, db_pool, and direct_connections sections +- [x] 3.2 Implement Redis INFO + SCAN namespace key distribution (data, route_cache, equipment_status, reject_dataset, meta, lock, scrap_exclusion) with graceful degradation when Redis is disabled + +## 4. Frontend — Page Scaffolding + +- [x] 4.1 Create `frontend/src/admin-performance/index.html` and `main.js` (standard Vue SPA entry) +- [x] 4.2 Register `admin-performance` entry in `vite.config.js` +- [x] 4.3 Add `cp` command for `admin-performance.html` in `package.json` build script + +## 5. Frontend — Reusable Components + +- [x] 5.1 Create `GaugeBar.vue` — horizontal gauge bar with label, value, max, and color threshold props +- [x] 5.2 Create `StatCard.vue` — mini card with numeric value, label, and optional unit/icon +- [x] 5.3 Create `StatusDot.vue` — colored dot indicator (healthy/degraded/error/disabled) with label + +## 6. Frontend — App.vue Main Dashboard + +- [x] 6.1 Implement data fetching layer: `loadSystemStatus()`, `loadMetrics()`, `loadPerformanceDetail()`, `loadLogs()`, `loadWorkerStatus()` with `Promise.all` parallel fetch and `useAutoRefresh` (30s) +- [x] 6.2 Build header section with gradient background, title, auto-refresh toggle, and manual refresh button +- [x] 6.3 Build status cards section (Database / Redis / Circuit Breaker / Worker PID) using StatusDot +- [x] 6.4 Build query performance panel with P50/P95/P99 stat cards and ECharts latency distribution chart +- [x] 6.5 Build Redis cache detail panel with memory GaugeBar, hit rate, client count, peak memory, and namespace key distribution table +- [x] 6.6 Build memory cache panel with ProcessLevelCache grid cards (entries/max gauge + TTL) and route cache telemetry (L1/L2 hit rate, miss rate, total reads) +- [x] 6.7 Build connection pool panel with saturation GaugeBar and stat card grid (checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, direct connections) +- [x] 6.8 Build worker control panel with PID/uptime/cooldown display, restart button, and confirmation modal +- [x] 6.9 Build system logs panel with level filter, text search, pagination, and log clearing +- [x] 6.10 Create `style.css` with all panel, grid, gauge, card, and responsive layout styles + +## 7. Route Integration + +- [x] 7.1 Change `/admin/performance` route handler in `admin_routes.py` from `render_template` to `send_from_directory` serving the Vue SPA +- [x] 7.2 Update `routeContracts.js`: change renderMode to `'native'`, rollbackStrategy to `'fallback_to_legacy_route'`, compatibilityPolicy to `'redirect_to_shell_when_spa_enabled'` + +## 8. Verification (Phase 1) + +- [x] 8.1 Run `cd frontend && npx vite build` — confirm no compilation errors and `admin-performance.html` is produced +- [x] 8.2 Verify all dashboard panels render correctly with live data after service restart + +## 9. Backend — Metrics History Store + +- [x] 9.1 Create `core/metrics_history.py` with `MetricsHistoryStore` class (SQLite schema, thread-local connections, write_lock, write_snapshot, query_snapshots, cleanup) +- [x] 9.2 Add `MetricsHistoryCollector` class (daemon thread, configurable interval, collect pool/redis/route_cache/latency) +- [x] 9.3 Add module-level `get_metrics_history_store()`, `start_metrics_history(app)`, `stop_metrics_history()` functions + +## 10. Backend — Lifecycle Integration + +- [x] 10.1 Call `start_metrics_history(app)` in `app.py` after other background services +- [x] 10.2 Call `stop_metrics_history()` in `_shutdown_runtime_resources()` in `app.py` + +## 11. Backend — Performance History API + +- [x] 11.1 Add `GET /admin/api/performance-history` endpoint in `admin_routes.py` (minutes param, clamped 1-180, returns snapshots array) + +## 12. Frontend — Trend Charts + +- [x] 12.1 Create `TrendChart.vue` component using vue-echarts VChart (line/area chart, dual yAxis support, time labels, autoresize) +- [x] 12.2 Add `loadPerformanceHistory()` fetch to `App.vue` and integrate into `refreshAll()` +- [x] 12.3 Add 4 TrendChart panels to `App.vue` template (pool saturation, query latency, Redis memory, cache hit rates) +- [x] 12.4 Add trend chart styles to `style.css` + +## 13. Verification (Phase 2) + +- [x] 13.1 Run `cd frontend && npm run build` — confirm no compilation errors +- [x] 13.2 Verify trend charts render with historical data after service restart + 60s collection diff --git a/openspec/specs/admin-performance-spa/spec.md b/openspec/specs/admin-performance-spa/spec.md new file mode 100644 index 0000000..431b7a0 --- /dev/null +++ b/openspec/specs/admin-performance-spa/spec.md @@ -0,0 +1,100 @@ +## ADDED Requirements + +### Requirement: Vue 3 SPA page replaces Jinja2 template +The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route. + +#### Scenario: Page loads as Vue SPA +- **WHEN** user navigates to `/admin/performance` +- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template) + +#### Scenario: Portal-shell integration +- **WHEN** the portal-shell renders `/admin/performance` +- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe) + +### Requirement: Status cards display system health +The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value. + +#### Scenario: All systems healthy +- **WHEN** all backend systems report healthy status via `/admin/api/system-status` +- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values + +#### Scenario: Redis disabled +- **WHEN** Redis is disabled (`REDIS_ENABLED=false`) +- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message + +### Requirement: Query performance panel with ECharts +The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation. + +#### Scenario: Metrics loaded successfully +- **WHEN** `/admin/api/metrics` returns valid performance data +- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution + +#### Scenario: No metrics data +- **WHEN** `/admin/api/metrics` returns empty or null metrics +- **THEN** the panel SHALL display placeholder text indicating no data available + +### Requirement: Redis cache detail panel +The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table. + +#### Scenario: Redis active with data +- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts +- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count + +#### Scenario: Redis disabled +- **WHEN** Redis is disabled +- **THEN** the Redis detail panel SHALL display a disabled state message without errors + +### Requirement: Memory cache panel +The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads). + +#### Scenario: Multiple caches registered +- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries +- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description + +#### Scenario: Route cache telemetry +- **WHEN** `/admin/api/performance-detail` returns route_cache data +- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads + +### Requirement: Connection pool panel +The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count. + +#### Scenario: Pool under normal load +- **WHEN** pool saturation is below 80% +- **THEN** the GaugeBar SHALL display in a normal color (green/blue) + +#### Scenario: Pool near saturation +- **WHEN** pool saturation exceeds 80% +- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red) + +### Requirement: Worker control panel +The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal. + +#### Scenario: Restart worker +- **WHEN** user clicks the restart button and confirms in the modal +- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result + +#### Scenario: Restart during cooldown +- **WHEN** worker is in cooldown period +- **THEN** the restart button SHALL be disabled with a cooldown indicator + +### Requirement: System logs panel with filtering and pagination +The dashboard SHALL display system logs with level filtering, text search, and pagination controls. + +#### Scenario: Filter by log level +- **WHEN** user selects a specific log level filter +- **THEN** only logs matching that level SHALL be displayed + +#### Scenario: Paginate logs +- **WHEN** logs exceed the page size +- **THEN** pagination controls SHALL allow navigating between pages + +### Requirement: Auto-refresh with toggle +The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh. + +#### Scenario: Auto-refresh enabled +- **WHEN** auto-refresh is enabled (default) +- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch + +#### Scenario: Manual refresh +- **WHEN** user clicks the manual refresh button +- **THEN** all panels SHALL immediately refresh their data diff --git a/openspec/specs/cache-telemetry-api/spec.md b/openspec/specs/cache-telemetry-api/spec.md new file mode 100644 index 0000000..96cf778 --- /dev/null +++ b/openspec/specs/cache-telemetry-api/spec.md @@ -0,0 +1,56 @@ +## ADDED Requirements + +### Requirement: ProcessLevelCache stats method +Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`. + +#### Scenario: Stats on active cache +- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s) +- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}` + +#### Scenario: Stats with expired entries +- **WHEN** `stats()` is called and some entries have exceeded TTL +- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl` + +#### Scenario: Thread safety +- **WHEN** `stats()` is called concurrently with cache writes +- **THEN** it SHALL acquire the cache lock and return consistent data without races + +### Requirement: ProcessLevelCache global registry +The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`. + +#### Scenario: Register and retrieve all caches +- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called +- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches + +#### Scenario: Cache not registered +- **WHEN** a service's ProcessLevelCache is not registered +- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output + +### Requirement: Performance detail API endpoint +The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`. + +#### Scenario: All systems available +- **WHEN** the API is called and all subsystems are healthy +- **THEN** it SHALL return all 5 sections with current telemetry data + +#### Scenario: Redis disabled +- **WHEN** Redis is disabled (`REDIS_ENABLED=false`) +- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally + +### Requirement: Redis namespace key distribution +The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`. + +#### Scenario: Keys exist across namespaces +- **WHEN** Redis contains keys across multiple namespaces +- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count` + +#### Scenario: SCAN safety +- **WHEN** scanning Redis keys +- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis + +### Requirement: Route cache telemetry in performance detail +The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`. + +#### Scenario: LayeredCache active +- **WHEN** route cache is in layered mode +- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry diff --git a/openspec/specs/connection-pool-monitoring/spec.md b/openspec/specs/connection-pool-monitoring/spec.md new file mode 100644 index 0000000..ae71a86 --- /dev/null +++ b/openspec/specs/connection-pool-monitoring/spec.md @@ -0,0 +1,27 @@ +## ADDED Requirements + +### Requirement: Connection pool status in performance detail +The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`. + +#### Scenario: Pool status retrieved +- **WHEN** the API is called +- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values + +#### Scenario: Saturation calculation +- **WHEN** the pool has 8 checked_out connections and max_capacity is 30 +- **THEN** saturation SHALL be reported as approximately 26.7% + +### Requirement: Direct Oracle connection counter +The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection. + +#### Scenario: Counter increments on direct connection +- **WHEN** `get_db_connection()` successfully creates a connection +- **THEN** the direct connection counter SHALL increment by 1 + +#### Scenario: Counter in performance detail +- **WHEN** the performance-detail API is called +- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID) + +#### Scenario: Counter is per-worker +- **WHEN** multiple gunicorn workers are running +- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker diff --git a/openspec/specs/metrics-history-trending/spec.md b/openspec/specs/metrics-history-trending/spec.md new file mode 100644 index 0000000..c13633b --- /dev/null +++ b/openspec/specs/metrics-history-trending/spec.md @@ -0,0 +1,65 @@ +## ADDED Requirements + +### Requirement: SQLite metrics history store +The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`. + +#### Scenario: Write and query snapshots +- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics +- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID + +#### Scenario: Query by time range +- **WHEN** `query_snapshots(minutes=30)` is called +- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending + +#### Scenario: Retention cleanup +- **WHEN** `cleanup()` is called +- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000) + +#### Scenario: Thread safety +- **WHEN** multiple threads write snapshots concurrently +- **THEN** the write lock SHALL serialize writes and prevent database corruption + +### Requirement: Background metrics collector +The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var). + +#### Scenario: Automatic collection +- **WHEN** the collector is started via `start_metrics_history(app)` +- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store + +#### Scenario: Graceful shutdown +- **WHEN** `stop_metrics_history()` is called +- **THEN** the collector thread SHALL stop within one interval period + +#### Scenario: Subsystem unavailability +- **WHEN** a subsystem (e.g., Redis) is unavailable during collection +- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics + +### Requirement: Performance history API endpoint +The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots. + +#### Scenario: Query with time range +- **WHEN** the API is called with `?minutes=30` +- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}` + +#### Scenario: Time range bounds +- **WHEN** `minutes` is less than 1 or greater than 180 +- **THEN** it SHALL be clamped to the range [1, 180] + +#### Scenario: Admin authentication +- **WHEN** the API is called without admin authentication +- **THEN** it SHALL be rejected by the `@admin_required` decorator + +### Requirement: Frontend trend charts +The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts. + +#### Scenario: Trend charts with data +- **WHEN** historical snapshots contain more than 1 data point +- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates + +#### Scenario: Trend charts without data +- **WHEN** historical snapshots are empty or contain only 1 data point +- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`) + +#### Scenario: Auto-refresh +- **WHEN** the dashboard auto-refreshes +- **THEN** historical data SHALL also be refreshed alongside real-time metrics diff --git a/src/mes_dashboard/app.py b/src/mes_dashboard/app.py index 028a9c9..f4a5f2e 100644 --- a/src/mes_dashboard/app.py +++ b/src/mes_dashboard/app.py @@ -295,6 +295,12 @@ def _shutdown_runtime_resources() -> None: except Exception as exc: logger.warning("Error stopping scrap exclusion cache worker: %s", exc) + try: + from mes_dashboard.core.metrics_history import stop_metrics_history + stop_metrics_history() + except Exception as exc: + logger.warning("Error stopping metrics history: %s", exc) + try: close_redis() except Exception as exc: @@ -390,6 +396,8 @@ def create_app(config_name: str | None = None) -> Flask: start_cache_updater() # Start Redis cache updater init_realtime_equipment_cache(app) # Start realtime equipment status cache init_scrap_reason_exclusion_cache(app) # Start exclusion-policy cache sync + from mes_dashboard.core.metrics_history import start_metrics_history + start_metrics_history(app) # Start metrics history collector _register_shutdown_hooks(app) # Register API routes diff --git a/src/mes_dashboard/core/cache.py b/src/mes_dashboard/core/cache.py index 6ffc966..442f96b 100644 --- a/src/mes_dashboard/core/cache.py +++ b/src/mes_dashboard/core/cache.py @@ -95,6 +95,34 @@ class ProcessLevelCache: with self._lock: self._cache.clear() + def stats(self) -> dict: + """Return live cache statistics for telemetry.""" + with self._lock: + now = time.time() + live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl) + return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl} + + +# ============================================================ +# Process-Level Cache Registry (for admin telemetry) +# ============================================================ + +_PROCESS_CACHE_REGISTRY: dict[str, tuple[str, Any]] = {} + + +def register_process_cache(name: str, cache_instance: Any, description: str = "") -> None: + """Register a ProcessLevelCache instance for admin telemetry.""" + _PROCESS_CACHE_REGISTRY[name] = (description, cache_instance) + + +def get_all_process_cache_stats() -> dict[str, dict]: + """Collect stats from all registered ProcessLevelCache instances.""" + return { + name: {**inst.stats(), "description": desc} + for name, (desc, inst) in _PROCESS_CACHE_REGISTRY.items() + if callable(getattr(inst, "stats", None)) + } + def _resolve_cache_max_size(env_name: str, default: int) -> int: value = os.getenv(env_name) @@ -116,6 +144,7 @@ _wip_df_cache = ProcessLevelCache( ttl_seconds=30, max_size=WIP_PROCESS_CACHE_MAX_SIZE, ) +register_process_cache("wip_dataframe", _wip_df_cache, "WIP DataFrame (L1, 30s)") _wip_parse_lock = threading.Lock() # ============================================================ diff --git a/src/mes_dashboard/core/database.py b/src/mes_dashboard/core/database.py index bdbf279..f27f2bf 100644 --- a/src/mes_dashboard/core/database.py +++ b/src/mes_dashboard/core/database.py @@ -416,6 +416,14 @@ def dispose_engine(): # Direct Connection Helpers # ============================================================ +_DIRECT_CONN_COUNTER = 0 +_DIRECT_CONN_LOCK = threading.Lock() + + +def get_direct_connection_count() -> int: + """Return total direct (non-pooled) connections since worker start.""" + return _DIRECT_CONN_COUNTER + def get_db_connection(): """Create a direct oracledb connection. @@ -432,6 +440,9 @@ def get_db_connection(): retry_delay=runtime["retry_delay"], ) conn.call_timeout = runtime["call_timeout_ms"] + with _DIRECT_CONN_LOCK: + global _DIRECT_CONN_COUNTER + _DIRECT_CONN_COUNTER += 1 logger.debug( "Direct oracledb connection established (call_timeout_ms=%s)", runtime["call_timeout_ms"], @@ -591,6 +602,9 @@ def read_sql_df_slow( retry_delay=runtime["retry_delay"], ) conn.call_timeout = timeout_ms + with _DIRECT_CONN_LOCK: + global _DIRECT_CONN_COUNTER + _DIRECT_CONN_COUNTER += 1 logger.debug( "Slow-query connection established (call_timeout_ms=%s)", timeout_ms ) diff --git a/src/mes_dashboard/core/metrics_history.py b/src/mes_dashboard/core/metrics_history.py new file mode 100644 index 0000000..bdb08e2 --- /dev/null +++ b/src/mes_dashboard/core/metrics_history.py @@ -0,0 +1,369 @@ +# -*- coding: utf-8 -*- +"""SQLite-based metrics history store for admin performance dashboard. + +Periodically snapshots system metrics (pool, redis, cache, latency) +into a SQLite database for historical trend visualization. +Follows the LogStore pattern from core/log_store.py. +""" + +from __future__ import annotations + +import logging +import os +import sqlite3 +import threading +import time +from contextlib import contextmanager +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any, Dict, Generator, List, Optional + +logger = logging.getLogger('mes_dashboard.metrics_history') + +# ============================================================ +# Configuration +# ============================================================ + +METRICS_HISTORY_PATH = os.getenv( + 'METRICS_HISTORY_PATH', + 'logs/metrics_history.sqlite', +) +METRICS_HISTORY_INTERVAL = int(os.getenv('METRICS_HISTORY_INTERVAL', '30')) +METRICS_HISTORY_RETENTION_DAYS = int(os.getenv('METRICS_HISTORY_RETENTION_DAYS', '3')) +METRICS_HISTORY_MAX_ROWS = int(os.getenv('METRICS_HISTORY_MAX_ROWS', '50000')) + +# ============================================================ +# Database Schema +# ============================================================ + +CREATE_TABLE_SQL = """ +CREATE TABLE IF NOT EXISTS metrics_snapshots ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ts TEXT NOT NULL, + worker_pid INTEGER NOT NULL, + pool_saturation REAL, + pool_checked_out INTEGER, + pool_checked_in INTEGER, + pool_overflow INTEGER, + pool_max_capacity INTEGER, + redis_used_memory INTEGER, + redis_hit_rate REAL, + rc_l1_hit_rate REAL, + rc_l2_hit_rate REAL, + rc_miss_rate REAL, + latency_p50_ms REAL, + latency_p95_ms REAL, + latency_p99_ms REAL, + latency_count INTEGER +); +""" + +CREATE_INDEX_SQL = ( + "CREATE INDEX IF NOT EXISTS idx_metrics_ts ON metrics_snapshots(ts);" +) + +COLUMNS = [ + "ts", "worker_pid", + "pool_saturation", "pool_checked_out", "pool_checked_in", + "pool_overflow", "pool_max_capacity", + "redis_used_memory", "redis_hit_rate", + "rc_l1_hit_rate", "rc_l2_hit_rate", "rc_miss_rate", + "latency_p50_ms", "latency_p95_ms", "latency_p99_ms", "latency_count", +] + + +# ============================================================ +# Metrics History Store +# ============================================================ + +class MetricsHistoryStore: + """SQLite-based metrics history store (follows LogStore pattern).""" + + def __init__(self, db_path: str = METRICS_HISTORY_PATH): + self.db_path = db_path + self._local = threading.local() + self._write_lock = threading.Lock() + self._initialized = False + + def initialize(self) -> None: + if self._initialized: + return + db_dir = Path(self.db_path).parent + db_dir.mkdir(parents=True, exist_ok=True) + with self._get_connection() as conn: + cursor = conn.cursor() + cursor.execute(CREATE_TABLE_SQL) + cursor.execute(CREATE_INDEX_SQL) + conn.commit() + self._initialized = True + logger.info("Metrics history store initialized at %s", self.db_path) + + @contextmanager + def _get_connection(self) -> Generator[sqlite3.Connection, None, None]: + if not hasattr(self._local, 'connection') or self._local.connection is None: + self._local.connection = sqlite3.connect( + self.db_path, timeout=10.0, check_same_thread=False, + ) + self._local.connection.row_factory = sqlite3.Row + try: + yield self._local.connection + except sqlite3.Error as exc: + logger.error("Metrics history DB error: %s", exc) + try: + self._local.connection.close() + except Exception: + pass + self._local.connection = None + raise + + def write_snapshot(self, data: Dict[str, Any]) -> bool: + if not self._initialized: + self.initialize() + ts = datetime.now().isoformat() + pid = os.getpid() + pool = data.get("pool") or {} + redis = data.get("redis") or {} + rc = data.get("route_cache") or {} + lat = data.get("latency") or {} + try: + with self._write_lock: + with self._get_connection() as conn: + conn.execute( + """ + INSERT INTO metrics_snapshots + (ts, worker_pid, + pool_saturation, pool_checked_out, pool_checked_in, + pool_overflow, pool_max_capacity, + redis_used_memory, redis_hit_rate, + rc_l1_hit_rate, rc_l2_hit_rate, rc_miss_rate, + latency_p50_ms, latency_p95_ms, latency_p99_ms, latency_count) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + ts, pid, + pool.get("saturation"), + pool.get("checked_out"), + pool.get("checked_in"), + pool.get("overflow"), + pool.get("max_capacity"), + redis.get("used_memory"), + redis.get("hit_rate"), + rc.get("l1_hit_rate"), + rc.get("l2_hit_rate"), + rc.get("miss_rate"), + lat.get("p50_ms"), + lat.get("p95_ms"), + lat.get("p99_ms"), + lat.get("count"), + ), + ) + conn.commit() + return True + except Exception as exc: + logger.debug("Failed to write metrics snapshot: %s", exc) + return False + + def query_snapshots(self, minutes: int = 30) -> List[Dict[str, Any]]: + if not self._initialized: + self.initialize() + cutoff = (datetime.now() - timedelta(minutes=minutes)).isoformat() + try: + with self._get_connection() as conn: + cursor = conn.execute( + "SELECT * FROM metrics_snapshots WHERE ts >= ? ORDER BY ts ASC", + (cutoff,), + ) + return [dict(row) for row in cursor.fetchall()] + except Exception as exc: + logger.error("Failed to query metrics snapshots: %s", exc) + return [] + + def cleanup(self) -> int: + if not self._initialized: + return 0 + deleted = 0 + try: + with self._write_lock: + with self._get_connection() as conn: + cutoff = ( + datetime.now() - timedelta(days=METRICS_HISTORY_RETENTION_DAYS) + ).isoformat() + cursor = conn.execute( + "DELETE FROM metrics_snapshots WHERE ts < ?", (cutoff,), + ) + deleted += cursor.rowcount + row = conn.execute( + "SELECT COUNT(*) FROM metrics_snapshots", + ).fetchone() + count = row[0] if row else 0 + if count > METRICS_HISTORY_MAX_ROWS: + excess = count - METRICS_HISTORY_MAX_ROWS + cursor = conn.execute( + """ + DELETE FROM metrics_snapshots WHERE id IN ( + SELECT id FROM metrics_snapshots ORDER BY ts ASC LIMIT ? + ) + """, + (excess,), + ) + deleted += cursor.rowcount + conn.commit() + if deleted > 0: + logger.info("Cleaned up %d metrics history rows", deleted) + except Exception as exc: + logger.error("Failed to cleanup metrics history: %s", exc) + return deleted + + +# ============================================================ +# Background Collector +# ============================================================ + +class MetricsHistoryCollector: + """Daemon thread that snapshots metrics at a fixed interval.""" + + def __init__( + self, + app: Any = None, + store: Optional[MetricsHistoryStore] = None, + interval: int = METRICS_HISTORY_INTERVAL, + ): + self._app = app + self._store = store or get_metrics_history_store() + self.interval = interval + self._stop_event = threading.Event() + self._thread: Optional[threading.Thread] = None + self._cleanup_counter = 0 + + def start(self) -> None: + if self._thread is not None and self._thread.is_alive(): + return + self._stop_event.clear() + self._thread = threading.Thread( + target=self._run, daemon=True, name="metrics-history-collector", + ) + self._thread.start() + logger.info( + "Metrics history collector started (interval=%ds)", self.interval, + ) + + def stop(self) -> None: + if self._thread and self._thread.is_alive(): + self._stop_event.set() + self._thread.join(timeout=5) + logger.info("Metrics history collector stopped") + + def _run(self) -> None: + # Collect immediately on start, then loop. + self._collect_snapshot() + while not self._stop_event.wait(self.interval): + self._collect_snapshot() + # Run cleanup every ~100 intervals (~50 min at 30s). + self._cleanup_counter += 1 + if self._cleanup_counter >= 100: + self._cleanup_counter = 0 + self._store.cleanup() + + def _collect_snapshot(self) -> None: + try: + data: Dict[str, Any] = {} + + # Pool status + try: + from mes_dashboard.core.database import get_pool_status + data["pool"] = get_pool_status() + except Exception: + data["pool"] = {} + + # Redis + try: + from mes_dashboard.core.redis_client import ( + get_redis_client, + REDIS_ENABLED, + ) + if REDIS_ENABLED: + client = get_redis_client() + if client is not None: + info = client.info(section="memory") + stats_info = client.info(section="stats") + hits = int(stats_info.get("keyspace_hits", 0)) + misses = int(stats_info.get("keyspace_misses", 0)) + total = hits + misses + data["redis"] = { + "used_memory": info.get("used_memory", 0), + "hit_rate": round(hits / total, 4) if total > 0 else 0, + } + else: + data["redis"] = {} + else: + data["redis"] = {} + except Exception: + data["redis"] = {} + + # Route cache + try: + if self._app: + with self._app.app_context(): + from mes_dashboard.routes.health_routes import ( + get_route_cache_status, + ) + rc = get_route_cache_status() + else: + from mes_dashboard.routes.health_routes import ( + get_route_cache_status, + ) + rc = get_route_cache_status() + data["route_cache"] = { + "l1_hit_rate": rc.get("l1_hit_rate"), + "l2_hit_rate": rc.get("l2_hit_rate"), + "miss_rate": rc.get("miss_rate"), + } + except Exception: + data["route_cache"] = {} + + # Query latency + try: + from mes_dashboard.core.metrics import get_metrics_summary + summary = get_metrics_summary() + data["latency"] = { + "p50_ms": summary.get("p50_ms", 0), + "p95_ms": summary.get("p95_ms", 0), + "p99_ms": summary.get("p99_ms", 0), + "count": summary.get("count", 0), + } + except Exception: + data["latency"] = {} + + self._store.write_snapshot(data) + except Exception as exc: + logger.debug("Metrics snapshot collection failed: %s", exc) + + +# ============================================================ +# Global Instance & Lifecycle +# ============================================================ + +_STORE: Optional[MetricsHistoryStore] = None +_COLLECTOR: Optional[MetricsHistoryCollector] = None + + +def get_metrics_history_store() -> MetricsHistoryStore: + global _STORE + if _STORE is None: + _STORE = MetricsHistoryStore() + _STORE.initialize() + return _STORE + + +def start_metrics_history(app: Any = None) -> None: + global _COLLECTOR + store = get_metrics_history_store() + _COLLECTOR = MetricsHistoryCollector(app=app, store=store) + _COLLECTOR.start() + + +def stop_metrics_history() -> None: + global _COLLECTOR + if _COLLECTOR is not None: + _COLLECTOR.stop() + _COLLECTOR = None diff --git a/src/mes_dashboard/routes/admin_routes.py b/src/mes_dashboard/routes/admin_routes.py index 95f6a7a..9dfc1b5 100644 --- a/src/mes_dashboard/routes/admin_routes.py +++ b/src/mes_dashboard/routes/admin_routes.py @@ -1,8 +1,8 @@ -# -*- coding: utf-8 -*- -"""Admin routes for page management and performance monitoring.""" - -from __future__ import annotations - +# -*- coding: utf-8 -*- +"""Admin routes for page management and performance monitoring.""" + +from __future__ import annotations + import json import logging import os @@ -10,8 +10,8 @@ import time from datetime import datetime, timezone from pathlib import Path from typing import Any - -from flask import Blueprint, g, jsonify, render_template, request + +from flask import Blueprint, current_app, g, jsonify, render_template, request, send_from_directory from mes_dashboard.core.permissions import admin_required from mes_dashboard.core.response import error_response, TOO_MANY_REQUESTS @@ -42,14 +42,14 @@ from mes_dashboard.services.page_registry import ( set_page_status, update_drawer, ) - -admin_bp = Blueprint("admin", __name__, url_prefix="/admin") -logger = logging.getLogger("mes_dashboard.admin") - -# ============================================================ -# Worker Restart Configuration -# ============================================================ - + +admin_bp = Blueprint("admin", __name__, url_prefix="/admin") +logger = logging.getLogger("mes_dashboard.admin") + +# ============================================================ +# Worker Restart Configuration +# ============================================================ + _RUNTIME_CONTRACT = load_runtime_contract() WATCHDOG_RUNTIME_DIR = _RUNTIME_CONTRACT["watchdog_runtime_dir"] RESTART_FLAG_PATH = _RUNTIME_CONTRACT["watchdog_restart_flag"] @@ -57,24 +57,28 @@ RESTART_STATE_PATH = _RUNTIME_CONTRACT["watchdog_state_file"] WATCHDOG_PID_PATH = _RUNTIME_CONTRACT["watchdog_pid_file"] GUNICORN_BIND = _RUNTIME_CONTRACT["gunicorn_bind"] RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT["version"] - -# Track last restart request time (in-memory for this worker) -_last_restart_request: float = 0.0 - - -# ============================================================ -# Performance Monitoring Routes -# ============================================================ - -@admin_bp.route("/performance") -@admin_required -def performance(): - """Performance monitoring dashboard.""" - return render_template("admin/performance.html") - - -@admin_bp.route("/api/system-status", methods=["GET"]) -@admin_required + +# Track last restart request time (in-memory for this worker) +_last_restart_request: float = 0.0 + + +# ============================================================ +# Performance Monitoring Routes +# ============================================================ + +@admin_bp.route("/performance") +@admin_required +def performance(): + """Performance monitoring dashboard (Vue SPA).""" + dist_dir = os.path.join(current_app.static_folder or "", "dist") + dist_html = os.path.join(dist_dir, "admin-performance.html") + if os.path.exists(dist_html): + return send_from_directory(dist_dir, "admin-performance.html") + return render_template("admin/performance.html") + + +@admin_bp.route("/api/system-status", methods=["GET"]) +@admin_required def api_system_status(): """API: Get system status for performance dashboard.""" from mes_dashboard.core.database import get_pool_runtime_config, get_pool_status @@ -85,15 +89,15 @@ def api_system_status(): check_redis, get_route_cache_status, ) - - # Database status - db_status, db_error = check_database() - - # Redis status - redis_status = 'disabled' - if REDIS_ENABLED: - redis_status, _ = check_redis() - + + # Database status + db_status, db_error = check_database() + + # Redis status + redis_status = 'disabled' + if REDIS_ENABLED: + redis_status, _ = check_redis() + # Circuit breaker status circuit_breaker = get_circuit_breaker_status() route_cache = get_route_cache_status() @@ -135,26 +139,26 @@ def api_system_status(): thresholds=thresholds, ) runtime_contract = build_runtime_contract_diagnostics(strict=False) - - # Cache status - from mes_dashboard.routes.health_routes import ( - get_cache_status, - get_resource_cache_status, - get_equipment_status_cache_status - ) - - return jsonify({ - "success": True, - "data": { - "database": { - "status": db_status, - "error": db_error - }, - "redis": { - "status": redis_status, - "enabled": REDIS_ENABLED - }, - "circuit_breaker": circuit_breaker, + + # Cache status + from mes_dashboard.routes.health_routes import ( + get_cache_status, + get_resource_cache_status, + get_equipment_status_cache_status + ) + + return jsonify({ + "success": True, + "data": { + "database": { + "status": db_status, + "error": db_error + }, + "redis": { + "status": redis_status, + "enabled": REDIS_ENABLED + }, + "circuit_breaker": circuit_breaker, "cache": { "wip": get_cache_status(), "resource": get_resource_cache_status(), @@ -186,134 +190,265 @@ def api_system_status(): "worker_pid": os.getpid() } }) - - -@admin_bp.route("/api/metrics", methods=["GET"]) -@admin_required -def api_metrics(): - """API: Get performance metrics for dashboard.""" - from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics - - summary = get_metrics_summary() - metrics = get_query_metrics() - - return jsonify({ - "success": True, - "data": { - "p50_ms": summary.get("p50_ms"), - "p95_ms": summary.get("p95_ms"), - "p99_ms": summary.get("p99_ms"), - "count": summary.get("count"), - "slow_count": summary.get("slow_count"), - "slow_rate": summary.get("slow_rate"), - "worker_pid": summary.get("worker_pid"), - "collected_at": summary.get("collected_at"), - # Include latency distribution for charts - "latencies": metrics.get_latencies()[-100:] # Last 100 for chart - } - }) - - -@admin_bp.route("/api/logs", methods=["GET"]) -@admin_required -def api_logs(): - """API: Get recent logs from SQLite log store.""" - from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED - - if not LOG_STORE_ENABLED: - return jsonify({ - "success": True, - "data": { - "logs": [], - "enabled": False, - "total": 0 - } - }) - - # Query parameters - level = request.args.get("level") - q = request.args.get("q") - limit = request.args.get("limit", 50, type=int) - offset = request.args.get("offset", 0, type=int) - since = request.args.get("since") - - log_store = get_log_store() - - # Get total count for pagination - total = log_store.count_logs(level=level, q=q, since=since) - - # Get paginated logs - logs = log_store.query_logs( - level=level, - q=q, - limit=min(limit, 100), # Cap at 100 per page - offset=offset, - since=since - ) - - return jsonify({ - "success": True, - "data": { - "logs": logs, - "count": len(logs), - "total": total, - "enabled": True, - "stats": log_store.get_stats() - } - }) - - -@admin_bp.route("/api/logs/cleanup", methods=["POST"]) -@admin_required -def api_logs_cleanup(): - """API: Manually trigger log cleanup. - - Supports optional parameters: - - older_than_days: Delete logs older than N days (default: use configured retention) - - keep_count: Keep only the most recent N logs (optional) - """ - from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED - - if not LOG_STORE_ENABLED: - return jsonify({ - "success": False, - "error": "Log store is disabled" - }), 400 - - log_store = get_log_store() - - # Get current stats before cleanup - stats_before = log_store.get_stats() - - # Perform cleanup - deleted = log_store.cleanup_old_logs() - - # Get stats after cleanup - stats_after = log_store.get_stats() - - user = getattr(g, "username", "unknown") - logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries") - - return jsonify({ - "success": True, - "data": { - "deleted": deleted, - "before": { - "count": stats_before.get("count", 0), - "size_bytes": stats_before.get("size_bytes", 0) - }, - "after": { - "count": stats_after.get("count", 0), - "size_bytes": stats_after.get("size_bytes", 0) - } - } - }) - - -# ============================================================ -# Worker Restart Control Routes -# ============================================================ - + + +@admin_bp.route("/api/metrics", methods=["GET"]) +@admin_required +def api_metrics(): + """API: Get performance metrics for dashboard.""" + from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics + + summary = get_metrics_summary() + metrics = get_query_metrics() + + return jsonify({ + "success": True, + "data": { + "p50_ms": summary.get("p50_ms"), + "p95_ms": summary.get("p95_ms"), + "p99_ms": summary.get("p99_ms"), + "count": summary.get("count"), + "slow_count": summary.get("slow_count"), + "slow_rate": summary.get("slow_rate"), + "worker_pid": summary.get("worker_pid"), + "collected_at": summary.get("collected_at"), + # Include latency distribution for charts + "latencies": metrics.get_latencies()[-100:] # Last 100 for chart + } + }) + + +@admin_bp.route("/api/logs", methods=["GET"]) +@admin_required +def api_logs(): + """API: Get recent logs from SQLite log store.""" + from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED + + if not LOG_STORE_ENABLED: + return jsonify({ + "success": True, + "data": { + "logs": [], + "enabled": False, + "total": 0 + } + }) + + # Query parameters + level = request.args.get("level") + q = request.args.get("q") + limit = request.args.get("limit", 50, type=int) + offset = request.args.get("offset", 0, type=int) + since = request.args.get("since") + + log_store = get_log_store() + + # Get total count for pagination + total = log_store.count_logs(level=level, q=q, since=since) + + # Get paginated logs + logs = log_store.query_logs( + level=level, + q=q, + limit=min(limit, 100), # Cap at 100 per page + offset=offset, + since=since + ) + + return jsonify({ + "success": True, + "data": { + "logs": logs, + "count": len(logs), + "total": total, + "enabled": True, + "stats": log_store.get_stats() + } + }) + + +@admin_bp.route("/api/performance-detail", methods=["GET"]) +@admin_required +def api_performance_detail(): + """API: Get detailed performance telemetry for admin dashboard. + + Returns redis, process_caches, route_cache, db_pool, and + direct_connections sections in a single response. + """ + from mes_dashboard.core.cache import get_all_process_cache_stats + from mes_dashboard.core.database import ( + get_direct_connection_count, + get_pool_runtime_config, + get_pool_status, + ) + from mes_dashboard.core.redis_client import ( + get_redis_client, + REDIS_ENABLED, + REDIS_KEY_PREFIX, + ) + from mes_dashboard.routes.health_routes import get_route_cache_status + + # ---- Redis detail ---- + redis_detail = None + if REDIS_ENABLED: + client = get_redis_client() + if client is not None: + try: + info = client.info(section="memory") + stats_info = client.info(section="stats") + clients_info = client.info(section="clients") + + hits = int(stats_info.get("keyspace_hits", 0)) + misses = int(stats_info.get("keyspace_misses", 0)) + total = hits + misses + hit_rate = round(hits / total, 4) if total > 0 else 0 + + # Scan key counts per namespace + namespace_prefixes = [ + "data", "route_cache", "equipment_status", + "reject_dataset", "meta", "lock", "scrap_exclusion", + ] + namespaces = [] + for ns in namespace_prefixes: + pattern = f"{REDIS_KEY_PREFIX}:{ns}*" + count = 0 + cursor = 0 + while True: + cursor, keys = client.scan(cursor=cursor, match=pattern, count=100) + count += len(keys) + if cursor == 0: + break + namespaces.append({"name": ns, "key_count": count}) + + redis_detail = { + "used_memory_human": info.get("used_memory_human", "N/A"), + "used_memory": info.get("used_memory", 0), + "peak_memory_human": info.get("used_memory_peak_human", "N/A"), + "peak_memory": info.get("used_memory_peak", 0), + "maxmemory_human": info.get("maxmemory_human", "N/A"), + "maxmemory": info.get("maxmemory", 0), + "connected_clients": clients_info.get("connected_clients", 0), + "hit_rate": hit_rate, + "keyspace_hits": hits, + "keyspace_misses": misses, + "namespaces": namespaces, + } + except Exception as exc: + logger.warning("Failed to collect Redis detail: %s", exc) + redis_detail = {"error": str(exc)} + + # ---- Process caches ---- + process_caches = get_all_process_cache_stats() + + # ---- Route cache ---- + route_cache = get_route_cache_status() + + # ---- DB pool ---- + db_pool = None + try: + pool_status = get_pool_status() + pool_config = get_pool_runtime_config() + db_pool = { + "status": pool_status, + "config": { + "pool_size": pool_config.get("pool_size"), + "max_overflow": pool_config.get("max_overflow"), + "pool_timeout": pool_config.get("pool_timeout"), + "pool_recycle": pool_config.get("pool_recycle"), + }, + } + except Exception as exc: + logger.warning("Failed to collect DB pool status: %s", exc) + db_pool = {"error": str(exc)} + + # ---- Direct connections ---- + direct_connections = { + "total_since_start": get_direct_connection_count(), + "worker_pid": os.getpid(), + } + + return jsonify({ + "success": True, + "data": { + "redis": redis_detail, + "process_caches": process_caches, + "route_cache": route_cache, + "db_pool": db_pool, + "direct_connections": direct_connections, + }, + }) + + +@admin_bp.route("/api/performance-history", methods=["GET"]) +@admin_required +def api_performance_history(): + """API: Get historical metrics snapshots for trend charts.""" + from mes_dashboard.core.metrics_history import get_metrics_history_store + + minutes = request.args.get("minutes", 30, type=int) + minutes = max(1, min(minutes, 180)) + store = get_metrics_history_store() + snapshots = store.query_snapshots(minutes=minutes) + return jsonify({ + "success": True, + "data": { + "snapshots": snapshots, + "count": len(snapshots), + }, + }) + + +@admin_bp.route("/api/logs/cleanup", methods=["POST"]) +@admin_required +def api_logs_cleanup(): + """API: Manually trigger log cleanup. + + Supports optional parameters: + - older_than_days: Delete logs older than N days (default: use configured retention) + - keep_count: Keep only the most recent N logs (optional) + """ + from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED + + if not LOG_STORE_ENABLED: + return jsonify({ + "success": False, + "error": "Log store is disabled" + }), 400 + + log_store = get_log_store() + + # Get current stats before cleanup + stats_before = log_store.get_stats() + + # Perform cleanup + deleted = log_store.cleanup_old_logs() + + # Get stats after cleanup + stats_after = log_store.get_stats() + + user = getattr(g, "username", "unknown") + logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries") + + return jsonify({ + "success": True, + "data": { + "deleted": deleted, + "before": { + "count": stats_before.get("count", 0), + "size_bytes": stats_before.get("size_bytes", 0) + }, + "after": { + "count": stats_after.get("count", 0), + "size_bytes": stats_after.get("size_bytes", 0) + } + } + }) + + +# ============================================================ +# Worker Restart Control Routes +# ============================================================ + def _get_restart_state() -> dict: """Read worker restart state from file.""" return load_restart_state(RESTART_STATE_PATH) @@ -323,14 +458,14 @@ def _iso_from_epoch(ts: float) -> str | None: if ts <= 0: return None return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat() - - + + def _check_restart_cooldown() -> tuple[bool, float]: """Check if restart is in cooldown. - - Returns: - Tuple of (is_in_cooldown, remaining_seconds). - """ + + Returns: + Tuple of (is_in_cooldown, remaining_seconds). + """ policy = _get_restart_policy_state() if policy.get("cooldown"): return True, float(policy.get("cooldown_remaining_seconds") or 0.0) @@ -401,18 +536,18 @@ def _log_restart_audit(event: str, payload: dict[str, Any]) -> None: **payload, } logger.info("worker_restart_audit %s", json.dumps(entry, ensure_ascii=False)) - - + + @admin_bp.route("/api/worker/restart", methods=["POST"]) @admin_required def api_worker_restart(): - """API: Request worker restart. - - Writes a restart flag file that the watchdog process monitors. - Enforces a 60-second cooldown between restart requests. - """ - global _last_restart_request - + """API: Request worker restart. + + Writes a restart flag file that the watchdog process monitors. + Enforces a 60-second cooldown between restart requests. + """ + global _last_restart_request + payload = request.get_json(silent=True) or {} manual_override = bool(payload.get("manual_override")) override_acknowledged = bool(payload.get("override_acknowledged")) @@ -496,10 +631,10 @@ def api_worker_restart(): f"Failed to request restart: {e}", status_code=500 ) - - # Update in-memory cooldown - _last_restart_request = time.time() - + + # Update in-memory cooldown + _last_restart_request = time.time() + _log_restart_audit( "restart_request_accepted", { @@ -534,10 +669,10 @@ def api_worker_restart(): }, } }) - - -@admin_bp.route("/api/worker/status", methods=["GET"]) -@admin_required + + +@admin_bp.route("/api/worker/status", methods=["GET"]) +@admin_required def api_worker_status(): """API: Get worker status and restart information.""" # Get last restart info @@ -555,29 +690,29 @@ def api_worker_status(): cooldown_active=bool(policy_state.get("cooldown")), ) runtime_contract = build_runtime_contract_diagnostics(strict=False) - - # Get worker start time (psutil is optional) - worker_start_time = None - try: - import psutil - process = psutil.Process(os.getpid()) - worker_start_time = datetime.fromtimestamp( - process.create_time() - ).isoformat() - except ImportError: - # psutil not installed, try /proc on Linux - try: - stat_path = f"/proc/{os.getpid()}/stat" - with open(stat_path) as f: - stat = f.read().split() - # Field 22 is starttime in clock ticks since boot - # This is a simplified fallback - pass - except Exception: - pass - except Exception: - pass - + + # Get worker start time (psutil is optional) + worker_start_time = None + try: + import psutil + process = psutil.Process(os.getpid()) + worker_start_time = datetime.fromtimestamp( + process.create_time() + ).isoformat() + except ImportError: + # psutil not installed, try /proc on Linux + try: + stat_path = f"/proc/{os.getpid()}/stat" + with open(stat_path) as f: + stat = f.read().split() + # Field 22 is starttime in clock ticks since boot + # This is a simplified fallback + pass + except Exception: + pass + except Exception: + pass + return jsonify({ "success": True, "data": { @@ -628,25 +763,25 @@ def api_worker_status(): "last_restart": { "requested_by": last_restart.get("requested_by"), "requested_at": last_restart.get("requested_at"), - "requested_ip": last_restart.get("requested_ip"), - "completed_at": last_restart.get("completed_at"), - "success": last_restart.get("success") - } - } - }) - - -# ============================================================ -# Page Management Routes -# ============================================================ - -@admin_bp.route("/pages") -@admin_required -def pages(): - """Page management interface.""" - return render_template("admin/pages.html") - - + "requested_ip": last_restart.get("requested_ip"), + "completed_at": last_restart.get("completed_at"), + "success": last_restart.get("success") + } + } + }) + + +# ============================================================ +# Page Management Routes +# ============================================================ + +@admin_bp.route("/pages") +@admin_required +def pages(): + """Page management interface.""" + return render_template("admin/pages.html") + + @admin_bp.route("/api/pages", methods=["GET"]) @admin_required def api_get_pages(): diff --git a/src/mes_dashboard/services/realtime_equipment_cache.py b/src/mes_dashboard/services/realtime_equipment_cache.py index 615c802..e71ee98 100644 --- a/src/mes_dashboard/services/realtime_equipment_cache.py +++ b/src/mes_dashboard/services/realtime_equipment_cache.py @@ -14,6 +14,7 @@ from collections import OrderedDict from datetime import datetime from typing import Any +from mes_dashboard.core.cache import register_process_cache from mes_dashboard.core.database import read_sql_df from mes_dashboard.core.redis_client import ( get_redis_client, @@ -92,6 +93,13 @@ class _ProcessLevelCache: with self._lock: self._cache.pop(key, None) + def stats(self) -> dict: + """Return live cache statistics for telemetry.""" + with self._lock: + now = time.time() + live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl) + return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl} + def _resolve_cache_max_size(env_name: str, default: int) -> int: value = os.getenv(env_name) @@ -113,6 +121,7 @@ _equipment_status_cache = _ProcessLevelCache( ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS, max_size=EQUIPMENT_PROCESS_CACHE_MAX_SIZE, ) +register_process_cache("equipment_status", _equipment_status_cache, "Equipment Status (L1, 30s)") _equipment_status_parse_lock = threading.Lock() _equipment_lookup_lock = threading.Lock() _equipment_status_lookup: dict[str, dict[str, Any]] = {} diff --git a/src/mes_dashboard/services/reject_dataset_cache.py b/src/mes_dashboard/services/reject_dataset_cache.py index 3373906..21a485c 100644 --- a/src/mes_dashboard/services/reject_dataset_cache.py +++ b/src/mes_dashboard/services/reject_dataset_cache.py @@ -20,7 +20,7 @@ from typing import Any, Dict, List, Optional import pandas as pd -from mes_dashboard.core.cache import ProcessLevelCache +from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache from mes_dashboard.core.database import read_sql_df from mes_dashboard.core.redis_client import ( REDIS_ENABLED, @@ -55,6 +55,7 @@ _CACHE_MAX_SIZE = 8 _REDIS_NAMESPACE = "reject_dataset" _dataset_cache = ProcessLevelCache(ttl_seconds=_CACHE_TTL, max_size=_CACHE_MAX_SIZE) +register_process_cache("reject_dataset", _dataset_cache, "Reject Dataset (L1, 15min)") # ============================================================ diff --git a/src/mes_dashboard/services/resource_cache.py b/src/mes_dashboard/services/resource_cache.py index 260611b..af4fe6b 100644 --- a/src/mes_dashboard/services/resource_cache.py +++ b/src/mes_dashboard/services/resource_cache.py @@ -19,6 +19,7 @@ from typing import Any import pandas as pd +from mes_dashboard.core.cache import register_process_cache from mes_dashboard.core.redis_client import ( get_redis_client, redis_available, @@ -109,6 +110,13 @@ class _ProcessLevelCache: with self._lock: self._cache.pop(key, None) + def stats(self) -> dict: + """Return live cache statistics for telemetry.""" + with self._lock: + now = time.time() + live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl) + return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl} + def _resolve_cache_max_size(env_name: str, default: int) -> int: value = os.getenv(env_name) @@ -130,6 +138,7 @@ _resource_df_cache = _ProcessLevelCache( ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS, max_size=RESOURCE_PROCESS_CACHE_MAX_SIZE, ) +register_process_cache("resource", _resource_df_cache, "Resource DataFrame (L1, 30s)") _resource_parse_lock = threading.Lock() _resource_index_lock = threading.Lock() _resource_index: ResourceIndex = {