diff --git a/frontend/package.json b/frontend/package.json
index 0465e5d..e95ffc2 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -5,7 +5,7 @@
"type": "module",
"scripts": {
"dev": "vite --host",
- "build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html",
+ "build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html && cp ../src/mes_dashboard/static/dist/src/admin-performance/index.html ../src/mes_dashboard/static/dist/admin-performance.html",
"test": "node --test tests/*.test.js"
},
"devDependencies": {
diff --git a/frontend/src/admin-performance/App.vue b/frontend/src/admin-performance/App.vue
new file mode 100644
index 0000000..9732407
--- /dev/null
+++ b/frontend/src/admin-performance/App.vue
@@ -0,0 +1,613 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Redis 快取
+
+
+
+
+ | Namespace | Key 數量 |
+
+
+ | {{ ns.name }} |
+ {{ ns.key_count }} |
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 記憶體快取
+
+
+
{{ name }}
+
{{ info.description }}
+
+
TTL: {{ info.ttl_seconds }}s
+
+
+
+
Route Cache
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 連線池
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Worker 控制
+
+
+
+
+
+
+
+
+
+
+
確認重啟 Worker
+
重啟將導致目前的請求暫時中斷,確定要繼續嗎?
+
+
+
+
+
+
+
+
+
+
+ 系統日誌
+
+
+
+
+
+
+
+
+
+ | 時間 |
+ 等級 |
+ 訊息 |
+
+
+
+
+ | {{ log.timestamp }} |
+ {{ log.level }} |
+ {{ log.message }} |
+
+
+
+
無日誌
+
+
+
+
+
+
+
diff --git a/frontend/src/admin-performance/components/GaugeBar.vue b/frontend/src/admin-performance/components/GaugeBar.vue
new file mode 100644
index 0000000..688650a
--- /dev/null
+++ b/frontend/src/admin-performance/components/GaugeBar.vue
@@ -0,0 +1,49 @@
+
+
+
+
+
diff --git a/frontend/src/admin-performance/components/StatCard.vue b/frontend/src/admin-performance/components/StatCard.vue
new file mode 100644
index 0000000..24b4ae2
--- /dev/null
+++ b/frontend/src/admin-performance/components/StatCard.vue
@@ -0,0 +1,24 @@
+
+
+
{{ formattedValue }}
+
{{ label }}
+
+
+
+
diff --git a/frontend/src/admin-performance/components/StatusDot.vue b/frontend/src/admin-performance/components/StatusDot.vue
new file mode 100644
index 0000000..ba5b3c1
--- /dev/null
+++ b/frontend/src/admin-performance/components/StatusDot.vue
@@ -0,0 +1,17 @@
+
+
+
+ {{ label }}
+
+
+
+
diff --git a/frontend/src/admin-performance/components/TrendChart.vue b/frontend/src/admin-performance/components/TrendChart.vue
new file mode 100644
index 0000000..e5fa3b1
--- /dev/null
+++ b/frontend/src/admin-performance/components/TrendChart.vue
@@ -0,0 +1,98 @@
+
+
+
+
+
{{ title }}
+
+
+
+
趨勢資料不足(需至少 2 筆快照)
+
+
diff --git a/frontend/src/admin-performance/index.html b/frontend/src/admin-performance/index.html
new file mode 100644
index 0000000..63e5435
--- /dev/null
+++ b/frontend/src/admin-performance/index.html
@@ -0,0 +1,12 @@
+
+
+
+
+
+ Performance Monitor
+
+
+
+
+
+
diff --git a/frontend/src/admin-performance/main.js b/frontend/src/admin-performance/main.js
new file mode 100644
index 0000000..c56440b
--- /dev/null
+++ b/frontend/src/admin-performance/main.js
@@ -0,0 +1,6 @@
+import { createApp } from 'vue';
+
+import App from './App.vue';
+import './style.css';
+
+createApp(App).mount('#app');
diff --git a/frontend/src/admin-performance/style.css b/frontend/src/admin-performance/style.css
new file mode 100644
index 0000000..f9e9bce
--- /dev/null
+++ b/frontend/src/admin-performance/style.css
@@ -0,0 +1,544 @@
+/* Admin Performance Dashboard */
+*,
+*::before,
+*::after {
+ box-sizing: border-box;
+ margin: 0;
+ padding: 0;
+}
+
+body {
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+ background: #f1f5f9;
+ color: #1e293b;
+ line-height: 1.5;
+}
+
+.perf-dashboard {
+ max-width: 1280px;
+ margin: 0 auto;
+ padding: 0 16px 32px;
+}
+
+/* Header */
+.perf-header {
+ background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+ color: #fff;
+ padding: 20px 24px;
+ border-radius: 0 0 12px 12px;
+ margin: 0 -16px 20px;
+}
+
+.perf-header-inner {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ flex-wrap: wrap;
+ gap: 12px;
+}
+
+.perf-title {
+ font-size: 1.4rem;
+ font-weight: 700;
+}
+
+.perf-header-actions {
+ display: flex;
+ align-items: center;
+ gap: 12px;
+}
+
+.auto-refresh-toggle {
+ display: flex;
+ align-items: center;
+ gap: 6px;
+ font-size: 0.85rem;
+ cursor: pointer;
+ user-select: none;
+}
+
+.auto-refresh-toggle input[type='checkbox'] {
+ accent-color: #fff;
+}
+
+/* Buttons */
+.btn {
+ display: inline-flex;
+ align-items: center;
+ gap: 6px;
+ padding: 8px 16px;
+ border: none;
+ border-radius: 6px;
+ font-size: 0.85rem;
+ font-weight: 500;
+ cursor: pointer;
+ background: rgba(255, 255, 255, 0.2);
+ color: #fff;
+ transition: background 0.15s;
+}
+
+.btn:hover:not(:disabled) {
+ background: rgba(255, 255, 255, 0.3);
+}
+
+.btn:disabled {
+ opacity: 0.5;
+ cursor: not-allowed;
+}
+
+.btn-sm {
+ padding: 5px 10px;
+ font-size: 0.8rem;
+ background: #e2e8f0;
+ color: #334155;
+}
+
+.btn-sm:hover:not(:disabled) {
+ background: #cbd5e1;
+}
+
+.btn-danger {
+ background: #ef4444;
+ color: #fff;
+}
+
+.btn-danger:hover:not(:disabled) {
+ background: #dc2626;
+}
+
+/* Panel */
+.panel {
+ background: #fff;
+ border-radius: 10px;
+ padding: 20px;
+ margin-bottom: 16px;
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.06);
+}
+
+.panel-disabled {
+ opacity: 0.6;
+}
+
+.panel-title {
+ font-size: 1rem;
+ font-weight: 600;
+ margin-bottom: 14px;
+ color: #334155;
+}
+
+.sub-title {
+ font-size: 0.9rem;
+ font-weight: 600;
+ margin: 16px 0 10px;
+ color: #475569;
+}
+
+.muted {
+ color: #94a3b8;
+ font-size: 0.85rem;
+}
+
+/* Status Cards */
+.status-cards-grid {
+ display: grid;
+ grid-template-columns: repeat(4, 1fr);
+ gap: 12px;
+}
+
+.status-card {
+ background: #f8fafc;
+ border-radius: 8px;
+ padding: 14px;
+ text-align: center;
+}
+
+.status-card-title {
+ font-size: 0.75rem;
+ color: #64748b;
+ margin-bottom: 8px;
+ text-transform: uppercase;
+ letter-spacing: 0.05em;
+}
+
+/* StatusDot */
+.status-dot-wrapper {
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ gap: 6px;
+}
+
+.status-dot {
+ width: 10px;
+ height: 10px;
+ border-radius: 50%;
+ flex-shrink: 0;
+}
+
+.status-dot--healthy {
+ background: #22c55e;
+ box-shadow: 0 0 6px rgba(34, 197, 94, 0.4);
+}
+
+.status-dot--degraded {
+ background: #f59e0b;
+ box-shadow: 0 0 6px rgba(245, 158, 11, 0.4);
+}
+
+.status-dot--error {
+ background: #ef4444;
+ box-shadow: 0 0 6px rgba(239, 68, 68, 0.4);
+}
+
+.status-dot--disabled {
+ background: #94a3b8;
+}
+
+.status-dot-label {
+ font-size: 0.85rem;
+ font-weight: 500;
+}
+
+/* GaugeBar */
+.gauge-bar {
+ margin-bottom: 12px;
+}
+
+.gauge-bar-header {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ margin-bottom: 4px;
+}
+
+.gauge-bar-label {
+ font-size: 0.8rem;
+ color: #64748b;
+}
+
+.gauge-bar-value {
+ font-size: 0.8rem;
+ font-weight: 600;
+}
+
+.gauge-bar-track {
+ height: 8px;
+ background: #e2e8f0;
+ border-radius: 4px;
+ overflow: hidden;
+}
+
+.gauge-bar-fill {
+ height: 100%;
+ border-radius: 4px;
+ transition: width 0.4s ease, background-color 0.3s;
+ min-width: 2px;
+}
+
+/* StatCard */
+.stat-card {
+ background: #f8fafc;
+ border-radius: 8px;
+ padding: 10px 12px;
+ text-align: center;
+}
+
+.stat-card-value {
+ font-size: 1.1rem;
+ font-weight: 700;
+ color: #1e293b;
+ line-height: 1.2;
+}
+
+.stat-card-label {
+ font-size: 0.7rem;
+ color: #64748b;
+ margin-top: 2px;
+}
+
+/* Query Performance */
+.query-perf-grid {
+ display: grid;
+ grid-template-columns: 1fr 1fr;
+ gap: 16px;
+}
+
+.query-perf-stats {
+ display: grid;
+ grid-template-columns: repeat(3, 1fr);
+ gap: 8px;
+ align-content: start;
+}
+
+.query-perf-chart {
+ min-height: 200px;
+}
+
+/* Redis */
+.redis-grid {
+ display: grid;
+ grid-template-columns: 1fr 1fr;
+ gap: 16px;
+}
+
+.redis-mini-stats {
+ display: grid;
+ grid-template-columns: repeat(2, 1fr);
+ gap: 8px;
+ margin-top: 12px;
+}
+
+.redis-namespaces {
+ overflow-x: auto;
+}
+
+/* Mini Table */
+.mini-table {
+ width: 100%;
+ border-collapse: collapse;
+ font-size: 0.82rem;
+}
+
+.mini-table th,
+.mini-table td {
+ padding: 6px 10px;
+ text-align: left;
+ border-bottom: 1px solid #e2e8f0;
+}
+
+.mini-table th {
+ background: #f8fafc;
+ font-weight: 600;
+ color: #475569;
+}
+
+/* Memory Cache Cards */
+.cache-cards-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
+ gap: 12px;
+}
+
+.cache-card {
+ background: #f8fafc;
+ border-radius: 8px;
+ padding: 14px;
+}
+
+.cache-card-name {
+ font-size: 0.85rem;
+ font-weight: 600;
+ margin-bottom: 2px;
+}
+
+.cache-card-desc {
+ font-size: 0.72rem;
+ color: #64748b;
+ margin-bottom: 8px;
+}
+
+.cache-card-ttl {
+ font-size: 0.72rem;
+ color: #94a3b8;
+ margin-top: 4px;
+}
+
+.route-cache-stats {
+ display: grid;
+ grid-template-columns: repeat(6, 1fr);
+ gap: 8px;
+}
+
+/* Connection Pool */
+.pool-stats-grid {
+ display: grid;
+ grid-template-columns: repeat(4, 1fr);
+ gap: 8px;
+ margin-top: 14px;
+}
+
+/* Worker */
+.worker-info {
+ display: grid;
+ grid-template-columns: repeat(3, 1fr);
+ gap: 8px;
+ margin-bottom: 14px;
+}
+
+/* Modal */
+.modal-backdrop {
+ position: fixed;
+ inset: 0;
+ background: rgba(0, 0, 0, 0.45);
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ z-index: 1000;
+}
+
+.modal-dialog {
+ background: #fff;
+ border-radius: 12px;
+ padding: 24px;
+ max-width: 400px;
+ width: 90%;
+ box-shadow: 0 20px 60px rgba(0, 0, 0, 0.15);
+}
+
+.modal-dialog h3 {
+ margin-bottom: 8px;
+}
+
+.modal-dialog p {
+ font-size: 0.9rem;
+ color: #475569;
+ margin-bottom: 16px;
+}
+
+.modal-actions {
+ display: flex;
+ gap: 8px;
+ justify-content: flex-end;
+}
+
+.modal-actions .btn {
+ background: #e2e8f0;
+ color: #334155;
+}
+
+/* Log Controls */
+.log-controls {
+ display: flex;
+ gap: 8px;
+ margin-bottom: 12px;
+ flex-wrap: wrap;
+}
+
+.log-controls select,
+.log-controls input[type='text'] {
+ padding: 6px 10px;
+ border: 1px solid #cbd5e1;
+ border-radius: 6px;
+ font-size: 0.82rem;
+ outline: none;
+}
+
+.log-controls input[type='text'] {
+ flex: 1;
+ min-width: 160px;
+}
+
+/* Log Table */
+.log-table-wrapper {
+ overflow-x: auto;
+ max-height: 400px;
+ overflow-y: auto;
+}
+
+.log-table {
+ width: 100%;
+ border-collapse: collapse;
+ font-size: 0.78rem;
+ font-family: 'SF Mono', 'Fira Code', monospace;
+}
+
+.log-table th,
+.log-table td {
+ padding: 5px 8px;
+ text-align: left;
+ border-bottom: 1px solid #f1f5f9;
+ white-space: nowrap;
+}
+
+.log-table th {
+ background: #f8fafc;
+ font-weight: 600;
+ position: sticky;
+ top: 0;
+ z-index: 1;
+}
+
+.log-msg {
+ white-space: pre-wrap;
+ word-break: break-all;
+ max-width: 600px;
+}
+
+.log-time {
+ color: #64748b;
+}
+
+.log-level {
+ font-weight: 600;
+}
+
+.log-error .log-level {
+ color: #ef4444;
+}
+
+.log-warning .log-level {
+ color: #f59e0b;
+}
+
+.log-info .log-level {
+ color: #3b82f6;
+}
+
+.log-debug .log-level {
+ color: #94a3b8;
+}
+
+/* Log Pagination */
+.log-pagination {
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ gap: 12px;
+ margin-top: 10px;
+ font-size: 0.82rem;
+}
+
+/* Responsive */
+@media (max-width: 768px) {
+ .status-cards-grid {
+ grid-template-columns: repeat(2, 1fr);
+ }
+
+ .query-perf-grid,
+ .redis-grid {
+ grid-template-columns: 1fr;
+ }
+
+ .pool-stats-grid {
+ grid-template-columns: repeat(2, 1fr);
+ }
+
+ .route-cache-stats {
+ grid-template-columns: repeat(3, 1fr);
+ }
+}
+
+/* Trend Charts */
+.trend-chart-card {
+ margin-top: 4px;
+ background: #fff;
+ border: 1px solid #e2e8f0;
+ border-radius: 8px;
+ padding: 16px;
+}
+.trend-chart-title {
+ font-size: 0.9rem;
+ font-weight: 600;
+ color: #475569;
+ margin-bottom: 8px;
+}
+.trend-chart-canvas {
+ width: 100%;
+ min-height: 200px;
+}
+.trend-chart-empty {
+ color: #94a3b8;
+ font-size: 0.85rem;
+ text-align: center;
+ padding: 32px 0;
+}
diff --git a/frontend/src/portal-shell/nativeModuleRegistry.js b/frontend/src/portal-shell/nativeModuleRegistry.js
index 9e8e3d3..8e50775 100644
--- a/frontend/src/portal-shell/nativeModuleRegistry.js
+++ b/frontend/src/portal-shell/nativeModuleRegistry.js
@@ -70,6 +70,10 @@ const NATIVE_MODULE_LOADERS = Object.freeze({
() => import('../tables/App.vue'),
[() => import('../tables/style.css')],
),
+ '/admin/performance': createNativeLoader(
+ () => import('../admin-performance/App.vue'),
+ [() => import('../admin-performance/style.css')],
+ ),
});
export function getNativeModuleLoader(route) {
diff --git a/frontend/src/portal-shell/routeContracts.js b/frontend/src/portal-shell/routeContracts.js
index 9da0493..254a38a 100644
--- a/frontend/src/portal-shell/routeContracts.js
+++ b/frontend/src/portal-shell/routeContracts.js
@@ -190,13 +190,13 @@ const ROUTE_CONTRACTS = Object.freeze({
'/admin/performance': buildContract({
route: '/admin/performance',
routeId: 'admin-performance',
- renderMode: 'external',
+ renderMode: 'native',
owner: 'frontend-platform-admin',
title: '效能監控',
- rollbackStrategy: 'external_route_reversion',
+ rollbackStrategy: 'fallback_to_legacy_route',
visibilityPolicy: 'admin_only',
scope: 'in-scope',
- compatibilityPolicy: 'external_target_redirect',
+ compatibilityPolicy: 'redirect_to_shell_when_spa_enabled',
}),
'/tables': buildContract({
route: '/tables',
diff --git a/frontend/vite.config.js b/frontend/vite.config.js
index 8d13109..35fc4f4 100644
--- a/frontend/vite.config.js
+++ b/frontend/vite.config.js
@@ -28,7 +28,8 @@ export default defineConfig(({ mode }) => ({
'query-tool': resolve(__dirname, 'src/query-tool/main.js'),
'tmtt-defect': resolve(__dirname, 'src/tmtt-defect/main.js'),
'qc-gate': resolve(__dirname, 'src/qc-gate/index.html'),
- 'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html')
+ 'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html'),
+ 'admin-performance': resolve(__dirname, 'src/admin-performance/index.html')
},
output: {
entryFileNames: '[name].js',
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/.openspec.yaml b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/.openspec.yaml
new file mode 100644
index 0000000..cbbb578
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-02-22
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/design.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/design.md
new file mode 100644
index 0000000..0bd2406
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/design.md
@@ -0,0 +1,91 @@
+## Context
+
+現有 `/admin/performance` 是 Jinja2 server-rendered 頁面(vanilla JS + Chart.js),是唯一未遷移至 Vue 3 SPA 的前端頁面。後端已具備豐富的監控數據(連線池 `get_pool_status()`、Redis client、LayeredCache `.telemetry()`),但前端僅展示 4 張 status cards + query performance + worker control + logs,缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度等關鍵面板。
+
+## Goals / Non-Goals
+
+**Goals:**
+- 將 admin/performance 頁面從 Jinja2 切換為 Vue 3 SPA,與所有報表頁面架構一致
+- 新增完整的系統監控面板:Redis 快取詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤
+- 提供可複用的 gauge/stat card 組件,便於未來擴展監控項目
+- 保留所有既有功能(status cards、query performance、worker control、system logs)
+
+**Non-Goals:**
+- 不新增告警/通知機制(未來可擴展)
+- 不引入 WebSocket 即時推送(維持 30 秒輪詢)
+- 不修改既有 API response format(`system-status`、`metrics`、`logs` 保持不變)
+- 不新增使用者權限控制(沿用既有 admin 認證)
+
+## Decisions
+
+### 1. Vue 3 SPA + ECharts 取代 Jinja2 + Chart.js
+
+**選擇**: 全面重建為 Vue 3 SPA,使用 ECharts 繪製圖表
+
+**理由**: 所有報表頁面已完成 Vue SPA 遷移,admin/performance 是最後一個 Jinja2 頁面。統一架構可複用 `apiGet`、`useAutoRefresh` 等共用基礎設施,減少維護成本。ECharts 已是專案標準圖表庫(query-tool、reject-history 等均使用)。
+
+**替代方案**: 保留 Jinja2 僅加 API — 但會持續累積技術債,且無法複用 Vue 生態。
+
+### 2. 單一 performance-detail API 聚合所有新增監控數據
+
+**選擇**: 新增 `GET /admin/api/performance-detail` 一個 endpoint,回傳 `redis`、`process_caches`、`route_cache`、`db_pool`、`direct_connections` 五個 section。
+
+**理由**: 減少前端並發請求數(已有 5 個 API,加 1 個共 6 個),後端可在同一 request 中順序收集各子系統狀態,避免多次 round-trip。
+
+**替代方案**: 每個監控維度獨立 endpoint — 更 RESTful 但增加前端複雜度和網路開銷。
+
+### 3. ProcessLevelCache 全域 registry 模式
+
+**選擇**: 在 `core/cache.py` 新增 `_PROCESS_CACHE_REGISTRY` dict + `register_process_cache()` 函式,各服務在模組載入時自行註冊。
+
+**理由**: 避免 admin_routes 硬編碼各快取實例的 import 路徑,新增快取時只需在該服務中加一行 `register_process_cache()` 即可自動出現在監控面板。
+
+**替代方案**: admin_routes 直接 import 各快取實例 — 耦合度高,新增快取需改兩處。
+
+### 4. Redis namespace 監控使用 SCAN 而非 KEYS
+
+**選擇**: 使用 `SCAN` 搭配 `MATCH` pattern 掃描各 namespace 的 key 數量。
+
+**理由**: `KEYS *` 在生產環境會阻塞 Redis,`SCAN` 為非阻塞迭代器,安全性更高。
+
+### 5. 直連 Oracle 使用 thread-safe atomic counter
+
+**選擇**: 在 `database.py` 使用 `threading.Lock` 保護的全域計數器,在 `get_db_connection()` 和 `read_sql_df_slow()` 建立連線後 increment。
+
+**理由**: 追蹤連線池外的直接連線使用量,幫助判斷是否需要調整池大小。計數器為 monotonic(只增不減),記錄的是自 worker 啟動以來的總數。
+
+### 6. 前端組件複用 GaugeBar / StatCard / StatusDot
+
+**選擇**: 新增 3 個小型可複用組件放在 `admin-performance/components/` 下。
+
+**理由**: Redis 記憶體、連線池飽和度、ProcessLevelCache 使用率等多處需要 gauge 視覺化;status cards 跨面板重複。組件化可統一視覺風格並減少重複 template。
+
+### 7. SQLite 持久化 metrics history store
+
+**選擇**: 新增 `core/metrics_history.py`,使用 SQLite 儲存 metrics snapshots(仿 `core/log_store.py` 的 `LogStore` 模式),搭配 daemon thread 每 30 秒採集一次。
+
+**理由**: in-memory deque 在 worker 重啟或 gunicorn prefork 下無法跨 worker 共享且不保留歷史。SQLite 提供跨 worker 讀取、重啟持久化、可配置保留天數(預設 3 天 / 50000 rows),且不需額外 infra。
+
+**替代方案**:
+- in-memory deque — 簡單但 worker 獨立、重啟即失
+- Redis TSDB — 需額外模組且增加 Redis 負擔
+- PostgreSQL — 太重,且此數據不需 ACID
+
+**Schema**: `metrics_snapshots` table 含 timestamp、worker PID、pool/redis/route_cache/latency 各欄位,`idx_metrics_ts` 索引加速時間查詢。
+
+**背景採集**: `MetricsHistoryCollector` daemon thread,間隔可透過 `METRICS_HISTORY_INTERVAL` 環境變數配置。在 `app.py` lifecycle 中 start/stop。
+
+## Risks / Trade-offs
+
+- **Redis SCAN 效能**: 大量 key 時 SCAN 可能較慢 → 設定 `COUNT 100` 限制每次迭代量,且 30 秒才掃一次,可接受
+- **ProcessLevelCache registry 依賴模組載入順序**: 服務未 import 時不會註冊 → 在 app factory 或 gunicorn post_fork 確保所有服務模組已載入
+- **直連計數器跨 worker 不共享**: gunicorn prefork 模式下每個 worker 有獨立計數 → API 回傳當前 worker PID 供辨識,可透過 `/admin/api/system-status` 的 worker info 交叉比對
+- **舊 Jinja2 模板保留但不維護**: 切換後舊模板不再更新 → 透過 `routeContracts.js` 的 `rollbackStrategy: 'fallback_to_legacy_route'` 保留回退能力
+
+## Migration Plan
+
+1. 後端先行:加 `stats()`、registry、直連計數器、新 API(不影響既有功能)
+2. 前端建構:新建 `admin-performance/` Vue SPA,Vite 註冊 entry
+3. 路由切換:`admin_routes.py` 改為 `send_from_directory`,`routeContracts.js` 改 `renderMode: 'native'`
+4. 驗證後部署:確認所有面板正確顯示後上線
+5. 回退方案:`routeContracts.js` 改回 `renderMode: 'external'`,`admin_routes.py` 改回 `render_template`
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/proposal.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/proposal.md
new file mode 100644
index 0000000..a6620f6
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/proposal.md
@@ -0,0 +1,31 @@
+## Why
+
+現有 `/admin/performance` 是唯一仍使用 Jinja2 + vanilla JS + Chart.js 的頁面,與所有已遷移至 Vue 3 SPA 的報表頁面架構不一致。同時,隨著報表系統功能擴充(L1/L2 快取層、連線池、直連 Oracle 等),後端已具備豐富的遙測數據,但管理後台的監控面板覆蓋不足——缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤等關鍵資訊。
+
+## What Changes
+
+- 將 `/admin/performance` 從 Jinja2 server-rendered 頁面重建為 Vue 3 SPA(ECharts 取代 Chart.js)
+- 新增 `GET /admin/api/performance-detail` API,整合 Redis INFO/SCAN、ProcessLevelCache registry、連線池狀態、直連計數等完整監控數據
+- 後端 `ProcessLevelCache` 加入 `stats()` 方法與全域 registry,支援動態收集所有快取實例狀態
+- 後端 `database.py` 加入直連 Oracle 計數器,追蹤非連線池的直接連線使用量
+- 前端新增 GaugeBar / StatCard / StatusDot 可複用組件,提供 gauge 飽和度視覺化
+- portal-shell 路由從 `renderMode: 'external'` 切換為 `'native'`
+- Vite 構建新增 `admin-performance` entry point
+
+## Capabilities
+
+### New Capabilities
+- `admin-performance-spa`: Vue 3 SPA 重建管理效能儀表板,包含 status cards、query performance、Redis 快取、記憶體快取、連線池、worker 控制、系統日誌等完整面板
+- `cache-telemetry-api`: ProcessLevelCache stats() + 全域 registry + performance-detail API,提供所有記憶體快取、Redis 快取、route cache 的遙測數據
+- `connection-pool-monitoring`: 連線池飽和度追蹤 + 直連 Oracle 計數器,完整呈現資料庫連線使用狀況
+- `metrics-history-trending`: SQLite 持久化背景採集 + 時間序列趨勢圖,可回溯連線池飽和度、查詢延遲、Redis 記憶體、快取命中率等歷史數據
+
+### Modified Capabilities
+
+
+## Impact
+
+- **Backend** (7 files): `core/cache.py`、`core/database.py`、`core/metrics_history.py`(NEW)、`routes/admin_routes.py`、`services/resource_cache.py`、`services/realtime_equipment_cache.py`、`services/reject_dataset_cache.py`、`app.py`
+- **Frontend** (8 new + 3 modified): 新建 `admin-performance/` 目錄(index.html、main.js、App.vue、style.css、4 個組件含 TrendChart),修改 `vite.config.js`、`package.json`、`routeContracts.js`
+- **API**: 新增 2 個 endpoint (`/admin/api/performance-detail`、`/admin/api/performance-history`),既有 5 個 endpoint 不變
+- **Rollback**: 舊 Jinja2 模板保留,可透過 `routeContracts.js` 切回 `renderMode: 'external'`
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/admin-performance-spa/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/admin-performance-spa/spec.md
new file mode 100644
index 0000000..431b7a0
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/admin-performance-spa/spec.md
@@ -0,0 +1,100 @@
+## ADDED Requirements
+
+### Requirement: Vue 3 SPA page replaces Jinja2 template
+The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route.
+
+#### Scenario: Page loads as Vue SPA
+- **WHEN** user navigates to `/admin/performance`
+- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template)
+
+#### Scenario: Portal-shell integration
+- **WHEN** the portal-shell renders `/admin/performance`
+- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe)
+
+### Requirement: Status cards display system health
+The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value.
+
+#### Scenario: All systems healthy
+- **WHEN** all backend systems report healthy status via `/admin/api/system-status`
+- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values
+
+#### Scenario: Redis disabled
+- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
+- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message
+
+### Requirement: Query performance panel with ECharts
+The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation.
+
+#### Scenario: Metrics loaded successfully
+- **WHEN** `/admin/api/metrics` returns valid performance data
+- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution
+
+#### Scenario: No metrics data
+- **WHEN** `/admin/api/metrics` returns empty or null metrics
+- **THEN** the panel SHALL display placeholder text indicating no data available
+
+### Requirement: Redis cache detail panel
+The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table.
+
+#### Scenario: Redis active with data
+- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts
+- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count
+
+#### Scenario: Redis disabled
+- **WHEN** Redis is disabled
+- **THEN** the Redis detail panel SHALL display a disabled state message without errors
+
+### Requirement: Memory cache panel
+The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads).
+
+#### Scenario: Multiple caches registered
+- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries
+- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description
+
+#### Scenario: Route cache telemetry
+- **WHEN** `/admin/api/performance-detail` returns route_cache data
+- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads
+
+### Requirement: Connection pool panel
+The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count.
+
+#### Scenario: Pool under normal load
+- **WHEN** pool saturation is below 80%
+- **THEN** the GaugeBar SHALL display in a normal color (green/blue)
+
+#### Scenario: Pool near saturation
+- **WHEN** pool saturation exceeds 80%
+- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red)
+
+### Requirement: Worker control panel
+The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal.
+
+#### Scenario: Restart worker
+- **WHEN** user clicks the restart button and confirms in the modal
+- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result
+
+#### Scenario: Restart during cooldown
+- **WHEN** worker is in cooldown period
+- **THEN** the restart button SHALL be disabled with a cooldown indicator
+
+### Requirement: System logs panel with filtering and pagination
+The dashboard SHALL display system logs with level filtering, text search, and pagination controls.
+
+#### Scenario: Filter by log level
+- **WHEN** user selects a specific log level filter
+- **THEN** only logs matching that level SHALL be displayed
+
+#### Scenario: Paginate logs
+- **WHEN** logs exceed the page size
+- **THEN** pagination controls SHALL allow navigating between pages
+
+### Requirement: Auto-refresh with toggle
+The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh.
+
+#### Scenario: Auto-refresh enabled
+- **WHEN** auto-refresh is enabled (default)
+- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch
+
+#### Scenario: Manual refresh
+- **WHEN** user clicks the manual refresh button
+- **THEN** all panels SHALL immediately refresh their data
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/cache-telemetry-api/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/cache-telemetry-api/spec.md
new file mode 100644
index 0000000..96cf778
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/cache-telemetry-api/spec.md
@@ -0,0 +1,56 @@
+## ADDED Requirements
+
+### Requirement: ProcessLevelCache stats method
+Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`.
+
+#### Scenario: Stats on active cache
+- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s)
+- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}`
+
+#### Scenario: Stats with expired entries
+- **WHEN** `stats()` is called and some entries have exceeded TTL
+- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl`
+
+#### Scenario: Thread safety
+- **WHEN** `stats()` is called concurrently with cache writes
+- **THEN** it SHALL acquire the cache lock and return consistent data without races
+
+### Requirement: ProcessLevelCache global registry
+The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`.
+
+#### Scenario: Register and retrieve all caches
+- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called
+- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches
+
+#### Scenario: Cache not registered
+- **WHEN** a service's ProcessLevelCache is not registered
+- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output
+
+### Requirement: Performance detail API endpoint
+The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`.
+
+#### Scenario: All systems available
+- **WHEN** the API is called and all subsystems are healthy
+- **THEN** it SHALL return all 5 sections with current telemetry data
+
+#### Scenario: Redis disabled
+- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
+- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally
+
+### Requirement: Redis namespace key distribution
+The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`.
+
+#### Scenario: Keys exist across namespaces
+- **WHEN** Redis contains keys across multiple namespaces
+- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count`
+
+#### Scenario: SCAN safety
+- **WHEN** scanning Redis keys
+- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis
+
+### Requirement: Route cache telemetry in performance detail
+The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`.
+
+#### Scenario: LayeredCache active
+- **WHEN** route cache is in layered mode
+- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/connection-pool-monitoring/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/connection-pool-monitoring/spec.md
new file mode 100644
index 0000000..ae71a86
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/connection-pool-monitoring/spec.md
@@ -0,0 +1,27 @@
+## ADDED Requirements
+
+### Requirement: Connection pool status in performance detail
+The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`.
+
+#### Scenario: Pool status retrieved
+- **WHEN** the API is called
+- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values
+
+#### Scenario: Saturation calculation
+- **WHEN** the pool has 8 checked_out connections and max_capacity is 30
+- **THEN** saturation SHALL be reported as approximately 26.7%
+
+### Requirement: Direct Oracle connection counter
+The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection.
+
+#### Scenario: Counter increments on direct connection
+- **WHEN** `get_db_connection()` successfully creates a connection
+- **THEN** the direct connection counter SHALL increment by 1
+
+#### Scenario: Counter in performance detail
+- **WHEN** the performance-detail API is called
+- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID)
+
+#### Scenario: Counter is per-worker
+- **WHEN** multiple gunicorn workers are running
+- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/metrics-history-trending/spec.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/metrics-history-trending/spec.md
new file mode 100644
index 0000000..c13633b
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/specs/metrics-history-trending/spec.md
@@ -0,0 +1,65 @@
+## ADDED Requirements
+
+### Requirement: SQLite metrics history store
+The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`.
+
+#### Scenario: Write and query snapshots
+- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics
+- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID
+
+#### Scenario: Query by time range
+- **WHEN** `query_snapshots(minutes=30)` is called
+- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending
+
+#### Scenario: Retention cleanup
+- **WHEN** `cleanup()` is called
+- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000)
+
+#### Scenario: Thread safety
+- **WHEN** multiple threads write snapshots concurrently
+- **THEN** the write lock SHALL serialize writes and prevent database corruption
+
+### Requirement: Background metrics collector
+The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var).
+
+#### Scenario: Automatic collection
+- **WHEN** the collector is started via `start_metrics_history(app)`
+- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store
+
+#### Scenario: Graceful shutdown
+- **WHEN** `stop_metrics_history()` is called
+- **THEN** the collector thread SHALL stop within one interval period
+
+#### Scenario: Subsystem unavailability
+- **WHEN** a subsystem (e.g., Redis) is unavailable during collection
+- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics
+
+### Requirement: Performance history API endpoint
+The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots.
+
+#### Scenario: Query with time range
+- **WHEN** the API is called with `?minutes=30`
+- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}`
+
+#### Scenario: Time range bounds
+- **WHEN** `minutes` is less than 1 or greater than 180
+- **THEN** it SHALL be clamped to the range [1, 180]
+
+#### Scenario: Admin authentication
+- **WHEN** the API is called without admin authentication
+- **THEN** it SHALL be rejected by the `@admin_required` decorator
+
+### Requirement: Frontend trend charts
+The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts.
+
+#### Scenario: Trend charts with data
+- **WHEN** historical snapshots contain more than 1 data point
+- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates
+
+#### Scenario: Trend charts without data
+- **WHEN** historical snapshots are empty or contain only 1 data point
+- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`)
+
+#### Scenario: Auto-refresh
+- **WHEN** the dashboard auto-refreshes
+- **THEN** historical data SHALL also be refreshed alongside real-time metrics
diff --git a/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/tasks.md b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/tasks.md
new file mode 100644
index 0000000..6e7c7c0
--- /dev/null
+++ b/openspec/changes/archive/2026-02-23-admin-performance-vue-spa/tasks.md
@@ -0,0 +1,80 @@
+## 1. Backend — Cache Telemetry Infrastructure
+
+- [x] 1.1 Add `stats()` method to `ProcessLevelCache` in `core/cache.py` (returns entries/max_size/ttl_seconds with lock)
+- [x] 1.2 Add `_PROCESS_CACHE_REGISTRY`, `register_process_cache()`, and `get_all_process_cache_stats()` to `core/cache.py`
+- [x] 1.3 Register `_wip_df_cache` in `core/cache.py`
+- [x] 1.4 Add `stats()` + `register_process_cache()` to `services/resource_cache.py`
+- [x] 1.5 Add `stats()` + `register_process_cache()` to `services/realtime_equipment_cache.py`
+- [x] 1.6 Add `register_process_cache()` to `services/reject_dataset_cache.py`
+
+## 2. Backend — Direct Connection Counter
+
+- [x] 2.1 Add `_DIRECT_CONN_COUNTER`, `_DIRECT_CONN_LOCK`, and `get_direct_connection_count()` to `core/database.py`
+- [x] 2.2 Increment counter in `get_db_connection()` and `read_sql_df_slow()` after successful connection creation
+
+## 3. Backend — Performance Detail API
+
+- [x] 3.1 Add `GET /admin/api/performance-detail` endpoint in `routes/admin_routes.py` returning redis, process_caches, route_cache, db_pool, and direct_connections sections
+- [x] 3.2 Implement Redis INFO + SCAN namespace key distribution (data, route_cache, equipment_status, reject_dataset, meta, lock, scrap_exclusion) with graceful degradation when Redis is disabled
+
+## 4. Frontend — Page Scaffolding
+
+- [x] 4.1 Create `frontend/src/admin-performance/index.html` and `main.js` (standard Vue SPA entry)
+- [x] 4.2 Register `admin-performance` entry in `vite.config.js`
+- [x] 4.3 Add `cp` command for `admin-performance.html` in `package.json` build script
+
+## 5. Frontend — Reusable Components
+
+- [x] 5.1 Create `GaugeBar.vue` — horizontal gauge bar with label, value, max, and color threshold props
+- [x] 5.2 Create `StatCard.vue` — mini card with numeric value, label, and optional unit/icon
+- [x] 5.3 Create `StatusDot.vue` — colored dot indicator (healthy/degraded/error/disabled) with label
+
+## 6. Frontend — App.vue Main Dashboard
+
+- [x] 6.1 Implement data fetching layer: `loadSystemStatus()`, `loadMetrics()`, `loadPerformanceDetail()`, `loadLogs()`, `loadWorkerStatus()` with `Promise.all` parallel fetch and `useAutoRefresh` (30s)
+- [x] 6.2 Build header section with gradient background, title, auto-refresh toggle, and manual refresh button
+- [x] 6.3 Build status cards section (Database / Redis / Circuit Breaker / Worker PID) using StatusDot
+- [x] 6.4 Build query performance panel with P50/P95/P99 stat cards and ECharts latency distribution chart
+- [x] 6.5 Build Redis cache detail panel with memory GaugeBar, hit rate, client count, peak memory, and namespace key distribution table
+- [x] 6.6 Build memory cache panel with ProcessLevelCache grid cards (entries/max gauge + TTL) and route cache telemetry (L1/L2 hit rate, miss rate, total reads)
+- [x] 6.7 Build connection pool panel with saturation GaugeBar and stat card grid (checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, direct connections)
+- [x] 6.8 Build worker control panel with PID/uptime/cooldown display, restart button, and confirmation modal
+- [x] 6.9 Build system logs panel with level filter, text search, pagination, and log clearing
+- [x] 6.10 Create `style.css` with all panel, grid, gauge, card, and responsive layout styles
+
+## 7. Route Integration
+
+- [x] 7.1 Change `/admin/performance` route handler in `admin_routes.py` from `render_template` to `send_from_directory` serving the Vue SPA
+- [x] 7.2 Update `routeContracts.js`: change renderMode to `'native'`, rollbackStrategy to `'fallback_to_legacy_route'`, compatibilityPolicy to `'redirect_to_shell_when_spa_enabled'`
+
+## 8. Verification (Phase 1)
+
+- [x] 8.1 Run `cd frontend && npx vite build` — confirm no compilation errors and `admin-performance.html` is produced
+- [x] 8.2 Verify all dashboard panels render correctly with live data after service restart
+
+## 9. Backend — Metrics History Store
+
+- [x] 9.1 Create `core/metrics_history.py` with `MetricsHistoryStore` class (SQLite schema, thread-local connections, write_lock, write_snapshot, query_snapshots, cleanup)
+- [x] 9.2 Add `MetricsHistoryCollector` class (daemon thread, configurable interval, collect pool/redis/route_cache/latency)
+- [x] 9.3 Add module-level `get_metrics_history_store()`, `start_metrics_history(app)`, `stop_metrics_history()` functions
+
+## 10. Backend — Lifecycle Integration
+
+- [x] 10.1 Call `start_metrics_history(app)` in `app.py` after other background services
+- [x] 10.2 Call `stop_metrics_history()` in `_shutdown_runtime_resources()` in `app.py`
+
+## 11. Backend — Performance History API
+
+- [x] 11.1 Add `GET /admin/api/performance-history` endpoint in `admin_routes.py` (minutes param, clamped 1-180, returns snapshots array)
+
+## 12. Frontend — Trend Charts
+
+- [x] 12.1 Create `TrendChart.vue` component using vue-echarts VChart (line/area chart, dual yAxis support, time labels, autoresize)
+- [x] 12.2 Add `loadPerformanceHistory()` fetch to `App.vue` and integrate into `refreshAll()`
+- [x] 12.3 Add 4 TrendChart panels to `App.vue` template (pool saturation, query latency, Redis memory, cache hit rates)
+- [x] 12.4 Add trend chart styles to `style.css`
+
+## 13. Verification (Phase 2)
+
+- [x] 13.1 Run `cd frontend && npm run build` — confirm no compilation errors
+- [x] 13.2 Verify trend charts render with historical data after service restart + 60s collection
diff --git a/openspec/specs/admin-performance-spa/spec.md b/openspec/specs/admin-performance-spa/spec.md
new file mode 100644
index 0000000..431b7a0
--- /dev/null
+++ b/openspec/specs/admin-performance-spa/spec.md
@@ -0,0 +1,100 @@
+## ADDED Requirements
+
+### Requirement: Vue 3 SPA page replaces Jinja2 template
+The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route.
+
+#### Scenario: Page loads as Vue SPA
+- **WHEN** user navigates to `/admin/performance`
+- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template)
+
+#### Scenario: Portal-shell integration
+- **WHEN** the portal-shell renders `/admin/performance`
+- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe)
+
+### Requirement: Status cards display system health
+The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value.
+
+#### Scenario: All systems healthy
+- **WHEN** all backend systems report healthy status via `/admin/api/system-status`
+- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values
+
+#### Scenario: Redis disabled
+- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
+- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message
+
+### Requirement: Query performance panel with ECharts
+The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation.
+
+#### Scenario: Metrics loaded successfully
+- **WHEN** `/admin/api/metrics` returns valid performance data
+- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution
+
+#### Scenario: No metrics data
+- **WHEN** `/admin/api/metrics` returns empty or null metrics
+- **THEN** the panel SHALL display placeholder text indicating no data available
+
+### Requirement: Redis cache detail panel
+The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table.
+
+#### Scenario: Redis active with data
+- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts
+- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count
+
+#### Scenario: Redis disabled
+- **WHEN** Redis is disabled
+- **THEN** the Redis detail panel SHALL display a disabled state message without errors
+
+### Requirement: Memory cache panel
+The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads).
+
+#### Scenario: Multiple caches registered
+- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries
+- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description
+
+#### Scenario: Route cache telemetry
+- **WHEN** `/admin/api/performance-detail` returns route_cache data
+- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads
+
+### Requirement: Connection pool panel
+The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count.
+
+#### Scenario: Pool under normal load
+- **WHEN** pool saturation is below 80%
+- **THEN** the GaugeBar SHALL display in a normal color (green/blue)
+
+#### Scenario: Pool near saturation
+- **WHEN** pool saturation exceeds 80%
+- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red)
+
+### Requirement: Worker control panel
+The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal.
+
+#### Scenario: Restart worker
+- **WHEN** user clicks the restart button and confirms in the modal
+- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result
+
+#### Scenario: Restart during cooldown
+- **WHEN** worker is in cooldown period
+- **THEN** the restart button SHALL be disabled with a cooldown indicator
+
+### Requirement: System logs panel with filtering and pagination
+The dashboard SHALL display system logs with level filtering, text search, and pagination controls.
+
+#### Scenario: Filter by log level
+- **WHEN** user selects a specific log level filter
+- **THEN** only logs matching that level SHALL be displayed
+
+#### Scenario: Paginate logs
+- **WHEN** logs exceed the page size
+- **THEN** pagination controls SHALL allow navigating between pages
+
+### Requirement: Auto-refresh with toggle
+The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh.
+
+#### Scenario: Auto-refresh enabled
+- **WHEN** auto-refresh is enabled (default)
+- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch
+
+#### Scenario: Manual refresh
+- **WHEN** user clicks the manual refresh button
+- **THEN** all panels SHALL immediately refresh their data
diff --git a/openspec/specs/cache-telemetry-api/spec.md b/openspec/specs/cache-telemetry-api/spec.md
new file mode 100644
index 0000000..96cf778
--- /dev/null
+++ b/openspec/specs/cache-telemetry-api/spec.md
@@ -0,0 +1,56 @@
+## ADDED Requirements
+
+### Requirement: ProcessLevelCache stats method
+Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`.
+
+#### Scenario: Stats on active cache
+- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s)
+- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}`
+
+#### Scenario: Stats with expired entries
+- **WHEN** `stats()` is called and some entries have exceeded TTL
+- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl`
+
+#### Scenario: Thread safety
+- **WHEN** `stats()` is called concurrently with cache writes
+- **THEN** it SHALL acquire the cache lock and return consistent data without races
+
+### Requirement: ProcessLevelCache global registry
+The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`.
+
+#### Scenario: Register and retrieve all caches
+- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called
+- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches
+
+#### Scenario: Cache not registered
+- **WHEN** a service's ProcessLevelCache is not registered
+- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output
+
+### Requirement: Performance detail API endpoint
+The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`.
+
+#### Scenario: All systems available
+- **WHEN** the API is called and all subsystems are healthy
+- **THEN** it SHALL return all 5 sections with current telemetry data
+
+#### Scenario: Redis disabled
+- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
+- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally
+
+### Requirement: Redis namespace key distribution
+The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`.
+
+#### Scenario: Keys exist across namespaces
+- **WHEN** Redis contains keys across multiple namespaces
+- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count`
+
+#### Scenario: SCAN safety
+- **WHEN** scanning Redis keys
+- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis
+
+### Requirement: Route cache telemetry in performance detail
+The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`.
+
+#### Scenario: LayeredCache active
+- **WHEN** route cache is in layered mode
+- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry
diff --git a/openspec/specs/connection-pool-monitoring/spec.md b/openspec/specs/connection-pool-monitoring/spec.md
new file mode 100644
index 0000000..ae71a86
--- /dev/null
+++ b/openspec/specs/connection-pool-monitoring/spec.md
@@ -0,0 +1,27 @@
+## ADDED Requirements
+
+### Requirement: Connection pool status in performance detail
+The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`.
+
+#### Scenario: Pool status retrieved
+- **WHEN** the API is called
+- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values
+
+#### Scenario: Saturation calculation
+- **WHEN** the pool has 8 checked_out connections and max_capacity is 30
+- **THEN** saturation SHALL be reported as approximately 26.7%
+
+### Requirement: Direct Oracle connection counter
+The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection.
+
+#### Scenario: Counter increments on direct connection
+- **WHEN** `get_db_connection()` successfully creates a connection
+- **THEN** the direct connection counter SHALL increment by 1
+
+#### Scenario: Counter in performance detail
+- **WHEN** the performance-detail API is called
+- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID)
+
+#### Scenario: Counter is per-worker
+- **WHEN** multiple gunicorn workers are running
+- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker
diff --git a/openspec/specs/metrics-history-trending/spec.md b/openspec/specs/metrics-history-trending/spec.md
new file mode 100644
index 0000000..c13633b
--- /dev/null
+++ b/openspec/specs/metrics-history-trending/spec.md
@@ -0,0 +1,65 @@
+## ADDED Requirements
+
+### Requirement: SQLite metrics history store
+The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`.
+
+#### Scenario: Write and query snapshots
+- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics
+- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID
+
+#### Scenario: Query by time range
+- **WHEN** `query_snapshots(minutes=30)` is called
+- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending
+
+#### Scenario: Retention cleanup
+- **WHEN** `cleanup()` is called
+- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000)
+
+#### Scenario: Thread safety
+- **WHEN** multiple threads write snapshots concurrently
+- **THEN** the write lock SHALL serialize writes and prevent database corruption
+
+### Requirement: Background metrics collector
+The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var).
+
+#### Scenario: Automatic collection
+- **WHEN** the collector is started via `start_metrics_history(app)`
+- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store
+
+#### Scenario: Graceful shutdown
+- **WHEN** `stop_metrics_history()` is called
+- **THEN** the collector thread SHALL stop within one interval period
+
+#### Scenario: Subsystem unavailability
+- **WHEN** a subsystem (e.g., Redis) is unavailable during collection
+- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics
+
+### Requirement: Performance history API endpoint
+The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots.
+
+#### Scenario: Query with time range
+- **WHEN** the API is called with `?minutes=30`
+- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}`
+
+#### Scenario: Time range bounds
+- **WHEN** `minutes` is less than 1 or greater than 180
+- **THEN** it SHALL be clamped to the range [1, 180]
+
+#### Scenario: Admin authentication
+- **WHEN** the API is called without admin authentication
+- **THEN** it SHALL be rejected by the `@admin_required` decorator
+
+### Requirement: Frontend trend charts
+The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts.
+
+#### Scenario: Trend charts with data
+- **WHEN** historical snapshots contain more than 1 data point
+- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates
+
+#### Scenario: Trend charts without data
+- **WHEN** historical snapshots are empty or contain only 1 data point
+- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`)
+
+#### Scenario: Auto-refresh
+- **WHEN** the dashboard auto-refreshes
+- **THEN** historical data SHALL also be refreshed alongside real-time metrics
diff --git a/src/mes_dashboard/app.py b/src/mes_dashboard/app.py
index 028a9c9..f4a5f2e 100644
--- a/src/mes_dashboard/app.py
+++ b/src/mes_dashboard/app.py
@@ -295,6 +295,12 @@ def _shutdown_runtime_resources() -> None:
except Exception as exc:
logger.warning("Error stopping scrap exclusion cache worker: %s", exc)
+ try:
+ from mes_dashboard.core.metrics_history import stop_metrics_history
+ stop_metrics_history()
+ except Exception as exc:
+ logger.warning("Error stopping metrics history: %s", exc)
+
try:
close_redis()
except Exception as exc:
@@ -390,6 +396,8 @@ def create_app(config_name: str | None = None) -> Flask:
start_cache_updater() # Start Redis cache updater
init_realtime_equipment_cache(app) # Start realtime equipment status cache
init_scrap_reason_exclusion_cache(app) # Start exclusion-policy cache sync
+ from mes_dashboard.core.metrics_history import start_metrics_history
+ start_metrics_history(app) # Start metrics history collector
_register_shutdown_hooks(app)
# Register API routes
diff --git a/src/mes_dashboard/core/cache.py b/src/mes_dashboard/core/cache.py
index 6ffc966..442f96b 100644
--- a/src/mes_dashboard/core/cache.py
+++ b/src/mes_dashboard/core/cache.py
@@ -95,6 +95,34 @@ class ProcessLevelCache:
with self._lock:
self._cache.clear()
+ def stats(self) -> dict:
+ """Return live cache statistics for telemetry."""
+ with self._lock:
+ now = time.time()
+ live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
+ return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
+
+
+# ============================================================
+# Process-Level Cache Registry (for admin telemetry)
+# ============================================================
+
+_PROCESS_CACHE_REGISTRY: dict[str, tuple[str, Any]] = {}
+
+
+def register_process_cache(name: str, cache_instance: Any, description: str = "") -> None:
+ """Register a ProcessLevelCache instance for admin telemetry."""
+ _PROCESS_CACHE_REGISTRY[name] = (description, cache_instance)
+
+
+def get_all_process_cache_stats() -> dict[str, dict]:
+ """Collect stats from all registered ProcessLevelCache instances."""
+ return {
+ name: {**inst.stats(), "description": desc}
+ for name, (desc, inst) in _PROCESS_CACHE_REGISTRY.items()
+ if callable(getattr(inst, "stats", None))
+ }
+
def _resolve_cache_max_size(env_name: str, default: int) -> int:
value = os.getenv(env_name)
@@ -116,6 +144,7 @@ _wip_df_cache = ProcessLevelCache(
ttl_seconds=30,
max_size=WIP_PROCESS_CACHE_MAX_SIZE,
)
+register_process_cache("wip_dataframe", _wip_df_cache, "WIP DataFrame (L1, 30s)")
_wip_parse_lock = threading.Lock()
# ============================================================
diff --git a/src/mes_dashboard/core/database.py b/src/mes_dashboard/core/database.py
index bdbf279..f27f2bf 100644
--- a/src/mes_dashboard/core/database.py
+++ b/src/mes_dashboard/core/database.py
@@ -416,6 +416,14 @@ def dispose_engine():
# Direct Connection Helpers
# ============================================================
+_DIRECT_CONN_COUNTER = 0
+_DIRECT_CONN_LOCK = threading.Lock()
+
+
+def get_direct_connection_count() -> int:
+ """Return total direct (non-pooled) connections since worker start."""
+ return _DIRECT_CONN_COUNTER
+
def get_db_connection():
"""Create a direct oracledb connection.
@@ -432,6 +440,9 @@ def get_db_connection():
retry_delay=runtime["retry_delay"],
)
conn.call_timeout = runtime["call_timeout_ms"]
+ with _DIRECT_CONN_LOCK:
+ global _DIRECT_CONN_COUNTER
+ _DIRECT_CONN_COUNTER += 1
logger.debug(
"Direct oracledb connection established (call_timeout_ms=%s)",
runtime["call_timeout_ms"],
@@ -591,6 +602,9 @@ def read_sql_df_slow(
retry_delay=runtime["retry_delay"],
)
conn.call_timeout = timeout_ms
+ with _DIRECT_CONN_LOCK:
+ global _DIRECT_CONN_COUNTER
+ _DIRECT_CONN_COUNTER += 1
logger.debug(
"Slow-query connection established (call_timeout_ms=%s)", timeout_ms
)
diff --git a/src/mes_dashboard/core/metrics_history.py b/src/mes_dashboard/core/metrics_history.py
new file mode 100644
index 0000000..bdb08e2
--- /dev/null
+++ b/src/mes_dashboard/core/metrics_history.py
@@ -0,0 +1,369 @@
+# -*- coding: utf-8 -*-
+"""SQLite-based metrics history store for admin performance dashboard.
+
+Periodically snapshots system metrics (pool, redis, cache, latency)
+into a SQLite database for historical trend visualization.
+Follows the LogStore pattern from core/log_store.py.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sqlite3
+import threading
+import time
+from contextlib import contextmanager
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional
+
+logger = logging.getLogger('mes_dashboard.metrics_history')
+
+# ============================================================
+# Configuration
+# ============================================================
+
+METRICS_HISTORY_PATH = os.getenv(
+ 'METRICS_HISTORY_PATH',
+ 'logs/metrics_history.sqlite',
+)
+METRICS_HISTORY_INTERVAL = int(os.getenv('METRICS_HISTORY_INTERVAL', '30'))
+METRICS_HISTORY_RETENTION_DAYS = int(os.getenv('METRICS_HISTORY_RETENTION_DAYS', '3'))
+METRICS_HISTORY_MAX_ROWS = int(os.getenv('METRICS_HISTORY_MAX_ROWS', '50000'))
+
+# ============================================================
+# Database Schema
+# ============================================================
+
+CREATE_TABLE_SQL = """
+CREATE TABLE IF NOT EXISTS metrics_snapshots (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ ts TEXT NOT NULL,
+ worker_pid INTEGER NOT NULL,
+ pool_saturation REAL,
+ pool_checked_out INTEGER,
+ pool_checked_in INTEGER,
+ pool_overflow INTEGER,
+ pool_max_capacity INTEGER,
+ redis_used_memory INTEGER,
+ redis_hit_rate REAL,
+ rc_l1_hit_rate REAL,
+ rc_l2_hit_rate REAL,
+ rc_miss_rate REAL,
+ latency_p50_ms REAL,
+ latency_p95_ms REAL,
+ latency_p99_ms REAL,
+ latency_count INTEGER
+);
+"""
+
+CREATE_INDEX_SQL = (
+ "CREATE INDEX IF NOT EXISTS idx_metrics_ts ON metrics_snapshots(ts);"
+)
+
+COLUMNS = [
+ "ts", "worker_pid",
+ "pool_saturation", "pool_checked_out", "pool_checked_in",
+ "pool_overflow", "pool_max_capacity",
+ "redis_used_memory", "redis_hit_rate",
+ "rc_l1_hit_rate", "rc_l2_hit_rate", "rc_miss_rate",
+ "latency_p50_ms", "latency_p95_ms", "latency_p99_ms", "latency_count",
+]
+
+
+# ============================================================
+# Metrics History Store
+# ============================================================
+
+class MetricsHistoryStore:
+ """SQLite-based metrics history store (follows LogStore pattern)."""
+
+ def __init__(self, db_path: str = METRICS_HISTORY_PATH):
+ self.db_path = db_path
+ self._local = threading.local()
+ self._write_lock = threading.Lock()
+ self._initialized = False
+
+ def initialize(self) -> None:
+ if self._initialized:
+ return
+ db_dir = Path(self.db_path).parent
+ db_dir.mkdir(parents=True, exist_ok=True)
+ with self._get_connection() as conn:
+ cursor = conn.cursor()
+ cursor.execute(CREATE_TABLE_SQL)
+ cursor.execute(CREATE_INDEX_SQL)
+ conn.commit()
+ self._initialized = True
+ logger.info("Metrics history store initialized at %s", self.db_path)
+
+ @contextmanager
+ def _get_connection(self) -> Generator[sqlite3.Connection, None, None]:
+ if not hasattr(self._local, 'connection') or self._local.connection is None:
+ self._local.connection = sqlite3.connect(
+ self.db_path, timeout=10.0, check_same_thread=False,
+ )
+ self._local.connection.row_factory = sqlite3.Row
+ try:
+ yield self._local.connection
+ except sqlite3.Error as exc:
+ logger.error("Metrics history DB error: %s", exc)
+ try:
+ self._local.connection.close()
+ except Exception:
+ pass
+ self._local.connection = None
+ raise
+
+ def write_snapshot(self, data: Dict[str, Any]) -> bool:
+ if not self._initialized:
+ self.initialize()
+ ts = datetime.now().isoformat()
+ pid = os.getpid()
+ pool = data.get("pool") or {}
+ redis = data.get("redis") or {}
+ rc = data.get("route_cache") or {}
+ lat = data.get("latency") or {}
+ try:
+ with self._write_lock:
+ with self._get_connection() as conn:
+ conn.execute(
+ """
+ INSERT INTO metrics_snapshots
+ (ts, worker_pid,
+ pool_saturation, pool_checked_out, pool_checked_in,
+ pool_overflow, pool_max_capacity,
+ redis_used_memory, redis_hit_rate,
+ rc_l1_hit_rate, rc_l2_hit_rate, rc_miss_rate,
+ latency_p50_ms, latency_p95_ms, latency_p99_ms, latency_count)
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
+ """,
+ (
+ ts, pid,
+ pool.get("saturation"),
+ pool.get("checked_out"),
+ pool.get("checked_in"),
+ pool.get("overflow"),
+ pool.get("max_capacity"),
+ redis.get("used_memory"),
+ redis.get("hit_rate"),
+ rc.get("l1_hit_rate"),
+ rc.get("l2_hit_rate"),
+ rc.get("miss_rate"),
+ lat.get("p50_ms"),
+ lat.get("p95_ms"),
+ lat.get("p99_ms"),
+ lat.get("count"),
+ ),
+ )
+ conn.commit()
+ return True
+ except Exception as exc:
+ logger.debug("Failed to write metrics snapshot: %s", exc)
+ return False
+
+ def query_snapshots(self, minutes: int = 30) -> List[Dict[str, Any]]:
+ if not self._initialized:
+ self.initialize()
+ cutoff = (datetime.now() - timedelta(minutes=minutes)).isoformat()
+ try:
+ with self._get_connection() as conn:
+ cursor = conn.execute(
+ "SELECT * FROM metrics_snapshots WHERE ts >= ? ORDER BY ts ASC",
+ (cutoff,),
+ )
+ return [dict(row) for row in cursor.fetchall()]
+ except Exception as exc:
+ logger.error("Failed to query metrics snapshots: %s", exc)
+ return []
+
+ def cleanup(self) -> int:
+ if not self._initialized:
+ return 0
+ deleted = 0
+ try:
+ with self._write_lock:
+ with self._get_connection() as conn:
+ cutoff = (
+ datetime.now() - timedelta(days=METRICS_HISTORY_RETENTION_DAYS)
+ ).isoformat()
+ cursor = conn.execute(
+ "DELETE FROM metrics_snapshots WHERE ts < ?", (cutoff,),
+ )
+ deleted += cursor.rowcount
+ row = conn.execute(
+ "SELECT COUNT(*) FROM metrics_snapshots",
+ ).fetchone()
+ count = row[0] if row else 0
+ if count > METRICS_HISTORY_MAX_ROWS:
+ excess = count - METRICS_HISTORY_MAX_ROWS
+ cursor = conn.execute(
+ """
+ DELETE FROM metrics_snapshots WHERE id IN (
+ SELECT id FROM metrics_snapshots ORDER BY ts ASC LIMIT ?
+ )
+ """,
+ (excess,),
+ )
+ deleted += cursor.rowcount
+ conn.commit()
+ if deleted > 0:
+ logger.info("Cleaned up %d metrics history rows", deleted)
+ except Exception as exc:
+ logger.error("Failed to cleanup metrics history: %s", exc)
+ return deleted
+
+
+# ============================================================
+# Background Collector
+# ============================================================
+
+class MetricsHistoryCollector:
+ """Daemon thread that snapshots metrics at a fixed interval."""
+
+ def __init__(
+ self,
+ app: Any = None,
+ store: Optional[MetricsHistoryStore] = None,
+ interval: int = METRICS_HISTORY_INTERVAL,
+ ):
+ self._app = app
+ self._store = store or get_metrics_history_store()
+ self.interval = interval
+ self._stop_event = threading.Event()
+ self._thread: Optional[threading.Thread] = None
+ self._cleanup_counter = 0
+
+ def start(self) -> None:
+ if self._thread is not None and self._thread.is_alive():
+ return
+ self._stop_event.clear()
+ self._thread = threading.Thread(
+ target=self._run, daemon=True, name="metrics-history-collector",
+ )
+ self._thread.start()
+ logger.info(
+ "Metrics history collector started (interval=%ds)", self.interval,
+ )
+
+ def stop(self) -> None:
+ if self._thread and self._thread.is_alive():
+ self._stop_event.set()
+ self._thread.join(timeout=5)
+ logger.info("Metrics history collector stopped")
+
+ def _run(self) -> None:
+ # Collect immediately on start, then loop.
+ self._collect_snapshot()
+ while not self._stop_event.wait(self.interval):
+ self._collect_snapshot()
+ # Run cleanup every ~100 intervals (~50 min at 30s).
+ self._cleanup_counter += 1
+ if self._cleanup_counter >= 100:
+ self._cleanup_counter = 0
+ self._store.cleanup()
+
+ def _collect_snapshot(self) -> None:
+ try:
+ data: Dict[str, Any] = {}
+
+ # Pool status
+ try:
+ from mes_dashboard.core.database import get_pool_status
+ data["pool"] = get_pool_status()
+ except Exception:
+ data["pool"] = {}
+
+ # Redis
+ try:
+ from mes_dashboard.core.redis_client import (
+ get_redis_client,
+ REDIS_ENABLED,
+ )
+ if REDIS_ENABLED:
+ client = get_redis_client()
+ if client is not None:
+ info = client.info(section="memory")
+ stats_info = client.info(section="stats")
+ hits = int(stats_info.get("keyspace_hits", 0))
+ misses = int(stats_info.get("keyspace_misses", 0))
+ total = hits + misses
+ data["redis"] = {
+ "used_memory": info.get("used_memory", 0),
+ "hit_rate": round(hits / total, 4) if total > 0 else 0,
+ }
+ else:
+ data["redis"] = {}
+ else:
+ data["redis"] = {}
+ except Exception:
+ data["redis"] = {}
+
+ # Route cache
+ try:
+ if self._app:
+ with self._app.app_context():
+ from mes_dashboard.routes.health_routes import (
+ get_route_cache_status,
+ )
+ rc = get_route_cache_status()
+ else:
+ from mes_dashboard.routes.health_routes import (
+ get_route_cache_status,
+ )
+ rc = get_route_cache_status()
+ data["route_cache"] = {
+ "l1_hit_rate": rc.get("l1_hit_rate"),
+ "l2_hit_rate": rc.get("l2_hit_rate"),
+ "miss_rate": rc.get("miss_rate"),
+ }
+ except Exception:
+ data["route_cache"] = {}
+
+ # Query latency
+ try:
+ from mes_dashboard.core.metrics import get_metrics_summary
+ summary = get_metrics_summary()
+ data["latency"] = {
+ "p50_ms": summary.get("p50_ms", 0),
+ "p95_ms": summary.get("p95_ms", 0),
+ "p99_ms": summary.get("p99_ms", 0),
+ "count": summary.get("count", 0),
+ }
+ except Exception:
+ data["latency"] = {}
+
+ self._store.write_snapshot(data)
+ except Exception as exc:
+ logger.debug("Metrics snapshot collection failed: %s", exc)
+
+
+# ============================================================
+# Global Instance & Lifecycle
+# ============================================================
+
+_STORE: Optional[MetricsHistoryStore] = None
+_COLLECTOR: Optional[MetricsHistoryCollector] = None
+
+
+def get_metrics_history_store() -> MetricsHistoryStore:
+ global _STORE
+ if _STORE is None:
+ _STORE = MetricsHistoryStore()
+ _STORE.initialize()
+ return _STORE
+
+
+def start_metrics_history(app: Any = None) -> None:
+ global _COLLECTOR
+ store = get_metrics_history_store()
+ _COLLECTOR = MetricsHistoryCollector(app=app, store=store)
+ _COLLECTOR.start()
+
+
+def stop_metrics_history() -> None:
+ global _COLLECTOR
+ if _COLLECTOR is not None:
+ _COLLECTOR.stop()
+ _COLLECTOR = None
diff --git a/src/mes_dashboard/routes/admin_routes.py b/src/mes_dashboard/routes/admin_routes.py
index 95f6a7a..9dfc1b5 100644
--- a/src/mes_dashboard/routes/admin_routes.py
+++ b/src/mes_dashboard/routes/admin_routes.py
@@ -1,8 +1,8 @@
-# -*- coding: utf-8 -*-
-"""Admin routes for page management and performance monitoring."""
-
-from __future__ import annotations
-
+# -*- coding: utf-8 -*-
+"""Admin routes for page management and performance monitoring."""
+
+from __future__ import annotations
+
import json
import logging
import os
@@ -10,8 +10,8 @@ import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
-
-from flask import Blueprint, g, jsonify, render_template, request
+
+from flask import Blueprint, current_app, g, jsonify, render_template, request, send_from_directory
from mes_dashboard.core.permissions import admin_required
from mes_dashboard.core.response import error_response, TOO_MANY_REQUESTS
@@ -42,14 +42,14 @@ from mes_dashboard.services.page_registry import (
set_page_status,
update_drawer,
)
-
-admin_bp = Blueprint("admin", __name__, url_prefix="/admin")
-logger = logging.getLogger("mes_dashboard.admin")
-
-# ============================================================
-# Worker Restart Configuration
-# ============================================================
-
+
+admin_bp = Blueprint("admin", __name__, url_prefix="/admin")
+logger = logging.getLogger("mes_dashboard.admin")
+
+# ============================================================
+# Worker Restart Configuration
+# ============================================================
+
_RUNTIME_CONTRACT = load_runtime_contract()
WATCHDOG_RUNTIME_DIR = _RUNTIME_CONTRACT["watchdog_runtime_dir"]
RESTART_FLAG_PATH = _RUNTIME_CONTRACT["watchdog_restart_flag"]
@@ -57,24 +57,28 @@ RESTART_STATE_PATH = _RUNTIME_CONTRACT["watchdog_state_file"]
WATCHDOG_PID_PATH = _RUNTIME_CONTRACT["watchdog_pid_file"]
GUNICORN_BIND = _RUNTIME_CONTRACT["gunicorn_bind"]
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT["version"]
-
-# Track last restart request time (in-memory for this worker)
-_last_restart_request: float = 0.0
-
-
-# ============================================================
-# Performance Monitoring Routes
-# ============================================================
-
-@admin_bp.route("/performance")
-@admin_required
-def performance():
- """Performance monitoring dashboard."""
- return render_template("admin/performance.html")
-
-
-@admin_bp.route("/api/system-status", methods=["GET"])
-@admin_required
+
+# Track last restart request time (in-memory for this worker)
+_last_restart_request: float = 0.0
+
+
+# ============================================================
+# Performance Monitoring Routes
+# ============================================================
+
+@admin_bp.route("/performance")
+@admin_required
+def performance():
+ """Performance monitoring dashboard (Vue SPA)."""
+ dist_dir = os.path.join(current_app.static_folder or "", "dist")
+ dist_html = os.path.join(dist_dir, "admin-performance.html")
+ if os.path.exists(dist_html):
+ return send_from_directory(dist_dir, "admin-performance.html")
+ return render_template("admin/performance.html")
+
+
+@admin_bp.route("/api/system-status", methods=["GET"])
+@admin_required
def api_system_status():
"""API: Get system status for performance dashboard."""
from mes_dashboard.core.database import get_pool_runtime_config, get_pool_status
@@ -85,15 +89,15 @@ def api_system_status():
check_redis,
get_route_cache_status,
)
-
- # Database status
- db_status, db_error = check_database()
-
- # Redis status
- redis_status = 'disabled'
- if REDIS_ENABLED:
- redis_status, _ = check_redis()
-
+
+ # Database status
+ db_status, db_error = check_database()
+
+ # Redis status
+ redis_status = 'disabled'
+ if REDIS_ENABLED:
+ redis_status, _ = check_redis()
+
# Circuit breaker status
circuit_breaker = get_circuit_breaker_status()
route_cache = get_route_cache_status()
@@ -135,26 +139,26 @@ def api_system_status():
thresholds=thresholds,
)
runtime_contract = build_runtime_contract_diagnostics(strict=False)
-
- # Cache status
- from mes_dashboard.routes.health_routes import (
- get_cache_status,
- get_resource_cache_status,
- get_equipment_status_cache_status
- )
-
- return jsonify({
- "success": True,
- "data": {
- "database": {
- "status": db_status,
- "error": db_error
- },
- "redis": {
- "status": redis_status,
- "enabled": REDIS_ENABLED
- },
- "circuit_breaker": circuit_breaker,
+
+ # Cache status
+ from mes_dashboard.routes.health_routes import (
+ get_cache_status,
+ get_resource_cache_status,
+ get_equipment_status_cache_status
+ )
+
+ return jsonify({
+ "success": True,
+ "data": {
+ "database": {
+ "status": db_status,
+ "error": db_error
+ },
+ "redis": {
+ "status": redis_status,
+ "enabled": REDIS_ENABLED
+ },
+ "circuit_breaker": circuit_breaker,
"cache": {
"wip": get_cache_status(),
"resource": get_resource_cache_status(),
@@ -186,134 +190,265 @@ def api_system_status():
"worker_pid": os.getpid()
}
})
-
-
-@admin_bp.route("/api/metrics", methods=["GET"])
-@admin_required
-def api_metrics():
- """API: Get performance metrics for dashboard."""
- from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics
-
- summary = get_metrics_summary()
- metrics = get_query_metrics()
-
- return jsonify({
- "success": True,
- "data": {
- "p50_ms": summary.get("p50_ms"),
- "p95_ms": summary.get("p95_ms"),
- "p99_ms": summary.get("p99_ms"),
- "count": summary.get("count"),
- "slow_count": summary.get("slow_count"),
- "slow_rate": summary.get("slow_rate"),
- "worker_pid": summary.get("worker_pid"),
- "collected_at": summary.get("collected_at"),
- # Include latency distribution for charts
- "latencies": metrics.get_latencies()[-100:] # Last 100 for chart
- }
- })
-
-
-@admin_bp.route("/api/logs", methods=["GET"])
-@admin_required
-def api_logs():
- """API: Get recent logs from SQLite log store."""
- from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
-
- if not LOG_STORE_ENABLED:
- return jsonify({
- "success": True,
- "data": {
- "logs": [],
- "enabled": False,
- "total": 0
- }
- })
-
- # Query parameters
- level = request.args.get("level")
- q = request.args.get("q")
- limit = request.args.get("limit", 50, type=int)
- offset = request.args.get("offset", 0, type=int)
- since = request.args.get("since")
-
- log_store = get_log_store()
-
- # Get total count for pagination
- total = log_store.count_logs(level=level, q=q, since=since)
-
- # Get paginated logs
- logs = log_store.query_logs(
- level=level,
- q=q,
- limit=min(limit, 100), # Cap at 100 per page
- offset=offset,
- since=since
- )
-
- return jsonify({
- "success": True,
- "data": {
- "logs": logs,
- "count": len(logs),
- "total": total,
- "enabled": True,
- "stats": log_store.get_stats()
- }
- })
-
-
-@admin_bp.route("/api/logs/cleanup", methods=["POST"])
-@admin_required
-def api_logs_cleanup():
- """API: Manually trigger log cleanup.
-
- Supports optional parameters:
- - older_than_days: Delete logs older than N days (default: use configured retention)
- - keep_count: Keep only the most recent N logs (optional)
- """
- from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
-
- if not LOG_STORE_ENABLED:
- return jsonify({
- "success": False,
- "error": "Log store is disabled"
- }), 400
-
- log_store = get_log_store()
-
- # Get current stats before cleanup
- stats_before = log_store.get_stats()
-
- # Perform cleanup
- deleted = log_store.cleanup_old_logs()
-
- # Get stats after cleanup
- stats_after = log_store.get_stats()
-
- user = getattr(g, "username", "unknown")
- logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries")
-
- return jsonify({
- "success": True,
- "data": {
- "deleted": deleted,
- "before": {
- "count": stats_before.get("count", 0),
- "size_bytes": stats_before.get("size_bytes", 0)
- },
- "after": {
- "count": stats_after.get("count", 0),
- "size_bytes": stats_after.get("size_bytes", 0)
- }
- }
- })
-
-
-# ============================================================
-# Worker Restart Control Routes
-# ============================================================
-
+
+
+@admin_bp.route("/api/metrics", methods=["GET"])
+@admin_required
+def api_metrics():
+ """API: Get performance metrics for dashboard."""
+ from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics
+
+ summary = get_metrics_summary()
+ metrics = get_query_metrics()
+
+ return jsonify({
+ "success": True,
+ "data": {
+ "p50_ms": summary.get("p50_ms"),
+ "p95_ms": summary.get("p95_ms"),
+ "p99_ms": summary.get("p99_ms"),
+ "count": summary.get("count"),
+ "slow_count": summary.get("slow_count"),
+ "slow_rate": summary.get("slow_rate"),
+ "worker_pid": summary.get("worker_pid"),
+ "collected_at": summary.get("collected_at"),
+ # Include latency distribution for charts
+ "latencies": metrics.get_latencies()[-100:] # Last 100 for chart
+ }
+ })
+
+
+@admin_bp.route("/api/logs", methods=["GET"])
+@admin_required
+def api_logs():
+ """API: Get recent logs from SQLite log store."""
+ from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
+
+ if not LOG_STORE_ENABLED:
+ return jsonify({
+ "success": True,
+ "data": {
+ "logs": [],
+ "enabled": False,
+ "total": 0
+ }
+ })
+
+ # Query parameters
+ level = request.args.get("level")
+ q = request.args.get("q")
+ limit = request.args.get("limit", 50, type=int)
+ offset = request.args.get("offset", 0, type=int)
+ since = request.args.get("since")
+
+ log_store = get_log_store()
+
+ # Get total count for pagination
+ total = log_store.count_logs(level=level, q=q, since=since)
+
+ # Get paginated logs
+ logs = log_store.query_logs(
+ level=level,
+ q=q,
+ limit=min(limit, 100), # Cap at 100 per page
+ offset=offset,
+ since=since
+ )
+
+ return jsonify({
+ "success": True,
+ "data": {
+ "logs": logs,
+ "count": len(logs),
+ "total": total,
+ "enabled": True,
+ "stats": log_store.get_stats()
+ }
+ })
+
+
+@admin_bp.route("/api/performance-detail", methods=["GET"])
+@admin_required
+def api_performance_detail():
+ """API: Get detailed performance telemetry for admin dashboard.
+
+ Returns redis, process_caches, route_cache, db_pool, and
+ direct_connections sections in a single response.
+ """
+ from mes_dashboard.core.cache import get_all_process_cache_stats
+ from mes_dashboard.core.database import (
+ get_direct_connection_count,
+ get_pool_runtime_config,
+ get_pool_status,
+ )
+ from mes_dashboard.core.redis_client import (
+ get_redis_client,
+ REDIS_ENABLED,
+ REDIS_KEY_PREFIX,
+ )
+ from mes_dashboard.routes.health_routes import get_route_cache_status
+
+ # ---- Redis detail ----
+ redis_detail = None
+ if REDIS_ENABLED:
+ client = get_redis_client()
+ if client is not None:
+ try:
+ info = client.info(section="memory")
+ stats_info = client.info(section="stats")
+ clients_info = client.info(section="clients")
+
+ hits = int(stats_info.get("keyspace_hits", 0))
+ misses = int(stats_info.get("keyspace_misses", 0))
+ total = hits + misses
+ hit_rate = round(hits / total, 4) if total > 0 else 0
+
+ # Scan key counts per namespace
+ namespace_prefixes = [
+ "data", "route_cache", "equipment_status",
+ "reject_dataset", "meta", "lock", "scrap_exclusion",
+ ]
+ namespaces = []
+ for ns in namespace_prefixes:
+ pattern = f"{REDIS_KEY_PREFIX}:{ns}*"
+ count = 0
+ cursor = 0
+ while True:
+ cursor, keys = client.scan(cursor=cursor, match=pattern, count=100)
+ count += len(keys)
+ if cursor == 0:
+ break
+ namespaces.append({"name": ns, "key_count": count})
+
+ redis_detail = {
+ "used_memory_human": info.get("used_memory_human", "N/A"),
+ "used_memory": info.get("used_memory", 0),
+ "peak_memory_human": info.get("used_memory_peak_human", "N/A"),
+ "peak_memory": info.get("used_memory_peak", 0),
+ "maxmemory_human": info.get("maxmemory_human", "N/A"),
+ "maxmemory": info.get("maxmemory", 0),
+ "connected_clients": clients_info.get("connected_clients", 0),
+ "hit_rate": hit_rate,
+ "keyspace_hits": hits,
+ "keyspace_misses": misses,
+ "namespaces": namespaces,
+ }
+ except Exception as exc:
+ logger.warning("Failed to collect Redis detail: %s", exc)
+ redis_detail = {"error": str(exc)}
+
+ # ---- Process caches ----
+ process_caches = get_all_process_cache_stats()
+
+ # ---- Route cache ----
+ route_cache = get_route_cache_status()
+
+ # ---- DB pool ----
+ db_pool = None
+ try:
+ pool_status = get_pool_status()
+ pool_config = get_pool_runtime_config()
+ db_pool = {
+ "status": pool_status,
+ "config": {
+ "pool_size": pool_config.get("pool_size"),
+ "max_overflow": pool_config.get("max_overflow"),
+ "pool_timeout": pool_config.get("pool_timeout"),
+ "pool_recycle": pool_config.get("pool_recycle"),
+ },
+ }
+ except Exception as exc:
+ logger.warning("Failed to collect DB pool status: %s", exc)
+ db_pool = {"error": str(exc)}
+
+ # ---- Direct connections ----
+ direct_connections = {
+ "total_since_start": get_direct_connection_count(),
+ "worker_pid": os.getpid(),
+ }
+
+ return jsonify({
+ "success": True,
+ "data": {
+ "redis": redis_detail,
+ "process_caches": process_caches,
+ "route_cache": route_cache,
+ "db_pool": db_pool,
+ "direct_connections": direct_connections,
+ },
+ })
+
+
+@admin_bp.route("/api/performance-history", methods=["GET"])
+@admin_required
+def api_performance_history():
+ """API: Get historical metrics snapshots for trend charts."""
+ from mes_dashboard.core.metrics_history import get_metrics_history_store
+
+ minutes = request.args.get("minutes", 30, type=int)
+ minutes = max(1, min(minutes, 180))
+ store = get_metrics_history_store()
+ snapshots = store.query_snapshots(minutes=minutes)
+ return jsonify({
+ "success": True,
+ "data": {
+ "snapshots": snapshots,
+ "count": len(snapshots),
+ },
+ })
+
+
+@admin_bp.route("/api/logs/cleanup", methods=["POST"])
+@admin_required
+def api_logs_cleanup():
+ """API: Manually trigger log cleanup.
+
+ Supports optional parameters:
+ - older_than_days: Delete logs older than N days (default: use configured retention)
+ - keep_count: Keep only the most recent N logs (optional)
+ """
+ from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
+
+ if not LOG_STORE_ENABLED:
+ return jsonify({
+ "success": False,
+ "error": "Log store is disabled"
+ }), 400
+
+ log_store = get_log_store()
+
+ # Get current stats before cleanup
+ stats_before = log_store.get_stats()
+
+ # Perform cleanup
+ deleted = log_store.cleanup_old_logs()
+
+ # Get stats after cleanup
+ stats_after = log_store.get_stats()
+
+ user = getattr(g, "username", "unknown")
+ logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries")
+
+ return jsonify({
+ "success": True,
+ "data": {
+ "deleted": deleted,
+ "before": {
+ "count": stats_before.get("count", 0),
+ "size_bytes": stats_before.get("size_bytes", 0)
+ },
+ "after": {
+ "count": stats_after.get("count", 0),
+ "size_bytes": stats_after.get("size_bytes", 0)
+ }
+ }
+ })
+
+
+# ============================================================
+# Worker Restart Control Routes
+# ============================================================
+
def _get_restart_state() -> dict:
"""Read worker restart state from file."""
return load_restart_state(RESTART_STATE_PATH)
@@ -323,14 +458,14 @@ def _iso_from_epoch(ts: float) -> str | None:
if ts <= 0:
return None
return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
-
-
+
+
def _check_restart_cooldown() -> tuple[bool, float]:
"""Check if restart is in cooldown.
-
- Returns:
- Tuple of (is_in_cooldown, remaining_seconds).
- """
+
+ Returns:
+ Tuple of (is_in_cooldown, remaining_seconds).
+ """
policy = _get_restart_policy_state()
if policy.get("cooldown"):
return True, float(policy.get("cooldown_remaining_seconds") or 0.0)
@@ -401,18 +536,18 @@ def _log_restart_audit(event: str, payload: dict[str, Any]) -> None:
**payload,
}
logger.info("worker_restart_audit %s", json.dumps(entry, ensure_ascii=False))
-
-
+
+
@admin_bp.route("/api/worker/restart", methods=["POST"])
@admin_required
def api_worker_restart():
- """API: Request worker restart.
-
- Writes a restart flag file that the watchdog process monitors.
- Enforces a 60-second cooldown between restart requests.
- """
- global _last_restart_request
-
+ """API: Request worker restart.
+
+ Writes a restart flag file that the watchdog process monitors.
+ Enforces a 60-second cooldown between restart requests.
+ """
+ global _last_restart_request
+
payload = request.get_json(silent=True) or {}
manual_override = bool(payload.get("manual_override"))
override_acknowledged = bool(payload.get("override_acknowledged"))
@@ -496,10 +631,10 @@ def api_worker_restart():
f"Failed to request restart: {e}",
status_code=500
)
-
- # Update in-memory cooldown
- _last_restart_request = time.time()
-
+
+ # Update in-memory cooldown
+ _last_restart_request = time.time()
+
_log_restart_audit(
"restart_request_accepted",
{
@@ -534,10 +669,10 @@ def api_worker_restart():
},
}
})
-
-
-@admin_bp.route("/api/worker/status", methods=["GET"])
-@admin_required
+
+
+@admin_bp.route("/api/worker/status", methods=["GET"])
+@admin_required
def api_worker_status():
"""API: Get worker status and restart information."""
# Get last restart info
@@ -555,29 +690,29 @@ def api_worker_status():
cooldown_active=bool(policy_state.get("cooldown")),
)
runtime_contract = build_runtime_contract_diagnostics(strict=False)
-
- # Get worker start time (psutil is optional)
- worker_start_time = None
- try:
- import psutil
- process = psutil.Process(os.getpid())
- worker_start_time = datetime.fromtimestamp(
- process.create_time()
- ).isoformat()
- except ImportError:
- # psutil not installed, try /proc on Linux
- try:
- stat_path = f"/proc/{os.getpid()}/stat"
- with open(stat_path) as f:
- stat = f.read().split()
- # Field 22 is starttime in clock ticks since boot
- # This is a simplified fallback
- pass
- except Exception:
- pass
- except Exception:
- pass
-
+
+ # Get worker start time (psutil is optional)
+ worker_start_time = None
+ try:
+ import psutil
+ process = psutil.Process(os.getpid())
+ worker_start_time = datetime.fromtimestamp(
+ process.create_time()
+ ).isoformat()
+ except ImportError:
+ # psutil not installed, try /proc on Linux
+ try:
+ stat_path = f"/proc/{os.getpid()}/stat"
+ with open(stat_path) as f:
+ stat = f.read().split()
+ # Field 22 is starttime in clock ticks since boot
+ # This is a simplified fallback
+ pass
+ except Exception:
+ pass
+ except Exception:
+ pass
+
return jsonify({
"success": True,
"data": {
@@ -628,25 +763,25 @@ def api_worker_status():
"last_restart": {
"requested_by": last_restart.get("requested_by"),
"requested_at": last_restart.get("requested_at"),
- "requested_ip": last_restart.get("requested_ip"),
- "completed_at": last_restart.get("completed_at"),
- "success": last_restart.get("success")
- }
- }
- })
-
-
-# ============================================================
-# Page Management Routes
-# ============================================================
-
-@admin_bp.route("/pages")
-@admin_required
-def pages():
- """Page management interface."""
- return render_template("admin/pages.html")
-
-
+ "requested_ip": last_restart.get("requested_ip"),
+ "completed_at": last_restart.get("completed_at"),
+ "success": last_restart.get("success")
+ }
+ }
+ })
+
+
+# ============================================================
+# Page Management Routes
+# ============================================================
+
+@admin_bp.route("/pages")
+@admin_required
+def pages():
+ """Page management interface."""
+ return render_template("admin/pages.html")
+
+
@admin_bp.route("/api/pages", methods=["GET"])
@admin_required
def api_get_pages():
diff --git a/src/mes_dashboard/services/realtime_equipment_cache.py b/src/mes_dashboard/services/realtime_equipment_cache.py
index 615c802..e71ee98 100644
--- a/src/mes_dashboard/services/realtime_equipment_cache.py
+++ b/src/mes_dashboard/services/realtime_equipment_cache.py
@@ -14,6 +14,7 @@ from collections import OrderedDict
from datetime import datetime
from typing import Any
+from mes_dashboard.core.cache import register_process_cache
from mes_dashboard.core.database import read_sql_df
from mes_dashboard.core.redis_client import (
get_redis_client,
@@ -92,6 +93,13 @@ class _ProcessLevelCache:
with self._lock:
self._cache.pop(key, None)
+ def stats(self) -> dict:
+ """Return live cache statistics for telemetry."""
+ with self._lock:
+ now = time.time()
+ live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
+ return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
+
def _resolve_cache_max_size(env_name: str, default: int) -> int:
value = os.getenv(env_name)
@@ -113,6 +121,7 @@ _equipment_status_cache = _ProcessLevelCache(
ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS,
max_size=EQUIPMENT_PROCESS_CACHE_MAX_SIZE,
)
+register_process_cache("equipment_status", _equipment_status_cache, "Equipment Status (L1, 30s)")
_equipment_status_parse_lock = threading.Lock()
_equipment_lookup_lock = threading.Lock()
_equipment_status_lookup: dict[str, dict[str, Any]] = {}
diff --git a/src/mes_dashboard/services/reject_dataset_cache.py b/src/mes_dashboard/services/reject_dataset_cache.py
index 3373906..21a485c 100644
--- a/src/mes_dashboard/services/reject_dataset_cache.py
+++ b/src/mes_dashboard/services/reject_dataset_cache.py
@@ -20,7 +20,7 @@ from typing import Any, Dict, List, Optional
import pandas as pd
-from mes_dashboard.core.cache import ProcessLevelCache
+from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache
from mes_dashboard.core.database import read_sql_df
from mes_dashboard.core.redis_client import (
REDIS_ENABLED,
@@ -55,6 +55,7 @@ _CACHE_MAX_SIZE = 8
_REDIS_NAMESPACE = "reject_dataset"
_dataset_cache = ProcessLevelCache(ttl_seconds=_CACHE_TTL, max_size=_CACHE_MAX_SIZE)
+register_process_cache("reject_dataset", _dataset_cache, "Reject Dataset (L1, 15min)")
# ============================================================
diff --git a/src/mes_dashboard/services/resource_cache.py b/src/mes_dashboard/services/resource_cache.py
index 260611b..af4fe6b 100644
--- a/src/mes_dashboard/services/resource_cache.py
+++ b/src/mes_dashboard/services/resource_cache.py
@@ -19,6 +19,7 @@ from typing import Any
import pandas as pd
+from mes_dashboard.core.cache import register_process_cache
from mes_dashboard.core.redis_client import (
get_redis_client,
redis_available,
@@ -109,6 +110,13 @@ class _ProcessLevelCache:
with self._lock:
self._cache.pop(key, None)
+ def stats(self) -> dict:
+ """Return live cache statistics for telemetry."""
+ with self._lock:
+ now = time.time()
+ live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
+ return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
+
def _resolve_cache_max_size(env_name: str, default: int) -> int:
value = os.getenv(env_name)
@@ -130,6 +138,7 @@ _resource_df_cache = _ProcessLevelCache(
ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS,
max_size=RESOURCE_PROCESS_CACHE_MAX_SIZE,
)
+register_process_cache("resource", _resource_df_cache, "Resource DataFrame (L1, 30s)")
_resource_parse_lock = threading.Lock()
_resource_index_lock = threading.Lock()
_resource_index: ResourceIndex = {