feat(admin-performance): Vue 3 SPA dashboard with metrics history trending
Rebuild /admin/performance from Jinja2 to Vue 3 SPA with ECharts, adding cache telemetry infrastructure, connection pool monitoring, and SQLite-backed historical metrics collection with trend chart visualization. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite --host",
|
||||
"build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html",
|
||||
"build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html && cp ../src/mes_dashboard/static/dist/src/admin-performance/index.html ../src/mes_dashboard/static/dist/admin-performance.html",
|
||||
"test": "node --test tests/*.test.js"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
||||
613
frontend/src/admin-performance/App.vue
Normal file
613
frontend/src/admin-performance/App.vue
Normal file
@@ -0,0 +1,613 @@
|
||||
<template>
|
||||
<div class="perf-dashboard">
|
||||
<!-- Header -->
|
||||
<header class="perf-header">
|
||||
<div class="perf-header-inner">
|
||||
<h1 class="perf-title">效能監控儀表板</h1>
|
||||
<div class="perf-header-actions">
|
||||
<label class="auto-refresh-toggle">
|
||||
<input type="checkbox" v-model="autoRefreshEnabled" @change="toggleAutoRefresh" />
|
||||
自動更新 (30s)
|
||||
</label>
|
||||
<button class="btn btn-sm" @click="refreshAll" :disabled="loading">
|
||||
<template v-if="loading">更新中...</template>
|
||||
<template v-else>重新整理</template>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- Status Cards -->
|
||||
<section class="panel">
|
||||
<div class="status-cards-grid">
|
||||
<div class="status-card">
|
||||
<div class="status-card-title">Database</div>
|
||||
<StatusDot :status="dbStatus" :label="dbStatusLabel" />
|
||||
</div>
|
||||
<div class="status-card">
|
||||
<div class="status-card-title">Redis</div>
|
||||
<StatusDot :status="redisStatus" :label="redisStatusLabel" />
|
||||
</div>
|
||||
<div class="status-card">
|
||||
<div class="status-card-title">Circuit Breaker</div>
|
||||
<StatusDot :status="cbStatus" :label="cbStatusLabel" />
|
||||
</div>
|
||||
<div class="status-card">
|
||||
<div class="status-card-title">Worker PID</div>
|
||||
<StatusDot status="healthy" :label="String(systemData?.worker_pid || '-')" />
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Query Performance -->
|
||||
<section class="panel">
|
||||
<h2 class="panel-title">查詢效能</h2>
|
||||
<div class="query-perf-grid">
|
||||
<div class="query-perf-stats">
|
||||
<StatCard :value="metricsData?.p50_ms" label="P50 (ms)" />
|
||||
<StatCard :value="metricsData?.p95_ms" label="P95 (ms)" />
|
||||
<StatCard :value="metricsData?.p99_ms" label="P99 (ms)" />
|
||||
<StatCard :value="metricsData?.count" label="查詢數" />
|
||||
<StatCard :value="metricsData?.slow_count" label="慢查詢" />
|
||||
<StatCard :value="slowRateDisplay" label="慢查詢率" />
|
||||
</div>
|
||||
<div class="query-perf-chart" ref="latencyChartRef"></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Query Latency Trend -->
|
||||
<TrendChart
|
||||
v-if="historyData.length > 1"
|
||||
title="查詢延遲趨勢"
|
||||
:snapshots="historyData"
|
||||
:series="latencyTrendSeries"
|
||||
yAxisLabel="ms"
|
||||
/>
|
||||
|
||||
<!-- Redis Cache Detail -->
|
||||
<section class="panel" v-if="perfDetail?.redis">
|
||||
<h2 class="panel-title">Redis 快取</h2>
|
||||
<div class="redis-grid">
|
||||
<div class="redis-stats">
|
||||
<GaugeBar
|
||||
label="記憶體使用"
|
||||
:value="redisMemoryRatio"
|
||||
:max="1"
|
||||
:displayText="redisMemoryLabel"
|
||||
/>
|
||||
<div class="redis-mini-stats">
|
||||
<StatCard :value="perfDetail.redis.used_memory_human" label="已使用" />
|
||||
<StatCard :value="perfDetail.redis.peak_memory_human" label="峰值" />
|
||||
<StatCard :value="perfDetail.redis.connected_clients" label="連線數" />
|
||||
<StatCard :value="hitRateDisplay" label="命中率" />
|
||||
</div>
|
||||
</div>
|
||||
<div class="redis-namespaces">
|
||||
<table class="mini-table">
|
||||
<thead><tr><th>Namespace</th><th>Key 數量</th></tr></thead>
|
||||
<tbody>
|
||||
<tr v-for="ns in perfDetail.redis.namespaces" :key="ns.name">
|
||||
<td>{{ ns.name }}</td>
|
||||
<td>{{ ns.key_count }}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section class="panel panel-disabled" v-else-if="perfDetail && !perfDetail.redis">
|
||||
<h2 class="panel-title">Redis 快取</h2>
|
||||
<p class="muted">Redis 未啟用</p>
|
||||
</section>
|
||||
|
||||
<!-- Redis Memory Trend -->
|
||||
<TrendChart
|
||||
v-if="historyData.length > 1"
|
||||
title="Redis 記憶體趨勢"
|
||||
:snapshots="historyData"
|
||||
:series="redisTrendSeries"
|
||||
/>
|
||||
|
||||
<!-- Memory Caches -->
|
||||
<section class="panel" v-if="perfDetail">
|
||||
<h2 class="panel-title">記憶體快取</h2>
|
||||
<div class="cache-cards-grid">
|
||||
<div class="cache-card" v-for="(info, name) in perfDetail.process_caches" :key="name">
|
||||
<div class="cache-card-name">{{ name }}</div>
|
||||
<div class="cache-card-desc">{{ info.description }}</div>
|
||||
<GaugeBar
|
||||
label="使用率"
|
||||
:value="info.entries"
|
||||
:max="info.max_size"
|
||||
/>
|
||||
<div class="cache-card-ttl">TTL: {{ info.ttl_seconds }}s</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="route-cache-section" v-if="perfDetail.route_cache">
|
||||
<h3 class="sub-title">Route Cache</h3>
|
||||
<div class="route-cache-stats">
|
||||
<StatCard :value="perfDetail.route_cache.mode" label="模式" />
|
||||
<StatCard :value="perfDetail.route_cache.l1_size" label="L1 大小" />
|
||||
<StatCard :value="routeCacheL1HitRate" label="L1 命中率" />
|
||||
<StatCard :value="routeCacheL2HitRate" label="L2 命中率" />
|
||||
<StatCard :value="routeCacheMissRate" label="未命中率" />
|
||||
<StatCard :value="perfDetail.route_cache.reads_total" label="總讀取" />
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Cache Hit Rate Trend -->
|
||||
<TrendChart
|
||||
v-if="historyData.length > 1"
|
||||
title="快取命中率趨勢"
|
||||
:snapshots="historyData"
|
||||
:series="hitRateTrendSeries"
|
||||
yAxisLabel=""
|
||||
:yMax="1"
|
||||
/>
|
||||
|
||||
<!-- Connection Pool -->
|
||||
<section class="panel" v-if="perfDetail?.db_pool?.status">
|
||||
<h2 class="panel-title">連線池</h2>
|
||||
<GaugeBar
|
||||
label="飽和度"
|
||||
:value="perfDetail.db_pool.status.saturation"
|
||||
:max="1"
|
||||
/>
|
||||
<div class="pool-stats-grid">
|
||||
<StatCard :value="perfDetail.db_pool.status.checked_out" label="使用中" />
|
||||
<StatCard :value="perfDetail.db_pool.status.checked_in" label="閒置" />
|
||||
<StatCard :value="poolTotalConnections" label="總連線數" />
|
||||
<StatCard :value="perfDetail.db_pool.status.max_capacity" label="最大容量" />
|
||||
<StatCard :value="poolOverflowDisplay" label="溢出連線" />
|
||||
<StatCard :value="perfDetail.db_pool.config?.pool_size" label="池大小" />
|
||||
<StatCard :value="perfDetail.db_pool.config?.pool_recycle" label="回收週期 (s)" />
|
||||
<StatCard :value="perfDetail.db_pool.config?.pool_timeout" label="逾時 (s)" />
|
||||
<StatCard :value="perfDetail.direct_connections?.total_since_start" label="直連次數" />
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Connection Pool Trend -->
|
||||
<TrendChart
|
||||
v-if="historyData.length > 1"
|
||||
title="連線池趨勢"
|
||||
:snapshots="historyData"
|
||||
:series="poolTrendSeries"
|
||||
/>
|
||||
|
||||
<!-- Worker Control -->
|
||||
<section class="panel">
|
||||
<h2 class="panel-title">Worker 控制</h2>
|
||||
<div class="worker-info">
|
||||
<StatCard :value="workerData?.worker_pid" label="PID" />
|
||||
<StatCard :value="workerStartTimeDisplay" label="啟動時間" />
|
||||
<StatCard :value="cooldownDisplay" label="冷卻狀態" />
|
||||
</div>
|
||||
<button
|
||||
class="btn btn-danger"
|
||||
:disabled="workerCooldownActive"
|
||||
@click="showRestartModal = true"
|
||||
>
|
||||
重啟 Worker
|
||||
</button>
|
||||
|
||||
<!-- Restart Modal -->
|
||||
<div class="modal-backdrop" v-if="showRestartModal" @click.self="showRestartModal = false">
|
||||
<div class="modal-dialog">
|
||||
<h3>確認重啟 Worker</h3>
|
||||
<p>重啟將導致目前的請求暫時中斷,確定要繼續嗎?</p>
|
||||
<div class="modal-actions">
|
||||
<button class="btn" @click="showRestartModal = false">取消</button>
|
||||
<button class="btn btn-danger" @click="doRestart" :disabled="restartLoading">
|
||||
{{ restartLoading ? '重啟中...' : '確認重啟' }}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- System Logs -->
|
||||
<section class="panel">
|
||||
<h2 class="panel-title">系統日誌</h2>
|
||||
<div class="log-controls">
|
||||
<select v-model="logLevel" @change="loadLogs">
|
||||
<option value="">全部等級</option>
|
||||
<option value="ERROR">ERROR</option>
|
||||
<option value="WARNING">WARNING</option>
|
||||
<option value="INFO">INFO</option>
|
||||
<option value="DEBUG">DEBUG</option>
|
||||
</select>
|
||||
<input
|
||||
type="text"
|
||||
v-model="logSearch"
|
||||
placeholder="搜尋日誌..."
|
||||
@input="debouncedLoadLogs"
|
||||
/>
|
||||
<button class="btn btn-sm" @click="cleanupLogs" :disabled="cleanupLoading">
|
||||
{{ cleanupLoading ? '清理中...' : '清理日誌' }}
|
||||
</button>
|
||||
</div>
|
||||
<div class="log-table-wrapper">
|
||||
<table class="log-table" v-if="logsData?.logs?.length">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>時間</th>
|
||||
<th>等級</th>
|
||||
<th>訊息</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr v-for="(log, i) in logsData.logs" :key="i" :class="'log-' + (log.level || '').toLowerCase()">
|
||||
<td class="log-time">{{ log.timestamp }}</td>
|
||||
<td class="log-level">{{ log.level }}</td>
|
||||
<td class="log-msg">{{ log.message }}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p v-else class="muted">無日誌</p>
|
||||
</div>
|
||||
<div class="log-pagination" v-if="logsData?.total > logLimit">
|
||||
<button class="btn btn-sm" :disabled="logOffset === 0" @click="logOffset -= logLimit; loadLogs()">上一頁</button>
|
||||
<span>{{ logOffset / logLimit + 1 }} / {{ Math.ceil(logsData.total / logLimit) }}</span>
|
||||
<button class="btn btn-sm" :disabled="logOffset + logLimit >= logsData.total" @click="logOffset += logLimit; loadLogs()">下一頁</button>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, computed, onMounted, onBeforeUnmount } from 'vue';
|
||||
import * as echarts from 'echarts/core';
|
||||
import { BarChart } from 'echarts/charts';
|
||||
import { GridComponent, TooltipComponent } from 'echarts/components';
|
||||
import { CanvasRenderer } from 'echarts/renderers';
|
||||
|
||||
import { apiGet, apiPost } from '../core/api.js';
|
||||
import { useAutoRefresh } from '../shared-composables/useAutoRefresh.js';
|
||||
|
||||
import GaugeBar from './components/GaugeBar.vue';
|
||||
import StatCard from './components/StatCard.vue';
|
||||
import StatusDot from './components/StatusDot.vue';
|
||||
import TrendChart from './components/TrendChart.vue';
|
||||
|
||||
echarts.use([BarChart, GridComponent, TooltipComponent, CanvasRenderer]);
|
||||
|
||||
// --- State ---
|
||||
const loading = ref(false);
|
||||
const autoRefreshEnabled = ref(true);
|
||||
const systemData = ref(null);
|
||||
const metricsData = ref(null);
|
||||
const perfDetail = ref(null);
|
||||
const historyData = ref([]);
|
||||
const logsData = ref(null);
|
||||
const workerData = ref(null);
|
||||
|
||||
const logLevel = ref('');
|
||||
const logSearch = ref('');
|
||||
const logOffset = ref(0);
|
||||
const logLimit = 50;
|
||||
|
||||
const showRestartModal = ref(false);
|
||||
const restartLoading = ref(false);
|
||||
const cleanupLoading = ref(false);
|
||||
|
||||
const latencyChartRef = ref(null);
|
||||
let chartInstance = null;
|
||||
|
||||
// --- Computed ---
|
||||
const dbStatus = computed(() => {
|
||||
const s = systemData.value?.database?.status;
|
||||
if (s === 'healthy' || s === 'ok') return 'healthy';
|
||||
if (s === 'error') return 'error';
|
||||
return 'disabled';
|
||||
});
|
||||
const dbStatusLabel = computed(() => systemData.value?.database?.status || '-');
|
||||
|
||||
const redisStatus = computed(() => {
|
||||
const r = systemData.value?.redis;
|
||||
if (!r?.enabled) return 'disabled';
|
||||
if (r.status === 'healthy' || r.status === 'ok') return 'healthy';
|
||||
if (r.status === 'error') return 'error';
|
||||
return 'degraded';
|
||||
});
|
||||
const redisStatusLabel = computed(() => {
|
||||
const r = systemData.value?.redis;
|
||||
if (!r?.enabled) return '未啟用';
|
||||
return r.status || '-';
|
||||
});
|
||||
|
||||
const cbStatus = computed(() => {
|
||||
const s = systemData.value?.circuit_breaker?.state;
|
||||
if (s === 'CLOSED') return 'healthy';
|
||||
if (s === 'OPEN') return 'error';
|
||||
if (s === 'HALF_OPEN') return 'degraded';
|
||||
return 'disabled';
|
||||
});
|
||||
const cbStatusLabel = computed(() => systemData.value?.circuit_breaker?.state || '-');
|
||||
|
||||
const slowRateDisplay = computed(() => {
|
||||
const r = metricsData.value?.slow_rate;
|
||||
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
|
||||
});
|
||||
|
||||
const redisMemoryRatio = computed(() => {
|
||||
const r = perfDetail.value?.redis;
|
||||
if (!r) return 0;
|
||||
const used = r.used_memory || 0;
|
||||
const max = r.maxmemory || 0;
|
||||
if (max > 0) return used / max;
|
||||
const peak = r.peak_memory || used;
|
||||
return peak > 0 ? used / peak : 0;
|
||||
});
|
||||
const redisMemoryLabel = computed(() => {
|
||||
const r = perfDetail.value?.redis;
|
||||
if (!r) return '';
|
||||
const used = r.used_memory_human || 'N/A';
|
||||
const max = r.maxmemory && r.maxmemory > 0
|
||||
? r.maxmemory_human
|
||||
: r.peak_memory_human;
|
||||
return `${used} / ${max || 'N/A'}`;
|
||||
});
|
||||
|
||||
const hitRateDisplay = computed(() => {
|
||||
const r = perfDetail.value?.redis?.hit_rate;
|
||||
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
|
||||
});
|
||||
|
||||
const routeCacheL1HitRate = computed(() => {
|
||||
const r = perfDetail.value?.route_cache?.l1_hit_rate;
|
||||
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
|
||||
});
|
||||
const routeCacheL2HitRate = computed(() => {
|
||||
const r = perfDetail.value?.route_cache?.l2_hit_rate;
|
||||
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
|
||||
});
|
||||
const routeCacheMissRate = computed(() => {
|
||||
const r = perfDetail.value?.route_cache?.miss_rate;
|
||||
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
|
||||
});
|
||||
|
||||
const poolOverflowDisplay = computed(() => {
|
||||
const overflow = perfDetail.value?.db_pool?.status?.overflow;
|
||||
if (overflow == null) return '-';
|
||||
return Math.max(0, overflow);
|
||||
});
|
||||
const poolTotalConnections = computed(() => {
|
||||
const s = perfDetail.value?.db_pool?.status;
|
||||
if (!s) return '-';
|
||||
return (s.checked_out || 0) + (s.checked_in || 0);
|
||||
});
|
||||
|
||||
const workerStartTimeDisplay = computed(() => {
|
||||
const t = workerData.value?.worker_start_time;
|
||||
if (!t) return '-';
|
||||
try {
|
||||
return new Date(t).toLocaleString('zh-TW');
|
||||
} catch {
|
||||
return t;
|
||||
}
|
||||
});
|
||||
|
||||
const workerCooldownActive = computed(() => workerData.value?.cooldown?.active || false);
|
||||
const cooldownDisplay = computed(() => {
|
||||
if (workerCooldownActive.value) {
|
||||
const secs = workerData.value?.cooldown?.remaining_seconds || 0;
|
||||
return `冷卻中 (${secs}s)`;
|
||||
}
|
||||
return '就緒';
|
||||
});
|
||||
|
||||
// --- Data Fetching ---
|
||||
async function loadSystemStatus() {
|
||||
try {
|
||||
const res = await apiGet('/admin/api/system-status');
|
||||
systemData.value = res?.data || null;
|
||||
} catch (e) {
|
||||
console.error('Failed to load system status:', e);
|
||||
}
|
||||
}
|
||||
|
||||
async function loadMetrics() {
|
||||
try {
|
||||
const res = await apiGet('/admin/api/metrics');
|
||||
metricsData.value = res?.data || null;
|
||||
updateLatencyChart();
|
||||
} catch (e) {
|
||||
console.error('Failed to load metrics:', e);
|
||||
}
|
||||
}
|
||||
|
||||
async function loadPerformanceDetail() {
|
||||
try {
|
||||
const res = await apiGet('/admin/api/performance-detail');
|
||||
perfDetail.value = res?.data || null;
|
||||
} catch (e) {
|
||||
console.error('Failed to load performance detail:', e);
|
||||
}
|
||||
}
|
||||
|
||||
async function loadLogs() {
|
||||
try {
|
||||
const params = { limit: logLimit, offset: logOffset.value };
|
||||
if (logLevel.value) params.level = logLevel.value;
|
||||
if (logSearch.value) params.q = logSearch.value;
|
||||
const res = await apiGet('/admin/api/logs', { params });
|
||||
logsData.value = res?.data || null;
|
||||
} catch (e) {
|
||||
console.error('Failed to load logs:', e);
|
||||
}
|
||||
}
|
||||
|
||||
async function loadWorkerStatus() {
|
||||
try {
|
||||
const res = await apiGet('/admin/api/worker/status');
|
||||
workerData.value = res?.data || null;
|
||||
} catch (e) {
|
||||
console.error('Failed to load worker status:', e);
|
||||
}
|
||||
}
|
||||
|
||||
async function loadPerformanceHistory() {
|
||||
try {
|
||||
const res = await apiGet('/admin/api/performance-history', { params: { minutes: 30 } });
|
||||
historyData.value = res?.data?.snapshots || [];
|
||||
} catch (e) {
|
||||
console.error('Failed to load performance history:', e);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Trend Chart Series Configs ---
|
||||
const poolTrendSeries = [
|
||||
{ name: '飽和度', key: 'pool_saturation', color: '#6366f1' },
|
||||
{ name: '使用中', key: 'pool_checked_out', color: '#f59e0b' },
|
||||
];
|
||||
|
||||
const latencyTrendSeries = [
|
||||
{ name: 'P50', key: 'latency_p50_ms', color: '#22c55e' },
|
||||
{ name: 'P95', key: 'latency_p95_ms', color: '#f59e0b' },
|
||||
{ name: 'P99', key: 'latency_p99_ms', color: '#ef4444' },
|
||||
];
|
||||
|
||||
const redisTrendSeries = [
|
||||
{ name: '記憶體 (bytes)', key: 'redis_used_memory', color: '#06b6d4' },
|
||||
];
|
||||
|
||||
const hitRateTrendSeries = [
|
||||
{ name: 'Redis 命中率', key: 'redis_hit_rate', color: '#22c55e' },
|
||||
{ name: 'L1 命中率', key: 'rc_l1_hit_rate', color: '#2563eb' },
|
||||
{ name: 'L2 命中率', key: 'rc_l2_hit_rate', color: '#f59e0b' },
|
||||
];
|
||||
|
||||
async function refreshAll() {
|
||||
loading.value = true;
|
||||
try {
|
||||
await Promise.all([
|
||||
loadSystemStatus(),
|
||||
loadMetrics(),
|
||||
loadPerformanceDetail(),
|
||||
loadPerformanceHistory(),
|
||||
loadLogs(),
|
||||
loadWorkerStatus(),
|
||||
]);
|
||||
} finally {
|
||||
loading.value = false;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Auto Refresh ---
|
||||
const { startAutoRefresh, stopAutoRefresh } = useAutoRefresh({
|
||||
onRefresh: refreshAll,
|
||||
intervalMs: 30_000,
|
||||
autoStart: false,
|
||||
});
|
||||
|
||||
function toggleAutoRefresh() {
|
||||
if (autoRefreshEnabled.value) {
|
||||
startAutoRefresh();
|
||||
} else {
|
||||
stopAutoRefresh();
|
||||
}
|
||||
}
|
||||
|
||||
// --- Worker Restart ---
|
||||
async function doRestart() {
|
||||
restartLoading.value = true;
|
||||
try {
|
||||
await apiPost('/admin/api/worker/restart', {});
|
||||
showRestartModal.value = false;
|
||||
await loadWorkerStatus();
|
||||
} catch (e) {
|
||||
alert(e.message || '重啟失敗');
|
||||
} finally {
|
||||
restartLoading.value = false;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Log Cleanup ---
|
||||
async function cleanupLogs() {
|
||||
cleanupLoading.value = true;
|
||||
try {
|
||||
await apiPost('/admin/api/logs/cleanup', {});
|
||||
await loadLogs();
|
||||
} catch (e) {
|
||||
console.error('Failed to cleanup logs:', e);
|
||||
} finally {
|
||||
cleanupLoading.value = false;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Debounce ---
|
||||
let debounceTimer = null;
|
||||
function debouncedLoadLogs() {
|
||||
clearTimeout(debounceTimer);
|
||||
debounceTimer = setTimeout(() => {
|
||||
logOffset.value = 0;
|
||||
loadLogs();
|
||||
}, 300);
|
||||
}
|
||||
|
||||
// --- ECharts ---
|
||||
function updateLatencyChart() {
|
||||
if (!latencyChartRef.value) return;
|
||||
|
||||
if (!chartInstance) {
|
||||
chartInstance = echarts.init(latencyChartRef.value);
|
||||
}
|
||||
|
||||
const latencies = metricsData.value?.latencies || [];
|
||||
if (!latencies.length) {
|
||||
chartInstance.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
// Build histogram buckets
|
||||
const buckets = [
|
||||
{ label: '<100ms', max: 100 },
|
||||
{ label: '100-500ms', max: 500 },
|
||||
{ label: '500ms-1s', max: 1000 },
|
||||
{ label: '1-5s', max: 5000 },
|
||||
{ label: '>5s', max: Infinity },
|
||||
];
|
||||
const counts = buckets.map(() => 0);
|
||||
for (const ms of latencies.map((v) => v * 1000)) {
|
||||
for (let i = 0; i < buckets.length; i++) {
|
||||
if (ms < buckets[i].max || i === buckets.length - 1) {
|
||||
counts[i]++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chartInstance.setOption({
|
||||
tooltip: { trigger: 'axis' },
|
||||
grid: { left: 40, right: 20, top: 20, bottom: 30 },
|
||||
xAxis: { type: 'category', data: buckets.map((b) => b.label) },
|
||||
yAxis: { type: 'value' },
|
||||
series: [
|
||||
{
|
||||
type: 'bar',
|
||||
data: counts,
|
||||
itemStyle: { color: '#6366f1' },
|
||||
barMaxWidth: 40,
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
// --- Lifecycle ---
|
||||
onMounted(async () => {
|
||||
await refreshAll();
|
||||
if (autoRefreshEnabled.value) {
|
||||
startAutoRefresh();
|
||||
}
|
||||
});
|
||||
|
||||
onBeforeUnmount(() => {
|
||||
stopAutoRefresh();
|
||||
if (chartInstance) {
|
||||
chartInstance.dispose();
|
||||
chartInstance = null;
|
||||
}
|
||||
clearTimeout(debounceTimer);
|
||||
});
|
||||
</script>
|
||||
49
frontend/src/admin-performance/components/GaugeBar.vue
Normal file
49
frontend/src/admin-performance/components/GaugeBar.vue
Normal file
@@ -0,0 +1,49 @@
|
||||
<template>
|
||||
<div class="gauge-bar">
|
||||
<div class="gauge-bar-header">
|
||||
<span class="gauge-bar-label">{{ label }}</span>
|
||||
<span class="gauge-bar-value">{{ displayValue }}</span>
|
||||
</div>
|
||||
<div class="gauge-bar-track">
|
||||
<div class="gauge-bar-fill" :style="fillStyle"></div>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { computed } from 'vue';
|
||||
|
||||
const props = defineProps({
|
||||
label: { type: String, default: '' },
|
||||
value: { type: Number, default: 0 },
|
||||
max: { type: Number, default: 100 },
|
||||
unit: { type: String, default: '%' },
|
||||
displayText: { type: String, default: '' },
|
||||
warningThreshold: { type: Number, default: 0.7 },
|
||||
dangerThreshold: { type: Number, default: 0.9 },
|
||||
});
|
||||
|
||||
const ratio = computed(() => {
|
||||
if (props.max <= 0) return 0;
|
||||
return Math.min(Math.max(props.value / props.max, 0), 1);
|
||||
});
|
||||
|
||||
const displayValue = computed(() => {
|
||||
if (props.displayText) return props.displayText;
|
||||
if (props.unit === '%') {
|
||||
return `${(ratio.value * 100).toFixed(1)}%`;
|
||||
}
|
||||
return `${props.value}${props.unit ? ' ' + props.unit : ''}`;
|
||||
});
|
||||
|
||||
const fillColor = computed(() => {
|
||||
if (ratio.value >= props.dangerThreshold) return '#ef4444';
|
||||
if (ratio.value >= props.warningThreshold) return '#f59e0b';
|
||||
return '#22c55e';
|
||||
});
|
||||
|
||||
const fillStyle = computed(() => ({
|
||||
width: `${ratio.value * 100}%`,
|
||||
backgroundColor: fillColor.value,
|
||||
}));
|
||||
</script>
|
||||
24
frontend/src/admin-performance/components/StatCard.vue
Normal file
24
frontend/src/admin-performance/components/StatCard.vue
Normal file
@@ -0,0 +1,24 @@
|
||||
<template>
|
||||
<div class="stat-card">
|
||||
<div class="stat-card-value">{{ formattedValue }}</div>
|
||||
<div class="stat-card-label">{{ label }}</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { computed } from 'vue';
|
||||
|
||||
const props = defineProps({
|
||||
value: { type: [Number, String], default: '-' },
|
||||
label: { type: String, default: '' },
|
||||
unit: { type: String, default: '' },
|
||||
});
|
||||
|
||||
const formattedValue = computed(() => {
|
||||
if (props.value === null || props.value === undefined) return '-';
|
||||
const v = typeof props.value === 'number'
|
||||
? (Number.isInteger(props.value) ? props.value : props.value.toFixed(2))
|
||||
: props.value;
|
||||
return props.unit ? `${v} ${props.unit}` : String(v);
|
||||
});
|
||||
</script>
|
||||
17
frontend/src/admin-performance/components/StatusDot.vue
Normal file
17
frontend/src/admin-performance/components/StatusDot.vue
Normal file
@@ -0,0 +1,17 @@
|
||||
<template>
|
||||
<div class="status-dot-wrapper">
|
||||
<span class="status-dot" :class="'status-dot--' + status"></span>
|
||||
<span class="status-dot-label">{{ label }}</span>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
defineProps({
|
||||
status: {
|
||||
type: String,
|
||||
default: 'disabled',
|
||||
validator: (v) => ['healthy', 'degraded', 'error', 'disabled'].includes(v),
|
||||
},
|
||||
label: { type: String, default: '' },
|
||||
});
|
||||
</script>
|
||||
98
frontend/src/admin-performance/components/TrendChart.vue
Normal file
98
frontend/src/admin-performance/components/TrendChart.vue
Normal file
@@ -0,0 +1,98 @@
|
||||
<script setup>
|
||||
import { computed } from 'vue';
|
||||
|
||||
import { LineChart } from 'echarts/charts';
|
||||
import {
|
||||
GridComponent,
|
||||
LegendComponent,
|
||||
TooltipComponent,
|
||||
} from 'echarts/components';
|
||||
import { use } from 'echarts/core';
|
||||
import { CanvasRenderer } from 'echarts/renderers';
|
||||
import VChart from 'vue-echarts';
|
||||
|
||||
use([CanvasRenderer, LineChart, GridComponent, TooltipComponent, LegendComponent]);
|
||||
|
||||
const props = defineProps({
|
||||
title: { type: String, default: '' },
|
||||
snapshots: { type: Array, default: () => [] },
|
||||
series: { type: Array, default: () => [] },
|
||||
height: { type: String, default: '220px' },
|
||||
yAxisLabel: { type: String, default: '' },
|
||||
yMax: { type: Number, default: undefined },
|
||||
});
|
||||
|
||||
const hasData = computed(() => props.snapshots.length > 1);
|
||||
|
||||
function extractValue(row, key) {
|
||||
return row[key] ?? null;
|
||||
}
|
||||
|
||||
function formatTime(ts) {
|
||||
if (!ts) return '';
|
||||
const d = new Date(ts);
|
||||
const hh = String(d.getHours()).padStart(2, '0');
|
||||
const mm = String(d.getMinutes()).padStart(2, '0');
|
||||
const ss = String(d.getSeconds()).padStart(2, '0');
|
||||
return `${hh}:${mm}:${ss}`;
|
||||
}
|
||||
|
||||
const chartOption = computed(() => {
|
||||
const data = props.snapshots || [];
|
||||
const seriesDefs = props.series || [];
|
||||
|
||||
const xLabels = data.map((row) => formatTime(row.ts));
|
||||
|
||||
const echartsSeries = seriesDefs.map((s) => ({
|
||||
name: s.name,
|
||||
type: 'line',
|
||||
smooth: true,
|
||||
symbol: 'none',
|
||||
areaStyle: { opacity: 0.12 },
|
||||
lineStyle: { width: 2 },
|
||||
itemStyle: { color: s.color },
|
||||
yAxisIndex: s.yAxisIndex || 0,
|
||||
data: data.map((row) => extractValue(row, s.key)),
|
||||
}));
|
||||
|
||||
const yAxisConfig = { type: 'value', min: 0 };
|
||||
if (props.yMax != null) yAxisConfig.max = props.yMax;
|
||||
if (props.yAxisLabel) {
|
||||
yAxisConfig.axisLabel = { formatter: `{value}${props.yAxisLabel}` };
|
||||
}
|
||||
|
||||
return {
|
||||
tooltip: {
|
||||
trigger: 'axis',
|
||||
axisPointer: { type: 'cross' },
|
||||
},
|
||||
legend: {
|
||||
data: seriesDefs.map((s) => s.name),
|
||||
bottom: 0,
|
||||
},
|
||||
grid: {
|
||||
left: 50,
|
||||
right: 20,
|
||||
top: 16,
|
||||
bottom: 40,
|
||||
},
|
||||
xAxis: {
|
||||
type: 'category',
|
||||
data: xLabels,
|
||||
axisLabel: { fontSize: 10 },
|
||||
},
|
||||
yAxis: yAxisConfig,
|
||||
series: echartsSeries,
|
||||
};
|
||||
});
|
||||
</script>
|
||||
|
||||
<template>
|
||||
<div class="trend-chart-card">
|
||||
<h4 v-if="title" class="trend-chart-title">{{ title }}</h4>
|
||||
<div v-if="hasData" class="trend-chart-canvas" :style="{ height }">
|
||||
<VChart :option="chartOption" autoresize />
|
||||
</div>
|
||||
<div v-else class="trend-chart-empty">趨勢資料不足(需至少 2 筆快照)</div>
|
||||
</div>
|
||||
</template>
|
||||
12
frontend/src/admin-performance/index.html
Normal file
12
frontend/src/admin-performance/index.html
Normal file
@@ -0,0 +1,12 @@
|
||||
<!doctype html>
|
||||
<html lang="zh-Hant">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Performance Monitor</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="app"></div>
|
||||
<script type="module" src="./main.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
6
frontend/src/admin-performance/main.js
Normal file
6
frontend/src/admin-performance/main.js
Normal file
@@ -0,0 +1,6 @@
|
||||
import { createApp } from 'vue';
|
||||
|
||||
import App from './App.vue';
|
||||
import './style.css';
|
||||
|
||||
createApp(App).mount('#app');
|
||||
544
frontend/src/admin-performance/style.css
Normal file
544
frontend/src/admin-performance/style.css
Normal file
@@ -0,0 +1,544 @@
|
||||
/* Admin Performance Dashboard */
|
||||
*,
|
||||
*::before,
|
||||
*::after {
|
||||
box-sizing: border-box;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
background: #f1f5f9;
|
||||
color: #1e293b;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.perf-dashboard {
|
||||
max-width: 1280px;
|
||||
margin: 0 auto;
|
||||
padding: 0 16px 32px;
|
||||
}
|
||||
|
||||
/* Header */
|
||||
.perf-header {
|
||||
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
|
||||
color: #fff;
|
||||
padding: 20px 24px;
|
||||
border-radius: 0 0 12px 12px;
|
||||
margin: 0 -16px 20px;
|
||||
}
|
||||
|
||||
.perf-header-inner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
flex-wrap: wrap;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.perf-title {
|
||||
font-size: 1.4rem;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.perf-header-actions {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.auto-refresh-toggle {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
font-size: 0.85rem;
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
.auto-refresh-toggle input[type='checkbox'] {
|
||||
accent-color: #fff;
|
||||
}
|
||||
|
||||
/* Buttons */
|
||||
.btn {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 8px 16px;
|
||||
border: none;
|
||||
border-radius: 6px;
|
||||
font-size: 0.85rem;
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
background: rgba(255, 255, 255, 0.2);
|
||||
color: #fff;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
|
||||
.btn:hover:not(:disabled) {
|
||||
background: rgba(255, 255, 255, 0.3);
|
||||
}
|
||||
|
||||
.btn:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.btn-sm {
|
||||
padding: 5px 10px;
|
||||
font-size: 0.8rem;
|
||||
background: #e2e8f0;
|
||||
color: #334155;
|
||||
}
|
||||
|
||||
.btn-sm:hover:not(:disabled) {
|
||||
background: #cbd5e1;
|
||||
}
|
||||
|
||||
.btn-danger {
|
||||
background: #ef4444;
|
||||
color: #fff;
|
||||
}
|
||||
|
||||
.btn-danger:hover:not(:disabled) {
|
||||
background: #dc2626;
|
||||
}
|
||||
|
||||
/* Panel */
|
||||
.panel {
|
||||
background: #fff;
|
||||
border-radius: 10px;
|
||||
padding: 20px;
|
||||
margin-bottom: 16px;
|
||||
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.06);
|
||||
}
|
||||
|
||||
.panel-disabled {
|
||||
opacity: 0.6;
|
||||
}
|
||||
|
||||
.panel-title {
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
margin-bottom: 14px;
|
||||
color: #334155;
|
||||
}
|
||||
|
||||
.sub-title {
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
margin: 16px 0 10px;
|
||||
color: #475569;
|
||||
}
|
||||
|
||||
.muted {
|
||||
color: #94a3b8;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
/* Status Cards */
|
||||
.status-cards-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(4, 1fr);
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.status-card {
|
||||
background: #f8fafc;
|
||||
border-radius: 8px;
|
||||
padding: 14px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.status-card-title {
|
||||
font-size: 0.75rem;
|
||||
color: #64748b;
|
||||
margin-bottom: 8px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
}
|
||||
|
||||
/* StatusDot */
|
||||
.status-dot-wrapper {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 6px;
|
||||
}
|
||||
|
||||
.status-dot {
|
||||
width: 10px;
|
||||
height: 10px;
|
||||
border-radius: 50%;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.status-dot--healthy {
|
||||
background: #22c55e;
|
||||
box-shadow: 0 0 6px rgba(34, 197, 94, 0.4);
|
||||
}
|
||||
|
||||
.status-dot--degraded {
|
||||
background: #f59e0b;
|
||||
box-shadow: 0 0 6px rgba(245, 158, 11, 0.4);
|
||||
}
|
||||
|
||||
.status-dot--error {
|
||||
background: #ef4444;
|
||||
box-shadow: 0 0 6px rgba(239, 68, 68, 0.4);
|
||||
}
|
||||
|
||||
.status-dot--disabled {
|
||||
background: #94a3b8;
|
||||
}
|
||||
|
||||
.status-dot-label {
|
||||
font-size: 0.85rem;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
/* GaugeBar */
|
||||
.gauge-bar {
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.gauge-bar-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
.gauge-bar-label {
|
||||
font-size: 0.8rem;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.gauge-bar-value {
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.gauge-bar-track {
|
||||
height: 8px;
|
||||
background: #e2e8f0;
|
||||
border-radius: 4px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.gauge-bar-fill {
|
||||
height: 100%;
|
||||
border-radius: 4px;
|
||||
transition: width 0.4s ease, background-color 0.3s;
|
||||
min-width: 2px;
|
||||
}
|
||||
|
||||
/* StatCard */
|
||||
.stat-card {
|
||||
background: #f8fafc;
|
||||
border-radius: 8px;
|
||||
padding: 10px 12px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.stat-card-value {
|
||||
font-size: 1.1rem;
|
||||
font-weight: 700;
|
||||
color: #1e293b;
|
||||
line-height: 1.2;
|
||||
}
|
||||
|
||||
.stat-card-label {
|
||||
font-size: 0.7rem;
|
||||
color: #64748b;
|
||||
margin-top: 2px;
|
||||
}
|
||||
|
||||
/* Query Performance */
|
||||
.query-perf-grid {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 16px;
|
||||
}
|
||||
|
||||
.query-perf-stats {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
gap: 8px;
|
||||
align-content: start;
|
||||
}
|
||||
|
||||
.query-perf-chart {
|
||||
min-height: 200px;
|
||||
}
|
||||
|
||||
/* Redis */
|
||||
.redis-grid {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 16px;
|
||||
}
|
||||
|
||||
.redis-mini-stats {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
gap: 8px;
|
||||
margin-top: 12px;
|
||||
}
|
||||
|
||||
.redis-namespaces {
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
/* Mini Table */
|
||||
.mini-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.82rem;
|
||||
}
|
||||
|
||||
.mini-table th,
|
||||
.mini-table td {
|
||||
padding: 6px 10px;
|
||||
text-align: left;
|
||||
border-bottom: 1px solid #e2e8f0;
|
||||
}
|
||||
|
||||
.mini-table th {
|
||||
background: #f8fafc;
|
||||
font-weight: 600;
|
||||
color: #475569;
|
||||
}
|
||||
|
||||
/* Memory Cache Cards */
|
||||
.cache-cards-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.cache-card {
|
||||
background: #f8fafc;
|
||||
border-radius: 8px;
|
||||
padding: 14px;
|
||||
}
|
||||
|
||||
.cache-card-name {
|
||||
font-size: 0.85rem;
|
||||
font-weight: 600;
|
||||
margin-bottom: 2px;
|
||||
}
|
||||
|
||||
.cache-card-desc {
|
||||
font-size: 0.72rem;
|
||||
color: #64748b;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.cache-card-ttl {
|
||||
font-size: 0.72rem;
|
||||
color: #94a3b8;
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
.route-cache-stats {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(6, 1fr);
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
/* Connection Pool */
|
||||
.pool-stats-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(4, 1fr);
|
||||
gap: 8px;
|
||||
margin-top: 14px;
|
||||
}
|
||||
|
||||
/* Worker */
|
||||
.worker-info {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
gap: 8px;
|
||||
margin-bottom: 14px;
|
||||
}
|
||||
|
||||
/* Modal */
|
||||
.modal-backdrop {
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
background: rgba(0, 0, 0, 0.45);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
z-index: 1000;
|
||||
}
|
||||
|
||||
.modal-dialog {
|
||||
background: #fff;
|
||||
border-radius: 12px;
|
||||
padding: 24px;
|
||||
max-width: 400px;
|
||||
width: 90%;
|
||||
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
.modal-dialog h3 {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.modal-dialog p {
|
||||
font-size: 0.9rem;
|
||||
color: #475569;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.modal-actions {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
justify-content: flex-end;
|
||||
}
|
||||
|
||||
.modal-actions .btn {
|
||||
background: #e2e8f0;
|
||||
color: #334155;
|
||||
}
|
||||
|
||||
/* Log Controls */
|
||||
.log-controls {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
margin-bottom: 12px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.log-controls select,
|
||||
.log-controls input[type='text'] {
|
||||
padding: 6px 10px;
|
||||
border: 1px solid #cbd5e1;
|
||||
border-radius: 6px;
|
||||
font-size: 0.82rem;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
.log-controls input[type='text'] {
|
||||
flex: 1;
|
||||
min-width: 160px;
|
||||
}
|
||||
|
||||
/* Log Table */
|
||||
.log-table-wrapper {
|
||||
overflow-x: auto;
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.log-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.78rem;
|
||||
font-family: 'SF Mono', 'Fira Code', monospace;
|
||||
}
|
||||
|
||||
.log-table th,
|
||||
.log-table td {
|
||||
padding: 5px 8px;
|
||||
text-align: left;
|
||||
border-bottom: 1px solid #f1f5f9;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.log-table th {
|
||||
background: #f8fafc;
|
||||
font-weight: 600;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 1;
|
||||
}
|
||||
|
||||
.log-msg {
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
max-width: 600px;
|
||||
}
|
||||
|
||||
.log-time {
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.log-level {
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.log-error .log-level {
|
||||
color: #ef4444;
|
||||
}
|
||||
|
||||
.log-warning .log-level {
|
||||
color: #f59e0b;
|
||||
}
|
||||
|
||||
.log-info .log-level {
|
||||
color: #3b82f6;
|
||||
}
|
||||
|
||||
.log-debug .log-level {
|
||||
color: #94a3b8;
|
||||
}
|
||||
|
||||
/* Log Pagination */
|
||||
.log-pagination {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
margin-top: 10px;
|
||||
font-size: 0.82rem;
|
||||
}
|
||||
|
||||
/* Responsive */
|
||||
@media (max-width: 768px) {
|
||||
.status-cards-grid {
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
}
|
||||
|
||||
.query-perf-grid,
|
||||
.redis-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.pool-stats-grid {
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
}
|
||||
|
||||
.route-cache-stats {
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Trend Charts */
|
||||
.trend-chart-card {
|
||||
margin-top: 4px;
|
||||
background: #fff;
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 8px;
|
||||
padding: 16px;
|
||||
}
|
||||
.trend-chart-title {
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
color: #475569;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
.trend-chart-canvas {
|
||||
width: 100%;
|
||||
min-height: 200px;
|
||||
}
|
||||
.trend-chart-empty {
|
||||
color: #94a3b8;
|
||||
font-size: 0.85rem;
|
||||
text-align: center;
|
||||
padding: 32px 0;
|
||||
}
|
||||
@@ -70,6 +70,10 @@ const NATIVE_MODULE_LOADERS = Object.freeze({
|
||||
() => import('../tables/App.vue'),
|
||||
[() => import('../tables/style.css')],
|
||||
),
|
||||
'/admin/performance': createNativeLoader(
|
||||
() => import('../admin-performance/App.vue'),
|
||||
[() => import('../admin-performance/style.css')],
|
||||
),
|
||||
});
|
||||
|
||||
export function getNativeModuleLoader(route) {
|
||||
|
||||
@@ -190,13 +190,13 @@ const ROUTE_CONTRACTS = Object.freeze({
|
||||
'/admin/performance': buildContract({
|
||||
route: '/admin/performance',
|
||||
routeId: 'admin-performance',
|
||||
renderMode: 'external',
|
||||
renderMode: 'native',
|
||||
owner: 'frontend-platform-admin',
|
||||
title: '效能監控',
|
||||
rollbackStrategy: 'external_route_reversion',
|
||||
rollbackStrategy: 'fallback_to_legacy_route',
|
||||
visibilityPolicy: 'admin_only',
|
||||
scope: 'in-scope',
|
||||
compatibilityPolicy: 'external_target_redirect',
|
||||
compatibilityPolicy: 'redirect_to_shell_when_spa_enabled',
|
||||
}),
|
||||
'/tables': buildContract({
|
||||
route: '/tables',
|
||||
|
||||
@@ -28,7 +28,8 @@ export default defineConfig(({ mode }) => ({
|
||||
'query-tool': resolve(__dirname, 'src/query-tool/main.js'),
|
||||
'tmtt-defect': resolve(__dirname, 'src/tmtt-defect/main.js'),
|
||||
'qc-gate': resolve(__dirname, 'src/qc-gate/index.html'),
|
||||
'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html')
|
||||
'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html'),
|
||||
'admin-performance': resolve(__dirname, 'src/admin-performance/index.html')
|
||||
},
|
||||
output: {
|
||||
entryFileNames: '[name].js',
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
schema: spec-driven
|
||||
created: 2026-02-22
|
||||
@@ -0,0 +1,91 @@
|
||||
## Context
|
||||
|
||||
現有 `/admin/performance` 是 Jinja2 server-rendered 頁面(vanilla JS + Chart.js),是唯一未遷移至 Vue 3 SPA 的前端頁面。後端已具備豐富的監控數據(連線池 `get_pool_status()`、Redis client、LayeredCache `.telemetry()`),但前端僅展示 4 張 status cards + query performance + worker control + logs,缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度等關鍵面板。
|
||||
|
||||
## Goals / Non-Goals
|
||||
|
||||
**Goals:**
|
||||
- 將 admin/performance 頁面從 Jinja2 切換為 Vue 3 SPA,與所有報表頁面架構一致
|
||||
- 新增完整的系統監控面板:Redis 快取詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤
|
||||
- 提供可複用的 gauge/stat card 組件,便於未來擴展監控項目
|
||||
- 保留所有既有功能(status cards、query performance、worker control、system logs)
|
||||
|
||||
**Non-Goals:**
|
||||
- 不新增告警/通知機制(未來可擴展)
|
||||
- 不引入 WebSocket 即時推送(維持 30 秒輪詢)
|
||||
- 不修改既有 API response format(`system-status`、`metrics`、`logs` 保持不變)
|
||||
- 不新增使用者權限控制(沿用既有 admin 認證)
|
||||
|
||||
## Decisions
|
||||
|
||||
### 1. Vue 3 SPA + ECharts 取代 Jinja2 + Chart.js
|
||||
|
||||
**選擇**: 全面重建為 Vue 3 SPA,使用 ECharts 繪製圖表
|
||||
|
||||
**理由**: 所有報表頁面已完成 Vue SPA 遷移,admin/performance 是最後一個 Jinja2 頁面。統一架構可複用 `apiGet`、`useAutoRefresh` 等共用基礎設施,減少維護成本。ECharts 已是專案標準圖表庫(query-tool、reject-history 等均使用)。
|
||||
|
||||
**替代方案**: 保留 Jinja2 僅加 API — 但會持續累積技術債,且無法複用 Vue 生態。
|
||||
|
||||
### 2. 單一 performance-detail API 聚合所有新增監控數據
|
||||
|
||||
**選擇**: 新增 `GET /admin/api/performance-detail` 一個 endpoint,回傳 `redis`、`process_caches`、`route_cache`、`db_pool`、`direct_connections` 五個 section。
|
||||
|
||||
**理由**: 減少前端並發請求數(已有 5 個 API,加 1 個共 6 個),後端可在同一 request 中順序收集各子系統狀態,避免多次 round-trip。
|
||||
|
||||
**替代方案**: 每個監控維度獨立 endpoint — 更 RESTful 但增加前端複雜度和網路開銷。
|
||||
|
||||
### 3. ProcessLevelCache 全域 registry 模式
|
||||
|
||||
**選擇**: 在 `core/cache.py` 新增 `_PROCESS_CACHE_REGISTRY` dict + `register_process_cache()` 函式,各服務在模組載入時自行註冊。
|
||||
|
||||
**理由**: 避免 admin_routes 硬編碼各快取實例的 import 路徑,新增快取時只需在該服務中加一行 `register_process_cache()` 即可自動出現在監控面板。
|
||||
|
||||
**替代方案**: admin_routes 直接 import 各快取實例 — 耦合度高,新增快取需改兩處。
|
||||
|
||||
### 4. Redis namespace 監控使用 SCAN 而非 KEYS
|
||||
|
||||
**選擇**: 使用 `SCAN` 搭配 `MATCH` pattern 掃描各 namespace 的 key 數量。
|
||||
|
||||
**理由**: `KEYS *` 在生產環境會阻塞 Redis,`SCAN` 為非阻塞迭代器,安全性更高。
|
||||
|
||||
### 5. 直連 Oracle 使用 thread-safe atomic counter
|
||||
|
||||
**選擇**: 在 `database.py` 使用 `threading.Lock` 保護的全域計數器,在 `get_db_connection()` 和 `read_sql_df_slow()` 建立連線後 increment。
|
||||
|
||||
**理由**: 追蹤連線池外的直接連線使用量,幫助判斷是否需要調整池大小。計數器為 monotonic(只增不減),記錄的是自 worker 啟動以來的總數。
|
||||
|
||||
### 6. 前端組件複用 GaugeBar / StatCard / StatusDot
|
||||
|
||||
**選擇**: 新增 3 個小型可複用組件放在 `admin-performance/components/` 下。
|
||||
|
||||
**理由**: Redis 記憶體、連線池飽和度、ProcessLevelCache 使用率等多處需要 gauge 視覺化;status cards 跨面板重複。組件化可統一視覺風格並減少重複 template。
|
||||
|
||||
### 7. SQLite 持久化 metrics history store
|
||||
|
||||
**選擇**: 新增 `core/metrics_history.py`,使用 SQLite 儲存 metrics snapshots(仿 `core/log_store.py` 的 `LogStore` 模式),搭配 daemon thread 每 30 秒採集一次。
|
||||
|
||||
**理由**: in-memory deque 在 worker 重啟或 gunicorn prefork 下無法跨 worker 共享且不保留歷史。SQLite 提供跨 worker 讀取、重啟持久化、可配置保留天數(預設 3 天 / 50000 rows),且不需額外 infra。
|
||||
|
||||
**替代方案**:
|
||||
- in-memory deque — 簡單但 worker 獨立、重啟即失
|
||||
- Redis TSDB — 需額外模組且增加 Redis 負擔
|
||||
- PostgreSQL — 太重,且此數據不需 ACID
|
||||
|
||||
**Schema**: `metrics_snapshots` table 含 timestamp、worker PID、pool/redis/route_cache/latency 各欄位,`idx_metrics_ts` 索引加速時間查詢。
|
||||
|
||||
**背景採集**: `MetricsHistoryCollector` daemon thread,間隔可透過 `METRICS_HISTORY_INTERVAL` 環境變數配置。在 `app.py` lifecycle 中 start/stop。
|
||||
|
||||
## Risks / Trade-offs
|
||||
|
||||
- **Redis SCAN 效能**: 大量 key 時 SCAN 可能較慢 → 設定 `COUNT 100` 限制每次迭代量,且 30 秒才掃一次,可接受
|
||||
- **ProcessLevelCache registry 依賴模組載入順序**: 服務未 import 時不會註冊 → 在 app factory 或 gunicorn post_fork 確保所有服務模組已載入
|
||||
- **直連計數器跨 worker 不共享**: gunicorn prefork 模式下每個 worker 有獨立計數 → API 回傳當前 worker PID 供辨識,可透過 `/admin/api/system-status` 的 worker info 交叉比對
|
||||
- **舊 Jinja2 模板保留但不維護**: 切換後舊模板不再更新 → 透過 `routeContracts.js` 的 `rollbackStrategy: 'fallback_to_legacy_route'` 保留回退能力
|
||||
|
||||
## Migration Plan
|
||||
|
||||
1. 後端先行:加 `stats()`、registry、直連計數器、新 API(不影響既有功能)
|
||||
2. 前端建構:新建 `admin-performance/` Vue SPA,Vite 註冊 entry
|
||||
3. 路由切換:`admin_routes.py` 改為 `send_from_directory`,`routeContracts.js` 改 `renderMode: 'native'`
|
||||
4. 驗證後部署:確認所有面板正確顯示後上線
|
||||
5. 回退方案:`routeContracts.js` 改回 `renderMode: 'external'`,`admin_routes.py` 改回 `render_template`
|
||||
@@ -0,0 +1,31 @@
|
||||
## Why
|
||||
|
||||
現有 `/admin/performance` 是唯一仍使用 Jinja2 + vanilla JS + Chart.js 的頁面,與所有已遷移至 Vue 3 SPA 的報表頁面架構不一致。同時,隨著報表系統功能擴充(L1/L2 快取層、連線池、直連 Oracle 等),後端已具備豐富的遙測數據,但管理後台的監控面板覆蓋不足——缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤等關鍵資訊。
|
||||
|
||||
## What Changes
|
||||
|
||||
- 將 `/admin/performance` 從 Jinja2 server-rendered 頁面重建為 Vue 3 SPA(ECharts 取代 Chart.js)
|
||||
- 新增 `GET /admin/api/performance-detail` API,整合 Redis INFO/SCAN、ProcessLevelCache registry、連線池狀態、直連計數等完整監控數據
|
||||
- 後端 `ProcessLevelCache` 加入 `stats()` 方法與全域 registry,支援動態收集所有快取實例狀態
|
||||
- 後端 `database.py` 加入直連 Oracle 計數器,追蹤非連線池的直接連線使用量
|
||||
- 前端新增 GaugeBar / StatCard / StatusDot 可複用組件,提供 gauge 飽和度視覺化
|
||||
- portal-shell 路由從 `renderMode: 'external'` 切換為 `'native'`
|
||||
- Vite 構建新增 `admin-performance` entry point
|
||||
|
||||
## Capabilities
|
||||
|
||||
### New Capabilities
|
||||
- `admin-performance-spa`: Vue 3 SPA 重建管理效能儀表板,包含 status cards、query performance、Redis 快取、記憶體快取、連線池、worker 控制、系統日誌等完整面板
|
||||
- `cache-telemetry-api`: ProcessLevelCache stats() + 全域 registry + performance-detail API,提供所有記憶體快取、Redis 快取、route cache 的遙測數據
|
||||
- `connection-pool-monitoring`: 連線池飽和度追蹤 + 直連 Oracle 計數器,完整呈現資料庫連線使用狀況
|
||||
- `metrics-history-trending`: SQLite 持久化背景採集 + 時間序列趨勢圖,可回溯連線池飽和度、查詢延遲、Redis 記憶體、快取命中率等歷史數據
|
||||
|
||||
### Modified Capabilities
|
||||
<!-- No existing spec-level requirements are changing -->
|
||||
|
||||
## Impact
|
||||
|
||||
- **Backend** (7 files): `core/cache.py`、`core/database.py`、`core/metrics_history.py`(NEW)、`routes/admin_routes.py`、`services/resource_cache.py`、`services/realtime_equipment_cache.py`、`services/reject_dataset_cache.py`、`app.py`
|
||||
- **Frontend** (8 new + 3 modified): 新建 `admin-performance/` 目錄(index.html、main.js、App.vue、style.css、4 個組件含 TrendChart),修改 `vite.config.js`、`package.json`、`routeContracts.js`
|
||||
- **API**: 新增 2 個 endpoint (`/admin/api/performance-detail`、`/admin/api/performance-history`),既有 5 個 endpoint 不變
|
||||
- **Rollback**: 舊 Jinja2 模板保留,可透過 `routeContracts.js` 切回 `renderMode: 'external'`
|
||||
@@ -0,0 +1,100 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: Vue 3 SPA page replaces Jinja2 template
|
||||
The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route.
|
||||
|
||||
#### Scenario: Page loads as Vue SPA
|
||||
- **WHEN** user navigates to `/admin/performance`
|
||||
- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template)
|
||||
|
||||
#### Scenario: Portal-shell integration
|
||||
- **WHEN** the portal-shell renders `/admin/performance`
|
||||
- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe)
|
||||
|
||||
### Requirement: Status cards display system health
|
||||
The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value.
|
||||
|
||||
#### Scenario: All systems healthy
|
||||
- **WHEN** all backend systems report healthy status via `/admin/api/system-status`
|
||||
- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values
|
||||
|
||||
#### Scenario: Redis disabled
|
||||
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
|
||||
- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message
|
||||
|
||||
### Requirement: Query performance panel with ECharts
|
||||
The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation.
|
||||
|
||||
#### Scenario: Metrics loaded successfully
|
||||
- **WHEN** `/admin/api/metrics` returns valid performance data
|
||||
- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution
|
||||
|
||||
#### Scenario: No metrics data
|
||||
- **WHEN** `/admin/api/metrics` returns empty or null metrics
|
||||
- **THEN** the panel SHALL display placeholder text indicating no data available
|
||||
|
||||
### Requirement: Redis cache detail panel
|
||||
The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table.
|
||||
|
||||
#### Scenario: Redis active with data
|
||||
- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts
|
||||
- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count
|
||||
|
||||
#### Scenario: Redis disabled
|
||||
- **WHEN** Redis is disabled
|
||||
- **THEN** the Redis detail panel SHALL display a disabled state message without errors
|
||||
|
||||
### Requirement: Memory cache panel
|
||||
The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads).
|
||||
|
||||
#### Scenario: Multiple caches registered
|
||||
- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries
|
||||
- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description
|
||||
|
||||
#### Scenario: Route cache telemetry
|
||||
- **WHEN** `/admin/api/performance-detail` returns route_cache data
|
||||
- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads
|
||||
|
||||
### Requirement: Connection pool panel
|
||||
The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count.
|
||||
|
||||
#### Scenario: Pool under normal load
|
||||
- **WHEN** pool saturation is below 80%
|
||||
- **THEN** the GaugeBar SHALL display in a normal color (green/blue)
|
||||
|
||||
#### Scenario: Pool near saturation
|
||||
- **WHEN** pool saturation exceeds 80%
|
||||
- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red)
|
||||
|
||||
### Requirement: Worker control panel
|
||||
The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal.
|
||||
|
||||
#### Scenario: Restart worker
|
||||
- **WHEN** user clicks the restart button and confirms in the modal
|
||||
- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result
|
||||
|
||||
#### Scenario: Restart during cooldown
|
||||
- **WHEN** worker is in cooldown period
|
||||
- **THEN** the restart button SHALL be disabled with a cooldown indicator
|
||||
|
||||
### Requirement: System logs panel with filtering and pagination
|
||||
The dashboard SHALL display system logs with level filtering, text search, and pagination controls.
|
||||
|
||||
#### Scenario: Filter by log level
|
||||
- **WHEN** user selects a specific log level filter
|
||||
- **THEN** only logs matching that level SHALL be displayed
|
||||
|
||||
#### Scenario: Paginate logs
|
||||
- **WHEN** logs exceed the page size
|
||||
- **THEN** pagination controls SHALL allow navigating between pages
|
||||
|
||||
### Requirement: Auto-refresh with toggle
|
||||
The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh.
|
||||
|
||||
#### Scenario: Auto-refresh enabled
|
||||
- **WHEN** auto-refresh is enabled (default)
|
||||
- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch
|
||||
|
||||
#### Scenario: Manual refresh
|
||||
- **WHEN** user clicks the manual refresh button
|
||||
- **THEN** all panels SHALL immediately refresh their data
|
||||
@@ -0,0 +1,56 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: ProcessLevelCache stats method
|
||||
Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`.
|
||||
|
||||
#### Scenario: Stats on active cache
|
||||
- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s)
|
||||
- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}`
|
||||
|
||||
#### Scenario: Stats with expired entries
|
||||
- **WHEN** `stats()` is called and some entries have exceeded TTL
|
||||
- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl`
|
||||
|
||||
#### Scenario: Thread safety
|
||||
- **WHEN** `stats()` is called concurrently with cache writes
|
||||
- **THEN** it SHALL acquire the cache lock and return consistent data without races
|
||||
|
||||
### Requirement: ProcessLevelCache global registry
|
||||
The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`.
|
||||
|
||||
#### Scenario: Register and retrieve all caches
|
||||
- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called
|
||||
- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches
|
||||
|
||||
#### Scenario: Cache not registered
|
||||
- **WHEN** a service's ProcessLevelCache is not registered
|
||||
- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output
|
||||
|
||||
### Requirement: Performance detail API endpoint
|
||||
The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`.
|
||||
|
||||
#### Scenario: All systems available
|
||||
- **WHEN** the API is called and all subsystems are healthy
|
||||
- **THEN** it SHALL return all 5 sections with current telemetry data
|
||||
|
||||
#### Scenario: Redis disabled
|
||||
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
|
||||
- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally
|
||||
|
||||
### Requirement: Redis namespace key distribution
|
||||
The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`.
|
||||
|
||||
#### Scenario: Keys exist across namespaces
|
||||
- **WHEN** Redis contains keys across multiple namespaces
|
||||
- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count`
|
||||
|
||||
#### Scenario: SCAN safety
|
||||
- **WHEN** scanning Redis keys
|
||||
- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis
|
||||
|
||||
### Requirement: Route cache telemetry in performance detail
|
||||
The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`.
|
||||
|
||||
#### Scenario: LayeredCache active
|
||||
- **WHEN** route cache is in layered mode
|
||||
- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry
|
||||
@@ -0,0 +1,27 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: Connection pool status in performance detail
|
||||
The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`.
|
||||
|
||||
#### Scenario: Pool status retrieved
|
||||
- **WHEN** the API is called
|
||||
- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values
|
||||
|
||||
#### Scenario: Saturation calculation
|
||||
- **WHEN** the pool has 8 checked_out connections and max_capacity is 30
|
||||
- **THEN** saturation SHALL be reported as approximately 26.7%
|
||||
|
||||
### Requirement: Direct Oracle connection counter
|
||||
The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection.
|
||||
|
||||
#### Scenario: Counter increments on direct connection
|
||||
- **WHEN** `get_db_connection()` successfully creates a connection
|
||||
- **THEN** the direct connection counter SHALL increment by 1
|
||||
|
||||
#### Scenario: Counter in performance detail
|
||||
- **WHEN** the performance-detail API is called
|
||||
- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID)
|
||||
|
||||
#### Scenario: Counter is per-worker
|
||||
- **WHEN** multiple gunicorn workers are running
|
||||
- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker
|
||||
@@ -0,0 +1,65 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: SQLite metrics history store
|
||||
The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`.
|
||||
|
||||
#### Scenario: Write and query snapshots
|
||||
- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics
|
||||
- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID
|
||||
|
||||
#### Scenario: Query by time range
|
||||
- **WHEN** `query_snapshots(minutes=30)` is called
|
||||
- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending
|
||||
|
||||
#### Scenario: Retention cleanup
|
||||
- **WHEN** `cleanup()` is called
|
||||
- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000)
|
||||
|
||||
#### Scenario: Thread safety
|
||||
- **WHEN** multiple threads write snapshots concurrently
|
||||
- **THEN** the write lock SHALL serialize writes and prevent database corruption
|
||||
|
||||
### Requirement: Background metrics collector
|
||||
The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var).
|
||||
|
||||
#### Scenario: Automatic collection
|
||||
- **WHEN** the collector is started via `start_metrics_history(app)`
|
||||
- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store
|
||||
|
||||
#### Scenario: Graceful shutdown
|
||||
- **WHEN** `stop_metrics_history()` is called
|
||||
- **THEN** the collector thread SHALL stop within one interval period
|
||||
|
||||
#### Scenario: Subsystem unavailability
|
||||
- **WHEN** a subsystem (e.g., Redis) is unavailable during collection
|
||||
- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics
|
||||
|
||||
### Requirement: Performance history API endpoint
|
||||
The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots.
|
||||
|
||||
#### Scenario: Query with time range
|
||||
- **WHEN** the API is called with `?minutes=30`
|
||||
- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}`
|
||||
|
||||
#### Scenario: Time range bounds
|
||||
- **WHEN** `minutes` is less than 1 or greater than 180
|
||||
- **THEN** it SHALL be clamped to the range [1, 180]
|
||||
|
||||
#### Scenario: Admin authentication
|
||||
- **WHEN** the API is called without admin authentication
|
||||
- **THEN** it SHALL be rejected by the `@admin_required` decorator
|
||||
|
||||
### Requirement: Frontend trend charts
|
||||
The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts.
|
||||
|
||||
#### Scenario: Trend charts with data
|
||||
- **WHEN** historical snapshots contain more than 1 data point
|
||||
- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates
|
||||
|
||||
#### Scenario: Trend charts without data
|
||||
- **WHEN** historical snapshots are empty or contain only 1 data point
|
||||
- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`)
|
||||
|
||||
#### Scenario: Auto-refresh
|
||||
- **WHEN** the dashboard auto-refreshes
|
||||
- **THEN** historical data SHALL also be refreshed alongside real-time metrics
|
||||
@@ -0,0 +1,80 @@
|
||||
## 1. Backend — Cache Telemetry Infrastructure
|
||||
|
||||
- [x] 1.1 Add `stats()` method to `ProcessLevelCache` in `core/cache.py` (returns entries/max_size/ttl_seconds with lock)
|
||||
- [x] 1.2 Add `_PROCESS_CACHE_REGISTRY`, `register_process_cache()`, and `get_all_process_cache_stats()` to `core/cache.py`
|
||||
- [x] 1.3 Register `_wip_df_cache` in `core/cache.py`
|
||||
- [x] 1.4 Add `stats()` + `register_process_cache()` to `services/resource_cache.py`
|
||||
- [x] 1.5 Add `stats()` + `register_process_cache()` to `services/realtime_equipment_cache.py`
|
||||
- [x] 1.6 Add `register_process_cache()` to `services/reject_dataset_cache.py`
|
||||
|
||||
## 2. Backend — Direct Connection Counter
|
||||
|
||||
- [x] 2.1 Add `_DIRECT_CONN_COUNTER`, `_DIRECT_CONN_LOCK`, and `get_direct_connection_count()` to `core/database.py`
|
||||
- [x] 2.2 Increment counter in `get_db_connection()` and `read_sql_df_slow()` after successful connection creation
|
||||
|
||||
## 3. Backend — Performance Detail API
|
||||
|
||||
- [x] 3.1 Add `GET /admin/api/performance-detail` endpoint in `routes/admin_routes.py` returning redis, process_caches, route_cache, db_pool, and direct_connections sections
|
||||
- [x] 3.2 Implement Redis INFO + SCAN namespace key distribution (data, route_cache, equipment_status, reject_dataset, meta, lock, scrap_exclusion) with graceful degradation when Redis is disabled
|
||||
|
||||
## 4. Frontend — Page Scaffolding
|
||||
|
||||
- [x] 4.1 Create `frontend/src/admin-performance/index.html` and `main.js` (standard Vue SPA entry)
|
||||
- [x] 4.2 Register `admin-performance` entry in `vite.config.js`
|
||||
- [x] 4.3 Add `cp` command for `admin-performance.html` in `package.json` build script
|
||||
|
||||
## 5. Frontend — Reusable Components
|
||||
|
||||
- [x] 5.1 Create `GaugeBar.vue` — horizontal gauge bar with label, value, max, and color threshold props
|
||||
- [x] 5.2 Create `StatCard.vue` — mini card with numeric value, label, and optional unit/icon
|
||||
- [x] 5.3 Create `StatusDot.vue` — colored dot indicator (healthy/degraded/error/disabled) with label
|
||||
|
||||
## 6. Frontend — App.vue Main Dashboard
|
||||
|
||||
- [x] 6.1 Implement data fetching layer: `loadSystemStatus()`, `loadMetrics()`, `loadPerformanceDetail()`, `loadLogs()`, `loadWorkerStatus()` with `Promise.all` parallel fetch and `useAutoRefresh` (30s)
|
||||
- [x] 6.2 Build header section with gradient background, title, auto-refresh toggle, and manual refresh button
|
||||
- [x] 6.3 Build status cards section (Database / Redis / Circuit Breaker / Worker PID) using StatusDot
|
||||
- [x] 6.4 Build query performance panel with P50/P95/P99 stat cards and ECharts latency distribution chart
|
||||
- [x] 6.5 Build Redis cache detail panel with memory GaugeBar, hit rate, client count, peak memory, and namespace key distribution table
|
||||
- [x] 6.6 Build memory cache panel with ProcessLevelCache grid cards (entries/max gauge + TTL) and route cache telemetry (L1/L2 hit rate, miss rate, total reads)
|
||||
- [x] 6.7 Build connection pool panel with saturation GaugeBar and stat card grid (checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, direct connections)
|
||||
- [x] 6.8 Build worker control panel with PID/uptime/cooldown display, restart button, and confirmation modal
|
||||
- [x] 6.9 Build system logs panel with level filter, text search, pagination, and log clearing
|
||||
- [x] 6.10 Create `style.css` with all panel, grid, gauge, card, and responsive layout styles
|
||||
|
||||
## 7. Route Integration
|
||||
|
||||
- [x] 7.1 Change `/admin/performance` route handler in `admin_routes.py` from `render_template` to `send_from_directory` serving the Vue SPA
|
||||
- [x] 7.2 Update `routeContracts.js`: change renderMode to `'native'`, rollbackStrategy to `'fallback_to_legacy_route'`, compatibilityPolicy to `'redirect_to_shell_when_spa_enabled'`
|
||||
|
||||
## 8. Verification (Phase 1)
|
||||
|
||||
- [x] 8.1 Run `cd frontend && npx vite build` — confirm no compilation errors and `admin-performance.html` is produced
|
||||
- [x] 8.2 Verify all dashboard panels render correctly with live data after service restart
|
||||
|
||||
## 9. Backend — Metrics History Store
|
||||
|
||||
- [x] 9.1 Create `core/metrics_history.py` with `MetricsHistoryStore` class (SQLite schema, thread-local connections, write_lock, write_snapshot, query_snapshots, cleanup)
|
||||
- [x] 9.2 Add `MetricsHistoryCollector` class (daemon thread, configurable interval, collect pool/redis/route_cache/latency)
|
||||
- [x] 9.3 Add module-level `get_metrics_history_store()`, `start_metrics_history(app)`, `stop_metrics_history()` functions
|
||||
|
||||
## 10. Backend — Lifecycle Integration
|
||||
|
||||
- [x] 10.1 Call `start_metrics_history(app)` in `app.py` after other background services
|
||||
- [x] 10.2 Call `stop_metrics_history()` in `_shutdown_runtime_resources()` in `app.py`
|
||||
|
||||
## 11. Backend — Performance History API
|
||||
|
||||
- [x] 11.1 Add `GET /admin/api/performance-history` endpoint in `admin_routes.py` (minutes param, clamped 1-180, returns snapshots array)
|
||||
|
||||
## 12. Frontend — Trend Charts
|
||||
|
||||
- [x] 12.1 Create `TrendChart.vue` component using vue-echarts VChart (line/area chart, dual yAxis support, time labels, autoresize)
|
||||
- [x] 12.2 Add `loadPerformanceHistory()` fetch to `App.vue` and integrate into `refreshAll()`
|
||||
- [x] 12.3 Add 4 TrendChart panels to `App.vue` template (pool saturation, query latency, Redis memory, cache hit rates)
|
||||
- [x] 12.4 Add trend chart styles to `style.css`
|
||||
|
||||
## 13. Verification (Phase 2)
|
||||
|
||||
- [x] 13.1 Run `cd frontend && npm run build` — confirm no compilation errors
|
||||
- [x] 13.2 Verify trend charts render with historical data after service restart + 60s collection
|
||||
100
openspec/specs/admin-performance-spa/spec.md
Normal file
100
openspec/specs/admin-performance-spa/spec.md
Normal file
@@ -0,0 +1,100 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: Vue 3 SPA page replaces Jinja2 template
|
||||
The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route.
|
||||
|
||||
#### Scenario: Page loads as Vue SPA
|
||||
- **WHEN** user navigates to `/admin/performance`
|
||||
- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template)
|
||||
|
||||
#### Scenario: Portal-shell integration
|
||||
- **WHEN** the portal-shell renders `/admin/performance`
|
||||
- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe)
|
||||
|
||||
### Requirement: Status cards display system health
|
||||
The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value.
|
||||
|
||||
#### Scenario: All systems healthy
|
||||
- **WHEN** all backend systems report healthy status via `/admin/api/system-status`
|
||||
- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values
|
||||
|
||||
#### Scenario: Redis disabled
|
||||
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
|
||||
- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message
|
||||
|
||||
### Requirement: Query performance panel with ECharts
|
||||
The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation.
|
||||
|
||||
#### Scenario: Metrics loaded successfully
|
||||
- **WHEN** `/admin/api/metrics` returns valid performance data
|
||||
- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution
|
||||
|
||||
#### Scenario: No metrics data
|
||||
- **WHEN** `/admin/api/metrics` returns empty or null metrics
|
||||
- **THEN** the panel SHALL display placeholder text indicating no data available
|
||||
|
||||
### Requirement: Redis cache detail panel
|
||||
The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table.
|
||||
|
||||
#### Scenario: Redis active with data
|
||||
- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts
|
||||
- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count
|
||||
|
||||
#### Scenario: Redis disabled
|
||||
- **WHEN** Redis is disabled
|
||||
- **THEN** the Redis detail panel SHALL display a disabled state message without errors
|
||||
|
||||
### Requirement: Memory cache panel
|
||||
The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads).
|
||||
|
||||
#### Scenario: Multiple caches registered
|
||||
- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries
|
||||
- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description
|
||||
|
||||
#### Scenario: Route cache telemetry
|
||||
- **WHEN** `/admin/api/performance-detail` returns route_cache data
|
||||
- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads
|
||||
|
||||
### Requirement: Connection pool panel
|
||||
The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count.
|
||||
|
||||
#### Scenario: Pool under normal load
|
||||
- **WHEN** pool saturation is below 80%
|
||||
- **THEN** the GaugeBar SHALL display in a normal color (green/blue)
|
||||
|
||||
#### Scenario: Pool near saturation
|
||||
- **WHEN** pool saturation exceeds 80%
|
||||
- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red)
|
||||
|
||||
### Requirement: Worker control panel
|
||||
The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal.
|
||||
|
||||
#### Scenario: Restart worker
|
||||
- **WHEN** user clicks the restart button and confirms in the modal
|
||||
- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result
|
||||
|
||||
#### Scenario: Restart during cooldown
|
||||
- **WHEN** worker is in cooldown period
|
||||
- **THEN** the restart button SHALL be disabled with a cooldown indicator
|
||||
|
||||
### Requirement: System logs panel with filtering and pagination
|
||||
The dashboard SHALL display system logs with level filtering, text search, and pagination controls.
|
||||
|
||||
#### Scenario: Filter by log level
|
||||
- **WHEN** user selects a specific log level filter
|
||||
- **THEN** only logs matching that level SHALL be displayed
|
||||
|
||||
#### Scenario: Paginate logs
|
||||
- **WHEN** logs exceed the page size
|
||||
- **THEN** pagination controls SHALL allow navigating between pages
|
||||
|
||||
### Requirement: Auto-refresh with toggle
|
||||
The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh.
|
||||
|
||||
#### Scenario: Auto-refresh enabled
|
||||
- **WHEN** auto-refresh is enabled (default)
|
||||
- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch
|
||||
|
||||
#### Scenario: Manual refresh
|
||||
- **WHEN** user clicks the manual refresh button
|
||||
- **THEN** all panels SHALL immediately refresh their data
|
||||
56
openspec/specs/cache-telemetry-api/spec.md
Normal file
56
openspec/specs/cache-telemetry-api/spec.md
Normal file
@@ -0,0 +1,56 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: ProcessLevelCache stats method
|
||||
Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`.
|
||||
|
||||
#### Scenario: Stats on active cache
|
||||
- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s)
|
||||
- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}`
|
||||
|
||||
#### Scenario: Stats with expired entries
|
||||
- **WHEN** `stats()` is called and some entries have exceeded TTL
|
||||
- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl`
|
||||
|
||||
#### Scenario: Thread safety
|
||||
- **WHEN** `stats()` is called concurrently with cache writes
|
||||
- **THEN** it SHALL acquire the cache lock and return consistent data without races
|
||||
|
||||
### Requirement: ProcessLevelCache global registry
|
||||
The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`.
|
||||
|
||||
#### Scenario: Register and retrieve all caches
|
||||
- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called
|
||||
- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches
|
||||
|
||||
#### Scenario: Cache not registered
|
||||
- **WHEN** a service's ProcessLevelCache is not registered
|
||||
- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output
|
||||
|
||||
### Requirement: Performance detail API endpoint
|
||||
The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`.
|
||||
|
||||
#### Scenario: All systems available
|
||||
- **WHEN** the API is called and all subsystems are healthy
|
||||
- **THEN** it SHALL return all 5 sections with current telemetry data
|
||||
|
||||
#### Scenario: Redis disabled
|
||||
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
|
||||
- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally
|
||||
|
||||
### Requirement: Redis namespace key distribution
|
||||
The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`.
|
||||
|
||||
#### Scenario: Keys exist across namespaces
|
||||
- **WHEN** Redis contains keys across multiple namespaces
|
||||
- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count`
|
||||
|
||||
#### Scenario: SCAN safety
|
||||
- **WHEN** scanning Redis keys
|
||||
- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis
|
||||
|
||||
### Requirement: Route cache telemetry in performance detail
|
||||
The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`.
|
||||
|
||||
#### Scenario: LayeredCache active
|
||||
- **WHEN** route cache is in layered mode
|
||||
- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry
|
||||
27
openspec/specs/connection-pool-monitoring/spec.md
Normal file
27
openspec/specs/connection-pool-monitoring/spec.md
Normal file
@@ -0,0 +1,27 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: Connection pool status in performance detail
|
||||
The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`.
|
||||
|
||||
#### Scenario: Pool status retrieved
|
||||
- **WHEN** the API is called
|
||||
- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values
|
||||
|
||||
#### Scenario: Saturation calculation
|
||||
- **WHEN** the pool has 8 checked_out connections and max_capacity is 30
|
||||
- **THEN** saturation SHALL be reported as approximately 26.7%
|
||||
|
||||
### Requirement: Direct Oracle connection counter
|
||||
The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection.
|
||||
|
||||
#### Scenario: Counter increments on direct connection
|
||||
- **WHEN** `get_db_connection()` successfully creates a connection
|
||||
- **THEN** the direct connection counter SHALL increment by 1
|
||||
|
||||
#### Scenario: Counter in performance detail
|
||||
- **WHEN** the performance-detail API is called
|
||||
- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID)
|
||||
|
||||
#### Scenario: Counter is per-worker
|
||||
- **WHEN** multiple gunicorn workers are running
|
||||
- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker
|
||||
65
openspec/specs/metrics-history-trending/spec.md
Normal file
65
openspec/specs/metrics-history-trending/spec.md
Normal file
@@ -0,0 +1,65 @@
|
||||
## ADDED Requirements
|
||||
|
||||
### Requirement: SQLite metrics history store
|
||||
The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`.
|
||||
|
||||
#### Scenario: Write and query snapshots
|
||||
- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics
|
||||
- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID
|
||||
|
||||
#### Scenario: Query by time range
|
||||
- **WHEN** `query_snapshots(minutes=30)` is called
|
||||
- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending
|
||||
|
||||
#### Scenario: Retention cleanup
|
||||
- **WHEN** `cleanup()` is called
|
||||
- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000)
|
||||
|
||||
#### Scenario: Thread safety
|
||||
- **WHEN** multiple threads write snapshots concurrently
|
||||
- **THEN** the write lock SHALL serialize writes and prevent database corruption
|
||||
|
||||
### Requirement: Background metrics collector
|
||||
The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var).
|
||||
|
||||
#### Scenario: Automatic collection
|
||||
- **WHEN** the collector is started via `start_metrics_history(app)`
|
||||
- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store
|
||||
|
||||
#### Scenario: Graceful shutdown
|
||||
- **WHEN** `stop_metrics_history()` is called
|
||||
- **THEN** the collector thread SHALL stop within one interval period
|
||||
|
||||
#### Scenario: Subsystem unavailability
|
||||
- **WHEN** a subsystem (e.g., Redis) is unavailable during collection
|
||||
- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics
|
||||
|
||||
### Requirement: Performance history API endpoint
|
||||
The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots.
|
||||
|
||||
#### Scenario: Query with time range
|
||||
- **WHEN** the API is called with `?minutes=30`
|
||||
- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}`
|
||||
|
||||
#### Scenario: Time range bounds
|
||||
- **WHEN** `minutes` is less than 1 or greater than 180
|
||||
- **THEN** it SHALL be clamped to the range [1, 180]
|
||||
|
||||
#### Scenario: Admin authentication
|
||||
- **WHEN** the API is called without admin authentication
|
||||
- **THEN** it SHALL be rejected by the `@admin_required` decorator
|
||||
|
||||
### Requirement: Frontend trend charts
|
||||
The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts.
|
||||
|
||||
#### Scenario: Trend charts with data
|
||||
- **WHEN** historical snapshots contain more than 1 data point
|
||||
- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates
|
||||
|
||||
#### Scenario: Trend charts without data
|
||||
- **WHEN** historical snapshots are empty or contain only 1 data point
|
||||
- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`)
|
||||
|
||||
#### Scenario: Auto-refresh
|
||||
- **WHEN** the dashboard auto-refreshes
|
||||
- **THEN** historical data SHALL also be refreshed alongside real-time metrics
|
||||
@@ -295,6 +295,12 @@ def _shutdown_runtime_resources() -> None:
|
||||
except Exception as exc:
|
||||
logger.warning("Error stopping scrap exclusion cache worker: %s", exc)
|
||||
|
||||
try:
|
||||
from mes_dashboard.core.metrics_history import stop_metrics_history
|
||||
stop_metrics_history()
|
||||
except Exception as exc:
|
||||
logger.warning("Error stopping metrics history: %s", exc)
|
||||
|
||||
try:
|
||||
close_redis()
|
||||
except Exception as exc:
|
||||
@@ -390,6 +396,8 @@ def create_app(config_name: str | None = None) -> Flask:
|
||||
start_cache_updater() # Start Redis cache updater
|
||||
init_realtime_equipment_cache(app) # Start realtime equipment status cache
|
||||
init_scrap_reason_exclusion_cache(app) # Start exclusion-policy cache sync
|
||||
from mes_dashboard.core.metrics_history import start_metrics_history
|
||||
start_metrics_history(app) # Start metrics history collector
|
||||
_register_shutdown_hooks(app)
|
||||
|
||||
# Register API routes
|
||||
|
||||
@@ -95,6 +95,34 @@ class ProcessLevelCache:
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
|
||||
def stats(self) -> dict:
|
||||
"""Return live cache statistics for telemetry."""
|
||||
with self._lock:
|
||||
now = time.time()
|
||||
live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
|
||||
return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Process-Level Cache Registry (for admin telemetry)
|
||||
# ============================================================
|
||||
|
||||
_PROCESS_CACHE_REGISTRY: dict[str, tuple[str, Any]] = {}
|
||||
|
||||
|
||||
def register_process_cache(name: str, cache_instance: Any, description: str = "") -> None:
|
||||
"""Register a ProcessLevelCache instance for admin telemetry."""
|
||||
_PROCESS_CACHE_REGISTRY[name] = (description, cache_instance)
|
||||
|
||||
|
||||
def get_all_process_cache_stats() -> dict[str, dict]:
|
||||
"""Collect stats from all registered ProcessLevelCache instances."""
|
||||
return {
|
||||
name: {**inst.stats(), "description": desc}
|
||||
for name, (desc, inst) in _PROCESS_CACHE_REGISTRY.items()
|
||||
if callable(getattr(inst, "stats", None))
|
||||
}
|
||||
|
||||
|
||||
def _resolve_cache_max_size(env_name: str, default: int) -> int:
|
||||
value = os.getenv(env_name)
|
||||
@@ -116,6 +144,7 @@ _wip_df_cache = ProcessLevelCache(
|
||||
ttl_seconds=30,
|
||||
max_size=WIP_PROCESS_CACHE_MAX_SIZE,
|
||||
)
|
||||
register_process_cache("wip_dataframe", _wip_df_cache, "WIP DataFrame (L1, 30s)")
|
||||
_wip_parse_lock = threading.Lock()
|
||||
|
||||
# ============================================================
|
||||
|
||||
@@ -416,6 +416,14 @@ def dispose_engine():
|
||||
# Direct Connection Helpers
|
||||
# ============================================================
|
||||
|
||||
_DIRECT_CONN_COUNTER = 0
|
||||
_DIRECT_CONN_LOCK = threading.Lock()
|
||||
|
||||
|
||||
def get_direct_connection_count() -> int:
|
||||
"""Return total direct (non-pooled) connections since worker start."""
|
||||
return _DIRECT_CONN_COUNTER
|
||||
|
||||
|
||||
def get_db_connection():
|
||||
"""Create a direct oracledb connection.
|
||||
@@ -432,6 +440,9 @@ def get_db_connection():
|
||||
retry_delay=runtime["retry_delay"],
|
||||
)
|
||||
conn.call_timeout = runtime["call_timeout_ms"]
|
||||
with _DIRECT_CONN_LOCK:
|
||||
global _DIRECT_CONN_COUNTER
|
||||
_DIRECT_CONN_COUNTER += 1
|
||||
logger.debug(
|
||||
"Direct oracledb connection established (call_timeout_ms=%s)",
|
||||
runtime["call_timeout_ms"],
|
||||
@@ -591,6 +602,9 @@ def read_sql_df_slow(
|
||||
retry_delay=runtime["retry_delay"],
|
||||
)
|
||||
conn.call_timeout = timeout_ms
|
||||
with _DIRECT_CONN_LOCK:
|
||||
global _DIRECT_CONN_COUNTER
|
||||
_DIRECT_CONN_COUNTER += 1
|
||||
logger.debug(
|
||||
"Slow-query connection established (call_timeout_ms=%s)", timeout_ms
|
||||
)
|
||||
|
||||
369
src/mes_dashboard/core/metrics_history.py
Normal file
369
src/mes_dashboard/core/metrics_history.py
Normal file
@@ -0,0 +1,369 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""SQLite-based metrics history store for admin performance dashboard.
|
||||
|
||||
Periodically snapshots system metrics (pool, redis, cache, latency)
|
||||
into a SQLite database for historical trend visualization.
|
||||
Follows the LogStore pattern from core/log_store.py.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Generator, List, Optional
|
||||
|
||||
logger = logging.getLogger('mes_dashboard.metrics_history')
|
||||
|
||||
# ============================================================
|
||||
# Configuration
|
||||
# ============================================================
|
||||
|
||||
METRICS_HISTORY_PATH = os.getenv(
|
||||
'METRICS_HISTORY_PATH',
|
||||
'logs/metrics_history.sqlite',
|
||||
)
|
||||
METRICS_HISTORY_INTERVAL = int(os.getenv('METRICS_HISTORY_INTERVAL', '30'))
|
||||
METRICS_HISTORY_RETENTION_DAYS = int(os.getenv('METRICS_HISTORY_RETENTION_DAYS', '3'))
|
||||
METRICS_HISTORY_MAX_ROWS = int(os.getenv('METRICS_HISTORY_MAX_ROWS', '50000'))
|
||||
|
||||
# ============================================================
|
||||
# Database Schema
|
||||
# ============================================================
|
||||
|
||||
CREATE_TABLE_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS metrics_snapshots (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
ts TEXT NOT NULL,
|
||||
worker_pid INTEGER NOT NULL,
|
||||
pool_saturation REAL,
|
||||
pool_checked_out INTEGER,
|
||||
pool_checked_in INTEGER,
|
||||
pool_overflow INTEGER,
|
||||
pool_max_capacity INTEGER,
|
||||
redis_used_memory INTEGER,
|
||||
redis_hit_rate REAL,
|
||||
rc_l1_hit_rate REAL,
|
||||
rc_l2_hit_rate REAL,
|
||||
rc_miss_rate REAL,
|
||||
latency_p50_ms REAL,
|
||||
latency_p95_ms REAL,
|
||||
latency_p99_ms REAL,
|
||||
latency_count INTEGER
|
||||
);
|
||||
"""
|
||||
|
||||
CREATE_INDEX_SQL = (
|
||||
"CREATE INDEX IF NOT EXISTS idx_metrics_ts ON metrics_snapshots(ts);"
|
||||
)
|
||||
|
||||
COLUMNS = [
|
||||
"ts", "worker_pid",
|
||||
"pool_saturation", "pool_checked_out", "pool_checked_in",
|
||||
"pool_overflow", "pool_max_capacity",
|
||||
"redis_used_memory", "redis_hit_rate",
|
||||
"rc_l1_hit_rate", "rc_l2_hit_rate", "rc_miss_rate",
|
||||
"latency_p50_ms", "latency_p95_ms", "latency_p99_ms", "latency_count",
|
||||
]
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Metrics History Store
|
||||
# ============================================================
|
||||
|
||||
class MetricsHistoryStore:
|
||||
"""SQLite-based metrics history store (follows LogStore pattern)."""
|
||||
|
||||
def __init__(self, db_path: str = METRICS_HISTORY_PATH):
|
||||
self.db_path = db_path
|
||||
self._local = threading.local()
|
||||
self._write_lock = threading.Lock()
|
||||
self._initialized = False
|
||||
|
||||
def initialize(self) -> None:
|
||||
if self._initialized:
|
||||
return
|
||||
db_dir = Path(self.db_path).parent
|
||||
db_dir.mkdir(parents=True, exist_ok=True)
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(CREATE_TABLE_SQL)
|
||||
cursor.execute(CREATE_INDEX_SQL)
|
||||
conn.commit()
|
||||
self._initialized = True
|
||||
logger.info("Metrics history store initialized at %s", self.db_path)
|
||||
|
||||
@contextmanager
|
||||
def _get_connection(self) -> Generator[sqlite3.Connection, None, None]:
|
||||
if not hasattr(self._local, 'connection') or self._local.connection is None:
|
||||
self._local.connection = sqlite3.connect(
|
||||
self.db_path, timeout=10.0, check_same_thread=False,
|
||||
)
|
||||
self._local.connection.row_factory = sqlite3.Row
|
||||
try:
|
||||
yield self._local.connection
|
||||
except sqlite3.Error as exc:
|
||||
logger.error("Metrics history DB error: %s", exc)
|
||||
try:
|
||||
self._local.connection.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._local.connection = None
|
||||
raise
|
||||
|
||||
def write_snapshot(self, data: Dict[str, Any]) -> bool:
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
ts = datetime.now().isoformat()
|
||||
pid = os.getpid()
|
||||
pool = data.get("pool") or {}
|
||||
redis = data.get("redis") or {}
|
||||
rc = data.get("route_cache") or {}
|
||||
lat = data.get("latency") or {}
|
||||
try:
|
||||
with self._write_lock:
|
||||
with self._get_connection() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO metrics_snapshots
|
||||
(ts, worker_pid,
|
||||
pool_saturation, pool_checked_out, pool_checked_in,
|
||||
pool_overflow, pool_max_capacity,
|
||||
redis_used_memory, redis_hit_rate,
|
||||
rc_l1_hit_rate, rc_l2_hit_rate, rc_miss_rate,
|
||||
latency_p50_ms, latency_p95_ms, latency_p99_ms, latency_count)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
""",
|
||||
(
|
||||
ts, pid,
|
||||
pool.get("saturation"),
|
||||
pool.get("checked_out"),
|
||||
pool.get("checked_in"),
|
||||
pool.get("overflow"),
|
||||
pool.get("max_capacity"),
|
||||
redis.get("used_memory"),
|
||||
redis.get("hit_rate"),
|
||||
rc.get("l1_hit_rate"),
|
||||
rc.get("l2_hit_rate"),
|
||||
rc.get("miss_rate"),
|
||||
lat.get("p50_ms"),
|
||||
lat.get("p95_ms"),
|
||||
lat.get("p99_ms"),
|
||||
lat.get("count"),
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to write metrics snapshot: %s", exc)
|
||||
return False
|
||||
|
||||
def query_snapshots(self, minutes: int = 30) -> List[Dict[str, Any]]:
|
||||
if not self._initialized:
|
||||
self.initialize()
|
||||
cutoff = (datetime.now() - timedelta(minutes=minutes)).isoformat()
|
||||
try:
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT * FROM metrics_snapshots WHERE ts >= ? ORDER BY ts ASC",
|
||||
(cutoff,),
|
||||
)
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
except Exception as exc:
|
||||
logger.error("Failed to query metrics snapshots: %s", exc)
|
||||
return []
|
||||
|
||||
def cleanup(self) -> int:
|
||||
if not self._initialized:
|
||||
return 0
|
||||
deleted = 0
|
||||
try:
|
||||
with self._write_lock:
|
||||
with self._get_connection() as conn:
|
||||
cutoff = (
|
||||
datetime.now() - timedelta(days=METRICS_HISTORY_RETENTION_DAYS)
|
||||
).isoformat()
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM metrics_snapshots WHERE ts < ?", (cutoff,),
|
||||
)
|
||||
deleted += cursor.rowcount
|
||||
row = conn.execute(
|
||||
"SELECT COUNT(*) FROM metrics_snapshots",
|
||||
).fetchone()
|
||||
count = row[0] if row else 0
|
||||
if count > METRICS_HISTORY_MAX_ROWS:
|
||||
excess = count - METRICS_HISTORY_MAX_ROWS
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
DELETE FROM metrics_snapshots WHERE id IN (
|
||||
SELECT id FROM metrics_snapshots ORDER BY ts ASC LIMIT ?
|
||||
)
|
||||
""",
|
||||
(excess,),
|
||||
)
|
||||
deleted += cursor.rowcount
|
||||
conn.commit()
|
||||
if deleted > 0:
|
||||
logger.info("Cleaned up %d metrics history rows", deleted)
|
||||
except Exception as exc:
|
||||
logger.error("Failed to cleanup metrics history: %s", exc)
|
||||
return deleted
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Background Collector
|
||||
# ============================================================
|
||||
|
||||
class MetricsHistoryCollector:
|
||||
"""Daemon thread that snapshots metrics at a fixed interval."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
app: Any = None,
|
||||
store: Optional[MetricsHistoryStore] = None,
|
||||
interval: int = METRICS_HISTORY_INTERVAL,
|
||||
):
|
||||
self._app = app
|
||||
self._store = store or get_metrics_history_store()
|
||||
self.interval = interval
|
||||
self._stop_event = threading.Event()
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._cleanup_counter = 0
|
||||
|
||||
def start(self) -> None:
|
||||
if self._thread is not None and self._thread.is_alive():
|
||||
return
|
||||
self._stop_event.clear()
|
||||
self._thread = threading.Thread(
|
||||
target=self._run, daemon=True, name="metrics-history-collector",
|
||||
)
|
||||
self._thread.start()
|
||||
logger.info(
|
||||
"Metrics history collector started (interval=%ds)", self.interval,
|
||||
)
|
||||
|
||||
def stop(self) -> None:
|
||||
if self._thread and self._thread.is_alive():
|
||||
self._stop_event.set()
|
||||
self._thread.join(timeout=5)
|
||||
logger.info("Metrics history collector stopped")
|
||||
|
||||
def _run(self) -> None:
|
||||
# Collect immediately on start, then loop.
|
||||
self._collect_snapshot()
|
||||
while not self._stop_event.wait(self.interval):
|
||||
self._collect_snapshot()
|
||||
# Run cleanup every ~100 intervals (~50 min at 30s).
|
||||
self._cleanup_counter += 1
|
||||
if self._cleanup_counter >= 100:
|
||||
self._cleanup_counter = 0
|
||||
self._store.cleanup()
|
||||
|
||||
def _collect_snapshot(self) -> None:
|
||||
try:
|
||||
data: Dict[str, Any] = {}
|
||||
|
||||
# Pool status
|
||||
try:
|
||||
from mes_dashboard.core.database import get_pool_status
|
||||
data["pool"] = get_pool_status()
|
||||
except Exception:
|
||||
data["pool"] = {}
|
||||
|
||||
# Redis
|
||||
try:
|
||||
from mes_dashboard.core.redis_client import (
|
||||
get_redis_client,
|
||||
REDIS_ENABLED,
|
||||
)
|
||||
if REDIS_ENABLED:
|
||||
client = get_redis_client()
|
||||
if client is not None:
|
||||
info = client.info(section="memory")
|
||||
stats_info = client.info(section="stats")
|
||||
hits = int(stats_info.get("keyspace_hits", 0))
|
||||
misses = int(stats_info.get("keyspace_misses", 0))
|
||||
total = hits + misses
|
||||
data["redis"] = {
|
||||
"used_memory": info.get("used_memory", 0),
|
||||
"hit_rate": round(hits / total, 4) if total > 0 else 0,
|
||||
}
|
||||
else:
|
||||
data["redis"] = {}
|
||||
else:
|
||||
data["redis"] = {}
|
||||
except Exception:
|
||||
data["redis"] = {}
|
||||
|
||||
# Route cache
|
||||
try:
|
||||
if self._app:
|
||||
with self._app.app_context():
|
||||
from mes_dashboard.routes.health_routes import (
|
||||
get_route_cache_status,
|
||||
)
|
||||
rc = get_route_cache_status()
|
||||
else:
|
||||
from mes_dashboard.routes.health_routes import (
|
||||
get_route_cache_status,
|
||||
)
|
||||
rc = get_route_cache_status()
|
||||
data["route_cache"] = {
|
||||
"l1_hit_rate": rc.get("l1_hit_rate"),
|
||||
"l2_hit_rate": rc.get("l2_hit_rate"),
|
||||
"miss_rate": rc.get("miss_rate"),
|
||||
}
|
||||
except Exception:
|
||||
data["route_cache"] = {}
|
||||
|
||||
# Query latency
|
||||
try:
|
||||
from mes_dashboard.core.metrics import get_metrics_summary
|
||||
summary = get_metrics_summary()
|
||||
data["latency"] = {
|
||||
"p50_ms": summary.get("p50_ms", 0),
|
||||
"p95_ms": summary.get("p95_ms", 0),
|
||||
"p99_ms": summary.get("p99_ms", 0),
|
||||
"count": summary.get("count", 0),
|
||||
}
|
||||
except Exception:
|
||||
data["latency"] = {}
|
||||
|
||||
self._store.write_snapshot(data)
|
||||
except Exception as exc:
|
||||
logger.debug("Metrics snapshot collection failed: %s", exc)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Global Instance & Lifecycle
|
||||
# ============================================================
|
||||
|
||||
_STORE: Optional[MetricsHistoryStore] = None
|
||||
_COLLECTOR: Optional[MetricsHistoryCollector] = None
|
||||
|
||||
|
||||
def get_metrics_history_store() -> MetricsHistoryStore:
|
||||
global _STORE
|
||||
if _STORE is None:
|
||||
_STORE = MetricsHistoryStore()
|
||||
_STORE.initialize()
|
||||
return _STORE
|
||||
|
||||
|
||||
def start_metrics_history(app: Any = None) -> None:
|
||||
global _COLLECTOR
|
||||
store = get_metrics_history_store()
|
||||
_COLLECTOR = MetricsHistoryCollector(app=app, store=store)
|
||||
_COLLECTOR.start()
|
||||
|
||||
|
||||
def stop_metrics_history() -> None:
|
||||
global _COLLECTOR
|
||||
if _COLLECTOR is not None:
|
||||
_COLLECTOR.stop()
|
||||
_COLLECTOR = None
|
||||
@@ -11,7 +11,7 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from flask import Blueprint, g, jsonify, render_template, request
|
||||
from flask import Blueprint, current_app, g, jsonify, render_template, request, send_from_directory
|
||||
|
||||
from mes_dashboard.core.permissions import admin_required
|
||||
from mes_dashboard.core.response import error_response, TOO_MANY_REQUESTS
|
||||
@@ -69,7 +69,11 @@ _last_restart_request: float = 0.0
|
||||
@admin_bp.route("/performance")
|
||||
@admin_required
|
||||
def performance():
|
||||
"""Performance monitoring dashboard."""
|
||||
"""Performance monitoring dashboard (Vue SPA)."""
|
||||
dist_dir = os.path.join(current_app.static_folder or "", "dist")
|
||||
dist_html = os.path.join(dist_dir, "admin-performance.html")
|
||||
if os.path.exists(dist_html):
|
||||
return send_from_directory(dist_dir, "admin-performance.html")
|
||||
return render_template("admin/performance.html")
|
||||
|
||||
|
||||
@@ -263,6 +267,137 @@ def api_logs():
|
||||
})
|
||||
|
||||
|
||||
@admin_bp.route("/api/performance-detail", methods=["GET"])
|
||||
@admin_required
|
||||
def api_performance_detail():
|
||||
"""API: Get detailed performance telemetry for admin dashboard.
|
||||
|
||||
Returns redis, process_caches, route_cache, db_pool, and
|
||||
direct_connections sections in a single response.
|
||||
"""
|
||||
from mes_dashboard.core.cache import get_all_process_cache_stats
|
||||
from mes_dashboard.core.database import (
|
||||
get_direct_connection_count,
|
||||
get_pool_runtime_config,
|
||||
get_pool_status,
|
||||
)
|
||||
from mes_dashboard.core.redis_client import (
|
||||
get_redis_client,
|
||||
REDIS_ENABLED,
|
||||
REDIS_KEY_PREFIX,
|
||||
)
|
||||
from mes_dashboard.routes.health_routes import get_route_cache_status
|
||||
|
||||
# ---- Redis detail ----
|
||||
redis_detail = None
|
||||
if REDIS_ENABLED:
|
||||
client = get_redis_client()
|
||||
if client is not None:
|
||||
try:
|
||||
info = client.info(section="memory")
|
||||
stats_info = client.info(section="stats")
|
||||
clients_info = client.info(section="clients")
|
||||
|
||||
hits = int(stats_info.get("keyspace_hits", 0))
|
||||
misses = int(stats_info.get("keyspace_misses", 0))
|
||||
total = hits + misses
|
||||
hit_rate = round(hits / total, 4) if total > 0 else 0
|
||||
|
||||
# Scan key counts per namespace
|
||||
namespace_prefixes = [
|
||||
"data", "route_cache", "equipment_status",
|
||||
"reject_dataset", "meta", "lock", "scrap_exclusion",
|
||||
]
|
||||
namespaces = []
|
||||
for ns in namespace_prefixes:
|
||||
pattern = f"{REDIS_KEY_PREFIX}:{ns}*"
|
||||
count = 0
|
||||
cursor = 0
|
||||
while True:
|
||||
cursor, keys = client.scan(cursor=cursor, match=pattern, count=100)
|
||||
count += len(keys)
|
||||
if cursor == 0:
|
||||
break
|
||||
namespaces.append({"name": ns, "key_count": count})
|
||||
|
||||
redis_detail = {
|
||||
"used_memory_human": info.get("used_memory_human", "N/A"),
|
||||
"used_memory": info.get("used_memory", 0),
|
||||
"peak_memory_human": info.get("used_memory_peak_human", "N/A"),
|
||||
"peak_memory": info.get("used_memory_peak", 0),
|
||||
"maxmemory_human": info.get("maxmemory_human", "N/A"),
|
||||
"maxmemory": info.get("maxmemory", 0),
|
||||
"connected_clients": clients_info.get("connected_clients", 0),
|
||||
"hit_rate": hit_rate,
|
||||
"keyspace_hits": hits,
|
||||
"keyspace_misses": misses,
|
||||
"namespaces": namespaces,
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to collect Redis detail: %s", exc)
|
||||
redis_detail = {"error": str(exc)}
|
||||
|
||||
# ---- Process caches ----
|
||||
process_caches = get_all_process_cache_stats()
|
||||
|
||||
# ---- Route cache ----
|
||||
route_cache = get_route_cache_status()
|
||||
|
||||
# ---- DB pool ----
|
||||
db_pool = None
|
||||
try:
|
||||
pool_status = get_pool_status()
|
||||
pool_config = get_pool_runtime_config()
|
||||
db_pool = {
|
||||
"status": pool_status,
|
||||
"config": {
|
||||
"pool_size": pool_config.get("pool_size"),
|
||||
"max_overflow": pool_config.get("max_overflow"),
|
||||
"pool_timeout": pool_config.get("pool_timeout"),
|
||||
"pool_recycle": pool_config.get("pool_recycle"),
|
||||
},
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to collect DB pool status: %s", exc)
|
||||
db_pool = {"error": str(exc)}
|
||||
|
||||
# ---- Direct connections ----
|
||||
direct_connections = {
|
||||
"total_since_start": get_direct_connection_count(),
|
||||
"worker_pid": os.getpid(),
|
||||
}
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"data": {
|
||||
"redis": redis_detail,
|
||||
"process_caches": process_caches,
|
||||
"route_cache": route_cache,
|
||||
"db_pool": db_pool,
|
||||
"direct_connections": direct_connections,
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
@admin_bp.route("/api/performance-history", methods=["GET"])
|
||||
@admin_required
|
||||
def api_performance_history():
|
||||
"""API: Get historical metrics snapshots for trend charts."""
|
||||
from mes_dashboard.core.metrics_history import get_metrics_history_store
|
||||
|
||||
minutes = request.args.get("minutes", 30, type=int)
|
||||
minutes = max(1, min(minutes, 180))
|
||||
store = get_metrics_history_store()
|
||||
snapshots = store.query_snapshots(minutes=minutes)
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"data": {
|
||||
"snapshots": snapshots,
|
||||
"count": len(snapshots),
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
@admin_bp.route("/api/logs/cleanup", methods=["POST"])
|
||||
@admin_required
|
||||
def api_logs_cleanup():
|
||||
|
||||
@@ -14,6 +14,7 @@ from collections import OrderedDict
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from mes_dashboard.core.cache import register_process_cache
|
||||
from mes_dashboard.core.database import read_sql_df
|
||||
from mes_dashboard.core.redis_client import (
|
||||
get_redis_client,
|
||||
@@ -92,6 +93,13 @@ class _ProcessLevelCache:
|
||||
with self._lock:
|
||||
self._cache.pop(key, None)
|
||||
|
||||
def stats(self) -> dict:
|
||||
"""Return live cache statistics for telemetry."""
|
||||
with self._lock:
|
||||
now = time.time()
|
||||
live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
|
||||
return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
|
||||
|
||||
|
||||
def _resolve_cache_max_size(env_name: str, default: int) -> int:
|
||||
value = os.getenv(env_name)
|
||||
@@ -113,6 +121,7 @@ _equipment_status_cache = _ProcessLevelCache(
|
||||
ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS,
|
||||
max_size=EQUIPMENT_PROCESS_CACHE_MAX_SIZE,
|
||||
)
|
||||
register_process_cache("equipment_status", _equipment_status_cache, "Equipment Status (L1, 30s)")
|
||||
_equipment_status_parse_lock = threading.Lock()
|
||||
_equipment_lookup_lock = threading.Lock()
|
||||
_equipment_status_lookup: dict[str, dict[str, Any]] = {}
|
||||
|
||||
@@ -20,7 +20,7 @@ from typing import Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from mes_dashboard.core.cache import ProcessLevelCache
|
||||
from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache
|
||||
from mes_dashboard.core.database import read_sql_df
|
||||
from mes_dashboard.core.redis_client import (
|
||||
REDIS_ENABLED,
|
||||
@@ -55,6 +55,7 @@ _CACHE_MAX_SIZE = 8
|
||||
_REDIS_NAMESPACE = "reject_dataset"
|
||||
|
||||
_dataset_cache = ProcessLevelCache(ttl_seconds=_CACHE_TTL, max_size=_CACHE_MAX_SIZE)
|
||||
register_process_cache("reject_dataset", _dataset_cache, "Reject Dataset (L1, 15min)")
|
||||
|
||||
|
||||
# ============================================================
|
||||
|
||||
@@ -19,6 +19,7 @@ from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from mes_dashboard.core.cache import register_process_cache
|
||||
from mes_dashboard.core.redis_client import (
|
||||
get_redis_client,
|
||||
redis_available,
|
||||
@@ -109,6 +110,13 @@ class _ProcessLevelCache:
|
||||
with self._lock:
|
||||
self._cache.pop(key, None)
|
||||
|
||||
def stats(self) -> dict:
|
||||
"""Return live cache statistics for telemetry."""
|
||||
with self._lock:
|
||||
now = time.time()
|
||||
live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
|
||||
return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
|
||||
|
||||
|
||||
def _resolve_cache_max_size(env_name: str, default: int) -> int:
|
||||
value = os.getenv(env_name)
|
||||
@@ -130,6 +138,7 @@ _resource_df_cache = _ProcessLevelCache(
|
||||
ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS,
|
||||
max_size=RESOURCE_PROCESS_CACHE_MAX_SIZE,
|
||||
)
|
||||
register_process_cache("resource", _resource_df_cache, "Resource DataFrame (L1, 30s)")
|
||||
_resource_parse_lock = threading.Lock()
|
||||
_resource_index_lock = threading.Lock()
|
||||
_resource_index: ResourceIndex = {
|
||||
|
||||
Reference in New Issue
Block a user