feat(admin-performance): Vue 3 SPA dashboard with metrics history trending

Rebuild /admin/performance from Jinja2 to Vue 3 SPA with ECharts, adding
cache telemetry infrastructure, connection pool monitoring, and SQLite-backed
historical metrics collection with trend chart visualization.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
egg
2026-02-23 09:18:10 +08:00
parent 1c46f5eb69
commit 5d570ca7a2
32 changed files with 2903 additions and 261 deletions

View File

@@ -5,7 +5,7 @@
"type": "module", "type": "module",
"scripts": { "scripts": {
"dev": "vite --host", "dev": "vite --host",
"build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html", "build": "vite build && cp ../src/mes_dashboard/static/dist/src/portal-shell/index.html ../src/mes_dashboard/static/dist/portal-shell.html && cp ../src/mes_dashboard/static/dist/src/tables/index.html ../src/mes_dashboard/static/dist/tables.html && cp ../src/mes_dashboard/static/dist/src/qc-gate/index.html ../src/mes_dashboard/static/dist/qc-gate.html && cp ../src/mes_dashboard/static/dist/src/wip-overview/index.html ../src/mes_dashboard/static/dist/wip-overview.html && cp ../src/mes_dashboard/static/dist/src/wip-detail/index.html ../src/mes_dashboard/static/dist/wip-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-detail/index.html ../src/mes_dashboard/static/dist/hold-detail.html && cp ../src/mes_dashboard/static/dist/src/hold-overview/index.html ../src/mes_dashboard/static/dist/hold-overview.html && cp ../src/mes_dashboard/static/dist/src/hold-history/index.html ../src/mes_dashboard/static/dist/hold-history.html && cp ../src/mes_dashboard/static/dist/src/reject-history/index.html ../src/mes_dashboard/static/dist/reject-history.html && cp ../src/mes_dashboard/static/dist/src/resource-status/index.html ../src/mes_dashboard/static/dist/resource-status.html && cp ../src/mes_dashboard/static/dist/src/resource-history/index.html ../src/mes_dashboard/static/dist/resource-history.html && cp ../src/mes_dashboard/static/dist/src/mid-section-defect/index.html ../src/mes_dashboard/static/dist/mid-section-defect.html && cp ../src/mes_dashboard/static/dist/src/admin-performance/index.html ../src/mes_dashboard/static/dist/admin-performance.html",
"test": "node --test tests/*.test.js" "test": "node --test tests/*.test.js"
}, },
"devDependencies": { "devDependencies": {

View File

@@ -0,0 +1,613 @@
<template>
<div class="perf-dashboard">
<!-- Header -->
<header class="perf-header">
<div class="perf-header-inner">
<h1 class="perf-title">效能監控儀表板</h1>
<div class="perf-header-actions">
<label class="auto-refresh-toggle">
<input type="checkbox" v-model="autoRefreshEnabled" @change="toggleAutoRefresh" />
自動更新 (30s)
</label>
<button class="btn btn-sm" @click="refreshAll" :disabled="loading">
<template v-if="loading">更新中...</template>
<template v-else>重新整理</template>
</button>
</div>
</div>
</header>
<!-- Status Cards -->
<section class="panel">
<div class="status-cards-grid">
<div class="status-card">
<div class="status-card-title">Database</div>
<StatusDot :status="dbStatus" :label="dbStatusLabel" />
</div>
<div class="status-card">
<div class="status-card-title">Redis</div>
<StatusDot :status="redisStatus" :label="redisStatusLabel" />
</div>
<div class="status-card">
<div class="status-card-title">Circuit Breaker</div>
<StatusDot :status="cbStatus" :label="cbStatusLabel" />
</div>
<div class="status-card">
<div class="status-card-title">Worker PID</div>
<StatusDot status="healthy" :label="String(systemData?.worker_pid || '-')" />
</div>
</div>
</section>
<!-- Query Performance -->
<section class="panel">
<h2 class="panel-title">查詢效能</h2>
<div class="query-perf-grid">
<div class="query-perf-stats">
<StatCard :value="metricsData?.p50_ms" label="P50 (ms)" />
<StatCard :value="metricsData?.p95_ms" label="P95 (ms)" />
<StatCard :value="metricsData?.p99_ms" label="P99 (ms)" />
<StatCard :value="metricsData?.count" label="查詢數" />
<StatCard :value="metricsData?.slow_count" label="慢查詢" />
<StatCard :value="slowRateDisplay" label="慢查詢率" />
</div>
<div class="query-perf-chart" ref="latencyChartRef"></div>
</div>
</section>
<!-- Query Latency Trend -->
<TrendChart
v-if="historyData.length > 1"
title="查詢延遲趨勢"
:snapshots="historyData"
:series="latencyTrendSeries"
yAxisLabel="ms"
/>
<!-- Redis Cache Detail -->
<section class="panel" v-if="perfDetail?.redis">
<h2 class="panel-title">Redis 快取</h2>
<div class="redis-grid">
<div class="redis-stats">
<GaugeBar
label="記憶體使用"
:value="redisMemoryRatio"
:max="1"
:displayText="redisMemoryLabel"
/>
<div class="redis-mini-stats">
<StatCard :value="perfDetail.redis.used_memory_human" label="已使用" />
<StatCard :value="perfDetail.redis.peak_memory_human" label="峰值" />
<StatCard :value="perfDetail.redis.connected_clients" label="連線數" />
<StatCard :value="hitRateDisplay" label="命中率" />
</div>
</div>
<div class="redis-namespaces">
<table class="mini-table">
<thead><tr><th>Namespace</th><th>Key 數量</th></tr></thead>
<tbody>
<tr v-for="ns in perfDetail.redis.namespaces" :key="ns.name">
<td>{{ ns.name }}</td>
<td>{{ ns.key_count }}</td>
</tr>
</tbody>
</table>
</div>
</div>
</section>
<section class="panel panel-disabled" v-else-if="perfDetail && !perfDetail.redis">
<h2 class="panel-title">Redis 快取</h2>
<p class="muted">Redis 未啟用</p>
</section>
<!-- Redis Memory Trend -->
<TrendChart
v-if="historyData.length > 1"
title="Redis 記憶體趨勢"
:snapshots="historyData"
:series="redisTrendSeries"
/>
<!-- Memory Caches -->
<section class="panel" v-if="perfDetail">
<h2 class="panel-title">記憶體快取</h2>
<div class="cache-cards-grid">
<div class="cache-card" v-for="(info, name) in perfDetail.process_caches" :key="name">
<div class="cache-card-name">{{ name }}</div>
<div class="cache-card-desc">{{ info.description }}</div>
<GaugeBar
label="使用率"
:value="info.entries"
:max="info.max_size"
/>
<div class="cache-card-ttl">TTL: {{ info.ttl_seconds }}s</div>
</div>
</div>
<div class="route-cache-section" v-if="perfDetail.route_cache">
<h3 class="sub-title">Route Cache</h3>
<div class="route-cache-stats">
<StatCard :value="perfDetail.route_cache.mode" label="模式" />
<StatCard :value="perfDetail.route_cache.l1_size" label="L1 大小" />
<StatCard :value="routeCacheL1HitRate" label="L1 命中率" />
<StatCard :value="routeCacheL2HitRate" label="L2 命中率" />
<StatCard :value="routeCacheMissRate" label="未命中率" />
<StatCard :value="perfDetail.route_cache.reads_total" label="總讀取" />
</div>
</div>
</section>
<!-- Cache Hit Rate Trend -->
<TrendChart
v-if="historyData.length > 1"
title="快取命中率趨勢"
:snapshots="historyData"
:series="hitRateTrendSeries"
yAxisLabel=""
:yMax="1"
/>
<!-- Connection Pool -->
<section class="panel" v-if="perfDetail?.db_pool?.status">
<h2 class="panel-title">連線池</h2>
<GaugeBar
label="飽和度"
:value="perfDetail.db_pool.status.saturation"
:max="1"
/>
<div class="pool-stats-grid">
<StatCard :value="perfDetail.db_pool.status.checked_out" label="使用中" />
<StatCard :value="perfDetail.db_pool.status.checked_in" label="閒置" />
<StatCard :value="poolTotalConnections" label="總連線數" />
<StatCard :value="perfDetail.db_pool.status.max_capacity" label="最大容量" />
<StatCard :value="poolOverflowDisplay" label="溢出連線" />
<StatCard :value="perfDetail.db_pool.config?.pool_size" label="池大小" />
<StatCard :value="perfDetail.db_pool.config?.pool_recycle" label="回收週期 (s)" />
<StatCard :value="perfDetail.db_pool.config?.pool_timeout" label="逾時 (s)" />
<StatCard :value="perfDetail.direct_connections?.total_since_start" label="直連次數" />
</div>
</section>
<!-- Connection Pool Trend -->
<TrendChart
v-if="historyData.length > 1"
title="連線池趨勢"
:snapshots="historyData"
:series="poolTrendSeries"
/>
<!-- Worker Control -->
<section class="panel">
<h2 class="panel-title">Worker 控制</h2>
<div class="worker-info">
<StatCard :value="workerData?.worker_pid" label="PID" />
<StatCard :value="workerStartTimeDisplay" label="啟動時間" />
<StatCard :value="cooldownDisplay" label="冷卻狀態" />
</div>
<button
class="btn btn-danger"
:disabled="workerCooldownActive"
@click="showRestartModal = true"
>
重啟 Worker
</button>
<!-- Restart Modal -->
<div class="modal-backdrop" v-if="showRestartModal" @click.self="showRestartModal = false">
<div class="modal-dialog">
<h3>確認重啟 Worker</h3>
<p>重啟將導致目前的請求暫時中斷確定要繼續嗎</p>
<div class="modal-actions">
<button class="btn" @click="showRestartModal = false">取消</button>
<button class="btn btn-danger" @click="doRestart" :disabled="restartLoading">
{{ restartLoading ? '重啟中...' : '確認重啟' }}
</button>
</div>
</div>
</div>
</section>
<!-- System Logs -->
<section class="panel">
<h2 class="panel-title">系統日誌</h2>
<div class="log-controls">
<select v-model="logLevel" @change="loadLogs">
<option value="">全部等級</option>
<option value="ERROR">ERROR</option>
<option value="WARNING">WARNING</option>
<option value="INFO">INFO</option>
<option value="DEBUG">DEBUG</option>
</select>
<input
type="text"
v-model="logSearch"
placeholder="搜尋日誌..."
@input="debouncedLoadLogs"
/>
<button class="btn btn-sm" @click="cleanupLogs" :disabled="cleanupLoading">
{{ cleanupLoading ? '清理中...' : '清理日誌' }}
</button>
</div>
<div class="log-table-wrapper">
<table class="log-table" v-if="logsData?.logs?.length">
<thead>
<tr>
<th>時間</th>
<th>等級</th>
<th>訊息</th>
</tr>
</thead>
<tbody>
<tr v-for="(log, i) in logsData.logs" :key="i" :class="'log-' + (log.level || '').toLowerCase()">
<td class="log-time">{{ log.timestamp }}</td>
<td class="log-level">{{ log.level }}</td>
<td class="log-msg">{{ log.message }}</td>
</tr>
</tbody>
</table>
<p v-else class="muted">無日誌</p>
</div>
<div class="log-pagination" v-if="logsData?.total > logLimit">
<button class="btn btn-sm" :disabled="logOffset === 0" @click="logOffset -= logLimit; loadLogs()">上一頁</button>
<span>{{ logOffset / logLimit + 1 }} / {{ Math.ceil(logsData.total / logLimit) }}</span>
<button class="btn btn-sm" :disabled="logOffset + logLimit >= logsData.total" @click="logOffset += logLimit; loadLogs()">下一頁</button>
</div>
</section>
</div>
</template>
<script setup>
import { ref, computed, onMounted, onBeforeUnmount } from 'vue';
import * as echarts from 'echarts/core';
import { BarChart } from 'echarts/charts';
import { GridComponent, TooltipComponent } from 'echarts/components';
import { CanvasRenderer } from 'echarts/renderers';
import { apiGet, apiPost } from '../core/api.js';
import { useAutoRefresh } from '../shared-composables/useAutoRefresh.js';
import GaugeBar from './components/GaugeBar.vue';
import StatCard from './components/StatCard.vue';
import StatusDot from './components/StatusDot.vue';
import TrendChart from './components/TrendChart.vue';
echarts.use([BarChart, GridComponent, TooltipComponent, CanvasRenderer]);
// --- State ---
const loading = ref(false);
const autoRefreshEnabled = ref(true);
const systemData = ref(null);
const metricsData = ref(null);
const perfDetail = ref(null);
const historyData = ref([]);
const logsData = ref(null);
const workerData = ref(null);
const logLevel = ref('');
const logSearch = ref('');
const logOffset = ref(0);
const logLimit = 50;
const showRestartModal = ref(false);
const restartLoading = ref(false);
const cleanupLoading = ref(false);
const latencyChartRef = ref(null);
let chartInstance = null;
// --- Computed ---
const dbStatus = computed(() => {
const s = systemData.value?.database?.status;
if (s === 'healthy' || s === 'ok') return 'healthy';
if (s === 'error') return 'error';
return 'disabled';
});
const dbStatusLabel = computed(() => systemData.value?.database?.status || '-');
const redisStatus = computed(() => {
const r = systemData.value?.redis;
if (!r?.enabled) return 'disabled';
if (r.status === 'healthy' || r.status === 'ok') return 'healthy';
if (r.status === 'error') return 'error';
return 'degraded';
});
const redisStatusLabel = computed(() => {
const r = systemData.value?.redis;
if (!r?.enabled) return '未啟用';
return r.status || '-';
});
const cbStatus = computed(() => {
const s = systemData.value?.circuit_breaker?.state;
if (s === 'CLOSED') return 'healthy';
if (s === 'OPEN') return 'error';
if (s === 'HALF_OPEN') return 'degraded';
return 'disabled';
});
const cbStatusLabel = computed(() => systemData.value?.circuit_breaker?.state || '-');
const slowRateDisplay = computed(() => {
const r = metricsData.value?.slow_rate;
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
});
const redisMemoryRatio = computed(() => {
const r = perfDetail.value?.redis;
if (!r) return 0;
const used = r.used_memory || 0;
const max = r.maxmemory || 0;
if (max > 0) return used / max;
const peak = r.peak_memory || used;
return peak > 0 ? used / peak : 0;
});
const redisMemoryLabel = computed(() => {
const r = perfDetail.value?.redis;
if (!r) return '';
const used = r.used_memory_human || 'N/A';
const max = r.maxmemory && r.maxmemory > 0
? r.maxmemory_human
: r.peak_memory_human;
return `${used} / ${max || 'N/A'}`;
});
const hitRateDisplay = computed(() => {
const r = perfDetail.value?.redis?.hit_rate;
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
});
const routeCacheL1HitRate = computed(() => {
const r = perfDetail.value?.route_cache?.l1_hit_rate;
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
});
const routeCacheL2HitRate = computed(() => {
const r = perfDetail.value?.route_cache?.l2_hit_rate;
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
});
const routeCacheMissRate = computed(() => {
const r = perfDetail.value?.route_cache?.miss_rate;
return r != null ? `${(r * 100).toFixed(1)}%` : '-';
});
const poolOverflowDisplay = computed(() => {
const overflow = perfDetail.value?.db_pool?.status?.overflow;
if (overflow == null) return '-';
return Math.max(0, overflow);
});
const poolTotalConnections = computed(() => {
const s = perfDetail.value?.db_pool?.status;
if (!s) return '-';
return (s.checked_out || 0) + (s.checked_in || 0);
});
const workerStartTimeDisplay = computed(() => {
const t = workerData.value?.worker_start_time;
if (!t) return '-';
try {
return new Date(t).toLocaleString('zh-TW');
} catch {
return t;
}
});
const workerCooldownActive = computed(() => workerData.value?.cooldown?.active || false);
const cooldownDisplay = computed(() => {
if (workerCooldownActive.value) {
const secs = workerData.value?.cooldown?.remaining_seconds || 0;
return `冷卻中 (${secs}s)`;
}
return '就緒';
});
// --- Data Fetching ---
async function loadSystemStatus() {
try {
const res = await apiGet('/admin/api/system-status');
systemData.value = res?.data || null;
} catch (e) {
console.error('Failed to load system status:', e);
}
}
async function loadMetrics() {
try {
const res = await apiGet('/admin/api/metrics');
metricsData.value = res?.data || null;
updateLatencyChart();
} catch (e) {
console.error('Failed to load metrics:', e);
}
}
async function loadPerformanceDetail() {
try {
const res = await apiGet('/admin/api/performance-detail');
perfDetail.value = res?.data || null;
} catch (e) {
console.error('Failed to load performance detail:', e);
}
}
async function loadLogs() {
try {
const params = { limit: logLimit, offset: logOffset.value };
if (logLevel.value) params.level = logLevel.value;
if (logSearch.value) params.q = logSearch.value;
const res = await apiGet('/admin/api/logs', { params });
logsData.value = res?.data || null;
} catch (e) {
console.error('Failed to load logs:', e);
}
}
async function loadWorkerStatus() {
try {
const res = await apiGet('/admin/api/worker/status');
workerData.value = res?.data || null;
} catch (e) {
console.error('Failed to load worker status:', e);
}
}
async function loadPerformanceHistory() {
try {
const res = await apiGet('/admin/api/performance-history', { params: { minutes: 30 } });
historyData.value = res?.data?.snapshots || [];
} catch (e) {
console.error('Failed to load performance history:', e);
}
}
// --- Trend Chart Series Configs ---
const poolTrendSeries = [
{ name: '飽和度', key: 'pool_saturation', color: '#6366f1' },
{ name: '使用中', key: 'pool_checked_out', color: '#f59e0b' },
];
const latencyTrendSeries = [
{ name: 'P50', key: 'latency_p50_ms', color: '#22c55e' },
{ name: 'P95', key: 'latency_p95_ms', color: '#f59e0b' },
{ name: 'P99', key: 'latency_p99_ms', color: '#ef4444' },
];
const redisTrendSeries = [
{ name: '記憶體 (bytes)', key: 'redis_used_memory', color: '#06b6d4' },
];
const hitRateTrendSeries = [
{ name: 'Redis 命中率', key: 'redis_hit_rate', color: '#22c55e' },
{ name: 'L1 命中率', key: 'rc_l1_hit_rate', color: '#2563eb' },
{ name: 'L2 命中率', key: 'rc_l2_hit_rate', color: '#f59e0b' },
];
async function refreshAll() {
loading.value = true;
try {
await Promise.all([
loadSystemStatus(),
loadMetrics(),
loadPerformanceDetail(),
loadPerformanceHistory(),
loadLogs(),
loadWorkerStatus(),
]);
} finally {
loading.value = false;
}
}
// --- Auto Refresh ---
const { startAutoRefresh, stopAutoRefresh } = useAutoRefresh({
onRefresh: refreshAll,
intervalMs: 30_000,
autoStart: false,
});
function toggleAutoRefresh() {
if (autoRefreshEnabled.value) {
startAutoRefresh();
} else {
stopAutoRefresh();
}
}
// --- Worker Restart ---
async function doRestart() {
restartLoading.value = true;
try {
await apiPost('/admin/api/worker/restart', {});
showRestartModal.value = false;
await loadWorkerStatus();
} catch (e) {
alert(e.message || '重啟失敗');
} finally {
restartLoading.value = false;
}
}
// --- Log Cleanup ---
async function cleanupLogs() {
cleanupLoading.value = true;
try {
await apiPost('/admin/api/logs/cleanup', {});
await loadLogs();
} catch (e) {
console.error('Failed to cleanup logs:', e);
} finally {
cleanupLoading.value = false;
}
}
// --- Debounce ---
let debounceTimer = null;
function debouncedLoadLogs() {
clearTimeout(debounceTimer);
debounceTimer = setTimeout(() => {
logOffset.value = 0;
loadLogs();
}, 300);
}
// --- ECharts ---
function updateLatencyChart() {
if (!latencyChartRef.value) return;
if (!chartInstance) {
chartInstance = echarts.init(latencyChartRef.value);
}
const latencies = metricsData.value?.latencies || [];
if (!latencies.length) {
chartInstance.clear();
return;
}
// Build histogram buckets
const buckets = [
{ label: '<100ms', max: 100 },
{ label: '100-500ms', max: 500 },
{ label: '500ms-1s', max: 1000 },
{ label: '1-5s', max: 5000 },
{ label: '>5s', max: Infinity },
];
const counts = buckets.map(() => 0);
for (const ms of latencies.map((v) => v * 1000)) {
for (let i = 0; i < buckets.length; i++) {
if (ms < buckets[i].max || i === buckets.length - 1) {
counts[i]++;
break;
}
}
}
chartInstance.setOption({
tooltip: { trigger: 'axis' },
grid: { left: 40, right: 20, top: 20, bottom: 30 },
xAxis: { type: 'category', data: buckets.map((b) => b.label) },
yAxis: { type: 'value' },
series: [
{
type: 'bar',
data: counts,
itemStyle: { color: '#6366f1' },
barMaxWidth: 40,
},
],
});
}
// --- Lifecycle ---
onMounted(async () => {
await refreshAll();
if (autoRefreshEnabled.value) {
startAutoRefresh();
}
});
onBeforeUnmount(() => {
stopAutoRefresh();
if (chartInstance) {
chartInstance.dispose();
chartInstance = null;
}
clearTimeout(debounceTimer);
});
</script>

View File

@@ -0,0 +1,49 @@
<template>
<div class="gauge-bar">
<div class="gauge-bar-header">
<span class="gauge-bar-label">{{ label }}</span>
<span class="gauge-bar-value">{{ displayValue }}</span>
</div>
<div class="gauge-bar-track">
<div class="gauge-bar-fill" :style="fillStyle"></div>
</div>
</div>
</template>
<script setup>
import { computed } from 'vue';
const props = defineProps({
label: { type: String, default: '' },
value: { type: Number, default: 0 },
max: { type: Number, default: 100 },
unit: { type: String, default: '%' },
displayText: { type: String, default: '' },
warningThreshold: { type: Number, default: 0.7 },
dangerThreshold: { type: Number, default: 0.9 },
});
const ratio = computed(() => {
if (props.max <= 0) return 0;
return Math.min(Math.max(props.value / props.max, 0), 1);
});
const displayValue = computed(() => {
if (props.displayText) return props.displayText;
if (props.unit === '%') {
return `${(ratio.value * 100).toFixed(1)}%`;
}
return `${props.value}${props.unit ? ' ' + props.unit : ''}`;
});
const fillColor = computed(() => {
if (ratio.value >= props.dangerThreshold) return '#ef4444';
if (ratio.value >= props.warningThreshold) return '#f59e0b';
return '#22c55e';
});
const fillStyle = computed(() => ({
width: `${ratio.value * 100}%`,
backgroundColor: fillColor.value,
}));
</script>

View File

@@ -0,0 +1,24 @@
<template>
<div class="stat-card">
<div class="stat-card-value">{{ formattedValue }}</div>
<div class="stat-card-label">{{ label }}</div>
</div>
</template>
<script setup>
import { computed } from 'vue';
const props = defineProps({
value: { type: [Number, String], default: '-' },
label: { type: String, default: '' },
unit: { type: String, default: '' },
});
const formattedValue = computed(() => {
if (props.value === null || props.value === undefined) return '-';
const v = typeof props.value === 'number'
? (Number.isInteger(props.value) ? props.value : props.value.toFixed(2))
: props.value;
return props.unit ? `${v} ${props.unit}` : String(v);
});
</script>

View File

@@ -0,0 +1,17 @@
<template>
<div class="status-dot-wrapper">
<span class="status-dot" :class="'status-dot--' + status"></span>
<span class="status-dot-label">{{ label }}</span>
</div>
</template>
<script setup>
defineProps({
status: {
type: String,
default: 'disabled',
validator: (v) => ['healthy', 'degraded', 'error', 'disabled'].includes(v),
},
label: { type: String, default: '' },
});
</script>

View File

@@ -0,0 +1,98 @@
<script setup>
import { computed } from 'vue';
import { LineChart } from 'echarts/charts';
import {
GridComponent,
LegendComponent,
TooltipComponent,
} from 'echarts/components';
import { use } from 'echarts/core';
import { CanvasRenderer } from 'echarts/renderers';
import VChart from 'vue-echarts';
use([CanvasRenderer, LineChart, GridComponent, TooltipComponent, LegendComponent]);
const props = defineProps({
title: { type: String, default: '' },
snapshots: { type: Array, default: () => [] },
series: { type: Array, default: () => [] },
height: { type: String, default: '220px' },
yAxisLabel: { type: String, default: '' },
yMax: { type: Number, default: undefined },
});
const hasData = computed(() => props.snapshots.length > 1);
function extractValue(row, key) {
return row[key] ?? null;
}
function formatTime(ts) {
if (!ts) return '';
const d = new Date(ts);
const hh = String(d.getHours()).padStart(2, '0');
const mm = String(d.getMinutes()).padStart(2, '0');
const ss = String(d.getSeconds()).padStart(2, '0');
return `${hh}:${mm}:${ss}`;
}
const chartOption = computed(() => {
const data = props.snapshots || [];
const seriesDefs = props.series || [];
const xLabels = data.map((row) => formatTime(row.ts));
const echartsSeries = seriesDefs.map((s) => ({
name: s.name,
type: 'line',
smooth: true,
symbol: 'none',
areaStyle: { opacity: 0.12 },
lineStyle: { width: 2 },
itemStyle: { color: s.color },
yAxisIndex: s.yAxisIndex || 0,
data: data.map((row) => extractValue(row, s.key)),
}));
const yAxisConfig = { type: 'value', min: 0 };
if (props.yMax != null) yAxisConfig.max = props.yMax;
if (props.yAxisLabel) {
yAxisConfig.axisLabel = { formatter: `{value}${props.yAxisLabel}` };
}
return {
tooltip: {
trigger: 'axis',
axisPointer: { type: 'cross' },
},
legend: {
data: seriesDefs.map((s) => s.name),
bottom: 0,
},
grid: {
left: 50,
right: 20,
top: 16,
bottom: 40,
},
xAxis: {
type: 'category',
data: xLabels,
axisLabel: { fontSize: 10 },
},
yAxis: yAxisConfig,
series: echartsSeries,
};
});
</script>
<template>
<div class="trend-chart-card">
<h4 v-if="title" class="trend-chart-title">{{ title }}</h4>
<div v-if="hasData" class="trend-chart-canvas" :style="{ height }">
<VChart :option="chartOption" autoresize />
</div>
<div v-else class="trend-chart-empty">趨勢資料不足需至少 2 筆快照</div>
</div>
</template>

View File

@@ -0,0 +1,12 @@
<!doctype html>
<html lang="zh-Hant">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Performance Monitor</title>
</head>
<body>
<div id="app"></div>
<script type="module" src="./main.js"></script>
</body>
</html>

View File

@@ -0,0 +1,6 @@
import { createApp } from 'vue';
import App from './App.vue';
import './style.css';
createApp(App).mount('#app');

View File

@@ -0,0 +1,544 @@
/* Admin Performance Dashboard */
*,
*::before,
*::after {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: #f1f5f9;
color: #1e293b;
line-height: 1.5;
}
.perf-dashboard {
max-width: 1280px;
margin: 0 auto;
padding: 0 16px 32px;
}
/* Header */
.perf-header {
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
color: #fff;
padding: 20px 24px;
border-radius: 0 0 12px 12px;
margin: 0 -16px 20px;
}
.perf-header-inner {
display: flex;
align-items: center;
justify-content: space-between;
flex-wrap: wrap;
gap: 12px;
}
.perf-title {
font-size: 1.4rem;
font-weight: 700;
}
.perf-header-actions {
display: flex;
align-items: center;
gap: 12px;
}
.auto-refresh-toggle {
display: flex;
align-items: center;
gap: 6px;
font-size: 0.85rem;
cursor: pointer;
user-select: none;
}
.auto-refresh-toggle input[type='checkbox'] {
accent-color: #fff;
}
/* Buttons */
.btn {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 8px 16px;
border: none;
border-radius: 6px;
font-size: 0.85rem;
font-weight: 500;
cursor: pointer;
background: rgba(255, 255, 255, 0.2);
color: #fff;
transition: background 0.15s;
}
.btn:hover:not(:disabled) {
background: rgba(255, 255, 255, 0.3);
}
.btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.btn-sm {
padding: 5px 10px;
font-size: 0.8rem;
background: #e2e8f0;
color: #334155;
}
.btn-sm:hover:not(:disabled) {
background: #cbd5e1;
}
.btn-danger {
background: #ef4444;
color: #fff;
}
.btn-danger:hover:not(:disabled) {
background: #dc2626;
}
/* Panel */
.panel {
background: #fff;
border-radius: 10px;
padding: 20px;
margin-bottom: 16px;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.06);
}
.panel-disabled {
opacity: 0.6;
}
.panel-title {
font-size: 1rem;
font-weight: 600;
margin-bottom: 14px;
color: #334155;
}
.sub-title {
font-size: 0.9rem;
font-weight: 600;
margin: 16px 0 10px;
color: #475569;
}
.muted {
color: #94a3b8;
font-size: 0.85rem;
}
/* Status Cards */
.status-cards-grid {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 12px;
}
.status-card {
background: #f8fafc;
border-radius: 8px;
padding: 14px;
text-align: center;
}
.status-card-title {
font-size: 0.75rem;
color: #64748b;
margin-bottom: 8px;
text-transform: uppercase;
letter-spacing: 0.05em;
}
/* StatusDot */
.status-dot-wrapper {
display: flex;
align-items: center;
justify-content: center;
gap: 6px;
}
.status-dot {
width: 10px;
height: 10px;
border-radius: 50%;
flex-shrink: 0;
}
.status-dot--healthy {
background: #22c55e;
box-shadow: 0 0 6px rgba(34, 197, 94, 0.4);
}
.status-dot--degraded {
background: #f59e0b;
box-shadow: 0 0 6px rgba(245, 158, 11, 0.4);
}
.status-dot--error {
background: #ef4444;
box-shadow: 0 0 6px rgba(239, 68, 68, 0.4);
}
.status-dot--disabled {
background: #94a3b8;
}
.status-dot-label {
font-size: 0.85rem;
font-weight: 500;
}
/* GaugeBar */
.gauge-bar {
margin-bottom: 12px;
}
.gauge-bar-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 4px;
}
.gauge-bar-label {
font-size: 0.8rem;
color: #64748b;
}
.gauge-bar-value {
font-size: 0.8rem;
font-weight: 600;
}
.gauge-bar-track {
height: 8px;
background: #e2e8f0;
border-radius: 4px;
overflow: hidden;
}
.gauge-bar-fill {
height: 100%;
border-radius: 4px;
transition: width 0.4s ease, background-color 0.3s;
min-width: 2px;
}
/* StatCard */
.stat-card {
background: #f8fafc;
border-radius: 8px;
padding: 10px 12px;
text-align: center;
}
.stat-card-value {
font-size: 1.1rem;
font-weight: 700;
color: #1e293b;
line-height: 1.2;
}
.stat-card-label {
font-size: 0.7rem;
color: #64748b;
margin-top: 2px;
}
/* Query Performance */
.query-perf-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 16px;
}
.query-perf-stats {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 8px;
align-content: start;
}
.query-perf-chart {
min-height: 200px;
}
/* Redis */
.redis-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 16px;
}
.redis-mini-stats {
display: grid;
grid-template-columns: repeat(2, 1fr);
gap: 8px;
margin-top: 12px;
}
.redis-namespaces {
overflow-x: auto;
}
/* Mini Table */
.mini-table {
width: 100%;
border-collapse: collapse;
font-size: 0.82rem;
}
.mini-table th,
.mini-table td {
padding: 6px 10px;
text-align: left;
border-bottom: 1px solid #e2e8f0;
}
.mini-table th {
background: #f8fafc;
font-weight: 600;
color: #475569;
}
/* Memory Cache Cards */
.cache-cards-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
gap: 12px;
}
.cache-card {
background: #f8fafc;
border-radius: 8px;
padding: 14px;
}
.cache-card-name {
font-size: 0.85rem;
font-weight: 600;
margin-bottom: 2px;
}
.cache-card-desc {
font-size: 0.72rem;
color: #64748b;
margin-bottom: 8px;
}
.cache-card-ttl {
font-size: 0.72rem;
color: #94a3b8;
margin-top: 4px;
}
.route-cache-stats {
display: grid;
grid-template-columns: repeat(6, 1fr);
gap: 8px;
}
/* Connection Pool */
.pool-stats-grid {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 8px;
margin-top: 14px;
}
/* Worker */
.worker-info {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 8px;
margin-bottom: 14px;
}
/* Modal */
.modal-backdrop {
position: fixed;
inset: 0;
background: rgba(0, 0, 0, 0.45);
display: flex;
align-items: center;
justify-content: center;
z-index: 1000;
}
.modal-dialog {
background: #fff;
border-radius: 12px;
padding: 24px;
max-width: 400px;
width: 90%;
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.15);
}
.modal-dialog h3 {
margin-bottom: 8px;
}
.modal-dialog p {
font-size: 0.9rem;
color: #475569;
margin-bottom: 16px;
}
.modal-actions {
display: flex;
gap: 8px;
justify-content: flex-end;
}
.modal-actions .btn {
background: #e2e8f0;
color: #334155;
}
/* Log Controls */
.log-controls {
display: flex;
gap: 8px;
margin-bottom: 12px;
flex-wrap: wrap;
}
.log-controls select,
.log-controls input[type='text'] {
padding: 6px 10px;
border: 1px solid #cbd5e1;
border-radius: 6px;
font-size: 0.82rem;
outline: none;
}
.log-controls input[type='text'] {
flex: 1;
min-width: 160px;
}
/* Log Table */
.log-table-wrapper {
overflow-x: auto;
max-height: 400px;
overflow-y: auto;
}
.log-table {
width: 100%;
border-collapse: collapse;
font-size: 0.78rem;
font-family: 'SF Mono', 'Fira Code', monospace;
}
.log-table th,
.log-table td {
padding: 5px 8px;
text-align: left;
border-bottom: 1px solid #f1f5f9;
white-space: nowrap;
}
.log-table th {
background: #f8fafc;
font-weight: 600;
position: sticky;
top: 0;
z-index: 1;
}
.log-msg {
white-space: pre-wrap;
word-break: break-all;
max-width: 600px;
}
.log-time {
color: #64748b;
}
.log-level {
font-weight: 600;
}
.log-error .log-level {
color: #ef4444;
}
.log-warning .log-level {
color: #f59e0b;
}
.log-info .log-level {
color: #3b82f6;
}
.log-debug .log-level {
color: #94a3b8;
}
/* Log Pagination */
.log-pagination {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
margin-top: 10px;
font-size: 0.82rem;
}
/* Responsive */
@media (max-width: 768px) {
.status-cards-grid {
grid-template-columns: repeat(2, 1fr);
}
.query-perf-grid,
.redis-grid {
grid-template-columns: 1fr;
}
.pool-stats-grid {
grid-template-columns: repeat(2, 1fr);
}
.route-cache-stats {
grid-template-columns: repeat(3, 1fr);
}
}
/* Trend Charts */
.trend-chart-card {
margin-top: 4px;
background: #fff;
border: 1px solid #e2e8f0;
border-radius: 8px;
padding: 16px;
}
.trend-chart-title {
font-size: 0.9rem;
font-weight: 600;
color: #475569;
margin-bottom: 8px;
}
.trend-chart-canvas {
width: 100%;
min-height: 200px;
}
.trend-chart-empty {
color: #94a3b8;
font-size: 0.85rem;
text-align: center;
padding: 32px 0;
}

View File

@@ -70,6 +70,10 @@ const NATIVE_MODULE_LOADERS = Object.freeze({
() => import('../tables/App.vue'), () => import('../tables/App.vue'),
[() => import('../tables/style.css')], [() => import('../tables/style.css')],
), ),
'/admin/performance': createNativeLoader(
() => import('../admin-performance/App.vue'),
[() => import('../admin-performance/style.css')],
),
}); });
export function getNativeModuleLoader(route) { export function getNativeModuleLoader(route) {

View File

@@ -190,13 +190,13 @@ const ROUTE_CONTRACTS = Object.freeze({
'/admin/performance': buildContract({ '/admin/performance': buildContract({
route: '/admin/performance', route: '/admin/performance',
routeId: 'admin-performance', routeId: 'admin-performance',
renderMode: 'external', renderMode: 'native',
owner: 'frontend-platform-admin', owner: 'frontend-platform-admin',
title: '效能監控', title: '效能監控',
rollbackStrategy: 'external_route_reversion', rollbackStrategy: 'fallback_to_legacy_route',
visibilityPolicy: 'admin_only', visibilityPolicy: 'admin_only',
scope: 'in-scope', scope: 'in-scope',
compatibilityPolicy: 'external_target_redirect', compatibilityPolicy: 'redirect_to_shell_when_spa_enabled',
}), }),
'/tables': buildContract({ '/tables': buildContract({
route: '/tables', route: '/tables',

View File

@@ -28,7 +28,8 @@ export default defineConfig(({ mode }) => ({
'query-tool': resolve(__dirname, 'src/query-tool/main.js'), 'query-tool': resolve(__dirname, 'src/query-tool/main.js'),
'tmtt-defect': resolve(__dirname, 'src/tmtt-defect/main.js'), 'tmtt-defect': resolve(__dirname, 'src/tmtt-defect/main.js'),
'qc-gate': resolve(__dirname, 'src/qc-gate/index.html'), 'qc-gate': resolve(__dirname, 'src/qc-gate/index.html'),
'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html') 'mid-section-defect': resolve(__dirname, 'src/mid-section-defect/index.html'),
'admin-performance': resolve(__dirname, 'src/admin-performance/index.html')
}, },
output: { output: {
entryFileNames: '[name].js', entryFileNames: '[name].js',

View File

@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-02-22

View File

@@ -0,0 +1,91 @@
## Context
現有 `/admin/performance` 是 Jinja2 server-rendered 頁面vanilla JS + Chart.js是唯一未遷移至 Vue 3 SPA 的前端頁面。後端已具備豐富的監控數據(連線池 `get_pool_status()`、Redis client、LayeredCache `.telemetry()`),但前端僅展示 4 張 status cards + query performance + worker control + logs缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度等關鍵面板。
## Goals / Non-Goals
**Goals:**
- 將 admin/performance 頁面從 Jinja2 切換為 Vue 3 SPA與所有報表頁面架構一致
- 新增完整的系統監控面板Redis 快取詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤
- 提供可複用的 gauge/stat card 組件,便於未來擴展監控項目
- 保留所有既有功能status cards、query performance、worker control、system logs
**Non-Goals:**
- 不新增告警/通知機制(未來可擴展)
- 不引入 WebSocket 即時推送(維持 30 秒輪詢)
- 不修改既有 API response format`system-status``metrics``logs` 保持不變)
- 不新增使用者權限控制(沿用既有 admin 認證)
## Decisions
### 1. Vue 3 SPA + ECharts 取代 Jinja2 + Chart.js
**選擇**: 全面重建為 Vue 3 SPA使用 ECharts 繪製圖表
**理由**: 所有報表頁面已完成 Vue SPA 遷移admin/performance 是最後一個 Jinja2 頁面。統一架構可複用 `apiGet``useAutoRefresh` 等共用基礎設施減少維護成本。ECharts 已是專案標準圖表庫query-tool、reject-history 等均使用)。
**替代方案**: 保留 Jinja2 僅加 API — 但會持續累積技術債,且無法複用 Vue 生態。
### 2. 單一 performance-detail API 聚合所有新增監控數據
**選擇**: 新增 `GET /admin/api/performance-detail` 一個 endpoint回傳 `redis``process_caches``route_cache``db_pool``direct_connections` 五個 section。
**理由**: 減少前端並發請求數(已有 5 個 API加 1 個共 6 個),後端可在同一 request 中順序收集各子系統狀態,避免多次 round-trip。
**替代方案**: 每個監控維度獨立 endpoint — 更 RESTful 但增加前端複雜度和網路開銷。
### 3. ProcessLevelCache 全域 registry 模式
**選擇**: 在 `core/cache.py` 新增 `_PROCESS_CACHE_REGISTRY` dict + `register_process_cache()` 函式,各服務在模組載入時自行註冊。
**理由**: 避免 admin_routes 硬編碼各快取實例的 import 路徑,新增快取時只需在該服務中加一行 `register_process_cache()` 即可自動出現在監控面板。
**替代方案**: admin_routes 直接 import 各快取實例 — 耦合度高,新增快取需改兩處。
### 4. Redis namespace 監控使用 SCAN 而非 KEYS
**選擇**: 使用 `SCAN` 搭配 `MATCH` pattern 掃描各 namespace 的 key 數量。
**理由**: `KEYS *` 在生產環境會阻塞 Redis`SCAN` 為非阻塞迭代器,安全性更高。
### 5. 直連 Oracle 使用 thread-safe atomic counter
**選擇**: 在 `database.py` 使用 `threading.Lock` 保護的全域計數器,在 `get_db_connection()``read_sql_df_slow()` 建立連線後 increment。
**理由**: 追蹤連線池外的直接連線使用量,幫助判斷是否需要調整池大小。計數器為 monotonic只增不減記錄的是自 worker 啟動以來的總數。
### 6. 前端組件複用 GaugeBar / StatCard / StatusDot
**選擇**: 新增 3 個小型可複用組件放在 `admin-performance/components/` 下。
**理由**: Redis 記憶體、連線池飽和度、ProcessLevelCache 使用率等多處需要 gauge 視覺化status cards 跨面板重複。組件化可統一視覺風格並減少重複 template。
### 7. SQLite 持久化 metrics history store
**選擇**: 新增 `core/metrics_history.py`,使用 SQLite 儲存 metrics snapshots仿 `core/log_store.py``LogStore` 模式),搭配 daemon thread 每 30 秒採集一次。
**理由**: in-memory deque 在 worker 重啟或 gunicorn prefork 下無法跨 worker 共享且不保留歷史。SQLite 提供跨 worker 讀取、重啟持久化、可配置保留天數(預設 3 天 / 50000 rows且不需額外 infra。
**替代方案**:
- in-memory deque — 簡單但 worker 獨立、重啟即失
- Redis TSDB — 需額外模組且增加 Redis 負擔
- PostgreSQL — 太重,且此數據不需 ACID
**Schema**: `metrics_snapshots` table 含 timestamp、worker PID、pool/redis/route_cache/latency 各欄位,`idx_metrics_ts` 索引加速時間查詢。
**背景採集**: `MetricsHistoryCollector` daemon thread間隔可透過 `METRICS_HISTORY_INTERVAL` 環境變數配置。在 `app.py` lifecycle 中 start/stop。
## Risks / Trade-offs
- **Redis SCAN 效能**: 大量 key 時 SCAN 可能較慢 → 設定 `COUNT 100` 限制每次迭代量,且 30 秒才掃一次,可接受
- **ProcessLevelCache registry 依賴模組載入順序**: 服務未 import 時不會註冊 → 在 app factory 或 gunicorn post_fork 確保所有服務模組已載入
- **直連計數器跨 worker 不共享**: gunicorn prefork 模式下每個 worker 有獨立計數 → API 回傳當前 worker PID 供辨識,可透過 `/admin/api/system-status` 的 worker info 交叉比對
- **舊 Jinja2 模板保留但不維護**: 切換後舊模板不再更新 → 透過 `routeContracts.js``rollbackStrategy: 'fallback_to_legacy_route'` 保留回退能力
## Migration Plan
1. 後端先行:加 `stats()`、registry、直連計數器、新 API不影響既有功能
2. 前端建構:新建 `admin-performance/` Vue SPAVite 註冊 entry
3. 路由切換:`admin_routes.py` 改為 `send_from_directory``routeContracts.js``renderMode: 'native'`
4. 驗證後部署:確認所有面板正確顯示後上線
5. 回退方案:`routeContracts.js` 改回 `renderMode: 'external'``admin_routes.py` 改回 `render_template`

View File

@@ -0,0 +1,31 @@
## Why
現有 `/admin/performance` 是唯一仍使用 Jinja2 + vanilla JS + Chart.js 的頁面,與所有已遷移至 Vue 3 SPA 的報表頁面架構不一致。同時隨著報表系統功能擴充L1/L2 快取層、連線池、直連 Oracle 等),後端已具備豐富的遙測數據,但管理後台的監控面板覆蓋不足——缺少 Redis 詳情、ProcessLevelCache 統計、連線池飽和度、直連 Oracle 追蹤等關鍵資訊。
## What Changes
-`/admin/performance` 從 Jinja2 server-rendered 頁面重建為 Vue 3 SPAECharts 取代 Chart.js
- 新增 `GET /admin/api/performance-detail` API整合 Redis INFO/SCAN、ProcessLevelCache registry、連線池狀態、直連計數等完整監控數據
- 後端 `ProcessLevelCache` 加入 `stats()` 方法與全域 registry支援動態收集所有快取實例狀態
- 後端 `database.py` 加入直連 Oracle 計數器,追蹤非連線池的直接連線使用量
- 前端新增 GaugeBar / StatCard / StatusDot 可複用組件,提供 gauge 飽和度視覺化
- portal-shell 路由從 `renderMode: 'external'` 切換為 `'native'`
- Vite 構建新增 `admin-performance` entry point
## Capabilities
### New Capabilities
- `admin-performance-spa`: Vue 3 SPA 重建管理效能儀表板,包含 status cards、query performance、Redis 快取、記憶體快取、連線池、worker 控制、系統日誌等完整面板
- `cache-telemetry-api`: ProcessLevelCache stats() + 全域 registry + performance-detail API提供所有記憶體快取、Redis 快取、route cache 的遙測數據
- `connection-pool-monitoring`: 連線池飽和度追蹤 + 直連 Oracle 計數器,完整呈現資料庫連線使用狀況
- `metrics-history-trending`: SQLite 持久化背景採集 + 時間序列趨勢圖可回溯連線池飽和度、查詢延遲、Redis 記憶體、快取命中率等歷史數據
### Modified Capabilities
<!-- No existing spec-level requirements are changing -->
## Impact
- **Backend** (7 files): `core/cache.py``core/database.py``core/metrics_history.py`(NEW)、`routes/admin_routes.py``services/resource_cache.py``services/realtime_equipment_cache.py``services/reject_dataset_cache.py``app.py`
- **Frontend** (8 new + 3 modified): 新建 `admin-performance/` 目錄index.html、main.js、App.vue、style.css、4 個組件含 TrendChart修改 `vite.config.js``package.json``routeContracts.js`
- **API**: 新增 2 個 endpoint (`/admin/api/performance-detail``/admin/api/performance-history`),既有 5 個 endpoint 不變
- **Rollback**: 舊 Jinja2 模板保留,可透過 `routeContracts.js` 切回 `renderMode: 'external'`

View File

@@ -0,0 +1,100 @@
## ADDED Requirements
### Requirement: Vue 3 SPA page replaces Jinja2 template
The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route.
#### Scenario: Page loads as Vue SPA
- **WHEN** user navigates to `/admin/performance`
- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template)
#### Scenario: Portal-shell integration
- **WHEN** the portal-shell renders `/admin/performance`
- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe)
### Requirement: Status cards display system health
The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value.
#### Scenario: All systems healthy
- **WHEN** all backend systems report healthy status via `/admin/api/system-status`
- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values
#### Scenario: Redis disabled
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message
### Requirement: Query performance panel with ECharts
The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation.
#### Scenario: Metrics loaded successfully
- **WHEN** `/admin/api/metrics` returns valid performance data
- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution
#### Scenario: No metrics data
- **WHEN** `/admin/api/metrics` returns empty or null metrics
- **THEN** the panel SHALL display placeholder text indicating no data available
### Requirement: Redis cache detail panel
The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table.
#### Scenario: Redis active with data
- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts
- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count
#### Scenario: Redis disabled
- **WHEN** Redis is disabled
- **THEN** the Redis detail panel SHALL display a disabled state message without errors
### Requirement: Memory cache panel
The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads).
#### Scenario: Multiple caches registered
- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries
- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description
#### Scenario: Route cache telemetry
- **WHEN** `/admin/api/performance-detail` returns route_cache data
- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads
### Requirement: Connection pool panel
The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count.
#### Scenario: Pool under normal load
- **WHEN** pool saturation is below 80%
- **THEN** the GaugeBar SHALL display in a normal color (green/blue)
#### Scenario: Pool near saturation
- **WHEN** pool saturation exceeds 80%
- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red)
### Requirement: Worker control panel
The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal.
#### Scenario: Restart worker
- **WHEN** user clicks the restart button and confirms in the modal
- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result
#### Scenario: Restart during cooldown
- **WHEN** worker is in cooldown period
- **THEN** the restart button SHALL be disabled with a cooldown indicator
### Requirement: System logs panel with filtering and pagination
The dashboard SHALL display system logs with level filtering, text search, and pagination controls.
#### Scenario: Filter by log level
- **WHEN** user selects a specific log level filter
- **THEN** only logs matching that level SHALL be displayed
#### Scenario: Paginate logs
- **WHEN** logs exceed the page size
- **THEN** pagination controls SHALL allow navigating between pages
### Requirement: Auto-refresh with toggle
The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh.
#### Scenario: Auto-refresh enabled
- **WHEN** auto-refresh is enabled (default)
- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch
#### Scenario: Manual refresh
- **WHEN** user clicks the manual refresh button
- **THEN** all panels SHALL immediately refresh their data

View File

@@ -0,0 +1,56 @@
## ADDED Requirements
### Requirement: ProcessLevelCache stats method
Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`.
#### Scenario: Stats on active cache
- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s)
- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}`
#### Scenario: Stats with expired entries
- **WHEN** `stats()` is called and some entries have exceeded TTL
- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl`
#### Scenario: Thread safety
- **WHEN** `stats()` is called concurrently with cache writes
- **THEN** it SHALL acquire the cache lock and return consistent data without races
### Requirement: ProcessLevelCache global registry
The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`.
#### Scenario: Register and retrieve all caches
- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called
- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches
#### Scenario: Cache not registered
- **WHEN** a service's ProcessLevelCache is not registered
- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output
### Requirement: Performance detail API endpoint
The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`.
#### Scenario: All systems available
- **WHEN** the API is called and all subsystems are healthy
- **THEN** it SHALL return all 5 sections with current telemetry data
#### Scenario: Redis disabled
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally
### Requirement: Redis namespace key distribution
The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`.
#### Scenario: Keys exist across namespaces
- **WHEN** Redis contains keys across multiple namespaces
- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count`
#### Scenario: SCAN safety
- **WHEN** scanning Redis keys
- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis
### Requirement: Route cache telemetry in performance detail
The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`.
#### Scenario: LayeredCache active
- **WHEN** route cache is in layered mode
- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry

View File

@@ -0,0 +1,27 @@
## ADDED Requirements
### Requirement: Connection pool status in performance detail
The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`.
#### Scenario: Pool status retrieved
- **WHEN** the API is called
- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values
#### Scenario: Saturation calculation
- **WHEN** the pool has 8 checked_out connections and max_capacity is 30
- **THEN** saturation SHALL be reported as approximately 26.7%
### Requirement: Direct Oracle connection counter
The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection.
#### Scenario: Counter increments on direct connection
- **WHEN** `get_db_connection()` successfully creates a connection
- **THEN** the direct connection counter SHALL increment by 1
#### Scenario: Counter in performance detail
- **WHEN** the performance-detail API is called
- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID)
#### Scenario: Counter is per-worker
- **WHEN** multiple gunicorn workers are running
- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker

View File

@@ -0,0 +1,65 @@
## ADDED Requirements
### Requirement: SQLite metrics history store
The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`.
#### Scenario: Write and query snapshots
- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics
- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID
#### Scenario: Query by time range
- **WHEN** `query_snapshots(minutes=30)` is called
- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending
#### Scenario: Retention cleanup
- **WHEN** `cleanup()` is called
- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000)
#### Scenario: Thread safety
- **WHEN** multiple threads write snapshots concurrently
- **THEN** the write lock SHALL serialize writes and prevent database corruption
### Requirement: Background metrics collector
The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var).
#### Scenario: Automatic collection
- **WHEN** the collector is started via `start_metrics_history(app)`
- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store
#### Scenario: Graceful shutdown
- **WHEN** `stop_metrics_history()` is called
- **THEN** the collector thread SHALL stop within one interval period
#### Scenario: Subsystem unavailability
- **WHEN** a subsystem (e.g., Redis) is unavailable during collection
- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics
### Requirement: Performance history API endpoint
The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots.
#### Scenario: Query with time range
- **WHEN** the API is called with `?minutes=30`
- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}`
#### Scenario: Time range bounds
- **WHEN** `minutes` is less than 1 or greater than 180
- **THEN** it SHALL be clamped to the range [1, 180]
#### Scenario: Admin authentication
- **WHEN** the API is called without admin authentication
- **THEN** it SHALL be rejected by the `@admin_required` decorator
### Requirement: Frontend trend charts
The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts.
#### Scenario: Trend charts with data
- **WHEN** historical snapshots contain more than 1 data point
- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates
#### Scenario: Trend charts without data
- **WHEN** historical snapshots are empty or contain only 1 data point
- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`)
#### Scenario: Auto-refresh
- **WHEN** the dashboard auto-refreshes
- **THEN** historical data SHALL also be refreshed alongside real-time metrics

View File

@@ -0,0 +1,80 @@
## 1. Backend — Cache Telemetry Infrastructure
- [x] 1.1 Add `stats()` method to `ProcessLevelCache` in `core/cache.py` (returns entries/max_size/ttl_seconds with lock)
- [x] 1.2 Add `_PROCESS_CACHE_REGISTRY`, `register_process_cache()`, and `get_all_process_cache_stats()` to `core/cache.py`
- [x] 1.3 Register `_wip_df_cache` in `core/cache.py`
- [x] 1.4 Add `stats()` + `register_process_cache()` to `services/resource_cache.py`
- [x] 1.5 Add `stats()` + `register_process_cache()` to `services/realtime_equipment_cache.py`
- [x] 1.6 Add `register_process_cache()` to `services/reject_dataset_cache.py`
## 2. Backend — Direct Connection Counter
- [x] 2.1 Add `_DIRECT_CONN_COUNTER`, `_DIRECT_CONN_LOCK`, and `get_direct_connection_count()` to `core/database.py`
- [x] 2.2 Increment counter in `get_db_connection()` and `read_sql_df_slow()` after successful connection creation
## 3. Backend — Performance Detail API
- [x] 3.1 Add `GET /admin/api/performance-detail` endpoint in `routes/admin_routes.py` returning redis, process_caches, route_cache, db_pool, and direct_connections sections
- [x] 3.2 Implement Redis INFO + SCAN namespace key distribution (data, route_cache, equipment_status, reject_dataset, meta, lock, scrap_exclusion) with graceful degradation when Redis is disabled
## 4. Frontend — Page Scaffolding
- [x] 4.1 Create `frontend/src/admin-performance/index.html` and `main.js` (standard Vue SPA entry)
- [x] 4.2 Register `admin-performance` entry in `vite.config.js`
- [x] 4.3 Add `cp` command for `admin-performance.html` in `package.json` build script
## 5. Frontend — Reusable Components
- [x] 5.1 Create `GaugeBar.vue` — horizontal gauge bar with label, value, max, and color threshold props
- [x] 5.2 Create `StatCard.vue` — mini card with numeric value, label, and optional unit/icon
- [x] 5.3 Create `StatusDot.vue` — colored dot indicator (healthy/degraded/error/disabled) with label
## 6. Frontend — App.vue Main Dashboard
- [x] 6.1 Implement data fetching layer: `loadSystemStatus()`, `loadMetrics()`, `loadPerformanceDetail()`, `loadLogs()`, `loadWorkerStatus()` with `Promise.all` parallel fetch and `useAutoRefresh` (30s)
- [x] 6.2 Build header section with gradient background, title, auto-refresh toggle, and manual refresh button
- [x] 6.3 Build status cards section (Database / Redis / Circuit Breaker / Worker PID) using StatusDot
- [x] 6.4 Build query performance panel with P50/P95/P99 stat cards and ECharts latency distribution chart
- [x] 6.5 Build Redis cache detail panel with memory GaugeBar, hit rate, client count, peak memory, and namespace key distribution table
- [x] 6.6 Build memory cache panel with ProcessLevelCache grid cards (entries/max gauge + TTL) and route cache telemetry (L1/L2 hit rate, miss rate, total reads)
- [x] 6.7 Build connection pool panel with saturation GaugeBar and stat card grid (checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, direct connections)
- [x] 6.8 Build worker control panel with PID/uptime/cooldown display, restart button, and confirmation modal
- [x] 6.9 Build system logs panel with level filter, text search, pagination, and log clearing
- [x] 6.10 Create `style.css` with all panel, grid, gauge, card, and responsive layout styles
## 7. Route Integration
- [x] 7.1 Change `/admin/performance` route handler in `admin_routes.py` from `render_template` to `send_from_directory` serving the Vue SPA
- [x] 7.2 Update `routeContracts.js`: change renderMode to `'native'`, rollbackStrategy to `'fallback_to_legacy_route'`, compatibilityPolicy to `'redirect_to_shell_when_spa_enabled'`
## 8. Verification (Phase 1)
- [x] 8.1 Run `cd frontend && npx vite build` — confirm no compilation errors and `admin-performance.html` is produced
- [x] 8.2 Verify all dashboard panels render correctly with live data after service restart
## 9. Backend — Metrics History Store
- [x] 9.1 Create `core/metrics_history.py` with `MetricsHistoryStore` class (SQLite schema, thread-local connections, write_lock, write_snapshot, query_snapshots, cleanup)
- [x] 9.2 Add `MetricsHistoryCollector` class (daemon thread, configurable interval, collect pool/redis/route_cache/latency)
- [x] 9.3 Add module-level `get_metrics_history_store()`, `start_metrics_history(app)`, `stop_metrics_history()` functions
## 10. Backend — Lifecycle Integration
- [x] 10.1 Call `start_metrics_history(app)` in `app.py` after other background services
- [x] 10.2 Call `stop_metrics_history()` in `_shutdown_runtime_resources()` in `app.py`
## 11. Backend — Performance History API
- [x] 11.1 Add `GET /admin/api/performance-history` endpoint in `admin_routes.py` (minutes param, clamped 1-180, returns snapshots array)
## 12. Frontend — Trend Charts
- [x] 12.1 Create `TrendChart.vue` component using vue-echarts VChart (line/area chart, dual yAxis support, time labels, autoresize)
- [x] 12.2 Add `loadPerformanceHistory()` fetch to `App.vue` and integrate into `refreshAll()`
- [x] 12.3 Add 4 TrendChart panels to `App.vue` template (pool saturation, query latency, Redis memory, cache hit rates)
- [x] 12.4 Add trend chart styles to `style.css`
## 13. Verification (Phase 2)
- [x] 13.1 Run `cd frontend && npm run build` — confirm no compilation errors
- [x] 13.2 Verify trend charts render with historical data after service restart + 60s collection

View File

@@ -0,0 +1,100 @@
## ADDED Requirements
### Requirement: Vue 3 SPA page replaces Jinja2 template
The `/admin/performance` route SHALL serve a Vue 3 SPA page built by Vite, replacing the existing Jinja2 server-rendered template. The SPA SHALL be registered as a Vite entry point and integrated into the portal-shell navigation as a `renderMode: 'native'` route.
#### Scenario: Page loads as Vue SPA
- **WHEN** user navigates to `/admin/performance`
- **THEN** the server SHALL return the Vite-built `admin-performance.html` static file (not a Jinja2 rendered template)
#### Scenario: Portal-shell integration
- **WHEN** the portal-shell renders `/admin/performance`
- **THEN** it SHALL load the page as a native Vue SPA (not an external iframe)
### Requirement: Status cards display system health
The dashboard SHALL display 4 status cards in a horizontal grid: Database, Redis, Circuit Breaker, and Worker PID. Each card SHALL show a StatusDot indicator (healthy/degraded/error/disabled) with the current status value.
#### Scenario: All systems healthy
- **WHEN** all backend systems report healthy status via `/admin/api/system-status`
- **THEN** all 4 status cards SHALL display green StatusDot indicators with their respective values
#### Scenario: Redis disabled
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
- **THEN** the Redis status card SHALL display a disabled StatusDot indicator and the Redis cache panel SHALL show a graceful degradation message
### Requirement: Query performance panel with ECharts
The dashboard SHALL display query performance metrics (P50, P95, P99 latencies, total queries, slow queries) and an ECharts latency distribution chart, replacing the existing Chart.js implementation.
#### Scenario: Metrics loaded successfully
- **WHEN** `/admin/api/metrics` returns valid performance data
- **THEN** the panel SHALL display P50/P95/P99 latency values and render an ECharts bar chart showing latency distribution
#### Scenario: No metrics data
- **WHEN** `/admin/api/metrics` returns empty or null metrics
- **THEN** the panel SHALL display placeholder text indicating no data available
### Requirement: Redis cache detail panel
The dashboard SHALL display a Redis cache detail panel showing memory usage (as a GaugeBar), connected clients, hit rate percentage, peak memory, and a namespace key distribution table.
#### Scenario: Redis active with data
- **WHEN** `/admin/api/performance-detail` returns Redis data with namespace key counts
- **THEN** the panel SHALL display a memory GaugeBar, hit rate, client count, and a table listing each namespace with its key count
#### Scenario: Redis disabled
- **WHEN** Redis is disabled
- **THEN** the Redis detail panel SHALL display a disabled state message without errors
### Requirement: Memory cache panel
The dashboard SHALL display ProcessLevelCache statistics as grid cards (showing entries/max_size as a mini gauge and TTL) plus Route Cache telemetry (L1 hit rate, L2 hit rate, miss rate, total reads).
#### Scenario: Multiple caches registered
- **WHEN** `/admin/api/performance-detail` returns process_caches with multiple entries
- **THEN** the panel SHALL render one card per cache instance showing entries, max_size, TTL, and description
#### Scenario: Route cache telemetry
- **WHEN** `/admin/api/performance-detail` returns route_cache data
- **THEN** the panel SHALL display L1 hit rate, L2 hit rate, miss rate, and total reads
### Requirement: Connection pool panel
The dashboard SHALL display connection pool saturation as a GaugeBar and stat cards showing checked_out, checked_in, overflow, max_capacity, pool_size, pool_recycle, pool_timeout, and direct connection count.
#### Scenario: Pool under normal load
- **WHEN** pool saturation is below 80%
- **THEN** the GaugeBar SHALL display in a normal color (green/blue)
#### Scenario: Pool near saturation
- **WHEN** pool saturation exceeds 80%
- **THEN** the GaugeBar SHALL display in a warning color (yellow/orange/red)
### Requirement: Worker control panel
The dashboard SHALL display worker PID, uptime, cooldown status, and provide a restart button with a confirmation modal.
#### Scenario: Restart worker
- **WHEN** user clicks the restart button and confirms in the modal
- **THEN** the system SHALL POST to `/admin/api/worker/restart` and display the result
#### Scenario: Restart during cooldown
- **WHEN** worker is in cooldown period
- **THEN** the restart button SHALL be disabled with a cooldown indicator
### Requirement: System logs panel with filtering and pagination
The dashboard SHALL display system logs with level filtering, text search, and pagination controls.
#### Scenario: Filter by log level
- **WHEN** user selects a specific log level filter
- **THEN** only logs matching that level SHALL be displayed
#### Scenario: Paginate logs
- **WHEN** logs exceed the page size
- **THEN** pagination controls SHALL allow navigating between pages
### Requirement: Auto-refresh with toggle
The dashboard SHALL auto-refresh all panels every 30 seconds using `useAutoRefresh`. The user SHALL be able to toggle auto-refresh on/off and manually trigger a refresh.
#### Scenario: Auto-refresh enabled
- **WHEN** auto-refresh is enabled (default)
- **THEN** all panels SHALL refresh their data every 30 seconds via `Promise.all` parallel fetch
#### Scenario: Manual refresh
- **WHEN** user clicks the manual refresh button
- **THEN** all panels SHALL immediately refresh their data

View File

@@ -0,0 +1,56 @@
## ADDED Requirements
### Requirement: ProcessLevelCache stats method
Every `ProcessLevelCache` instance SHALL expose a `stats()` method that returns a dict containing `entries` (live entries count), `max_size`, and `ttl_seconds`.
#### Scenario: Stats on active cache
- **WHEN** `stats()` is called on a ProcessLevelCache with 5 live entries (max_size=32, ttl=30s)
- **THEN** it SHALL return `{"entries": 5, "max_size": 32, "ttl_seconds": 30}`
#### Scenario: Stats with expired entries
- **WHEN** `stats()` is called and some entries have exceeded TTL
- **THEN** `entries` SHALL only count entries where `now - timestamp <= ttl`
#### Scenario: Thread safety
- **WHEN** `stats()` is called concurrently with cache writes
- **THEN** it SHALL acquire the cache lock and return consistent data without races
### Requirement: ProcessLevelCache global registry
The system SHALL maintain a module-level registry in `core/cache.py` that maps cache names to `(description, instance)` tuples. Services SHALL register their cache instances at module load time via `register_process_cache(name, instance, description)`.
#### Scenario: Register and retrieve all caches
- **WHEN** multiple services register their caches and `get_all_process_cache_stats()` is called
- **THEN** it SHALL return a dict of `{name: {entries, max_size, ttl_seconds, description}}` for all registered caches
#### Scenario: Cache not registered
- **WHEN** a service's ProcessLevelCache is not registered
- **THEN** it SHALL NOT appear in `get_all_process_cache_stats()` output
### Requirement: Performance detail API endpoint
The system SHALL expose `GET /admin/api/performance-detail` that returns a JSON object with sections: `redis`, `process_caches`, `route_cache`, `db_pool`, and `direct_connections`.
#### Scenario: All systems available
- **WHEN** the API is called and all subsystems are healthy
- **THEN** it SHALL return all 5 sections with current telemetry data
#### Scenario: Redis disabled
- **WHEN** Redis is disabled (`REDIS_ENABLED=false`)
- **THEN** the `redis` section SHALL be `null` or contain `{"enabled": false}`, and other sections SHALL still return normally
### Requirement: Redis namespace key distribution
The performance-detail API SHALL scan Redis keys by namespace prefix and return key counts per namespace. Namespaces SHALL include: `data`, `route_cache`, `equipment_status`, `reject_dataset`, `meta`, `lock`, `scrap_exclusion`.
#### Scenario: Keys exist across namespaces
- **WHEN** Redis contains keys across multiple namespaces
- **THEN** the `redis.namespaces` array SHALL list each namespace with its `name` and `key_count`
#### Scenario: SCAN safety
- **WHEN** scanning Redis keys
- **THEN** the system SHALL use `SCAN` (not `KEYS`) to avoid blocking Redis
### Requirement: Route cache telemetry in performance detail
The performance-detail API SHALL include route cache telemetry from `get_route_cache_status()`, providing `mode`, `l1_size`, `l1_hit_rate`, `l2_hit_rate`, `miss_rate`, and `reads_total`.
#### Scenario: LayeredCache active
- **WHEN** route cache is in layered mode
- **THEN** the `route_cache` section SHALL include L1 and L2 hit rates from telemetry

View File

@@ -0,0 +1,27 @@
## ADDED Requirements
### Requirement: Connection pool status in performance detail
The performance-detail API SHALL include `db_pool` section with `status` (checked_out, checked_in, overflow, max_capacity, saturation) from `get_pool_status()` and `config` (pool_size, max_overflow, pool_timeout, pool_recycle) from `get_pool_runtime_config()`.
#### Scenario: Pool status retrieved
- **WHEN** the API is called
- **THEN** `db_pool.status` SHALL contain current pool utilization metrics and `db_pool.config` SHALL contain the pool configuration values
#### Scenario: Saturation calculation
- **WHEN** the pool has 8 checked_out connections and max_capacity is 30
- **THEN** saturation SHALL be reported as approximately 26.7%
### Requirement: Direct Oracle connection counter
The system SHALL maintain a thread-safe monotonic counter in `database.py` that increments each time `get_db_connection()` or `read_sql_df_slow()` successfully creates a direct (non-pooled) Oracle connection.
#### Scenario: Counter increments on direct connection
- **WHEN** `get_db_connection()` successfully creates a connection
- **THEN** the direct connection counter SHALL increment by 1
#### Scenario: Counter in performance detail
- **WHEN** the performance-detail API is called
- **THEN** `direct_connections` SHALL contain `total_since_start` (counter value) and `worker_pid` (current process PID)
#### Scenario: Counter is per-worker
- **WHEN** multiple gunicorn workers are running
- **THEN** each worker SHALL maintain its own independent counter, and the API SHALL return the counter for the responding worker

View File

@@ -0,0 +1,65 @@
## ADDED Requirements
### Requirement: SQLite metrics history store
The system SHALL provide a `MetricsHistoryStore` class in `core/metrics_history.py` that persists metrics snapshots to a SQLite database (`logs/metrics_history.sqlite` by default). The store SHALL use thread-local connections and a write lock, following the `LogStore` pattern in `core/log_store.py`.
#### Scenario: Write and query snapshots
- **WHEN** `write_snapshot(data)` is called with pool/redis/route_cache/latency metrics
- **THEN** a row SHALL be inserted into `metrics_snapshots` with the current ISO 8601 timestamp and worker PID
#### Scenario: Query by time range
- **WHEN** `query_snapshots(minutes=30)` is called
- **THEN** it SHALL return all rows from the last 30 minutes, ordered by timestamp ascending
#### Scenario: Retention cleanup
- **WHEN** `cleanup()` is called
- **THEN** rows older than `METRICS_HISTORY_RETENTION_DAYS` (default 3) SHALL be deleted, and total rows SHALL be capped at `METRICS_HISTORY_MAX_ROWS` (default 50000)
#### Scenario: Thread safety
- **WHEN** multiple threads write snapshots concurrently
- **THEN** the write lock SHALL serialize writes and prevent database corruption
### Requirement: Background metrics collector
The system SHALL provide a `MetricsHistoryCollector` class that runs a daemon thread collecting metrics snapshots at a configurable interval (default 30 seconds, via `METRICS_HISTORY_INTERVAL` env var).
#### Scenario: Automatic collection
- **WHEN** the collector is started via `start_metrics_history(app)`
- **THEN** it SHALL collect pool status, Redis info, route cache status, and query latency metrics every interval and write them to the store
#### Scenario: Graceful shutdown
- **WHEN** `stop_metrics_history()` is called
- **THEN** the collector thread SHALL stop within one interval period
#### Scenario: Subsystem unavailability
- **WHEN** a subsystem (e.g., Redis) is unavailable during collection
- **THEN** the collector SHALL write null/0 for those fields and continue collecting other metrics
### Requirement: Performance history API endpoint
The system SHALL expose `GET /admin/api/performance-history` that returns historical metrics snapshots.
#### Scenario: Query with time range
- **WHEN** the API is called with `?minutes=30`
- **THEN** it SHALL return `{"success": true, "data": {"snapshots": [...], "count": N}}`
#### Scenario: Time range bounds
- **WHEN** `minutes` is less than 1 or greater than 180
- **THEN** it SHALL be clamped to the range [1, 180]
#### Scenario: Admin authentication
- **WHEN** the API is called without admin authentication
- **THEN** it SHALL be rejected by the `@admin_required` decorator
### Requirement: Frontend trend charts
The system SHALL display 4 trend chart panels in the admin performance dashboard using vue-echarts VChart line/area charts.
#### Scenario: Trend charts with data
- **WHEN** historical snapshots contain more than 1 data point
- **THEN** the dashboard SHALL display trend charts for: connection pool saturation, query latency (P50/P95/P99), Redis memory, and cache hit rates
#### Scenario: Trend charts without data
- **WHEN** historical snapshots are empty or contain only 1 data point
- **THEN** the trend charts SHALL NOT be displayed (hidden via `v-if`)
#### Scenario: Auto-refresh
- **WHEN** the dashboard auto-refreshes
- **THEN** historical data SHALL also be refreshed alongside real-time metrics

View File

@@ -295,6 +295,12 @@ def _shutdown_runtime_resources() -> None:
except Exception as exc: except Exception as exc:
logger.warning("Error stopping scrap exclusion cache worker: %s", exc) logger.warning("Error stopping scrap exclusion cache worker: %s", exc)
try:
from mes_dashboard.core.metrics_history import stop_metrics_history
stop_metrics_history()
except Exception as exc:
logger.warning("Error stopping metrics history: %s", exc)
try: try:
close_redis() close_redis()
except Exception as exc: except Exception as exc:
@@ -390,6 +396,8 @@ def create_app(config_name: str | None = None) -> Flask:
start_cache_updater() # Start Redis cache updater start_cache_updater() # Start Redis cache updater
init_realtime_equipment_cache(app) # Start realtime equipment status cache init_realtime_equipment_cache(app) # Start realtime equipment status cache
init_scrap_reason_exclusion_cache(app) # Start exclusion-policy cache sync init_scrap_reason_exclusion_cache(app) # Start exclusion-policy cache sync
from mes_dashboard.core.metrics_history import start_metrics_history
start_metrics_history(app) # Start metrics history collector
_register_shutdown_hooks(app) _register_shutdown_hooks(app)
# Register API routes # Register API routes

View File

@@ -95,6 +95,34 @@ class ProcessLevelCache:
with self._lock: with self._lock:
self._cache.clear() self._cache.clear()
def stats(self) -> dict:
"""Return live cache statistics for telemetry."""
with self._lock:
now = time.time()
live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
# ============================================================
# Process-Level Cache Registry (for admin telemetry)
# ============================================================
_PROCESS_CACHE_REGISTRY: dict[str, tuple[str, Any]] = {}
def register_process_cache(name: str, cache_instance: Any, description: str = "") -> None:
"""Register a ProcessLevelCache instance for admin telemetry."""
_PROCESS_CACHE_REGISTRY[name] = (description, cache_instance)
def get_all_process_cache_stats() -> dict[str, dict]:
"""Collect stats from all registered ProcessLevelCache instances."""
return {
name: {**inst.stats(), "description": desc}
for name, (desc, inst) in _PROCESS_CACHE_REGISTRY.items()
if callable(getattr(inst, "stats", None))
}
def _resolve_cache_max_size(env_name: str, default: int) -> int: def _resolve_cache_max_size(env_name: str, default: int) -> int:
value = os.getenv(env_name) value = os.getenv(env_name)
@@ -116,6 +144,7 @@ _wip_df_cache = ProcessLevelCache(
ttl_seconds=30, ttl_seconds=30,
max_size=WIP_PROCESS_CACHE_MAX_SIZE, max_size=WIP_PROCESS_CACHE_MAX_SIZE,
) )
register_process_cache("wip_dataframe", _wip_df_cache, "WIP DataFrame (L1, 30s)")
_wip_parse_lock = threading.Lock() _wip_parse_lock = threading.Lock()
# ============================================================ # ============================================================

View File

@@ -416,6 +416,14 @@ def dispose_engine():
# Direct Connection Helpers # Direct Connection Helpers
# ============================================================ # ============================================================
_DIRECT_CONN_COUNTER = 0
_DIRECT_CONN_LOCK = threading.Lock()
def get_direct_connection_count() -> int:
"""Return total direct (non-pooled) connections since worker start."""
return _DIRECT_CONN_COUNTER
def get_db_connection(): def get_db_connection():
"""Create a direct oracledb connection. """Create a direct oracledb connection.
@@ -432,6 +440,9 @@ def get_db_connection():
retry_delay=runtime["retry_delay"], retry_delay=runtime["retry_delay"],
) )
conn.call_timeout = runtime["call_timeout_ms"] conn.call_timeout = runtime["call_timeout_ms"]
with _DIRECT_CONN_LOCK:
global _DIRECT_CONN_COUNTER
_DIRECT_CONN_COUNTER += 1
logger.debug( logger.debug(
"Direct oracledb connection established (call_timeout_ms=%s)", "Direct oracledb connection established (call_timeout_ms=%s)",
runtime["call_timeout_ms"], runtime["call_timeout_ms"],
@@ -591,6 +602,9 @@ def read_sql_df_slow(
retry_delay=runtime["retry_delay"], retry_delay=runtime["retry_delay"],
) )
conn.call_timeout = timeout_ms conn.call_timeout = timeout_ms
with _DIRECT_CONN_LOCK:
global _DIRECT_CONN_COUNTER
_DIRECT_CONN_COUNTER += 1
logger.debug( logger.debug(
"Slow-query connection established (call_timeout_ms=%s)", timeout_ms "Slow-query connection established (call_timeout_ms=%s)", timeout_ms
) )

View File

@@ -0,0 +1,369 @@
# -*- coding: utf-8 -*-
"""SQLite-based metrics history store for admin performance dashboard.
Periodically snapshots system metrics (pool, redis, cache, latency)
into a SQLite database for historical trend visualization.
Follows the LogStore pattern from core/log_store.py.
"""
from __future__ import annotations
import logging
import os
import sqlite3
import threading
import time
from contextlib import contextmanager
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional
logger = logging.getLogger('mes_dashboard.metrics_history')
# ============================================================
# Configuration
# ============================================================
METRICS_HISTORY_PATH = os.getenv(
'METRICS_HISTORY_PATH',
'logs/metrics_history.sqlite',
)
METRICS_HISTORY_INTERVAL = int(os.getenv('METRICS_HISTORY_INTERVAL', '30'))
METRICS_HISTORY_RETENTION_DAYS = int(os.getenv('METRICS_HISTORY_RETENTION_DAYS', '3'))
METRICS_HISTORY_MAX_ROWS = int(os.getenv('METRICS_HISTORY_MAX_ROWS', '50000'))
# ============================================================
# Database Schema
# ============================================================
CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS metrics_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts TEXT NOT NULL,
worker_pid INTEGER NOT NULL,
pool_saturation REAL,
pool_checked_out INTEGER,
pool_checked_in INTEGER,
pool_overflow INTEGER,
pool_max_capacity INTEGER,
redis_used_memory INTEGER,
redis_hit_rate REAL,
rc_l1_hit_rate REAL,
rc_l2_hit_rate REAL,
rc_miss_rate REAL,
latency_p50_ms REAL,
latency_p95_ms REAL,
latency_p99_ms REAL,
latency_count INTEGER
);
"""
CREATE_INDEX_SQL = (
"CREATE INDEX IF NOT EXISTS idx_metrics_ts ON metrics_snapshots(ts);"
)
COLUMNS = [
"ts", "worker_pid",
"pool_saturation", "pool_checked_out", "pool_checked_in",
"pool_overflow", "pool_max_capacity",
"redis_used_memory", "redis_hit_rate",
"rc_l1_hit_rate", "rc_l2_hit_rate", "rc_miss_rate",
"latency_p50_ms", "latency_p95_ms", "latency_p99_ms", "latency_count",
]
# ============================================================
# Metrics History Store
# ============================================================
class MetricsHistoryStore:
"""SQLite-based metrics history store (follows LogStore pattern)."""
def __init__(self, db_path: str = METRICS_HISTORY_PATH):
self.db_path = db_path
self._local = threading.local()
self._write_lock = threading.Lock()
self._initialized = False
def initialize(self) -> None:
if self._initialized:
return
db_dir = Path(self.db_path).parent
db_dir.mkdir(parents=True, exist_ok=True)
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute(CREATE_TABLE_SQL)
cursor.execute(CREATE_INDEX_SQL)
conn.commit()
self._initialized = True
logger.info("Metrics history store initialized at %s", self.db_path)
@contextmanager
def _get_connection(self) -> Generator[sqlite3.Connection, None, None]:
if not hasattr(self._local, 'connection') or self._local.connection is None:
self._local.connection = sqlite3.connect(
self.db_path, timeout=10.0, check_same_thread=False,
)
self._local.connection.row_factory = sqlite3.Row
try:
yield self._local.connection
except sqlite3.Error as exc:
logger.error("Metrics history DB error: %s", exc)
try:
self._local.connection.close()
except Exception:
pass
self._local.connection = None
raise
def write_snapshot(self, data: Dict[str, Any]) -> bool:
if not self._initialized:
self.initialize()
ts = datetime.now().isoformat()
pid = os.getpid()
pool = data.get("pool") or {}
redis = data.get("redis") or {}
rc = data.get("route_cache") or {}
lat = data.get("latency") or {}
try:
with self._write_lock:
with self._get_connection() as conn:
conn.execute(
"""
INSERT INTO metrics_snapshots
(ts, worker_pid,
pool_saturation, pool_checked_out, pool_checked_in,
pool_overflow, pool_max_capacity,
redis_used_memory, redis_hit_rate,
rc_l1_hit_rate, rc_l2_hit_rate, rc_miss_rate,
latency_p50_ms, latency_p95_ms, latency_p99_ms, latency_count)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""",
(
ts, pid,
pool.get("saturation"),
pool.get("checked_out"),
pool.get("checked_in"),
pool.get("overflow"),
pool.get("max_capacity"),
redis.get("used_memory"),
redis.get("hit_rate"),
rc.get("l1_hit_rate"),
rc.get("l2_hit_rate"),
rc.get("miss_rate"),
lat.get("p50_ms"),
lat.get("p95_ms"),
lat.get("p99_ms"),
lat.get("count"),
),
)
conn.commit()
return True
except Exception as exc:
logger.debug("Failed to write metrics snapshot: %s", exc)
return False
def query_snapshots(self, minutes: int = 30) -> List[Dict[str, Any]]:
if not self._initialized:
self.initialize()
cutoff = (datetime.now() - timedelta(minutes=minutes)).isoformat()
try:
with self._get_connection() as conn:
cursor = conn.execute(
"SELECT * FROM metrics_snapshots WHERE ts >= ? ORDER BY ts ASC",
(cutoff,),
)
return [dict(row) for row in cursor.fetchall()]
except Exception as exc:
logger.error("Failed to query metrics snapshots: %s", exc)
return []
def cleanup(self) -> int:
if not self._initialized:
return 0
deleted = 0
try:
with self._write_lock:
with self._get_connection() as conn:
cutoff = (
datetime.now() - timedelta(days=METRICS_HISTORY_RETENTION_DAYS)
).isoformat()
cursor = conn.execute(
"DELETE FROM metrics_snapshots WHERE ts < ?", (cutoff,),
)
deleted += cursor.rowcount
row = conn.execute(
"SELECT COUNT(*) FROM metrics_snapshots",
).fetchone()
count = row[0] if row else 0
if count > METRICS_HISTORY_MAX_ROWS:
excess = count - METRICS_HISTORY_MAX_ROWS
cursor = conn.execute(
"""
DELETE FROM metrics_snapshots WHERE id IN (
SELECT id FROM metrics_snapshots ORDER BY ts ASC LIMIT ?
)
""",
(excess,),
)
deleted += cursor.rowcount
conn.commit()
if deleted > 0:
logger.info("Cleaned up %d metrics history rows", deleted)
except Exception as exc:
logger.error("Failed to cleanup metrics history: %s", exc)
return deleted
# ============================================================
# Background Collector
# ============================================================
class MetricsHistoryCollector:
"""Daemon thread that snapshots metrics at a fixed interval."""
def __init__(
self,
app: Any = None,
store: Optional[MetricsHistoryStore] = None,
interval: int = METRICS_HISTORY_INTERVAL,
):
self._app = app
self._store = store or get_metrics_history_store()
self.interval = interval
self._stop_event = threading.Event()
self._thread: Optional[threading.Thread] = None
self._cleanup_counter = 0
def start(self) -> None:
if self._thread is not None and self._thread.is_alive():
return
self._stop_event.clear()
self._thread = threading.Thread(
target=self._run, daemon=True, name="metrics-history-collector",
)
self._thread.start()
logger.info(
"Metrics history collector started (interval=%ds)", self.interval,
)
def stop(self) -> None:
if self._thread and self._thread.is_alive():
self._stop_event.set()
self._thread.join(timeout=5)
logger.info("Metrics history collector stopped")
def _run(self) -> None:
# Collect immediately on start, then loop.
self._collect_snapshot()
while not self._stop_event.wait(self.interval):
self._collect_snapshot()
# Run cleanup every ~100 intervals (~50 min at 30s).
self._cleanup_counter += 1
if self._cleanup_counter >= 100:
self._cleanup_counter = 0
self._store.cleanup()
def _collect_snapshot(self) -> None:
try:
data: Dict[str, Any] = {}
# Pool status
try:
from mes_dashboard.core.database import get_pool_status
data["pool"] = get_pool_status()
except Exception:
data["pool"] = {}
# Redis
try:
from mes_dashboard.core.redis_client import (
get_redis_client,
REDIS_ENABLED,
)
if REDIS_ENABLED:
client = get_redis_client()
if client is not None:
info = client.info(section="memory")
stats_info = client.info(section="stats")
hits = int(stats_info.get("keyspace_hits", 0))
misses = int(stats_info.get("keyspace_misses", 0))
total = hits + misses
data["redis"] = {
"used_memory": info.get("used_memory", 0),
"hit_rate": round(hits / total, 4) if total > 0 else 0,
}
else:
data["redis"] = {}
else:
data["redis"] = {}
except Exception:
data["redis"] = {}
# Route cache
try:
if self._app:
with self._app.app_context():
from mes_dashboard.routes.health_routes import (
get_route_cache_status,
)
rc = get_route_cache_status()
else:
from mes_dashboard.routes.health_routes import (
get_route_cache_status,
)
rc = get_route_cache_status()
data["route_cache"] = {
"l1_hit_rate": rc.get("l1_hit_rate"),
"l2_hit_rate": rc.get("l2_hit_rate"),
"miss_rate": rc.get("miss_rate"),
}
except Exception:
data["route_cache"] = {}
# Query latency
try:
from mes_dashboard.core.metrics import get_metrics_summary
summary = get_metrics_summary()
data["latency"] = {
"p50_ms": summary.get("p50_ms", 0),
"p95_ms": summary.get("p95_ms", 0),
"p99_ms": summary.get("p99_ms", 0),
"count": summary.get("count", 0),
}
except Exception:
data["latency"] = {}
self._store.write_snapshot(data)
except Exception as exc:
logger.debug("Metrics snapshot collection failed: %s", exc)
# ============================================================
# Global Instance & Lifecycle
# ============================================================
_STORE: Optional[MetricsHistoryStore] = None
_COLLECTOR: Optional[MetricsHistoryCollector] = None
def get_metrics_history_store() -> MetricsHistoryStore:
global _STORE
if _STORE is None:
_STORE = MetricsHistoryStore()
_STORE.initialize()
return _STORE
def start_metrics_history(app: Any = None) -> None:
global _COLLECTOR
store = get_metrics_history_store()
_COLLECTOR = MetricsHistoryCollector(app=app, store=store)
_COLLECTOR.start()
def stop_metrics_history() -> None:
global _COLLECTOR
if _COLLECTOR is not None:
_COLLECTOR.stop()
_COLLECTOR = None

View File

@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Admin routes for page management and performance monitoring.""" """Admin routes for page management and performance monitoring."""
from __future__ import annotations from __future__ import annotations
import json import json
import logging import logging
import os import os
@@ -10,8 +10,8 @@ import time
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from flask import Blueprint, g, jsonify, render_template, request from flask import Blueprint, current_app, g, jsonify, render_template, request, send_from_directory
from mes_dashboard.core.permissions import admin_required from mes_dashboard.core.permissions import admin_required
from mes_dashboard.core.response import error_response, TOO_MANY_REQUESTS from mes_dashboard.core.response import error_response, TOO_MANY_REQUESTS
@@ -42,14 +42,14 @@ from mes_dashboard.services.page_registry import (
set_page_status, set_page_status,
update_drawer, update_drawer,
) )
admin_bp = Blueprint("admin", __name__, url_prefix="/admin") admin_bp = Blueprint("admin", __name__, url_prefix="/admin")
logger = logging.getLogger("mes_dashboard.admin") logger = logging.getLogger("mes_dashboard.admin")
# ============================================================ # ============================================================
# Worker Restart Configuration # Worker Restart Configuration
# ============================================================ # ============================================================
_RUNTIME_CONTRACT = load_runtime_contract() _RUNTIME_CONTRACT = load_runtime_contract()
WATCHDOG_RUNTIME_DIR = _RUNTIME_CONTRACT["watchdog_runtime_dir"] WATCHDOG_RUNTIME_DIR = _RUNTIME_CONTRACT["watchdog_runtime_dir"]
RESTART_FLAG_PATH = _RUNTIME_CONTRACT["watchdog_restart_flag"] RESTART_FLAG_PATH = _RUNTIME_CONTRACT["watchdog_restart_flag"]
@@ -57,24 +57,28 @@ RESTART_STATE_PATH = _RUNTIME_CONTRACT["watchdog_state_file"]
WATCHDOG_PID_PATH = _RUNTIME_CONTRACT["watchdog_pid_file"] WATCHDOG_PID_PATH = _RUNTIME_CONTRACT["watchdog_pid_file"]
GUNICORN_BIND = _RUNTIME_CONTRACT["gunicorn_bind"] GUNICORN_BIND = _RUNTIME_CONTRACT["gunicorn_bind"]
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT["version"] RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT["version"]
# Track last restart request time (in-memory for this worker) # Track last restart request time (in-memory for this worker)
_last_restart_request: float = 0.0 _last_restart_request: float = 0.0
# ============================================================ # ============================================================
# Performance Monitoring Routes # Performance Monitoring Routes
# ============================================================ # ============================================================
@admin_bp.route("/performance") @admin_bp.route("/performance")
@admin_required @admin_required
def performance(): def performance():
"""Performance monitoring dashboard.""" """Performance monitoring dashboard (Vue SPA)."""
return render_template("admin/performance.html") dist_dir = os.path.join(current_app.static_folder or "", "dist")
dist_html = os.path.join(dist_dir, "admin-performance.html")
if os.path.exists(dist_html):
@admin_bp.route("/api/system-status", methods=["GET"]) return send_from_directory(dist_dir, "admin-performance.html")
@admin_required return render_template("admin/performance.html")
@admin_bp.route("/api/system-status", methods=["GET"])
@admin_required
def api_system_status(): def api_system_status():
"""API: Get system status for performance dashboard.""" """API: Get system status for performance dashboard."""
from mes_dashboard.core.database import get_pool_runtime_config, get_pool_status from mes_dashboard.core.database import get_pool_runtime_config, get_pool_status
@@ -85,15 +89,15 @@ def api_system_status():
check_redis, check_redis,
get_route_cache_status, get_route_cache_status,
) )
# Database status # Database status
db_status, db_error = check_database() db_status, db_error = check_database()
# Redis status # Redis status
redis_status = 'disabled' redis_status = 'disabled'
if REDIS_ENABLED: if REDIS_ENABLED:
redis_status, _ = check_redis() redis_status, _ = check_redis()
# Circuit breaker status # Circuit breaker status
circuit_breaker = get_circuit_breaker_status() circuit_breaker = get_circuit_breaker_status()
route_cache = get_route_cache_status() route_cache = get_route_cache_status()
@@ -135,26 +139,26 @@ def api_system_status():
thresholds=thresholds, thresholds=thresholds,
) )
runtime_contract = build_runtime_contract_diagnostics(strict=False) runtime_contract = build_runtime_contract_diagnostics(strict=False)
# Cache status # Cache status
from mes_dashboard.routes.health_routes import ( from mes_dashboard.routes.health_routes import (
get_cache_status, get_cache_status,
get_resource_cache_status, get_resource_cache_status,
get_equipment_status_cache_status get_equipment_status_cache_status
) )
return jsonify({ return jsonify({
"success": True, "success": True,
"data": { "data": {
"database": { "database": {
"status": db_status, "status": db_status,
"error": db_error "error": db_error
}, },
"redis": { "redis": {
"status": redis_status, "status": redis_status,
"enabled": REDIS_ENABLED "enabled": REDIS_ENABLED
}, },
"circuit_breaker": circuit_breaker, "circuit_breaker": circuit_breaker,
"cache": { "cache": {
"wip": get_cache_status(), "wip": get_cache_status(),
"resource": get_resource_cache_status(), "resource": get_resource_cache_status(),
@@ -186,134 +190,265 @@ def api_system_status():
"worker_pid": os.getpid() "worker_pid": os.getpid()
} }
}) })
@admin_bp.route("/api/metrics", methods=["GET"]) @admin_bp.route("/api/metrics", methods=["GET"])
@admin_required @admin_required
def api_metrics(): def api_metrics():
"""API: Get performance metrics for dashboard.""" """API: Get performance metrics for dashboard."""
from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics
summary = get_metrics_summary() summary = get_metrics_summary()
metrics = get_query_metrics() metrics = get_query_metrics()
return jsonify({ return jsonify({
"success": True, "success": True,
"data": { "data": {
"p50_ms": summary.get("p50_ms"), "p50_ms": summary.get("p50_ms"),
"p95_ms": summary.get("p95_ms"), "p95_ms": summary.get("p95_ms"),
"p99_ms": summary.get("p99_ms"), "p99_ms": summary.get("p99_ms"),
"count": summary.get("count"), "count": summary.get("count"),
"slow_count": summary.get("slow_count"), "slow_count": summary.get("slow_count"),
"slow_rate": summary.get("slow_rate"), "slow_rate": summary.get("slow_rate"),
"worker_pid": summary.get("worker_pid"), "worker_pid": summary.get("worker_pid"),
"collected_at": summary.get("collected_at"), "collected_at": summary.get("collected_at"),
# Include latency distribution for charts # Include latency distribution for charts
"latencies": metrics.get_latencies()[-100:] # Last 100 for chart "latencies": metrics.get_latencies()[-100:] # Last 100 for chart
} }
}) })
@admin_bp.route("/api/logs", methods=["GET"]) @admin_bp.route("/api/logs", methods=["GET"])
@admin_required @admin_required
def api_logs(): def api_logs():
"""API: Get recent logs from SQLite log store.""" """API: Get recent logs from SQLite log store."""
from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
if not LOG_STORE_ENABLED: if not LOG_STORE_ENABLED:
return jsonify({ return jsonify({
"success": True, "success": True,
"data": { "data": {
"logs": [], "logs": [],
"enabled": False, "enabled": False,
"total": 0 "total": 0
} }
}) })
# Query parameters # Query parameters
level = request.args.get("level") level = request.args.get("level")
q = request.args.get("q") q = request.args.get("q")
limit = request.args.get("limit", 50, type=int) limit = request.args.get("limit", 50, type=int)
offset = request.args.get("offset", 0, type=int) offset = request.args.get("offset", 0, type=int)
since = request.args.get("since") since = request.args.get("since")
log_store = get_log_store() log_store = get_log_store()
# Get total count for pagination # Get total count for pagination
total = log_store.count_logs(level=level, q=q, since=since) total = log_store.count_logs(level=level, q=q, since=since)
# Get paginated logs # Get paginated logs
logs = log_store.query_logs( logs = log_store.query_logs(
level=level, level=level,
q=q, q=q,
limit=min(limit, 100), # Cap at 100 per page limit=min(limit, 100), # Cap at 100 per page
offset=offset, offset=offset,
since=since since=since
) )
return jsonify({ return jsonify({
"success": True, "success": True,
"data": { "data": {
"logs": logs, "logs": logs,
"count": len(logs), "count": len(logs),
"total": total, "total": total,
"enabled": True, "enabled": True,
"stats": log_store.get_stats() "stats": log_store.get_stats()
} }
}) })
@admin_bp.route("/api/logs/cleanup", methods=["POST"]) @admin_bp.route("/api/performance-detail", methods=["GET"])
@admin_required @admin_required
def api_logs_cleanup(): def api_performance_detail():
"""API: Manually trigger log cleanup. """API: Get detailed performance telemetry for admin dashboard.
Supports optional parameters: Returns redis, process_caches, route_cache, db_pool, and
- older_than_days: Delete logs older than N days (default: use configured retention) direct_connections sections in a single response.
- keep_count: Keep only the most recent N logs (optional) """
""" from mes_dashboard.core.cache import get_all_process_cache_stats
from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED from mes_dashboard.core.database import (
get_direct_connection_count,
if not LOG_STORE_ENABLED: get_pool_runtime_config,
return jsonify({ get_pool_status,
"success": False, )
"error": "Log store is disabled" from mes_dashboard.core.redis_client import (
}), 400 get_redis_client,
REDIS_ENABLED,
log_store = get_log_store() REDIS_KEY_PREFIX,
)
# Get current stats before cleanup from mes_dashboard.routes.health_routes import get_route_cache_status
stats_before = log_store.get_stats()
# ---- Redis detail ----
# Perform cleanup redis_detail = None
deleted = log_store.cleanup_old_logs() if REDIS_ENABLED:
client = get_redis_client()
# Get stats after cleanup if client is not None:
stats_after = log_store.get_stats() try:
info = client.info(section="memory")
user = getattr(g, "username", "unknown") stats_info = client.info(section="stats")
logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries") clients_info = client.info(section="clients")
return jsonify({ hits = int(stats_info.get("keyspace_hits", 0))
"success": True, misses = int(stats_info.get("keyspace_misses", 0))
"data": { total = hits + misses
"deleted": deleted, hit_rate = round(hits / total, 4) if total > 0 else 0
"before": {
"count": stats_before.get("count", 0), # Scan key counts per namespace
"size_bytes": stats_before.get("size_bytes", 0) namespace_prefixes = [
}, "data", "route_cache", "equipment_status",
"after": { "reject_dataset", "meta", "lock", "scrap_exclusion",
"count": stats_after.get("count", 0), ]
"size_bytes": stats_after.get("size_bytes", 0) namespaces = []
} for ns in namespace_prefixes:
} pattern = f"{REDIS_KEY_PREFIX}:{ns}*"
}) count = 0
cursor = 0
while True:
# ============================================================ cursor, keys = client.scan(cursor=cursor, match=pattern, count=100)
# Worker Restart Control Routes count += len(keys)
# ============================================================ if cursor == 0:
break
namespaces.append({"name": ns, "key_count": count})
redis_detail = {
"used_memory_human": info.get("used_memory_human", "N/A"),
"used_memory": info.get("used_memory", 0),
"peak_memory_human": info.get("used_memory_peak_human", "N/A"),
"peak_memory": info.get("used_memory_peak", 0),
"maxmemory_human": info.get("maxmemory_human", "N/A"),
"maxmemory": info.get("maxmemory", 0),
"connected_clients": clients_info.get("connected_clients", 0),
"hit_rate": hit_rate,
"keyspace_hits": hits,
"keyspace_misses": misses,
"namespaces": namespaces,
}
except Exception as exc:
logger.warning("Failed to collect Redis detail: %s", exc)
redis_detail = {"error": str(exc)}
# ---- Process caches ----
process_caches = get_all_process_cache_stats()
# ---- Route cache ----
route_cache = get_route_cache_status()
# ---- DB pool ----
db_pool = None
try:
pool_status = get_pool_status()
pool_config = get_pool_runtime_config()
db_pool = {
"status": pool_status,
"config": {
"pool_size": pool_config.get("pool_size"),
"max_overflow": pool_config.get("max_overflow"),
"pool_timeout": pool_config.get("pool_timeout"),
"pool_recycle": pool_config.get("pool_recycle"),
},
}
except Exception as exc:
logger.warning("Failed to collect DB pool status: %s", exc)
db_pool = {"error": str(exc)}
# ---- Direct connections ----
direct_connections = {
"total_since_start": get_direct_connection_count(),
"worker_pid": os.getpid(),
}
return jsonify({
"success": True,
"data": {
"redis": redis_detail,
"process_caches": process_caches,
"route_cache": route_cache,
"db_pool": db_pool,
"direct_connections": direct_connections,
},
})
@admin_bp.route("/api/performance-history", methods=["GET"])
@admin_required
def api_performance_history():
"""API: Get historical metrics snapshots for trend charts."""
from mes_dashboard.core.metrics_history import get_metrics_history_store
minutes = request.args.get("minutes", 30, type=int)
minutes = max(1, min(minutes, 180))
store = get_metrics_history_store()
snapshots = store.query_snapshots(minutes=minutes)
return jsonify({
"success": True,
"data": {
"snapshots": snapshots,
"count": len(snapshots),
},
})
@admin_bp.route("/api/logs/cleanup", methods=["POST"])
@admin_required
def api_logs_cleanup():
"""API: Manually trigger log cleanup.
Supports optional parameters:
- older_than_days: Delete logs older than N days (default: use configured retention)
- keep_count: Keep only the most recent N logs (optional)
"""
from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
if not LOG_STORE_ENABLED:
return jsonify({
"success": False,
"error": "Log store is disabled"
}), 400
log_store = get_log_store()
# Get current stats before cleanup
stats_before = log_store.get_stats()
# Perform cleanup
deleted = log_store.cleanup_old_logs()
# Get stats after cleanup
stats_after = log_store.get_stats()
user = getattr(g, "username", "unknown")
logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries")
return jsonify({
"success": True,
"data": {
"deleted": deleted,
"before": {
"count": stats_before.get("count", 0),
"size_bytes": stats_before.get("size_bytes", 0)
},
"after": {
"count": stats_after.get("count", 0),
"size_bytes": stats_after.get("size_bytes", 0)
}
}
})
# ============================================================
# Worker Restart Control Routes
# ============================================================
def _get_restart_state() -> dict: def _get_restart_state() -> dict:
"""Read worker restart state from file.""" """Read worker restart state from file."""
return load_restart_state(RESTART_STATE_PATH) return load_restart_state(RESTART_STATE_PATH)
@@ -323,14 +458,14 @@ def _iso_from_epoch(ts: float) -> str | None:
if ts <= 0: if ts <= 0:
return None return None
return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat() return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
def _check_restart_cooldown() -> tuple[bool, float]: def _check_restart_cooldown() -> tuple[bool, float]:
"""Check if restart is in cooldown. """Check if restart is in cooldown.
Returns: Returns:
Tuple of (is_in_cooldown, remaining_seconds). Tuple of (is_in_cooldown, remaining_seconds).
""" """
policy = _get_restart_policy_state() policy = _get_restart_policy_state()
if policy.get("cooldown"): if policy.get("cooldown"):
return True, float(policy.get("cooldown_remaining_seconds") or 0.0) return True, float(policy.get("cooldown_remaining_seconds") or 0.0)
@@ -401,18 +536,18 @@ def _log_restart_audit(event: str, payload: dict[str, Any]) -> None:
**payload, **payload,
} }
logger.info("worker_restart_audit %s", json.dumps(entry, ensure_ascii=False)) logger.info("worker_restart_audit %s", json.dumps(entry, ensure_ascii=False))
@admin_bp.route("/api/worker/restart", methods=["POST"]) @admin_bp.route("/api/worker/restart", methods=["POST"])
@admin_required @admin_required
def api_worker_restart(): def api_worker_restart():
"""API: Request worker restart. """API: Request worker restart.
Writes a restart flag file that the watchdog process monitors. Writes a restart flag file that the watchdog process monitors.
Enforces a 60-second cooldown between restart requests. Enforces a 60-second cooldown between restart requests.
""" """
global _last_restart_request global _last_restart_request
payload = request.get_json(silent=True) or {} payload = request.get_json(silent=True) or {}
manual_override = bool(payload.get("manual_override")) manual_override = bool(payload.get("manual_override"))
override_acknowledged = bool(payload.get("override_acknowledged")) override_acknowledged = bool(payload.get("override_acknowledged"))
@@ -496,10 +631,10 @@ def api_worker_restart():
f"Failed to request restart: {e}", f"Failed to request restart: {e}",
status_code=500 status_code=500
) )
# Update in-memory cooldown # Update in-memory cooldown
_last_restart_request = time.time() _last_restart_request = time.time()
_log_restart_audit( _log_restart_audit(
"restart_request_accepted", "restart_request_accepted",
{ {
@@ -534,10 +669,10 @@ def api_worker_restart():
}, },
} }
}) })
@admin_bp.route("/api/worker/status", methods=["GET"]) @admin_bp.route("/api/worker/status", methods=["GET"])
@admin_required @admin_required
def api_worker_status(): def api_worker_status():
"""API: Get worker status and restart information.""" """API: Get worker status and restart information."""
# Get last restart info # Get last restart info
@@ -555,29 +690,29 @@ def api_worker_status():
cooldown_active=bool(policy_state.get("cooldown")), cooldown_active=bool(policy_state.get("cooldown")),
) )
runtime_contract = build_runtime_contract_diagnostics(strict=False) runtime_contract = build_runtime_contract_diagnostics(strict=False)
# Get worker start time (psutil is optional) # Get worker start time (psutil is optional)
worker_start_time = None worker_start_time = None
try: try:
import psutil import psutil
process = psutil.Process(os.getpid()) process = psutil.Process(os.getpid())
worker_start_time = datetime.fromtimestamp( worker_start_time = datetime.fromtimestamp(
process.create_time() process.create_time()
).isoformat() ).isoformat()
except ImportError: except ImportError:
# psutil not installed, try /proc on Linux # psutil not installed, try /proc on Linux
try: try:
stat_path = f"/proc/{os.getpid()}/stat" stat_path = f"/proc/{os.getpid()}/stat"
with open(stat_path) as f: with open(stat_path) as f:
stat = f.read().split() stat = f.read().split()
# Field 22 is starttime in clock ticks since boot # Field 22 is starttime in clock ticks since boot
# This is a simplified fallback # This is a simplified fallback
pass pass
except Exception: except Exception:
pass pass
except Exception: except Exception:
pass pass
return jsonify({ return jsonify({
"success": True, "success": True,
"data": { "data": {
@@ -628,25 +763,25 @@ def api_worker_status():
"last_restart": { "last_restart": {
"requested_by": last_restart.get("requested_by"), "requested_by": last_restart.get("requested_by"),
"requested_at": last_restart.get("requested_at"), "requested_at": last_restart.get("requested_at"),
"requested_ip": last_restart.get("requested_ip"), "requested_ip": last_restart.get("requested_ip"),
"completed_at": last_restart.get("completed_at"), "completed_at": last_restart.get("completed_at"),
"success": last_restart.get("success") "success": last_restart.get("success")
} }
} }
}) })
# ============================================================ # ============================================================
# Page Management Routes # Page Management Routes
# ============================================================ # ============================================================
@admin_bp.route("/pages") @admin_bp.route("/pages")
@admin_required @admin_required
def pages(): def pages():
"""Page management interface.""" """Page management interface."""
return render_template("admin/pages.html") return render_template("admin/pages.html")
@admin_bp.route("/api/pages", methods=["GET"]) @admin_bp.route("/api/pages", methods=["GET"])
@admin_required @admin_required
def api_get_pages(): def api_get_pages():

View File

@@ -14,6 +14,7 @@ from collections import OrderedDict
from datetime import datetime from datetime import datetime
from typing import Any from typing import Any
from mes_dashboard.core.cache import register_process_cache
from mes_dashboard.core.database import read_sql_df from mes_dashboard.core.database import read_sql_df
from mes_dashboard.core.redis_client import ( from mes_dashboard.core.redis_client import (
get_redis_client, get_redis_client,
@@ -92,6 +93,13 @@ class _ProcessLevelCache:
with self._lock: with self._lock:
self._cache.pop(key, None) self._cache.pop(key, None)
def stats(self) -> dict:
"""Return live cache statistics for telemetry."""
with self._lock:
now = time.time()
live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
def _resolve_cache_max_size(env_name: str, default: int) -> int: def _resolve_cache_max_size(env_name: str, default: int) -> int:
value = os.getenv(env_name) value = os.getenv(env_name)
@@ -113,6 +121,7 @@ _equipment_status_cache = _ProcessLevelCache(
ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS, ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS,
max_size=EQUIPMENT_PROCESS_CACHE_MAX_SIZE, max_size=EQUIPMENT_PROCESS_CACHE_MAX_SIZE,
) )
register_process_cache("equipment_status", _equipment_status_cache, "Equipment Status (L1, 30s)")
_equipment_status_parse_lock = threading.Lock() _equipment_status_parse_lock = threading.Lock()
_equipment_lookup_lock = threading.Lock() _equipment_lookup_lock = threading.Lock()
_equipment_status_lookup: dict[str, dict[str, Any]] = {} _equipment_status_lookup: dict[str, dict[str, Any]] = {}

View File

@@ -20,7 +20,7 @@ from typing import Any, Dict, List, Optional
import pandas as pd import pandas as pd
from mes_dashboard.core.cache import ProcessLevelCache from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache
from mes_dashboard.core.database import read_sql_df from mes_dashboard.core.database import read_sql_df
from mes_dashboard.core.redis_client import ( from mes_dashboard.core.redis_client import (
REDIS_ENABLED, REDIS_ENABLED,
@@ -55,6 +55,7 @@ _CACHE_MAX_SIZE = 8
_REDIS_NAMESPACE = "reject_dataset" _REDIS_NAMESPACE = "reject_dataset"
_dataset_cache = ProcessLevelCache(ttl_seconds=_CACHE_TTL, max_size=_CACHE_MAX_SIZE) _dataset_cache = ProcessLevelCache(ttl_seconds=_CACHE_TTL, max_size=_CACHE_MAX_SIZE)
register_process_cache("reject_dataset", _dataset_cache, "Reject Dataset (L1, 15min)")
# ============================================================ # ============================================================

View File

@@ -19,6 +19,7 @@ from typing import Any
import pandas as pd import pandas as pd
from mes_dashboard.core.cache import register_process_cache
from mes_dashboard.core.redis_client import ( from mes_dashboard.core.redis_client import (
get_redis_client, get_redis_client,
redis_available, redis_available,
@@ -109,6 +110,13 @@ class _ProcessLevelCache:
with self._lock: with self._lock:
self._cache.pop(key, None) self._cache.pop(key, None)
def stats(self) -> dict:
"""Return live cache statistics for telemetry."""
with self._lock:
now = time.time()
live = sum(1 for _, (_, ts) in self._cache.items() if now - ts <= self._ttl)
return {"entries": live, "max_size": self._max_size, "ttl_seconds": self._ttl}
def _resolve_cache_max_size(env_name: str, default: int) -> int: def _resolve_cache_max_size(env_name: str, default: int) -> int:
value = os.getenv(env_name) value = os.getenv(env_name)
@@ -130,6 +138,7 @@ _resource_df_cache = _ProcessLevelCache(
ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS, ttl_seconds=DEFAULT_PROCESS_CACHE_TTL_SECONDS,
max_size=RESOURCE_PROCESS_CACHE_MAX_SIZE, max_size=RESOURCE_PROCESS_CACHE_MAX_SIZE,
) )
register_process_cache("resource", _resource_df_cache, "Resource DataFrame (L1, 30s)")
_resource_parse_lock = threading.Lock() _resource_parse_lock = threading.Lock()
_resource_index_lock = threading.Lock() _resource_index_lock = threading.Lock()
_resource_index: ResourceIndex = { _resource_index: ResourceIndex = {