perf: 移除舊版 resource summary API 並加入 WIP DataFrame 記憶體快取

問題診斷:
- 舊版 /api/resource/summary API 使用慢速 SQL (JOIN + ROW_NUMBER),導致 55s 逾時
- 壓力測試持續呼叫此 API,佔滿所有 worker threads
- 每次 WIP API 請求都解析 14.8MB JSON,8 個並發請求造成 GIL 競爭

變更內容:
- 移除舊版 /api/resource/summary 路由和 query_resource_status_summary 函數
- 刪除未使用的 status_summary.sql
- 更新壓力測試和整合測試使用新版 /api/resource/status/summary
- 加入 ProcessLevelCache 類別實作 process-level DataFrame 快取 (30s TTL)
- 使用 double-check locking 確保只有一個 thread 解析 JSON

效能改善:
- 新版 API 使用 Redis 三層快取,回應時間 < 100ms
- Process-level 快取避免重複解析 14MB JSON,大幅改善並發效能

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
beabigegg
2026-02-03 18:00:41 +08:00
parent de38959568
commit 373a1f0f0e
8 changed files with 113 additions and 89 deletions

View File

@@ -10,7 +10,9 @@ from __future__ import annotations
import io
import json
import logging
from typing import Any, Optional, Protocol
import threading
import time
from typing import Any, Optional, Protocol, Tuple
import pandas as pd
from flask import current_app
@@ -25,6 +27,54 @@ from mes_dashboard.core.redis_client import (
logger = logging.getLogger('mes_dashboard.cache')
# ============================================================
# Process-Level DataFrame Cache (Prevents redundant JSON parsing)
# ============================================================
class ProcessLevelCache:
"""Thread-safe process-level cache for parsed DataFrames.
Prevents redundant JSON parsing across concurrent requests.
Uses a lock to ensure only one thread parses at a time.
"""
def __init__(self, ttl_seconds: int = 30):
self._cache: dict[str, Tuple[pd.DataFrame, float]] = {}
self._lock = threading.Lock()
self._ttl = ttl_seconds
def get(self, key: str) -> Optional[pd.DataFrame]:
"""Get cached DataFrame if not expired."""
with self._lock:
if key not in self._cache:
return None
df, timestamp = self._cache[key]
if time.time() - timestamp > self._ttl:
del self._cache[key]
return None
return df
def set(self, key: str, df: pd.DataFrame) -> None:
"""Cache a DataFrame with current timestamp."""
with self._lock:
self._cache[key] = (df, time.time())
def invalidate(self, key: str) -> None:
"""Remove a key from cache."""
with self._lock:
self._cache.pop(key, None)
def clear(self) -> None:
"""Clear all cached data."""
with self._lock:
self._cache.clear()
# Global process-level cache for WIP DataFrame (30s TTL)
_wip_df_cache = ProcessLevelCache(ttl_seconds=30)
_wip_parse_lock = threading.Lock()
# ============================================================
# Legacy Cache Backend Interface (for backwards compatibility)
# ============================================================
@@ -81,11 +131,27 @@ def make_cache_key(prefix: str, days_back: Optional[int] = None, filters: Option
def get_cached_wip_data() -> Optional[pd.DataFrame]:
"""Get cached WIP data from Redis.
"""Get cached WIP data from Redis with process-level caching.
Uses a two-tier cache strategy:
1. Process-level cache: Parsed DataFrame (30s TTL) - fast, no parsing
2. Redis cache: Raw JSON data - shared across workers
This prevents redundant JSON parsing of 14+ MB data across
concurrent requests, significantly improving response times.
Returns:
DataFrame with full DWH.DW_MES_LOT_V data, or None if cache miss.
"""
cache_key = "wip_dataframe"
# Tier 1: Check process-level cache first (fast path)
cached_df = _wip_df_cache.get(cache_key)
if cached_df is not None:
logger.debug(f"Process cache hit: {len(cached_df)} rows")
return cached_df
# Tier 2: Parse from Redis (slow path - needs lock)
if not REDIS_ENABLED:
return None
@@ -93,19 +159,33 @@ def get_cached_wip_data() -> Optional[pd.DataFrame]:
if client is None:
return None
try:
data_json = client.get(get_key("data"))
if data_json is None:
logger.debug("Cache miss: no data in Redis")
return None
# Use lock to prevent multiple threads from parsing simultaneously
with _wip_parse_lock:
# Double-check after acquiring lock (another thread may have parsed)
cached_df = _wip_df_cache.get(cache_key)
if cached_df is not None:
logger.debug(f"Process cache hit (after lock): {len(cached_df)} rows")
return cached_df
# Use StringIO to wrap the JSON string for pd.read_json
df = pd.read_json(io.StringIO(data_json), orient='records')
logger.debug(f"Cache hit: loaded {len(df)} rows from Redis")
return df
except Exception as e:
logger.warning(f"Failed to read cache: {e}")
return None
try:
start_time = time.time()
data_json = client.get(get_key("data"))
if data_json is None:
logger.debug("Cache miss: no data in Redis")
return None
# Parse JSON to DataFrame
df = pd.read_json(io.StringIO(data_json), orient='records')
parse_time = time.time() - start_time
# Store in process-level cache
_wip_df_cache.set(cache_key, df)
logger.debug(f"Cache hit: loaded {len(df)} rows from Redis (parsed in {parse_time:.2f}s)")
return df
except Exception as e:
logger.warning(f"Failed to read cache: {e}")
return None
def get_cached_sys_date() -> Optional[str]:

View File

@@ -43,7 +43,6 @@ def _clean_nan_values(data):
return data
from mes_dashboard.core.utils import get_days_back
from mes_dashboard.services.resource_service import (
query_resource_status_summary,
query_resource_by_status,
query_resource_by_workcenter,
query_resource_detail,
@@ -60,21 +59,6 @@ from mes_dashboard.config.constants import STATUS_CATEGORIES
resource_bp = Blueprint('resource', __name__, url_prefix='/api/resource')
@resource_bp.route('/summary')
def api_resource_summary():
"""API: Resource status summary."""
days_back = request.args.get('days_back', 30, type=int)
cache_key = make_cache_key("resource_summary", days_back)
summary = cache_get(cache_key)
if summary is None:
summary = query_resource_status_summary(days_back)
if summary:
cache_set(cache_key, summary)
if summary:
return jsonify({'success': True, 'data': summary})
return jsonify({'success': False, 'error': '查詢失敗'}), 500
@resource_bp.route('/by_status')
def api_resource_by_status():
"""API: Resource count by status."""

View File

@@ -105,38 +105,6 @@ def get_resource_latest_status_subquery(days_back: int = 30) -> str:
# Resource Summary Queries
# ============================================================
def query_resource_status_summary(days_back: int = 30) -> Optional[Dict]:
"""Query resource status summary statistics.
Args:
days_back: Number of days to look back
Returns:
Dict with summary stats or None if query fails.
"""
try:
base_sql = get_resource_latest_status_subquery(days_back)
# Load SQL from file and replace placeholder
sql = SQLLoader.load("resource/status_summary")
sql = sql.replace("{{ LATEST_STATUS_SUBQUERY }}", base_sql)
df = read_sql_df(sql)
if df is None or df.empty:
return None
row = df.iloc[0]
return {
'total_count': int(row['TOTAL_COUNT'] or 0),
'workcenter_count': int(row['WORKCENTER_COUNT'] or 0),
'family_count': int(row['FAMILY_COUNT'] or 0),
'dept_count': int(row['DEPT_COUNT'] or 0)
}
except Exception as exc:
print(f"Resource summary query failed: {exc}")
return None
def query_resource_by_status(days_back: int = 30) -> Optional[pd.DataFrame]:
"""Query resource count grouped by status.

View File

@@ -23,7 +23,6 @@ Directory Structure:
│ └── resource_detail_with_job.sql
├── resource/ # Resource status SQL files
│ ├── latest_status.sql
│ ├── status_summary.sql
│ ├── by_status.sql
│ ├── by_workcenter.sql
│ ├── detail.sql

View File

@@ -1,11 +0,0 @@
-- Resource Status Summary Query
-- Returns aggregate statistics for resources
--
-- This query wraps the latest_status subquery
SELECT
COUNT(*) as TOTAL_COUNT,
COUNT(DISTINCT WORKCENTERNAME) as WORKCENTER_COUNT,
COUNT(DISTINCT RESOURCEFAMILYNAME) as FAMILY_COUNT,
COUNT(DISTINCT PJ_DEPARTMENT) as DEPT_COUNT
FROM ({{ LATEST_STATUS_SUBQUERY }}) rs

View File

@@ -109,9 +109,9 @@ class TestAPILoadConcurrent:
assert result.avg_response_time < 15.0, f"Avg response time {result.avg_response_time:.2f}s exceeds 15s"
def test_resource_summary_concurrent_load(self, base_url: str, stress_config: dict, stress_result):
"""Test resource summary API under concurrent load."""
result = stress_result("Resource Summary Concurrent Load")
url = f"{base_url}/api/resource/summary"
"""Test resource status summary API under concurrent load."""
result = stress_result("Resource Status Summary Concurrent Load")
url = f"{base_url}/api/resource/status/summary"
concurrent_users = stress_config['concurrent_users']
requests_per_user = stress_config['requests_per_user']
timeout = stress_config['timeout']
@@ -146,7 +146,7 @@ class TestAPILoadConcurrent:
f"{base_url}/api/wip/overview/matrix",
f"{base_url}/api/wip/overview/hold",
f"{base_url}/api/wip/meta/workcenters",
f"{base_url}/api/resource/summary",
f"{base_url}/api/resource/status/summary",
]
concurrent_users = stress_config['concurrent_users']
timeout = stress_config['timeout']

View File

@@ -221,19 +221,21 @@ class TestResourceAPIIntegration(unittest.TestCase):
self.app.config['TESTING'] = True
self.client = self.app.test_client()
@patch('mes_dashboard.routes.resource_routes.query_resource_status_summary')
def test_resource_summary_response_format(self, mock_summary):
"""Resource summary should return consistent JSON structure."""
@patch('mes_dashboard.routes.resource_routes.get_resource_status_summary')
def test_resource_status_summary_response_format(self, mock_summary):
"""Resource status summary should return consistent JSON structure."""
mock_summary.return_value = {
'total_equipment': 100,
'by_status': {
'RUN': 60,
'IDLE': 30,
'DOWN': 10
}
'total_count': 100,
'by_status_category': {'PRODUCTIVE': 60, 'STANDBY': 30, 'DOWN': 10},
'by_status': {'PRD': 60, 'SBY': 30, 'UDT': 5, 'SDT': 5, 'EGT': 0, 'NST': 0, 'OTHER': 0},
'by_workcenter_group': {'焊接': 50, '成型': 50},
'with_active_job': 40,
'with_wip': 35,
'ou_pct': 63.2,
'availability_pct': 90.0,
}
response = self.client.get('/api/resource/summary')
response = self.client.get('/api/resource/status/summary')
self.assertEqual(response.status_code, 200)
data = json.loads(response.data)
@@ -241,6 +243,8 @@ class TestResourceAPIIntegration(unittest.TestCase):
# Verify response structure
self.assertIn('success', data)
self.assertTrue(data['success'])
self.assertIn('data', data)
self.assertIn('total_count', data['data'])
class TestAPIContentType(unittest.TestCase):

View File

@@ -42,7 +42,7 @@ class AppFactoryTests(unittest.TestCase):
"/api/wip/detail/<workcenter>",
"/api/wip/meta/workcenters",
"/api/wip/meta/packages",
"/api/resource/summary",
"/api/resource/status/summary",
"/api/dashboard/kpi",
"/api/excel-query/upload",
}