perf: 移除舊版 resource summary API 並加入 WIP DataFrame 記憶體快取

問題診斷： - 舊版 /api/resource/summary API 使用慢速 SQL (JOIN + ROW_NUMBER)，導致 55s 逾時 - 壓力測試持續呼叫此 API，佔滿所有 worker threads - 每次 WIP API 請求都解析 14.8MB JSON，8 個並發請求造成 GIL 競爭變更內容： - 移除舊版 /api/resource/summary 路由和 query_resource_status_summary 函數 - 刪除未使用的 status_summary.sql - 更新壓力測試和整合測試使用新版 /api/resource/status/summary - 加入 ProcessLevelCache 類別實作 process-level DataFrame 快取 (30s TTL) - 使用 double-check locking 確保只有一個 thread 解析 JSON 效能改善： - 新版 API 使用 Redis 三層快取，回應時間 < 100ms - Process-level 快取避免重複解析 14MB JSON，大幅改善並發效能 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 18:00:41 +08:00
parent de38959568
commit 373a1f0f0e
8 changed files with 113 additions and 89 deletions
--- a/src/mes_dashboard/core/cache.py
+++ b/src/mes_dashboard/core/cache.py
@@ -10,7 +10,9 @@ from __future__ import annotations
 import io
 import json
 import logging
-from typing import Any, Optional, Protocol
+import threading
+import time
+from typing import Any, Optional, Protocol, Tuple

 import pandas as pd
 from flask import current_app
@@ -25,6 +27,54 @@ from mes_dashboard.core.redis_client import (

 logger = logging.getLogger('mes_dashboard.cache')

+
+# ============================================================
+# Process-Level DataFrame Cache (Prevents redundant JSON parsing)
+# ============================================================
+
+class ProcessLevelCache:
+    """Thread-safe process-level cache for parsed DataFrames.
+
+    Prevents redundant JSON parsing across concurrent requests.
+    Uses a lock to ensure only one thread parses at a time.
+    """
+
+    def __init__(self, ttl_seconds: int = 30):
+        self._cache: dict[str, Tuple[pd.DataFrame, float]] = {}
+        self._lock = threading.Lock()
+        self._ttl = ttl_seconds
+
+    def get(self, key: str) -> Optional[pd.DataFrame]:
+        """Get cached DataFrame if not expired."""
+        with self._lock:
+            if key not in self._cache:
+                return None
+            df, timestamp = self._cache[key]
+            if time.time() - timestamp > self._ttl:
+                del self._cache[key]
+                return None
+            return df
+
+    def set(self, key: str, df: pd.DataFrame) -> None:
+        """Cache a DataFrame with current timestamp."""
+        with self._lock:
+            self._cache[key] = (df, time.time())
+
+    def invalidate(self, key: str) -> None:
+        """Remove a key from cache."""
+        with self._lock:
+            self._cache.pop(key, None)
+
+    def clear(self) -> None:
+        """Clear all cached data."""
+        with self._lock:
+            self._cache.clear()
+
+
+# Global process-level cache for WIP DataFrame (30s TTL)
+_wip_df_cache = ProcessLevelCache(ttl_seconds=30)
+_wip_parse_lock = threading.Lock()
+
 # ============================================================
 # Legacy Cache Backend Interface (for backwards compatibility)
 # ============================================================
@@ -81,11 +131,27 @@ def make_cache_key(prefix: str, days_back: Optional[int] = None, filters: Option


 def get_cached_wip_data() -> Optional[pd.DataFrame]:
-    """Get cached WIP data from Redis.
+    """Get cached WIP data from Redis with process-level caching.
+
+    Uses a two-tier cache strategy:
+    1. Process-level cache: Parsed DataFrame (30s TTL) - fast, no parsing
+    2. Redis cache: Raw JSON data - shared across workers
+
+    This prevents redundant JSON parsing of 14+ MB data across
+    concurrent requests, significantly improving response times.

    Returns:
        DataFrame with full DWH.DW_MES_LOT_V data, or None if cache miss.
    """
+    cache_key = "wip_dataframe"
+
+    # Tier 1: Check process-level cache first (fast path)
+    cached_df = _wip_df_cache.get(cache_key)
+    if cached_df is not None:
+        logger.debug(f"Process cache hit: {len(cached_df)} rows")
+        return cached_df
+
+    # Tier 2: Parse from Redis (slow path - needs lock)
    if not REDIS_ENABLED:
        return None

@@ -93,19 +159,33 @@ def get_cached_wip_data() -> Optional[pd.DataFrame]:
    if client is None:
        return None

-    try:
-        data_json = client.get(get_key("data"))
-        if data_json is None:
-            logger.debug("Cache miss: no data in Redis")
-            return None
+    # Use lock to prevent multiple threads from parsing simultaneously
+    with _wip_parse_lock:
+        # Double-check after acquiring lock (another thread may have parsed)
+        cached_df = _wip_df_cache.get(cache_key)
+        if cached_df is not None:
+            logger.debug(f"Process cache hit (after lock): {len(cached_df)} rows")
+            return cached_df

-        # Use StringIO to wrap the JSON string for pd.read_json
-        df = pd.read_json(io.StringIO(data_json), orient='records')
-        logger.debug(f"Cache hit: loaded {len(df)} rows from Redis")
-        return df
-    except Exception as e:
-        logger.warning(f"Failed to read cache: {e}")
-        return None
+        try:
+            start_time = time.time()
+            data_json = client.get(get_key("data"))
+            if data_json is None:
+                logger.debug("Cache miss: no data in Redis")
+                return None
+
+            # Parse JSON to DataFrame
+            df = pd.read_json(io.StringIO(data_json), orient='records')
+            parse_time = time.time() - start_time
+
+            # Store in process-level cache
+            _wip_df_cache.set(cache_key, df)
+
+            logger.debug(f"Cache hit: loaded {len(df)} rows from Redis (parsed in {parse_time:.2f}s)")
+            return df
+        except Exception as e:
+            logger.warning(f"Failed to read cache: {e}")
+            return None


 def get_cached_sys_date() -> Optional[str]:
--- a/src/mes_dashboard/routes/resource_routes.py
+++ b/src/mes_dashboard/routes/resource_routes.py
@@ -43,7 +43,6 @@ def _clean_nan_values(data):
    return data
 from mes_dashboard.core.utils import get_days_back
 from mes_dashboard.services.resource_service import (
-    query_resource_status_summary,
    query_resource_by_status,
    query_resource_by_workcenter,
    query_resource_detail,
@@ -60,21 +59,6 @@ from mes_dashboard.config.constants import STATUS_CATEGORIES
 resource_bp = Blueprint('resource', __name__, url_prefix='/api/resource')


-@resource_bp.route('/summary')
-def api_resource_summary():
-    """API: Resource status summary."""
-    days_back = request.args.get('days_back', 30, type=int)
-    cache_key = make_cache_key("resource_summary", days_back)
-    summary = cache_get(cache_key)
-    if summary is None:
-        summary = query_resource_status_summary(days_back)
-        if summary:
-            cache_set(cache_key, summary)
-    if summary:
-        return jsonify({'success': True, 'data': summary})
-    return jsonify({'success': False, 'error': '查詢失敗'}), 500
-
-
@resource_bp.route('/by_status')
 def api_resource_by_status():
    """API: Resource count by status."""
--- a/src/mes_dashboard/services/resource_service.py
+++ b/src/mes_dashboard/services/resource_service.py
@@ -105,38 +105,6 @@ def get_resource_latest_status_subquery(days_back: int = 30) -> str:
 # Resource Summary Queries
 # ============================================================

-def query_resource_status_summary(days_back: int = 30) -> Optional[Dict]:
-    """Query resource status summary statistics.
-
-    Args:
-        days_back: Number of days to look back
-
-    Returns:
-        Dict with summary stats or None if query fails.
-    """
-    try:
-        base_sql = get_resource_latest_status_subquery(days_back)
-
-        # Load SQL from file and replace placeholder
-        sql = SQLLoader.load("resource/status_summary")
-        sql = sql.replace("{{ LATEST_STATUS_SUBQUERY }}", base_sql)
-
-        df = read_sql_df(sql)
-        if df is None or df.empty:
-            return None
-
-        row = df.iloc[0]
-        return {
-            'total_count': int(row['TOTAL_COUNT'] or 0),
-            'workcenter_count': int(row['WORKCENTER_COUNT'] or 0),
-            'family_count': int(row['FAMILY_COUNT'] or 0),
-            'dept_count': int(row['DEPT_COUNT'] or 0)
-        }
-    except Exception as exc:
-        print(f"Resource summary query failed: {exc}")
-        return None
-
-
 def query_resource_by_status(days_back: int = 30) -> Optional[pd.DataFrame]:
    """Query resource count grouped by status.

--- a/src/mes_dashboard/sql/init.py
+++ b/src/mes_dashboard/sql/init.py
@@ -23,7 +23,6 @@ Directory Structure:
    │   └── resource_detail_with_job.sql
    ├── resource/            # Resource status SQL files
    │   ├── latest_status.sql
-    │   ├── status_summary.sql
    │   ├── by_status.sql
    │   ├── by_workcenter.sql
    │   ├── detail.sql
--- a/src/mes_dashboard/sql/resource/status_summary.sql
+++ b/src/mes_dashboard/sql/resource/status_summary.sql
@@ -1,11 +0,0 @@
-- Resource Status Summary Query
-- Returns aggregate statistics for resources
--
-- This query wraps the latest_status subquery
-
-SELECT
-    COUNT(*) as TOTAL_COUNT,
-    COUNT(DISTINCT WORKCENTERNAME) as WORKCENTER_COUNT,
-    COUNT(DISTINCT RESOURCEFAMILYNAME) as FAMILY_COUNT,
-    COUNT(DISTINCT PJ_DEPARTMENT) as DEPT_COUNT
-FROM ({{ LATEST_STATUS_SUBQUERY }}) rs
--- a/tests/stress/test_api_load.py
+++ b/tests/stress/test_api_load.py
@@ -109,9 +109,9 @@ class TestAPILoadConcurrent:
        assert result.avg_response_time < 15.0, f"Avg response time {result.avg_response_time:.2f}s exceeds 15s"

    def test_resource_summary_concurrent_load(self, base_url: str, stress_config: dict, stress_result):
-        """Test resource summary API under concurrent load."""
-        result = stress_result("Resource Summary Concurrent Load")
-        url = f"{base_url}/api/resource/summary"
+        """Test resource status summary API under concurrent load."""
+        result = stress_result("Resource Status Summary Concurrent Load")
+        url = f"{base_url}/api/resource/status/summary"
        concurrent_users = stress_config['concurrent_users']
        requests_per_user = stress_config['requests_per_user']
        timeout = stress_config['timeout']
@@ -146,7 +146,7 @@ class TestAPILoadConcurrent:
            f"{base_url}/api/wip/overview/matrix",
            f"{base_url}/api/wip/overview/hold",
            f"{base_url}/api/wip/meta/workcenters",
-            f"{base_url}/api/resource/summary",
+            f"{base_url}/api/resource/status/summary",
        ]
        concurrent_users = stress_config['concurrent_users']
        timeout = stress_config['timeout']
--- a/tests/test_api_integration.py
+++ b/tests/test_api_integration.py
@@ -221,19 +221,21 @@ class TestResourceAPIIntegration(unittest.TestCase):
        self.app.config['TESTING'] = True
        self.client = self.app.test_client()

-    @patch('mes_dashboard.routes.resource_routes.query_resource_status_summary')
-    def test_resource_summary_response_format(self, mock_summary):
-        """Resource summary should return consistent JSON structure."""
+    @patch('mes_dashboard.routes.resource_routes.get_resource_status_summary')
+    def test_resource_status_summary_response_format(self, mock_summary):
+        """Resource status summary should return consistent JSON structure."""
        mock_summary.return_value = {
-            'total_equipment': 100,
-            'by_status': {
-                'RUN': 60,
-                'IDLE': 30,
-                'DOWN': 10
-            }
+            'total_count': 100,
+            'by_status_category': {'PRODUCTIVE': 60, 'STANDBY': 30, 'DOWN': 10},
+            'by_status': {'PRD': 60, 'SBY': 30, 'UDT': 5, 'SDT': 5, 'EGT': 0, 'NST': 0, 'OTHER': 0},
+            'by_workcenter_group': {'焊接': 50, '成型': 50},
+            'with_active_job': 40,
+            'with_wip': 35,
+            'ou_pct': 63.2,
+            'availability_pct': 90.0,
        }

-        response = self.client.get('/api/resource/summary')
+        response = self.client.get('/api/resource/status/summary')

        self.assertEqual(response.status_code, 200)
        data = json.loads(response.data)
@@ -241,6 +243,8 @@ class TestResourceAPIIntegration(unittest.TestCase):
        # Verify response structure
        self.assertIn('success', data)
        self.assertTrue(data['success'])
+        self.assertIn('data', data)
+        self.assertIn('total_count', data['data'])


 class TestAPIContentType(unittest.TestCase):
--- a/tests/test_app_factory.py
+++ b/tests/test_app_factory.py
@@ -42,7 +42,7 @@ class AppFactoryTests(unittest.TestCase):
            "/api/wip/detail/<workcenter>",
            "/api/wip/meta/workcenters",
            "/api/wip/meta/packages",
-            "/api/resource/summary",
+            "/api/resource/status/summary",
            "/api/dashboard/kpi",
            "/api/excel-query/upload",
        }