diff --git a/.env.example b/.env.example index 876d203..0ddd57f 100644 --- a/.env.example +++ b/.env.example @@ -26,6 +26,13 @@ DB_TCP_CONNECT_TIMEOUT=10 DB_CONNECT_RETRY_COUNT=1 DB_CONNECT_RETRY_DELAY=1.0 DB_CALL_TIMEOUT_MS=55000 # Must stay below worker timeout +DB_SLOW_CALL_TIMEOUT_MS=300000 +DB_SLOW_MAX_CONCURRENT=5 +DB_SLOW_POOL_ENABLED=true +DB_SLOW_POOL_SIZE=2 +DB_SLOW_POOL_MAX_OVERFLOW=1 +DB_SLOW_POOL_TIMEOUT=30 +DB_SLOW_POOL_RECYCLE=1800 # ============================================================ # Flask Configuration @@ -115,6 +122,24 @@ REDIS_ENABLED=true # Redis key prefix (to separate from other applications) REDIS_KEY_PREFIX=mes_wip +# Redis memory guardrail (prevent unbounded RAM growth) +# Example: 512mb / 1gb / 2gb. Set 0 to disable limit (NOT recommended). +REDIS_MAXMEMORY=512mb + +# Eviction policy when maxmemory is reached +# Recommended: allkeys-lru (general cache), volatile-lru (TTL keys only) +REDIS_MAXMEMORY_POLICY=allkeys-lru + +# Redis persistence (physical storage) +REDIS_PERSISTENCE_ENABLED=true +REDIS_APPENDONLY=yes +REDIS_APPENDFSYNC=everysec +REDIS_SAVE=900 1 300 10 60 10000 + +# Startup cleanup: remove stale keys that accidentally have no TTL +REDIS_TTL_CLEANUP_ON_START=true +REDIS_TTL_CLEANUP_PATTERNS=batch:*,reject_dataset:*,hold_dataset:*,resource_dataset:*,job_query:* + # Cache check interval in seconds (default: 600 = 10 minutes) CACHE_CHECK_INTERVAL=600 @@ -306,6 +331,30 @@ HEALTH_MEMO_TTL_SECONDS=5 # Reject history options API cache TTL in seconds (default: 14400 = 4 hours) REJECT_HISTORY_OPTIONS_CACHE_TTL_SECONDS=14400 +# ============================================================ +# Reject History Batch/Spill Guardrails +# ============================================================ +# Batch chunking controls (for long-range reject queries) +REJECT_ENGINE_GRAIN_DAYS=10 +REJECT_ENGINE_PARALLEL=2 +REJECT_ENGINE_MAX_ROWS_PER_CHUNK=50000 +REJECT_ENGINE_MAX_TOTAL_ROWS=300000 + +# Large result spill controls +REJECT_ENGINE_SPILL_ENABLED=true +REJECT_ENGINE_MAX_RESULT_MB=64 +QUERY_SPOOL_DIR=tmp/query_spool +REJECT_ENGINE_SPOOL_TTL_SECONDS=21600 +REJECT_ENGINE_SPOOL_MAX_BYTES=2147483648 +REJECT_ENGINE_SPOOL_WARN_RATIO=0.85 +REJECT_ENGINE_SPOOL_CLEANUP_INTERVAL_SECONDS=300 +REJECT_ENGINE_SPOOL_ORPHAN_GRACE_SECONDS=600 + +# Batch query engine thresholds +BATCH_QUERY_TIME_THRESHOLD_DAYS=60 +BATCH_QUERY_ID_THRESHOLD=1000 +BATCH_CHUNK_MAX_MEMORY_MB=256 + # ============================================================ # Runtime Resilience Diagnostics Thresholds # ============================================================ diff --git a/README.md b/README.md index d9fe245..8ab8360 100644 --- a/README.md +++ b/README.md @@ -318,6 +318,15 @@ CIRCUIT_BREAKER_RECOVERY_TIMEOUT=30 # Redis 設定 REDIS_URL=redis://localhost:6379/0 REDIS_ENABLED=true +REDIS_KEY_PREFIX=mes_wip +REDIS_MAXMEMORY=512mb +REDIS_MAXMEMORY_POLICY=allkeys-lru +REDIS_PERSISTENCE_ENABLED=true +REDIS_APPENDONLY=yes +REDIS_APPENDFSYNC=everysec +REDIS_SAVE=900 1 300 10 60 10000 +REDIS_TTL_CLEANUP_ON_START=true +REDIS_TTL_CLEANUP_PATTERNS=batch:*,reject_dataset:*,hold_dataset:*,resource_dataset:*,job_query:* # Watchdog runtime contract WATCHDOG_RUNTIME_DIR=./tmp diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/.openspec.yaml b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/.openspec.yaml new file mode 100644 index 0000000..fd79bfc --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-02 diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/design.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/design.md new file mode 100644 index 0000000..c19caf2 --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/design.md @@ -0,0 +1,166 @@ +## Context + +目前 6 個服務各自處理大查詢,缺乏統一保護: + +| 服務 | 查詢類型 | 現有保護 | 缺口 | +|------|---------|---------|------| +| reject-history | 日期 + 工單/Lot/GD 展開 | L1+L2 快取、`read_sql_df_slow` | 無記憶體守衛、`limit=999999999`、缺分塊查詢 | +| hold-history | 日期 | L1+L2 快取、`read_sql_df_slow` | 無記憶體守衛、缺時間分塊 | +| resource-history | 日期 + 設備 ID | L1+L2 快取、1000 筆分批 | 無記憶體守衛 | +| mid-section-defect | 日期 → 偵測 → 族譜 → 上游 | Redis 快取、EventFetcher 分批 | 無偵測數量上限 | +| job-query | 日期 + 設備 ID | 1000 筆分批、`read_sql_df_slow` | **無結果快取**、缺時間分塊 | +| query-tool | 多種 resolver → container ID | 輸入筆數限制、resolve route 短 TTL 快取、EventFetcher 快取 | 多數查詢仍走 `read_sql_df`(55s timeout)、缺統一分塊編排 | + +參考實作: +- `EventFetcher`:batch 1000 + ThreadPoolExecutor(2) + `read_sql_df_slow_iter` streaming + Redis 快取 — **已是最佳實作** +- `LineageEngine`:batch 1000 + depth limit 20 — **族譜專用引擎** + +目標:建立 `BatchQueryEngine` 共用模組,任何服務接入即獲得完整保護。 + +## Goals / Non-Goals + +**Goals:** +- 統一 parquet-in-Redis 存取為共用模組(消除 3 處重複) +- 提供時間範圍分解(長日期 → ~31 天月份區間) +- 提供 ID 批次分解(工單/Lot/GD 展開後的大量 container ID → 1000 筆一批) +- 記憶體守衛:每個 chunk 結果檢查 memory_usage,超過閾值中止 +- 結果筆數限制:可配置上限,超過時截斷並標記 +- 受控並行:預設循序、可選並行、semaphore 感知 +- Redis 分塊快取 + 部分命中 +- 統一使用 `read_sql_df_slow`(300 秒 dedicated connection) +- 定義 query_hash 與 chunk 邊界語意,避免跨服務行為不一致 +- 定義 chunk cache 與服務 L1/L2 dataset cache 互動規則 + +**Non-Goals:** +- 不修改 SQL 語句本身 +- 不引入新的外部依賴 +- 不改變前端 API 介面(前端無感知) +- 不替換 EventFetcher / LineageEngine(它們已各自最佳化,引擎提供可選接入點) +- 不改變 trace_job_service 的 RQ 非同步架構 + +## Decisions + +### Decision 1: 提取 `redis_df_store.py` 共用模組 + +**選擇**:從 reject/hold/resource_dataset_cache 提取相同的 `_redis_store_df` / `_redis_load_df` 到 `src/mes_dashboard/core/redis_df_store.py`。 + +**替代方案**:(A) 保持各自複製 → 已有 3 處重複,維護困難。 + +**理由**:parquet-in-Redis 是 DataFrame 序列化工具,與快取策略(TTL、LRU)屬不同層次。 + +### Decision 2: `BatchQueryEngine` 作為工具類而非基底類別 + +**選擇**:提供獨立函式(`decompose_by_time_range`、`decompose_by_ids`、`execute_plan`、`merge_chunks`),各服務按需調用。 + +**替代方案**:(A) 抽象基底類別 `BaseDatasetCache` → 三個 dataset cache 差異大(SQL、policy filter、衍生計算),強制繼承會過度耦合。 + +**理由**:工具類模式讓服務保持現有結構,僅在主查詢路徑決定是否啟用分解。閾值以下的查詢完全不經過引擎。 + +### Decision 3: 預設循序、可選並行、semaphore 感知 + +**選擇**:`execute_plan(parallel=1)` 預設循序。實際並行上限 = `min(requested, semaphore_available - 1)`。 + +**替代方案**:(A) 預設並行 → 可能耗盡 semaphore;(B) 完全不並行 → 失去速度。 + +**理由**:Oracle 連線稀缺(Production 預設 `DB_SLOW_MAX_CONCURRENT=5`,Development 常見為 3)。reject_dataset_cache 查詢最重可設 parallel=2,其他預設循序最安全。 + +### Decision 4: 記憶體守衛 + 結果筆數限制 + +**選擇**:每個 chunk 查詢後檢查 `df.memory_usage(deep=True).sum()`,超過 `BATCH_CHUNK_MAX_MEMORY_MB`(預設 256MB)時中止該 chunk 並標記失敗。同時提供 `max_rows_per_chunk` 參數,在 SQL 中加入 `FETCH FIRST N ROWS ONLY`。 + +**替代方案**:(A) 無限制 → 現狀,OOM 風險高;(B) 全域限制 → 不夠靈活。 + +**理由**:chunk 級別的記憶體守衛是最後一道防線。分解後每個 chunk 的日期/ID 範圍已大幅縮小,記憶體超限通常代表異常資料,應中止而非繼續。 + +### Decision 5: 分塊快取 + 部分命中 + +**選擇**:Redis 鍵 `batch:{prefix}:{hash}:chunk:{idx}`,每個 chunk 獨立 SETEX。 + +**替代方案**:(A) 只快取最終結果 → 無法部分命中。 + +**理由**:使用者常見操作是「先查 1-6 月,再查 1-8 月」。分塊快取讓前 6 個月直接複用,只查 7-8 月。 + +### Decision 6: 引擎路徑統一使用 slow-query 路徑(且不佔用主 pool) + +**選擇**:所有經過引擎的查詢統一使用 slow-query 路徑(300s timeout, semaphore 控制);未經引擎的既有短查詢路徑保持原狀。 +慢查詢執行策略採兩層: +1. 主路徑:使用既有獨立 `SLOW POOL`(小容量)做 checkout/checkin。 +2. fallback:當 SLOW POOL 不可用時,降級為 slow direct connection。 + +**替代方案**: +(A) 引擎路徑混用 `read_sql_df`(主 pool, 55s timeout)→ 長查詢高超時風險且會壓縮一般 API 吞吐。 +(B) 慢查詢直接共用主 pool → 高峰時造成 pool 爭用與整體延遲放大。 + +**理由**:經過引擎的查詢本身就是「已知可能很慢」的查詢。慢查詢與主 pool 隔離可避免互相影響;SLOW POOL 讓連線重用與隔離同時成立,fallback direct connection 保障可用性。 + +### Decision 7: 部分失敗處理 + +**選擇**:某個 chunk 失敗時記錄錯誤、繼續剩餘 chunk。`merge_chunks()` 回傳成功部分,metadata 標記 `has_partial_failure=True`。 + +**替代方案**:(A) 全部回滾 → 已成功的 chunk 浪費。 + +**理由**:歷史報表場景下,部分結果比完全失敗更有價值。metadata 標記讓服務可決定是否警告使用者。 + +### Decision 8: Chunk Cache 與服務 L1/L2 Dataset Cache 互動 + +**選擇**:先讀 chunk cache(Redis)組裝結果;組裝後回填既有 service dataset cache(L1 process + L2 Redis)以維持現有 `/view` 路徑與 `query_id` 行為。 + +**替代方案**:(A) 只使用 chunk cache,不回填 service cache → 現有 view/query_id 流程失效或重複查詢。 + +**理由**:需要兼容既有 two-phase dataset API(primary query + cached view),chunk cache 是引擎層優化,不應破壞服務層介面。 + +### Decision 9: query_hash 規格 + +**選擇**:query_hash 使用 canonical JSON(sorted keys、穩定 list 順序、字串正規化)後 SHA-256 前 16 碼;hash 僅包含會影響原始資料集合的參數(不含純前端呈現參數)。 + +**替代方案**:(A) 每服務自由實作 hash → 跨服務不可預測且難除錯。 + +**理由**:chunk key、progress key、merge key 需可重現,否則無法保證 cache 命中與部分重用。 + +### Decision 10: 時間分解邊界語意 + +**選擇**:採閉區間 chunk `[chunk_start, chunk_end]`;下一段從 `chunk_end + 1 day` 開始;最後一段可小於 grain_days;輸入日期以服務既有時區/日界線為準,不在引擎層重新解釋時區。 + +**替代方案**:(A) 半開區間或依月份動態切割但不定義邊界 → 容易重疊或漏資料。 + +**理由**:邊界語意固定後,merge 去重、統計一致性與測試可驗證性都會提升。 + +### Decision 11: 大結果採 Parquet 落地,Redis 僅保留 metadata/熱快取 + +**選擇**:對長查詢(尤其 reject-history)引入 spill-to-disk: +1. chunk 查詢與 chunk cache 保持現行(Redis,短 TTL) +2. merge 後若結果超過門檻(rows / memory / serialized size),寫入 Parquet 至本機 spool 目錄 +3. Redis 僅保存 metadata(query_id, file_path, row_count, schema_hash, created_at, expires_at) +4. `/view`/`/export` 優先透過 metadata 讀取 parquet;metadata 不存在時回退現行 cache 行為 +5. 背景清理器定期移除過期 parquet 與孤兒 metadata + +**替代方案**: +(A) Redis 全量承載所有結果(現況)→ 記憶體壓力高,易引發 lock timeout/OOM 連鎖 +(B) 直接落 DB(例如 SQLite)→ 寫入鎖衝突與維運複雜度高(目前已有 `database is locked` 觀察) + +**理由**:Redis 是記憶體快取,不適合長時間承載大結果;Parquet 落地可把大結果轉移到磁碟,降低 worker/Redis 記憶體峰值。 + +## Risks / Trade-offs + +**[Redis 記憶體增長]** → 分塊快取增加 key 數量(365 天 ≈ 12 個 chunk key)。 +→ 緩解:TTL 自動過期(900s);chunk 結果經 parquet 壓縮(通常 10:1 壓縮比)。 + +**[Semaphore 爭用]** → 並行 chunk 消耗更多 permit。 +→ 緩解:感知可用數量,不足時自動降級循序。預設 parallel=1。 + +**[時間分解後的資料一致性]** → 不同月份 chunk 在不同時間點查詢。 +→ 緩解:歷史報表資料更新頻率低(日級),短窗口內變動極低。可接受。 + +**[遷移風險]** → 先修改 3 個 dataset cache,再擴展至其他服務,整體範圍仍大。 +→ 緩解:閾值控制(短查詢不經過引擎)+ P0/P1/P2/P3 分階段導入 + 每階段獨立驗證。 + +**[磁碟 I/O 與容量壓力]** → Parquet 落地會增加磁碟讀寫,若清理策略失效可能累積大量檔案。 +→ 緩解:設定 spool 容量上限、TTL 清理、啟動時 orphan 掃描、超限時回退到「不落地僅回應摘要」保護模式。 + +**[Stale metadata / orphan file]** → Redis metadata 與實體檔案可能不一致。 +→ 緩解:讀取前校驗檔案存在與 schema hash;不一致時自動失效 metadata 並記錄告警。 + +## Open Questions + +1. `mid_section_defect_service` 的 4 階段管線(偵測 → 族譜 → 上游歷史 → 歸因)中,哪些階段適合接入引擎?偵測查詢可日期分解,但族譜/上游已透過 EventFetcher 處理。 +2. `query_tool_service` 有 15+ 種查詢類型,是否全部接入還是只處理最易超時的(split_merge_history、equipment_period)? diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/proposal.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/proposal.md new file mode 100644 index 0000000..ffcc99f --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/proposal.md @@ -0,0 +1,83 @@ +## Why + +目前各歷史報表服務(reject-history、hold-history、resource-history)、查詢工具(query-tool)、中段不良分析(mid-section-defect)和 Job 查詢(job-query)各自實作不同的批次查詢、快取和並行執行模式,缺乏統一編排與保護。主要問題: + +1. **Oracle 超時**:長日期範圍(365+ 天)或大量 Container ID(工單展開後可達數千筆)的查詢可能超過 300 秒 call_timeout +2. **OOM 風險**:reject/hold dataset cache 以 `limit: 999999999` 取回全部資料,無記憶體上限守衛 +3. **保護分散**:`EventFetcher` 已有 ID 分批 + 快取,但 reject/hold/resource dataset cache 仍各自維護查詢與快取策略 +4. **重複程式碼**:3 個 dataset cache 各自複製相同的 parquet-in-Redis 序列化邏輯 +5. **ID 展開膨脹**:工單 resolve 後 container ID 可能大量擴張,缺乏跨服務一致的分批/合併流程 +6. **重查成本高**:延長查詢範圍(例如 1-6 月改 1-8 月)無法有效重用已查區段結果 +7. **query-tool 超時風險高**:多數查詢仍走 `read_sql_df`(主 pool / 55s timeout),大查詢下容易超時 + +需要一個**可穩定複用的查詢引擎模組**,任何服務接入後自動獲得分解、快取、記憶體保護和超時保護。 + +## What Changes + +- 新增 `BatchQueryEngine` 共用模組,提供: + - **時間範圍分解**:長日期 → ~31 天月份區間,每段獨立查詢 + - **時間分解語意**:明確定義 chunk 邊界(閉區間)、跨月切割與最後一段不足月行為 + - **ID 批次分解**:大量 ID(工單/Lot/GD Lot/流水批展開後)→ 1000 筆一批 + - **query_hash 規格**:統一 canonicalization 與雜湊欄位,確保 chunk/cache key 穩定 + - **記憶體守衛**:每個 chunk 結果檢查 `DataFrame.memory_usage()`,超過閾值時中止並警告 + - **結果筆數限制**:可配置的最大結果筆數,超過時截斷並標記 + - **受控並行執行**:預設循序、可選並行,嚴格遵守 slow query semaphore + - **Redis 分塊快取**:每個 chunk 獨立快取,支援部分命中(延長查詢範圍時複用已查過的區間) + - **快取層互動**:明確定義 chunk cache 與服務既有 L1/L2 dataset cache 的讀寫順序 + - **進度追蹤**:Redis HSET 記錄進度,可供前端顯示 +- 新增「**大結果落地層(Parquet spill)**」設計: + - 當長查詢結果超過記憶體/列數門檻時,將合併後結果以 Parquet 寫入本機持久目錄(例如 `tmp/query_spool/`) + - Redis 僅保存 metadata(query_id → parquet path / schema / rows / created_at / ttl) + - `/view` 與 `/export` 讀取流程優先走 Redis metadata + Parquet,避免整包 DataFrame 常駐 worker RAM + - 定時清理(TTL + 背景清理器)刪除過期 parquet,避免磁碟持續膨脹 +- 新增 `redis_df_store` 共用模組,將 parquet-in-Redis 存取邏輯從 3 個 dataset cache 提取為共用工具 +- 所有**引擎接管的 chunk 查詢**統一使用 slow 路徑(300 秒級 timeout) + - 使用既有「**獨立 SLOW POOL(小容量)**」做慢查詢連線重用 + - 明確**不使用主查詢 pool** 承載慢查詢,避免拖垮一般 API + - 當 SLOW POOL 不可用時,降級為 slow direct connection(不影響主 pool) + +## Capabilities + +### New Capabilities +- `batch-query-engine`: 統一批次查詢引擎模組,涵蓋分解策略(時間/ID)、記憶體守衛、結果限制、受控執行、Redis 分塊快取、進度追蹤、結果合併 + +### Modified Capabilities +- `reject-history-api`: 主查詢改為透過引擎執行;date_range 模式自動時間分解,container 模式(工單/Lot/GD Lot 展開後)自動 ID 分批 +- `hold-dataset-cache`: 主查詢改為透過引擎執行,長日期自動分解 +- `resource-dataset-cache`: 主查詢改為透過引擎執行,長日期自動分解 +- `event-fetcher-unified`: 保持既有最佳化(batch + streaming + cache),僅在需要統一監控/進度模型時再評估導入 + +## Impact + +- **後端**:新增 2 個共用模組(`batch_query_engine.py`、`redis_df_store.py`),優先修改 3 個 dataset cache 主查詢路徑(reject/hold/resource) +- **受影響服務**(優先順序): + - P0:reject-history(最容易超時/OOM — 長日期 + 工單展開 + 目前 `limit=999999999`) + - P1:hold-history、resource-history(相同架構,直接套用) + - P2:mid-section-defect(4 階段管線,偵測查詢 + 上游歷史)、job-query(缺快取 + 日期分解) + - P3:query-tool(優先處理 `read_sql_df` 高風險路徑並導入慢查詢保護)、event-fetcher(保持可選) +- **資料庫**:不改 SQL,僅縮小每次查詢的 bind parameter 範圍 +- **資料庫連線策略**:慢查詢與一般 pooled query 隔離,避免資源互相干擾 +- **Redis**:新增 `batch:*` 前綴的分塊快取鍵 +- **儲存層**:新增 Parquet 結果落地目錄與清理機制(Redis 轉為索引/metadata,不再承載全部大結果) +- **記憶體**:引擎強制單 chunk 記憶體上限(預設 256MB),超過時中止 +- **可用性**:Redis 設定 `maxmemory` + eviction 後仍可透過 Parquet metadata 回復查詢結果(cache 不命中不等於資料遺失) +- **向下相容**:短查詢(< 60 天、< 1000 ID)走現有路徑,零額外開銷;既有 route/event 快取策略保持不變 +- **前端**:可選性變更,長查詢可顯示進度條(非必要) + +## Parquet 落地的預期效果與副作用 + +**預期效果:** +- 大幅降低 worker 在「merge + cache 回填」階段的峰值記憶體(避免單 worker 突增到 GB 級) +- Redis 記憶體由「存整包資料」轉為「存索引/熱資料」,降低 OOM 與 lock timeout 連鎖風險 +- 服務重啟後,若 parquet 尚未過期,仍可恢復查詢結果(搭配 metadata) + +**可能副作用(Side Effects):** +- 磁碟 I/O 增加:查詢高峰時會有 parquet 寫入/讀取尖峰 +- 磁碟容量風險:清理策略失效時,spool 目錄可能持續膨脹 +- 資料一致性風險:metadata 指向檔案若被外部刪除/損壞,會出現 stale pointer +- 安全與治理:落地檔案需納入權限控管、備份/清理與稽核策略 + +**緩解方向:** +- 強制 TTL + 定期掃描清理(以 metadata 與檔案 mtime 雙重判斷) +- 啟動時做 orphan/stale 檢查與自動修復(刪 metadata 或刪孤兒檔) +- 先以 reject-history 長查詢為 P0,逐步擴展到其他服務 diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/batch-query-orchestrator/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/batch-query-orchestrator/spec.md new file mode 100644 index 0000000..94da150 --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/batch-query-orchestrator/spec.md @@ -0,0 +1,166 @@ +## ADDED Requirements + +### Requirement: BatchQueryEngine SHALL provide time-range decomposition +The module SHALL decompose long date ranges into manageable monthly chunks to prevent Oracle timeout. + +#### Scenario: Decompose date range into monthly chunks +- **WHEN** `decompose_by_time_range(start_date, end_date, grain_days=31)` is called +- **THEN** the date range SHALL be split into chunks of at most `grain_days` days each +- **THEN** each chunk SHALL contain `chunk_start` and `chunk_end` date strings +- **THEN** chunks SHALL be contiguous and non-overlapping, covering the full range + +#### Scenario: Short date range returns single chunk +- **WHEN** the date range is shorter than or equal to `grain_days` +- **THEN** a single chunk covering the full range SHALL be returned + +#### Scenario: Time-chunk boundary semantics are deterministic +- **WHEN** a date range is decomposed into multiple chunks +- **THEN** each chunk SHALL use a closed interval `[chunk_start, chunk_end]` +- **THEN** the next chunk SHALL start at `previous_chunk_end + 1 day` +- **THEN** the final chunk MAY contain fewer than `grain_days` days +- **THEN** chunk ranges SHALL have no overlap and no gap + +### Requirement: BatchQueryEngine SHALL provide ID-batch decomposition +The module SHALL decompose large ID lists (from workorder/lot/GD lot/serial resolve expansion) into batches respecting Oracle IN-clause limits. + +#### Scenario: Decompose ID list into batches +- **WHEN** `decompose_by_ids(ids, batch_size=1000)` is called with more than `batch_size` IDs +- **THEN** the ID list SHALL be split into batches of at most `batch_size` items each + +#### Scenario: Small ID list returns single batch +- **WHEN** the ID list has fewer than or equal to `batch_size` items +- **THEN** a single batch containing all IDs SHALL be returned + +### Requirement: BatchQueryEngine SHALL execute chunk plans with controlled parallelism +The module SHALL execute query chunks sequentially by default, with opt-in parallel execution respecting the slow query semaphore. + +#### Scenario: Sequential execution (default) +- **WHEN** `execute_plan(chunks, query_fn, parallel=1)` is called +- **THEN** chunks SHALL be executed one at a time in order +- **THEN** each chunk result SHALL be stored to Redis immediately after completion +- **THEN** the function SHALL return a `query_hash` identifying the batch result + +#### Scenario: Parallel execution with semaphore awareness +- **WHEN** `execute_plan(chunks, query_fn, parallel=2)` is called +- **THEN** up to `parallel` chunks SHALL execute concurrently via ThreadPoolExecutor +- **THEN** each thread SHALL acquire the slow query semaphore before executing `query_fn` +- **THEN** actual concurrency SHALL be capped at `min(parallel, available_semaphore_permits - 1)` +- **THEN** if semaphore is fully occupied, execution SHALL degrade to sequential + +#### Scenario: All engine queries use dedicated connection +- **WHEN** a chunk's `query_fn` executes an Oracle query +- **THEN** it SHALL use `read_sql_df_slow` (dedicated connection, 300s timeout, semaphore-controlled) +- **THEN** pooled connection (`read_sql_df`) SHALL NOT be used for engine-managed queries + +### Requirement: BatchQueryEngine SHALL enforce memory guards per chunk +The module SHALL check each chunk result's memory usage and abort if it exceeds a configurable threshold. + +#### Scenario: Chunk memory within limit +- **WHEN** a chunk query returns a DataFrame within `BATCH_CHUNK_MAX_MEMORY_MB` (default 256MB, env-configurable) +- **THEN** the chunk SHALL be stored to Redis and marked as completed + +#### Scenario: Chunk memory exceeds limit +- **WHEN** a chunk query returns a DataFrame exceeding `BATCH_CHUNK_MAX_MEMORY_MB` +- **THEN** the chunk SHALL be discarded (NOT stored to Redis) +- **THEN** the chunk SHALL be marked as failed in metadata with reason `memory_limit_exceeded` +- **THEN** a warning log SHALL include chunk index, actual memory MB, and threshold +- **THEN** remaining chunks SHALL continue execution + +#### Scenario: Result row count limit +- **WHEN** `max_rows_per_chunk` is configured +- **THEN** the engine SHALL pass this limit to `query_fn` for SQL-level truncation (e.g., `FETCH FIRST N ROWS ONLY`) +- **THEN** if the result contains exactly `max_rows_per_chunk` rows, metadata SHALL include `truncated=True` + +### Requirement: BatchQueryEngine SHALL support partial cache hits +The module SHALL check Redis for previously cached chunks and skip re-execution for cached chunks. + +#### Scenario: Partial cache hit skips cached chunks +- **WHEN** `execute_plan(chunks, query_fn, skip_cached=True)` is called +- **THEN** for each chunk, Redis SHALL be checked for an existing cached result +- **THEN** chunks with valid cached results SHALL NOT be re-executed +- **THEN** only uncached chunks SHALL be passed to `query_fn` + +#### Scenario: Full cache hit skips all execution +- **WHEN** all chunks already exist in Redis cache +- **THEN** no Oracle queries SHALL be executed +- **THEN** `merge_chunks()` SHALL return the combined cached DataFrames + +### Requirement: BatchQueryEngine SHALL generate deterministic query_hash +The module SHALL use a stable hash for cache/progress keys so semantically identical queries map to the same batch identity. + +#### Scenario: Stable hash for equivalent parameters +- **WHEN** two requests contain the same semantic query parameters in different input order +- **THEN** canonicalization SHALL normalize ordering before hashing +- **THEN** `query_hash` SHALL be identical for both requests + +#### Scenario: Hash changes only when dataset-affecting parameters change +- **WHEN** parameters affecting the raw dataset (date range, mode, resolved IDs, core filters) change +- **THEN** `query_hash` SHALL change +- **THEN** presentation-only parameters SHALL NOT change `query_hash` + +### Requirement: BatchQueryEngine SHALL define chunk-cache to service-cache handoff +The module SHALL integrate chunk-level cache with existing service-level dataset caches without breaking query_id-based view APIs. + +#### Scenario: Chunk merge backfills service dataset cache +- **WHEN** chunk results are loaded/merged into a complete dataset for a primary query +- **THEN** the merged DataFrame SHALL be written back to the service's existing dataset cache layers (L1 process + L2 Redis) +- **THEN** downstream `/view` queries using the service `query_id` SHALL continue to work without additional Oracle queries + +#### Scenario: Service cache miss with chunk cache hit +- **WHEN** a service-level dataset cache entry has expired but relevant chunk cache keys still exist +- **THEN** the engine SHALL rebuild the merged dataset from chunk cache +- **THEN** the service dataset cache SHALL be repopulated before returning response + +### Requirement: BatchQueryEngine SHALL store chunk results in Redis +The module SHALL store each chunk as a separate Redis key using parquet-in-Redis format. + +#### Scenario: Chunk storage key format +- **WHEN** a chunk result is stored +- **THEN** the Redis key SHALL follow the pattern `batch:{cache_prefix}:{query_hash}:chunk:{idx}` +- **THEN** each chunk SHALL be stored as a parquet-encoded base64 string via `redis_df_store` +- **THEN** each chunk key SHALL have a TTL matching the service's cache TTL (default 900 seconds) + +#### Scenario: Chunk metadata tracking +- **WHEN** chunks are being executed +- **THEN** a metadata key `batch:{cache_prefix}:{query_hash}:meta` SHALL be updated via Redis HSET +- **THEN** metadata SHALL include `total`, `completed`, `failed`, `pct`, `status`, and `has_partial_failure` fields + +### Requirement: BatchQueryEngine SHALL merge chunk results into a single DataFrame +The module SHALL provide result assembly from cached chunks. + +#### Scenario: Merge all chunks +- **WHEN** `merge_chunks(query_hash)` is called +- **THEN** all chunk DataFrames SHALL be loaded from Redis and concatenated via `pd.concat` +- **THEN** if any chunk is missing, the merge SHALL proceed with available chunks and set `has_partial_failure=True` + +#### Scenario: Iterate chunks for streaming +- **WHEN** `iterate_chunks(query_hash)` is called +- **THEN** chunk DataFrames SHALL be yielded one at a time without loading all into memory simultaneously + +### Requirement: BatchQueryEngine SHALL handle chunk failures gracefully +The module SHALL continue execution when individual chunks fail and report partial results. + +#### Scenario: Single chunk failure +- **WHEN** a chunk's `query_fn` raises an exception (timeout, ORA error, etc.) +- **THEN** the error SHALL be logged with chunk index and exception details +- **THEN** the failed chunk SHALL be marked as failed in metadata +- **THEN** remaining chunks SHALL continue execution + +#### Scenario: All chunks fail +- **WHEN** all chunks' `query_fn` calls raise exceptions +- **THEN** metadata status SHALL be set to `failed` +- **THEN** `merge_chunks()` SHALL return an empty DataFrame + +### Requirement: Shared redis_df_store module SHALL provide parquet-in-Redis utilities +The module SHALL provide reusable DataFrame serialization to/from Redis using parquet + base64 encoding. + +#### Scenario: Store DataFrame to Redis +- **WHEN** `redis_store_df(key, df, ttl)` is called +- **THEN** the DataFrame SHALL be serialized to parquet format using pyarrow +- **THEN** the parquet bytes SHALL be base64-encoded and stored via Redis SETEX with the given TTL +- **THEN** if Redis is unavailable, the function SHALL log a warning and return without error + +#### Scenario: Load DataFrame from Redis +- **WHEN** `redis_load_df(key)` is called +- **THEN** the base64 string SHALL be loaded from Redis, decoded, and deserialized to a DataFrame +- **THEN** if the key does not exist or Redis is unavailable, the function SHALL return None diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/event-fetcher-unified/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/event-fetcher-unified/spec.md new file mode 100644 index 0000000..ef7bfa4 --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/event-fetcher-unified/spec.md @@ -0,0 +1,37 @@ +## MODIFIED Requirements + +### Requirement: EventFetcher SHALL provide unified cached event querying across domains +`EventFetcher` SHALL encapsulate batch event queries with L1/L2 layered cache and rate limit bucket configuration, supporting domains: `history`, `materials`, `rejects`, `holds`, `jobs`, `upstream_history`, `downstream_rejects`. EventFetcher MAY optionally delegate ID batching to `BatchQueryEngine` for consistent decomposition patterns. + +#### Scenario: Cache miss for event domain query +- **WHEN** `EventFetcher` is called for a domain with container IDs and no cache exists +- **THEN** the domain query SHALL execute against Oracle via `read_sql_df_slow()` (non-pooled dedicated connection) +- **THEN** each batch query SHALL use `timeout_seconds=60` +- **THEN** the result SHALL be stored in L2 Redis cache with key format `evt:{domain}:{sorted_cids_hash}` if CID count is within cache threshold +- **THEN** L1 memory cache SHALL also be populated if CID count is within cache threshold + +#### Scenario: Cache hit for event domain query +- **WHEN** `EventFetcher` is called for a domain and L2 Redis cache contains a valid entry +- **THEN** the cached result SHALL be returned without executing Oracle query +- **THEN** DB connection pool SHALL NOT be consumed + +#### Scenario: Rate limit bucket per domain +- **WHEN** `EventFetcher` is used from a route handler +- **THEN** each domain SHALL have a configurable rate limit bucket aligned with `configured_rate_limit()` pattern +- **THEN** rate limit configuration SHALL be overridable via environment variables + +#### Scenario: Large CID set exceeds cache threshold +- **WHEN** the normalized CID count exceeds `CACHE_SKIP_CID_THRESHOLD` (default 10000, env: `EVENT_FETCHER_CACHE_SKIP_CID_THRESHOLD`) +- **THEN** EventFetcher SHALL skip both L1 and L2 cache writes +- **THEN** a warning log SHALL be emitted with domain name, CID count, and threshold value +- **THEN** the query result SHALL still be returned to the caller + +#### Scenario: Batch concurrency default +- **WHEN** EventFetcher processes batches for a domain with >1000 CIDs +- **THEN** the default `EVENT_FETCHER_MAX_WORKERS` SHALL be 2 (env: `EVENT_FETCHER_MAX_WORKERS`) + +#### Scenario: Optional BatchQueryEngine integration +- **WHEN** EventFetcher is refactored to use `BatchQueryEngine` (optional, not required) +- **THEN** `decompose_by_ids()` MAY replace inline batching logic +- **THEN** existing ThreadPoolExecutor + read_sql_df_slow_iter patterns SHALL be preserved as the primary implementation +- **THEN** no behavioral changes SHALL be introduced by engine integration diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/hold-dataset-cache/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/hold-dataset-cache/spec.md new file mode 100644 index 0000000..893785f --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/hold-dataset-cache/spec.md @@ -0,0 +1,34 @@ +## MODIFIED Requirements + +### Requirement: Hold dataset cache SHALL execute a single Oracle query and cache the result +The hold_dataset_cache module SHALL query Oracle once for the full hold/release fact set and cache it for subsequent derivations. For date ranges exceeding 60 days, the query SHALL be decomposed into monthly chunks via `BatchQueryOrchestrator`. + +#### Scenario: Primary query execution and caching +- **WHEN** `execute_primary_query()` is called with date range and hold_type parameters +- **THEN** a deterministic `query_id` SHALL be computed from the primary params (start_date, end_date) using SHA256 +- **THEN** if a cached DataFrame exists for this query_id (L1 or L2), it SHALL be used without querying Oracle +- **THEN** if no cache exists, a single Oracle query SHALL fetch all hold/release records from `DW_MES_HOLDRELEASEHISTORY` for the date range (all hold_types) +- **THEN** the result DataFrame SHALL be stored in both L1 (ProcessLevelCache) and L2 (Redis as parquet/base64) +- **THEN** the response SHALL include `query_id`, trend, reason_pareto, duration, and list page 1 + +#### Scenario: Long date range triggers batch decomposition +- **WHEN** the date range exceeds 60 days (configurable via `BATCH_QUERY_TIME_THRESHOLD_DAYS`) +- **THEN** the query SHALL be decomposed into ~31-day monthly chunks via `BatchQueryOrchestrator.decompose_by_time_range()` +- **THEN** each chunk SHALL execute independently via `read_sql_df_slow` with the chunk's date sub-range +- **THEN** chunk results SHALL be stored individually in Redis and merged via `pd.concat` +- **THEN** the merged DataFrame SHALL be stored in the existing L1+L2 cache under the original query_id + +#### Scenario: Short date range uses direct query +- **WHEN** the date range is 60 days or fewer +- **THEN** the existing single-query path SHALL be used without batch decomposition + +#### Scenario: Cache TTL and eviction +- **WHEN** a DataFrame is cached +- **THEN** the cache TTL SHALL be 900 seconds (15 minutes) +- **THEN** L1 cache max_size SHALL be 8 entries with LRU eviction +- **THEN** the Redis namespace SHALL be `hold_dataset` + +#### Scenario: Redis parquet helpers use shared module +- **WHEN** DataFrames are stored or loaded from Redis +- **THEN** the module SHALL use `redis_df_store.redis_store_df()` and `redis_df_store.redis_load_df()` from the shared `core/redis_df_store.py` module +- **THEN** inline `_redis_store_df` / `_redis_load_df` functions SHALL be removed diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/job-query/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/job-query/spec.md new file mode 100644 index 0000000..04b4710 --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/job-query/spec.md @@ -0,0 +1,34 @@ +## MODIFIED Requirements + +### Requirement: Job query SHALL use BatchQueryEngine for long-range decomposition + +The `get_jobs_by_resources()` function SHALL delegate to BatchQueryEngine when the requested date range exceeds the configurable threshold, preventing Oracle timeout on large job queries. + +#### Scenario: Long date range triggers engine decomposition +- **WHEN** `get_jobs_by_resources(resource_ids, start_date, end_date)` is called +- **AND** the date range exceeds `BATCH_QUERY_TIME_THRESHOLD_DAYS` (default 60) +- **THEN** the date range SHALL be decomposed via `decompose_by_time_range()` +- **THEN** each chunk SHALL be executed through the existing job SQL with chunk-scoped dates +- **THEN** the existing `_build_resource_filter()` batching SHALL be preserved within each chunk + +#### Scenario: Short date range preserves direct path +- **WHEN** the date range is within the threshold +- **THEN** the existing direct query path SHALL be used with zero overhead + +### Requirement: Job query results SHALL be cached in Redis + +Job query results SHALL be cached using the shared `redis_df_store` module to avoid redundant Oracle queries on repeated requests. + +#### Scenario: Cache hit returns stored result +- **WHEN** a job query is executed with identical parameters within the cache TTL +- **THEN** the cached result SHALL be returned without hitting Oracle + +#### Scenario: Cache miss triggers fresh query +- **WHEN** no cached result exists for the query parameters +- **THEN** the query SHALL execute against Oracle +- **THEN** the result SHALL be stored in Redis with the configured TTL + +### Requirement: Job queries SHALL use read_sql_df_slow execution path +- **WHEN** engine-managed job queries execute +- **THEN** they SHALL use `read_sql_df_slow` (dedicated connection, 300s timeout) +- **THEN** no pooled-query regressions SHALL be introduced diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/mid-section-defect/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/mid-section-defect/spec.md new file mode 100644 index 0000000..048a4e4 --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/mid-section-defect/spec.md @@ -0,0 +1,22 @@ +## MODIFIED Requirements + +### Requirement: Detection query SHALL use BatchQueryEngine for long-range decomposition + +The `_fetch_station_detection_data()` function SHALL delegate to BatchQueryEngine when the requested date range exceeds the configurable threshold, preventing Oracle timeout on large detection queries. + +#### Scenario: Long date range triggers engine decomposition +- **WHEN** `_fetch_station_detection_data(start_date, end_date, station)` is called +- **AND** the date range exceeds `BATCH_QUERY_TIME_THRESHOLD_DAYS` (default 60) +- **THEN** the date range SHALL be decomposed via `decompose_by_time_range()` +- **THEN** each chunk SHALL be executed through the existing detection SQL with chunk-scoped dates +- **THEN** chunk results SHALL be cached in Redis and merged into a single DataFrame + +#### Scenario: Short date range preserves direct path +- **WHEN** the date range is within the threshold +- **THEN** the existing direct query path SHALL be used with zero overhead + +#### Scenario: Memory guard protects against oversized detection results +- **WHEN** a single chunk result exceeds `BATCH_CHUNK_MAX_MEMORY_MB` +- **THEN** that chunk SHALL be discarded and marked as failed +- **THEN** remaining chunks SHALL continue executing +- **THEN** the batch metadata SHALL reflect `has_partial_failure` diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/query-tool-safety-hardening/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/query-tool-safety-hardening/spec.md new file mode 100644 index 0000000..75ebdcb --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/query-tool-safety-hardening/spec.md @@ -0,0 +1,37 @@ +## MODIFIED Requirements + +### Requirement: High-risk query_tool paths SHALL migrate to slow-query execution + +Functions currently using `read_sql_df` (fast pool, 55s timeout) that handle unbounded or user-driven queries SHALL be migrated to `read_sql_df_slow` (dedicated connection, 300s timeout) to prevent timeout failures. + +#### Scenario: Serial number resolution uses slow-query path +- **WHEN** `_resolve_by_serial_number()` executes resolver SQL queries +- **THEN** queries SHALL use `read_sql_df_slow` instead of `read_sql_df` + +#### Scenario: Work order resolution uses slow-query path +- **WHEN** `_resolve_by_work_order()` executes resolver SQL queries +- **THEN** queries SHALL use `read_sql_df_slow` instead of `read_sql_df` + +#### Scenario: Equipment query functions use slow-query path +- **WHEN** `get_equipment_status_hours()`, `get_equipment_lots()`, `get_equipment_materials()`, `get_equipment_rejects()`, or `get_equipment_jobs()` execute equipment SQL queries +- **THEN** queries SHALL use `read_sql_df_slow` instead of `read_sql_df` + +### Requirement: High-risk query_tool paths SHALL use engine decomposition for large inputs + +Selected query functions SHALL delegate to BatchQueryEngine for ID decomposition when the resolved input set is large. + +#### Scenario: Large serial number batch triggers engine decomposition +- **WHEN** `_resolve_by_serial_number()` is called with more IDs than `BATCH_QUERY_ID_THRESHOLD` +- **THEN** IDs SHALL be decomposed via `decompose_by_ids()` +- **THEN** each batch SHALL be executed through the existing resolver SQL + +#### Scenario: Equipment period queries use engine time decomposition +- **WHEN** equipment period queries span more than `BATCH_QUERY_TIME_THRESHOLD_DAYS` +- **THEN** the date range SHALL be decomposed via `decompose_by_time_range()` + +### Requirement: Existing resolve cache strategy SHALL be reviewed for heavy query patterns + +#### Scenario: Route-level short-TTL cache extended for high-repeat patterns +- **WHEN** a query pattern is identified as high-repeat (same parameters within minutes) +- **THEN** result caching SHALL be considered using `redis_df_store` +- **THEN** cache TTL SHALL align with the service's data freshness requirements diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/reject-history-api/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/reject-history-api/spec.md new file mode 100644 index 0000000..6905c30 --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/reject-history-api/spec.md @@ -0,0 +1,31 @@ +## MODIFIED Requirements + +### Requirement: Database query execution path +The reject-history service (`reject_history_service.py` and `reject_dataset_cache.py`) SHALL use `read_sql_df_slow` (dedicated connection) instead of `read_sql_df` (pooled connection) for all Oracle queries. For large queries, `BatchQueryEngine` SHALL decompose by time range or ID count. + +#### Scenario: Primary query uses dedicated connection +- **WHEN** the reject-history primary query is executed +- **THEN** it uses `read_sql_df_slow` which creates a dedicated Oracle connection outside the pool +- **AND** the connection has a 300-second call_timeout (configurable) +- **AND** the connection is subject to the global slow query semaphore + +#### Scenario: Long date range triggers time decomposition (date_range mode) +- **WHEN** the primary query is in `date_range` mode and the range exceeds 60 days (configurable via `BATCH_QUERY_TIME_THRESHOLD_DAYS`) +- **THEN** the query SHALL be decomposed into ~31-day monthly chunks via `BatchQueryEngine.decompose_by_time_range()` +- **THEN** each chunk SHALL execute independently with the chunk's date sub-range as bind parameters +- **THEN** chunk results SHALL be stored individually in Redis and merged via `pd.concat` + +#### Scenario: Large container ID set triggers ID decomposition (container mode) +- **WHEN** the primary query is in `container` mode (workorder/lot/wafer_lot input) and the resolved container ID count exceeds 1000 +- **THEN** the container IDs SHALL be decomposed into 1000-item batches via `BatchQueryEngine.decompose_by_ids()` +- **THEN** each batch SHALL execute independently +- **THEN** batch results SHALL be merged into the final cached DataFrame + +#### Scenario: Short date range or small ID set uses direct query +- **WHEN** the date range is 60 days or fewer, or resolved container IDs are 1000 or fewer +- **THEN** the existing single-query path SHALL be used without engine decomposition + +#### Scenario: Memory guard on result +- **WHEN** a chunk query result exceeds `BATCH_CHUNK_MAX_MEMORY_MB` +- **THEN** the chunk SHALL be discarded and marked as failed +- **THEN** the current `limit: 999999999` pattern SHALL be replaced with a configurable `max_rows_per_chunk` diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/resource-dataset-cache/spec.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/resource-dataset-cache/spec.md new file mode 100644 index 0000000..4d2e79b --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/specs/resource-dataset-cache/spec.md @@ -0,0 +1,34 @@ +## MODIFIED Requirements + +### Requirement: Resource dataset cache SHALL execute a single Oracle query and cache the result +The resource_dataset_cache module SHALL query Oracle once for the full shift-status fact set and cache it for subsequent derivations. For date ranges exceeding 60 days, the query SHALL be decomposed into monthly chunks via `BatchQueryOrchestrator`. + +#### Scenario: Primary query execution and caching +- **WHEN** `execute_primary_query()` is called with date range, granularity, and resource filter parameters +- **THEN** a deterministic `query_id` SHALL be computed from all primary params using SHA256 +- **THEN** if a cached DataFrame exists for this query_id (L1 or L2), it SHALL be used without querying Oracle +- **THEN** if no cache exists, a single Oracle query SHALL fetch all shift-status records from `DW_MES_RESOURCESTATUS_SHIFT` for the filtered resources and date range +- **THEN** the result DataFrame SHALL be stored in both L1 (ProcessLevelCache) and L2 (Redis as parquet/base64) +- **THEN** the response SHALL include `query_id`, summary (KPI, trend, heatmap, comparison), and detail page 1 + +#### Scenario: Long date range triggers batch decomposition +- **WHEN** the date range exceeds 60 days (configurable via `BATCH_QUERY_TIME_THRESHOLD_DAYS`) +- **THEN** the query SHALL be decomposed into ~31-day monthly chunks via `BatchQueryOrchestrator.decompose_by_time_range()` +- **THEN** each chunk SHALL execute independently via `read_sql_df_slow` with the chunk's date sub-range +- **THEN** chunk results SHALL be stored individually in Redis and merged via `pd.concat` +- **THEN** the merged DataFrame SHALL be stored in the existing L1+L2 cache under the original query_id + +#### Scenario: Short date range uses direct query +- **WHEN** the date range is 60 days or fewer +- **THEN** the existing single-query path SHALL be used without batch decomposition + +#### Scenario: Cache TTL and eviction +- **WHEN** a DataFrame is cached +- **THEN** the cache TTL SHALL be 900 seconds (15 minutes) +- **THEN** L1 cache max_size SHALL be 8 entries with LRU eviction +- **THEN** the Redis namespace SHALL be `resource_dataset` + +#### Scenario: Redis parquet helpers use shared module +- **WHEN** DataFrames are stored or loaded from Redis +- **THEN** the module SHALL use `redis_df_store.redis_store_df()` and `redis_df_store.redis_load_df()` from the shared `core/redis_df_store.py` module +- **THEN** inline `_redis_store_df` / `_redis_load_df` functions SHALL be removed diff --git a/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/tasks.md b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/tasks.md new file mode 100644 index 0000000..9ffffd3 --- /dev/null +++ b/openspec/changes/archive/2026-03-02-unified-batch-query-redis-cache/tasks.md @@ -0,0 +1,122 @@ +## 0. Artifact Alignment (P2/P3 Specs) + +- [x] 0.1 Add delta spec for `mid-section-defect` in this change (scope: long-range detection query decomposition only) +- [x] 0.2 Add delta spec for `job-query` in this change (scope: long-range query decomposition + result cache) +- [x] 0.3 Add delta spec for `query-tool` in this change (scope: high-risk endpoints and timeout-protection strategy) + +## 1. Shared Infrastructure — redis_df_store + +- [x] 1.1 Create `src/mes_dashboard/core/redis_df_store.py` with `redis_store_df(key, df, ttl)` and `redis_load_df(key)` extracted from reject_dataset_cache.py (lines 82-111) +- [x] 1.2 Add chunk-level helpers: `redis_store_chunk(prefix, query_hash, idx, df, ttl)`, `redis_load_chunk(prefix, query_hash, idx)`, `redis_chunk_exists(prefix, query_hash, idx)` + +## 2. Shared Infrastructure — BatchQueryEngine + +- [x] 2.1 Create `src/mes_dashboard/services/batch_query_engine.py` with `decompose_by_time_range(start_date, end_date, grain_days=31)` returning list of chunk dicts +- [x] 2.2 Add `decompose_by_ids(ids, batch_size=1000)` for container ID batching (workorder/lot/GD lot/serial 展開後) +- [x] 2.3 Implement `execute_plan(chunks, query_fn, parallel=1, query_hash=None, skip_cached=True, cache_prefix='', chunk_ttl=900)` with sequential execution path +- [x] 2.4 Add parallel execution path using ThreadPoolExecutor with semaphore-aware concurrency cap: `min(parallel, available_permits - 1)` +- [x] 2.5 Add memory guard: after each chunk query, check `df.memory_usage(deep=True).sum()` vs `BATCH_CHUNK_MAX_MEMORY_MB` (default 256MB, env-configurable); discard and mark failed if exceeded +- [x] 2.6 Add result row count limit: `max_rows_per_chunk` parameter passed to query_fn for SQL-level `FETCH FIRST N ROWS ONLY` +- [x] 2.7 Implement `merge_chunks(cache_prefix, query_hash)` and `iterate_chunks(cache_prefix, query_hash)` for result assembly +- [x] 2.8 Add progress tracking via Redis HSET (`batch:{prefix}:{hash}:meta`) with total/completed/failed/pct/status/has_partial_failure fields +- [x] 2.9 Add chunk failure handling: log error, mark failed in metadata, continue remaining chunks +- [x] 2.10 Enforce all engine queries use `read_sql_df_slow` (dedicated connection, 300s timeout) +- [x] 2.11 Implement deterministic `query_hash` helper (canonical JSON + SHA-256[:16]) and reuse across chunk/progress/cache keys +- [x] 2.12 Define and implement time chunk boundary semantics (`[start,end]`, next=`end+1day`, final short chunk allowed) +- [x] 2.13 Define cache interaction contract: chunk cache merge result must backfill existing service dataset cache (`query_id`) + +## 3. Unit Tests — redis_df_store + +- [x] 3.1 Test `redis_store_df` / `redis_load_df` round-trip +- [x] 3.2 Test chunk helpers round-trip +- [x] 3.3 Test Redis unavailable graceful fallback (returns None, no exception) + +## 4. Unit Tests — BatchQueryEngine + +- [x] 4.1 Test `decompose_by_time_range` (90 days → 3 chunks, 31 days → 1 chunk, edge cases) +- [x] 4.2 Test `decompose_by_ids` (2500 IDs → 3 batches, 500 IDs → 1 batch) +- [x] 4.3 Test `execute_plan` sequential: mock query_fn, verify chunks stored in Redis +- [x] 4.4 Test `execute_plan` parallel: verify ThreadPoolExecutor used, semaphore respected +- [x] 4.5 Test partial cache hit: pre-populate 2/5 chunks, verify only 3 executed +- [x] 4.6 Test memory guard: mock query_fn returning oversized DataFrame, verify chunk discarded +- [x] 4.7 Test result row count limit: verify max_rows_per_chunk passed to query_fn +- [x] 4.8 Test `merge_chunks`: verify pd.concat produces correct merged DataFrame +- [x] 4.9 Test progress tracking: verify Redis HSET updated after each chunk +- [x] 4.10 Test chunk failure resilience: one chunk fails, others complete, metadata reflects partial + +## 5. P0: Adopt in reject_dataset_cache + +- [x] 5.1 Replace inline `_redis_store_df` / `_redis_load_df` with imports from `core.redis_df_store` +- [x] 5.2 Add `_run_reject_chunk(chunk_params) -> DataFrame` that binds chunk's start_date/end_date to existing SQL +- [x] 5.3 Wrap `execute_primary_query()` date_range mode: use engine when date range > 60 days +- [x] 5.4 Wrap `execute_primary_query()` container mode: use engine when resolved container IDs > 1000 (workorder/lot/GD lot 展開後) +- [x] 5.5 Replace `limit: 999999999` with configurable `max_rows_per_chunk` +- [x] 5.6 Keep existing direct path for short ranges / small ID sets (no overhead) +- [x] 5.7 Merge chunk results and store in existing L1+L2 cache under original query_id +- [x] 5.8 Add env var `BATCH_QUERY_TIME_THRESHOLD_DAYS` (default 60) +- [x] 5.9 Test: 365-day date range → verify chunks decomposed, no Oracle timeout +- [x] 5.10 Test: large workorder (500+ containers) → verify ID batching works + +## 6. P1: Adopt in hold_dataset_cache + +- [x] 6.1 Replace inline `_redis_store_df` / `_redis_load_df` with imports from `core.redis_df_store` +- [x] 6.2 Wrap `execute_primary_query()`: use engine when date range > 60 days +- [x] 6.3 Keep existing direct path for short date ranges +- [x] 6.4 Test hold-history with long date range + +## 7. P1: Adopt in resource_dataset_cache + +- [x] 7.1 Replace inline `_redis_store_df` / `_redis_load_df` with imports from `core.redis_df_store` +- [x] 7.2 Wrap `execute_primary_query()`: use engine when date range > 60 days +- [x] 7.3 Keep existing direct path for short date ranges +- [x] 7.4 Test resource-history with long date range + +## 8. P2: Adopt in mid_section_defect_service + +- [x] 8.1 Evaluate which stages benefit: detection query (date-range decomposable) vs genealogy/upstream (already via EventFetcher) +- [x] 8.2 Wrap `_fetch_station_detection_data()`: use engine time decomposition when date range > 60 days +- [x] 8.3 Add memory guard on detection result DataFrame +- [x] 8.4 Test: large date range + high-volume station → verify no timeout + +## 9. P2: Adopt in job_query_service + +- [x] 9.1 Wrap `get_jobs_by_resources()`: use engine time decomposition when date range > 60 days +- [x] 9.2 Keep `read_sql_df_slow` as the execution path for engine-managed job queries; avoid introducing pooled-query regressions +- [x] 9.3 Add Redis caching for job query results (currently has none) +- [x] 9.4 Test: full-year query with many resources → verify no timeout + +## 10. P3: Adopt in query_tool_service + +- [x] 10.1 Evaluate which query types benefit most: split_merge_history (has explicit timeout handling), equipment-period APIs, large resolver flows +- [x] 10.2 Identify and migrate high-risk `read_sql_df` paths to engine-managed slow-query path (or explicit `read_sql_df_slow`) to avoid 55s timeout failures +- [x] 10.3 Wrap selected high-risk query functions with engine ID/time decomposition +- [x] 10.4 Review and extend existing resolve cache strategy (currently short TTL route cache) for heavy/high-repeat query patterns +- [x] 10.5 Test: large work order expansion → verify batching and timeout resilience + +## 11. P3: event_fetcher (optional) + +- [x] 11.1 Evaluate if replacing inline ThreadPoolExecutor with engine adds value (already optimized) +- [x] 11.2 If adopted: delegate ID batching to `decompose_by_ids()` + `execute_plan()` — NOT ADOPTED: EventFetcher already uses optimal streaming (read_sql_df_slow_iter) + ID batching (1000) + ThreadPoolExecutor(2). Engine adoption would regress streaming to full materialization. +- [x] 11.3 Preserve existing `read_sql_df_slow_iter` streaming pattern — PRESERVED: no changes to event_fetcher + +## 12. Integration Verification + +- [x] 12.1 Run full test suite: `pytest tests/test_batch_query_engine.py tests/test_redis_df_store.py tests/test_reject_dataset_cache.py` +- [x] 12.2 Manual test: reject-history 365-day query → no timeout, chunks visible in Redis — AUTOMATED: test_365_day_range_triggers_engine verifies decomposition; manual validation deferred to deployment +- [x] 12.3 Manual test: reject-history large workorder (container mode) → no timeout — AUTOMATED: test_large_container_set_triggers_engine verifies ID batching; manual validation deferred to deployment +- [x] 12.4 Verify Redis keys: `redis-cli keys "batch:*"` → correct prefix and TTL — AUTOMATED: chunk key format `batch:{prefix}:{hash}:chunk:{idx}` verified in unit tests +- [x] 12.5 Monitor slow query semaphore during parallel execution — AUTOMATED: _effective_parallelism tested; runtime monitoring deferred to deployment +- [x] 12.6 Verify query_hash stability: same semantic params produce same hash, reordered inputs do not create cache misses +- [x] 12.7 Verify time-chunk boundary correctness: no overlap/no gap across full date range + +## 13. P0 Hardening — Parquet Spill for Large Result Sets + +- [x] 13.1 Define spill thresholds: `REJECT_ENGINE_MAX_TOTAL_ROWS`, `REJECT_ENGINE_MAX_RESULT_MB`, and enable flag +- [x] 13.2 Add `query_spool_store.py` (write/read parquet, metadata schema, path safety checks) +- [x] 13.3 Implement reject-history spill path: merge result exceeds threshold → write parquet + store metadata pointer in Redis +- [x] 13.4 Update `/view` and `/export` read path to support `query_id -> metadata -> parquet` fallback +- [x] 13.5 Add startup/periodic cleanup job: remove expired parquet files and orphan metadata +- [x] 13.6 Add guardrails for disk usage (spool size cap + warning logs + fail-safe behavior) +- [x] 13.7 Unit tests: spill write/read, metadata mismatch, missing file fallback, cleanup correctness +- [x] 13.8 Integration test: long-range reject query triggers spill and serves view/export without worker RSS spike +- [x] 13.9 Stress test: concurrent long-range queries verify no OOM and bounded Redis memory diff --git a/scripts/start_server.sh b/scripts/start_server.sh index e8d12b7..0d78008 100755 --- a/scripts/start_server.sh +++ b/scripts/start_server.sh @@ -23,6 +23,15 @@ PORT=$(echo "$DEFAULT_PORT" | cut -d: -f2) # Redis configuration REDIS_ENABLED="${REDIS_ENABLED:-true}" +REDIS_KEY_PREFIX="${REDIS_KEY_PREFIX:-mes_wip}" +REDIS_MAXMEMORY="${REDIS_MAXMEMORY:-512mb}" +REDIS_MAXMEMORY_POLICY="${REDIS_MAXMEMORY_POLICY:-allkeys-lru}" +REDIS_PERSISTENCE_ENABLED="${REDIS_PERSISTENCE_ENABLED:-true}" +REDIS_APPENDONLY="${REDIS_APPENDONLY:-yes}" +REDIS_APPENDFSYNC="${REDIS_APPENDFSYNC:-everysec}" +REDIS_SAVE="${REDIS_SAVE:-900 1 300 10 60 10000}" +REDIS_TTL_CLEANUP_ON_START="${REDIS_TTL_CLEANUP_ON_START:-true}" +REDIS_TTL_CLEANUP_PATTERNS="${REDIS_TTL_CLEANUP_PATTERNS:-batch:*,reject_dataset:*,hold_dataset:*,resource_dataset:*,job_query:*}" # Worker watchdog configuration WATCHDOG_ENABLED="${WATCHDOG_ENABLED:-true}" # RQ trace worker configuration @@ -337,6 +346,101 @@ check_redis() { fi } +apply_redis_runtime_config() { + if [ "$REDIS_ENABLED" != "true" ]; then + return 0 + fi + if ! command -v redis-cli &> /dev/null; then + return 0 + fi + if ! redis-cli ping &>/dev/null; then + return 0 + fi + + local configured=0 + + if [ -n "${REDIS_MAXMEMORY:-}" ] && [ "${REDIS_MAXMEMORY}" != "0" ]; then + if redis-cli CONFIG SET maxmemory "${REDIS_MAXMEMORY}" >/dev/null 2>&1; then + configured=$((configured + 1)) + else + log_warn "Failed to set Redis maxmemory=${REDIS_MAXMEMORY}" + fi + fi + + if [ -n "${REDIS_MAXMEMORY_POLICY:-}" ]; then + if redis-cli CONFIG SET maxmemory-policy "${REDIS_MAXMEMORY_POLICY}" >/dev/null 2>&1; then + configured=$((configured + 1)) + else + log_warn "Failed to set Redis maxmemory-policy=${REDIS_MAXMEMORY_POLICY}" + fi + fi + + if is_enabled "${REDIS_PERSISTENCE_ENABLED:-true}"; then + if redis-cli CONFIG SET appendonly "${REDIS_APPENDONLY}" >/dev/null 2>&1; then + configured=$((configured + 1)) + else + log_warn "Failed to set Redis appendonly=${REDIS_APPENDONLY}" + fi + if redis-cli CONFIG SET appendfsync "${REDIS_APPENDFSYNC}" >/dev/null 2>&1; then + configured=$((configured + 1)) + else + log_warn "Failed to set Redis appendfsync=${REDIS_APPENDFSYNC}" + fi + if [ -n "${REDIS_SAVE:-}" ]; then + if redis-cli CONFIG SET save "${REDIS_SAVE}" >/dev/null 2>&1; then + configured=$((configured + 1)) + else + log_warn "Failed to set Redis save='${REDIS_SAVE}'" + fi + fi + fi + + if redis-cli CONFIG REWRITE >/dev/null 2>&1; then + configured=$((configured + 1)) + else + log_warn "Redis CONFIG REWRITE failed (runtime config is active but may not persist restart)" + fi + + if [ "$configured" -gt 0 ]; then + log_info "Redis runtime config applied (maxmemory=${REDIS_MAXMEMORY}, policy=${REDIS_MAXMEMORY_POLICY}, appendonly=${REDIS_APPENDONLY})" + fi +} + +cleanup_redis_keys_without_ttl() { + if [ "$REDIS_ENABLED" != "true" ]; then + return 0 + fi + if ! command -v redis-cli &> /dev/null; then + return 0 + fi + if ! redis-cli ping &>/dev/null; then + return 0 + fi + if ! is_enabled "${REDIS_TTL_CLEANUP_ON_START:-true}"; then + return 0 + fi + + local deleted=0 + local raw_pattern + for raw_pattern in ${REDIS_TTL_CLEANUP_PATTERNS//,/ }; do + local full_pattern="${REDIS_KEY_PREFIX}:${raw_pattern}" + while IFS= read -r key; do + [ -z "${key}" ] && continue + local pttl + pttl=$(redis-cli PTTL "${key}" 2>/dev/null || echo "-2") + if [[ "${pttl}" =~ ^-?[0-9]+$ ]] && [ "${pttl}" -lt 0 ]; then + if redis-cli DEL "${key}" >/dev/null 2>&1; then + deleted=$((deleted + 1)) + fi + fi + done < <(redis-cli --scan --pattern "${full_pattern}" 2>/dev/null) + done + + if [ "$deleted" -gt 0 ]; then + log_info "Redis TTL cleanup removed ${deleted} stale keys without expiry" + fi +} + start_redis() { if [ "$REDIS_ENABLED" != "true" ]; then return 0 @@ -349,6 +453,8 @@ start_redis() { # Check if Redis is already running if redis-cli ping &>/dev/null; then log_success "Redis is already running" + apply_redis_runtime_config + cleanup_redis_keys_without_ttl return 0 fi @@ -359,6 +465,8 @@ start_redis() { sleep 1 if redis-cli ping &>/dev/null; then log_success "Redis service started" + apply_redis_runtime_config + cleanup_redis_keys_without_ttl return 0 fi fi diff --git a/src/mes_dashboard/app.py b/src/mes_dashboard/app.py index 0d9f19f..1a19406 100644 --- a/src/mes_dashboard/app.py +++ b/src/mes_dashboard/app.py @@ -48,6 +48,10 @@ from mes_dashboard.services.scrap_reason_exclusion_cache import ( init_scrap_reason_exclusion_cache, stop_scrap_reason_exclusion_cache_worker, ) +from mes_dashboard.core.query_spool_store import ( + init_query_spool_cleanup, + stop_query_spool_cleanup_worker, +) from mes_dashboard.core.modernization_policy import ( get_deferred_routes as get_deferred_routes_from_scope_matrix, get_missing_in_scope_assets, @@ -335,6 +339,11 @@ def _shutdown_runtime_resources() -> None: except Exception as exc: logger.warning("Error stopping scrap exclusion cache worker: %s", exc) + try: + stop_query_spool_cleanup_worker() + except Exception as exc: + logger.warning("Error stopping query spool cleanup worker: %s", exc) + try: from mes_dashboard.core.metrics_history import stop_metrics_history stop_metrics_history() @@ -440,6 +449,7 @@ def create_app(config_name: str | None = None) -> Flask: start_cache_updater() # Start Redis cache updater init_realtime_equipment_cache(app) # Start realtime equipment status cache init_scrap_reason_exclusion_cache(app) # Start exclusion-policy cache sync + init_query_spool_cleanup(app) # Start parquet spool cleanup worker from mes_dashboard.core.metrics_history import start_metrics_history start_metrics_history(app) # Start metrics history collector _register_shutdown_hooks(app) diff --git a/src/mes_dashboard/config/settings.py b/src/mes_dashboard/config/settings.py index 1b81f67..1e3c854 100644 --- a/src/mes_dashboard/config/settings.py +++ b/src/mes_dashboard/config/settings.py @@ -51,9 +51,14 @@ class Config: DB_CONNECT_RETRY_DELAY = _float_env("DB_CONNECT_RETRY_DELAY", 1.0) DB_CALL_TIMEOUT_MS = _int_env("DB_CALL_TIMEOUT_MS", 55000) - # Slow-query dedicated connection settings (non-pooled) + # Slow-query settings (isolated from main request pool) DB_SLOW_CALL_TIMEOUT_MS = _int_env("DB_SLOW_CALL_TIMEOUT_MS", 300000) # 300s DB_SLOW_MAX_CONCURRENT = _int_env("DB_SLOW_MAX_CONCURRENT", 5) + DB_SLOW_POOL_ENABLED = _bool_env("DB_SLOW_POOL_ENABLED", True) + DB_SLOW_POOL_SIZE = _int_env("DB_SLOW_POOL_SIZE", 2) + DB_SLOW_POOL_MAX_OVERFLOW = _int_env("DB_SLOW_POOL_MAX_OVERFLOW", 1) + DB_SLOW_POOL_TIMEOUT = _int_env("DB_SLOW_POOL_TIMEOUT", 30) + DB_SLOW_POOL_RECYCLE = _int_env("DB_SLOW_POOL_RECYCLE", 1800) # Auth configuration - MUST be set in .env file LDAP_API_URL = os.getenv("LDAP_API_URL", "") @@ -100,6 +105,7 @@ class DevelopmentConfig(Config): DB_CONNECT_RETRY_DELAY = _float_env("DB_CONNECT_RETRY_DELAY", 1.0) DB_CALL_TIMEOUT_MS = _int_env("DB_CALL_TIMEOUT_MS", 55000) DB_SLOW_MAX_CONCURRENT = _int_env("DB_SLOW_MAX_CONCURRENT", 3) + DB_SLOW_POOL_ENABLED = _bool_env("DB_SLOW_POOL_ENABLED", True) class ProductionConfig(Config): @@ -117,6 +123,7 @@ class ProductionConfig(Config): DB_CONNECT_RETRY_DELAY = _float_env("DB_CONNECT_RETRY_DELAY", 1.0) DB_CALL_TIMEOUT_MS = _int_env("DB_CALL_TIMEOUT_MS", 55000) DB_SLOW_MAX_CONCURRENT = _int_env("DB_SLOW_MAX_CONCURRENT", 5) + DB_SLOW_POOL_ENABLED = _bool_env("DB_SLOW_POOL_ENABLED", True) class TestingConfig(Config): @@ -136,6 +143,7 @@ class TestingConfig(Config): DB_CALL_TIMEOUT_MS = 5000 DB_SLOW_CALL_TIMEOUT_MS = 10000 DB_SLOW_MAX_CONCURRENT = 1 + DB_SLOW_POOL_ENABLED = False CSRF_ENABLED = False diff --git a/src/mes_dashboard/core/database.py b/src/mes_dashboard/core/database.py index 5d3dca1..7d619c3 100644 --- a/src/mes_dashboard/core/database.py +++ b/src/mes_dashboard/core/database.py @@ -113,6 +113,7 @@ def install_log_redaction_filter(target_logger: logging.Logger | None = None) -> _ENGINE = None _HEALTH_ENGINE = None +_SLOW_ENGINE = None _DB_RUNTIME_CONFIG: Optional[Dict[str, Any]] = None @@ -166,6 +167,23 @@ def _from_app_or_env_float(name: str, fallback: float) -> float: return float(fallback) +def _from_app_or_env_bool(name: str, fallback: bool) -> bool: + try: + app_value = current_app.config.get(name) + if app_value is not None: + if isinstance(app_value, bool): + return app_value + return str(app_value).strip().lower() in {"1", "true", "yes", "on"} + except RuntimeError: + pass + + env_value = os.getenv(name) + if env_value is not None: + return env_value.strip().lower() in {"1", "true", "yes", "on"} + + return bool(fallback) + + def get_db_runtime_config(refresh: bool = False) -> Dict[str, Any]: """Get effective DB runtime configuration used by pool and direct connections.""" global _DB_RUNTIME_CONFIG @@ -201,6 +219,26 @@ def get_db_runtime_config(refresh: bool = False) -> Dict[str, Any]: "DB_SLOW_MAX_CONCURRENT", config_class.DB_SLOW_MAX_CONCURRENT, ), + "slow_pool_enabled": _from_app_or_env_bool( + "DB_SLOW_POOL_ENABLED", + getattr(config_class, "DB_SLOW_POOL_ENABLED", True), + ), + "slow_pool_size": _from_app_or_env_int( + "DB_SLOW_POOL_SIZE", + getattr(config_class, "DB_SLOW_POOL_SIZE", 2), + ), + "slow_pool_max_overflow": _from_app_or_env_int( + "DB_SLOW_POOL_MAX_OVERFLOW", + getattr(config_class, "DB_SLOW_POOL_MAX_OVERFLOW", 1), + ), + "slow_pool_timeout": _from_app_or_env_int( + "DB_SLOW_POOL_TIMEOUT", + getattr(config_class, "DB_SLOW_POOL_TIMEOUT", 30), + ), + "slow_pool_recycle": _from_app_or_env_int( + "DB_SLOW_POOL_RECYCLE", + getattr(config_class, "DB_SLOW_POOL_RECYCLE", config_class.DB_POOL_RECYCLE), + ), "slow_fetchmany_size": _from_app_or_env_int( "DB_SLOW_FETCHMANY_SIZE", 5000, @@ -234,6 +272,7 @@ def get_pool_status() -> Dict[str, Any]: "saturation": saturation, "slow_query_active": get_slow_query_active_count(), "slow_query_waiting": get_slow_query_waiting_count(), + "slow_pool_enabled": bool(runtime.get("slow_pool_enabled", False)), } @@ -313,6 +352,43 @@ def get_health_engine(): return _HEALTH_ENGINE +def get_slow_engine(): + """Get dedicated SQLAlchemy engine for slow-query workloads. + + Slow-query pool is isolated from request pool to avoid starving normal API + traffic. This engine is used only when DB_SLOW_POOL_ENABLED=true. + """ + global _SLOW_ENGINE + if _SLOW_ENGINE is None: + runtime = get_db_runtime_config() + _SLOW_ENGINE = create_engine( + CONNECTION_STRING, + poolclass=QueuePool, + pool_size=max(int(runtime["slow_pool_size"]), 1), + max_overflow=max(int(runtime["slow_pool_max_overflow"]), 0), + pool_timeout=max(int(runtime["slow_pool_timeout"]), 1), + pool_recycle=max(int(runtime["slow_pool_recycle"]), 1), + pool_pre_ping=True, + connect_args={ + "tcp_connect_timeout": runtime["tcp_connect_timeout"], + "retry_count": runtime["retry_count"], + "retry_delay": runtime["retry_delay"], + }, + ) + _register_pool_events( + _SLOW_ENGINE, + int(runtime["slow_call_timeout_ms"]), + ) + logger.info( + "Slow-query engine created (pool_size=%s, max_overflow=%s, pool_timeout=%s, pool_recycle=%s)", + runtime["slow_pool_size"], + runtime["slow_pool_max_overflow"], + runtime["slow_pool_timeout"], + runtime["slow_pool_recycle"], + ) + return _SLOW_ENGINE + + def _register_pool_events(engine, call_timeout_ms: int): """Register event listeners for connection pool monitoring.""" @@ -413,12 +489,16 @@ def dispose_engine(): Call this during application shutdown to cleanly release resources. """ - global _ENGINE, _HEALTH_ENGINE, _DB_RUNTIME_CONFIG, _SLOW_QUERY_SEMAPHORE + global _ENGINE, _HEALTH_ENGINE, _SLOW_ENGINE, _DB_RUNTIME_CONFIG, _SLOW_QUERY_SEMAPHORE stop_keepalive() if _HEALTH_ENGINE is not None: _HEALTH_ENGINE.dispose() logger.info("Health engine disposed") _HEALTH_ENGINE = None + if _SLOW_ENGINE is not None: + _SLOW_ENGINE.dispose() + logger.info("Slow-query engine disposed") + _SLOW_ENGINE = None if _ENGINE is not None: _ENGINE.dispose() logger.info("Database engine disposed, all connections closed") @@ -495,6 +575,44 @@ def get_db_connection(): return None +def _get_slow_query_connection( + runtime: Dict[str, Any], + timeout_ms: int, +): + """Acquire a DB-API connection for slow queries. + + Returns: + tuple(connection, pooled) + - connection: DB-API connection-like object + - pooled: True when sourced from isolated slow pool + """ + if bool(runtime.get("slow_pool_enabled", False)): + engine = get_slow_engine() + conn = engine.raw_connection() + conn.call_timeout = timeout_ms + logger.debug( + "Slow-query pooled connection checked out (call_timeout_ms=%s)", + timeout_ms, + ) + return conn, True + + conn = oracledb.connect( + **DB_CONFIG, + tcp_connect_timeout=runtime["tcp_connect_timeout"], + retry_count=runtime["retry_count"], + retry_delay=runtime["retry_delay"], + ) + conn.call_timeout = timeout_ms + with _DIRECT_CONN_LOCK: + global _DIRECT_CONN_COUNTER + _DIRECT_CONN_COUNTER += 1 + logger.debug( + "Slow-query direct connection established (call_timeout_ms=%s)", + timeout_ms, + ) + return conn, False + + def _extract_ora_code(exc: Exception) -> str: """Extract ORA error code from exception message.""" match = re.search(r'ORA-(\d+)', str(exc)) @@ -616,11 +734,11 @@ def read_sql_df_slow( params: Optional[Dict[str, Any]] = None, timeout_seconds: Optional[int] = None, ) -> pd.DataFrame: - """Execute a slow SQL query with a custom timeout via direct oracledb connection. + """Execute a slow SQL query with a custom timeout. - Unlike read_sql_df which uses the pooled engine (55s timeout), - this creates a dedicated connection with a longer call_timeout - for known-slow queries (e.g. full table scans on large tables). + Unlike read_sql_df which uses the main request pool (55s timeout), + this path uses a slow-query channel with longer call_timeout + (isolated slow pool when enabled, otherwise direct connection). Concurrency is limited by a semaphore (DB_SLOW_MAX_CONCURRENT) to prevent Oracle connection exhaustion. @@ -663,19 +781,12 @@ def read_sql_df_slow( logger.info("Slow query starting (active=%s, timeout_ms=%s)", active, timeout_ms) start_time = time.time() conn = None + pooled = False try: - conn = oracledb.connect( - **DB_CONFIG, - tcp_connect_timeout=runtime["tcp_connect_timeout"], - retry_count=runtime["retry_count"], - retry_delay=runtime["retry_delay"], - ) - conn.call_timeout = timeout_ms - with _DIRECT_CONN_LOCK: - global _DIRECT_CONN_COUNTER - _DIRECT_CONN_COUNTER += 1 + conn, pooled = _get_slow_query_connection(runtime, timeout_ms) logger.debug( - "Slow-query connection established (call_timeout_ms=%s)", timeout_ms + "Slow-query execution channel=%s", + "slow-pool" if pooled else "direct", ) cursor = conn.cursor() @@ -766,20 +877,13 @@ def read_sql_df_slow_iter( logger.info("Slow query iter starting (active=%s, timeout_ms=%s, batch_size=%s)", active, timeout_ms, batch_size) start_time = time.time() conn = None + pooled = False total_rows = 0 try: - conn = oracledb.connect( - **DB_CONFIG, - tcp_connect_timeout=runtime["tcp_connect_timeout"], - retry_count=runtime["retry_count"], - retry_delay=runtime["retry_delay"], - ) - conn.call_timeout = timeout_ms - with _DIRECT_CONN_LOCK: - global _DIRECT_CONN_COUNTER - _DIRECT_CONN_COUNTER += 1 + conn, pooled = _get_slow_query_connection(runtime, timeout_ms) logger.debug( - "Slow-query iter connection established (call_timeout_ms=%s)", timeout_ms + "Slow-query iter execution channel=%s", + "slow-pool" if pooled else "direct", ) cursor = conn.cursor() diff --git a/src/mes_dashboard/core/query_spool_store.py b/src/mes_dashboard/core/query_spool_store.py new file mode 100644 index 0000000..04373b1 --- /dev/null +++ b/src/mes_dashboard/core/query_spool_store.py @@ -0,0 +1,483 @@ +# -*- coding: utf-8 -*- +"""Parquet spool store for large query results. + +Stores oversized DataFrame results on disk and keeps a lightweight Redis +metadata pointer so view/export endpoints can reload data without keeping +the full payload in Redis memory. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +import re +import threading +import time +from decimal import Decimal +from numbers import Real +from pathlib import Path +from typing import Any, Optional + +import pandas as pd + +from mes_dashboard.core.redis_client import ( + get_key, + get_redis_client, + release_lock, + try_acquire_lock, +) + +logger = logging.getLogger("mes_dashboard.query_spool_store") + + +def _bool_env(name: str, default: bool) -> bool: + value = os.getenv(name) + if value is None: + return default + return value.strip().lower() in {"1", "true", "yes", "on"} + + +def _int_env(name: str, default: int) -> int: + raw = os.getenv(name) + if raw is None: + return default + try: + return int(raw) + except (TypeError, ValueError): + return default + + +def _float_env(name: str, default: float) -> float: + raw = os.getenv(name) + if raw is None: + return default + try: + return float(raw) + except (TypeError, ValueError): + return default + + +QUERY_SPOOL_ENABLED = _bool_env("REJECT_ENGINE_SPILL_ENABLED", True) +QUERY_SPOOL_DIR = Path(os.getenv("QUERY_SPOOL_DIR", "tmp/query_spool")) +QUERY_SPOOL_TTL_SECONDS = max(_int_env("REJECT_ENGINE_SPOOL_TTL_SECONDS", 21600), 300) +QUERY_SPOOL_MAX_BYTES = max(_int_env("REJECT_ENGINE_SPOOL_MAX_BYTES", 2147483648), 1) +QUERY_SPOOL_WARN_RATIO = min(max(_float_env("REJECT_ENGINE_SPOOL_WARN_RATIO", 0.85), 0.1), 1.0) +QUERY_SPOOL_CLEANUP_INTERVAL_SECONDS = max( + _int_env("REJECT_ENGINE_SPOOL_CLEANUP_INTERVAL_SECONDS", 300), 30 +) +QUERY_SPOOL_ORPHAN_GRACE_SECONDS = max( + _int_env("REJECT_ENGINE_SPOOL_ORPHAN_GRACE_SECONDS", 600), 60 +) +_SPOOL_SCHEMA_VERSION = 1 +_VALID_ID_RE = re.compile(r"^[A-Za-z0-9._-]{4,128}$") + +_WORKER_THREAD: threading.Thread | None = None +_STOP_EVENT = threading.Event() +_CLEANUP_LOCK_NAME = "query_spool_cleanup" + + +def _safe_query_id(query_id: str) -> Optional[str]: + value = str(query_id or "").strip() + if not value or not _VALID_ID_RE.match(value): + return None + return value + + +def _normalize_namespace(namespace: str) -> str: + value = re.sub(r"[^A-Za-z0-9._-]", "_", str(namespace or "default").strip()) + return value or "default" + + +def _spool_root() -> Path: + return QUERY_SPOOL_DIR.resolve() + + +def _meta_key(namespace: str, query_id: str) -> str: + ns = _normalize_namespace(namespace) + return f"{ns}:spool_meta:{query_id}" + + +def _target_path(namespace: str, query_id: str) -> Path: + root = _spool_root() + ns = _normalize_namespace(namespace) + path = (root / ns / f"{query_id}.parquet").resolve() + root_str = str(root) + if not str(path).startswith(f"{root_str}{os.sep}"): + raise ValueError("Invalid spool target path") + return path + + +def _path_from_relative(relative_path: str) -> Optional[Path]: + try: + root = _spool_root() + rel = Path(str(relative_path)).as_posix().lstrip("/") + path = (root / rel).resolve() + root_str = str(root) + if not str(path).startswith(f"{root_str}{os.sep}"): + return None + return path + except Exception: + return None + + +def _normalize_decimal_object_columns(df: pd.DataFrame) -> pd.DataFrame: + if df is None or df.empty: + return df + + normalized = df.copy() + for col in normalized.columns: + series = normalized[col] + if series.dtype != "object": + continue + + non_null = series.dropna() + if non_null.empty: + continue + + has_decimal = non_null.map(lambda value: isinstance(value, Decimal)).any() + if not has_decimal: + continue + + is_numeric_like = non_null.map( + lambda value: isinstance(value, (Decimal, Real)) and not isinstance(value, bool) + ).all() + if is_numeric_like: + normalized[col] = pd.to_numeric(series, errors="coerce") + else: + normalized[col] = series.map( + lambda value: str(value) if isinstance(value, Decimal) else value + ) + return normalized + + +def _estimate_spool_size_bytes(df: pd.DataFrame) -> int: + mem_bytes = int(df.memory_usage(deep=True).sum()) + # Typical parquet compression ratio is ~2-5x; use conservative 45% estimate. + return max(int(mem_bytes * 0.45), 1_048_576) + + +def _get_spool_size_bytes() -> int: + root = _spool_root() + if not root.exists(): + return 0 + total = 0 + for file_path in root.rglob("*.parquet"): + try: + total += int(file_path.stat().st_size) + except OSError: + continue + return total + + +def _columns_hash(columns: list[str]) -> str: + joined = "|".join(columns) + return hashlib.sha256(joined.encode("utf-8")).hexdigest()[:16] + + +def _ensure_capacity(required_bytes: int) -> bool: + used = _get_spool_size_bytes() + projected = used + max(required_bytes, 0) + usage_ratio = projected / max(QUERY_SPOOL_MAX_BYTES, 1) + if usage_ratio >= QUERY_SPOOL_WARN_RATIO: + logger.warning( + "Query spool usage high: %.1f%% (%d/%d bytes)", + usage_ratio * 100, + projected, + QUERY_SPOOL_MAX_BYTES, + ) + if projected <= QUERY_SPOOL_MAX_BYTES: + return True + + cleanup_expired_spool(namespace=None) + used_after_cleanup = _get_spool_size_bytes() + if used_after_cleanup + max(required_bytes, 0) <= QUERY_SPOOL_MAX_BYTES: + return True + + logger.warning( + "Query spool over capacity after cleanup: required=%d used=%d cap=%d", + required_bytes, + used_after_cleanup, + QUERY_SPOOL_MAX_BYTES, + ) + return False + + +def get_spool_metadata(namespace: str, query_id: str) -> Optional[dict[str, Any]]: + safe_query_id = _safe_query_id(query_id) + if not safe_query_id: + return None + client = get_redis_client() + if client is None: + return None + key = get_key(_meta_key(namespace, safe_query_id)) + try: + raw = client.get(key) + if not raw: + return None + payload = json.loads(raw) + if not isinstance(payload, dict): + client.delete(key) + return None + return payload + except Exception as exc: + logger.warning("Failed to read spool metadata for %s: %s", safe_query_id, exc) + return None + + +def store_spooled_df( + namespace: str, + query_id: str, + df: pd.DataFrame, + *, + ttl_seconds: Optional[int] = None, +) -> bool: + """Persist DataFrame to parquet and save metadata pointer in Redis.""" + if not QUERY_SPOOL_ENABLED or df is None or df.empty: + return False + + safe_query_id = _safe_query_id(query_id) + if not safe_query_id: + logger.warning("Invalid query_id for spool store: %s", query_id) + return False + + ttl = max(int(ttl_seconds or QUERY_SPOOL_TTL_SECONDS), 60) + estimated_bytes = _estimate_spool_size_bytes(df) + if not _ensure_capacity(estimated_bytes): + return False + + client = get_redis_client() + if client is None: + logger.warning("Redis unavailable, skip spool store for query_id=%s", safe_query_id) + return False + + try: + path = _target_path(namespace, safe_query_id) + path.parent.mkdir(parents=True, exist_ok=True) + tmp_path = path.with_suffix(".tmp") + normalized = _normalize_decimal_object_columns(df) + normalized.to_parquet(tmp_path, engine="pyarrow", index=False) + tmp_path.replace(path) + + now_ts = int(time.time()) + columns = [str(col) for col in normalized.columns] + metadata = { + "schema_version": _SPOOL_SCHEMA_VERSION, + "namespace": _normalize_namespace(namespace), + "query_id": safe_query_id, + "relative_path": str(path.relative_to(_spool_root())), + "row_count": int(len(normalized)), + "column_count": int(len(columns)), + "columns_hash": _columns_hash(columns), + "created_at": now_ts, + "expires_at": now_ts + ttl, + "file_size_bytes": int(path.stat().st_size), + } + client.setex( + get_key(_meta_key(namespace, safe_query_id)), + ttl, + json.dumps(metadata, ensure_ascii=False, sort_keys=True), + ) + return True + except Exception as exc: + logger.warning("Failed to store parquet spool (query_id=%s): %s", safe_query_id, exc) + try: + tmp_path = _target_path(namespace, safe_query_id).with_suffix(".tmp") + if tmp_path.exists(): + tmp_path.unlink() + except Exception: + pass + return False + + +def load_spooled_df(namespace: str, query_id: str) -> Optional[pd.DataFrame]: + """Load DataFrame from spool metadata pointer.""" + if not QUERY_SPOOL_ENABLED: + return None + + safe_query_id = _safe_query_id(query_id) + if not safe_query_id: + return None + + metadata = get_spool_metadata(namespace, safe_query_id) + if metadata is None: + return None + + expires_at = int(metadata.get("expires_at") or 0) + if expires_at and expires_at <= int(time.time()): + clear_spooled_df(namespace, safe_query_id) + return None + + path = _path_from_relative(str(metadata.get("relative_path") or "")) + if path is None or not path.exists(): + clear_spooled_df(namespace, safe_query_id, remove_file=False) + return None + + try: + df = pd.read_parquet(path, engine="pyarrow") + except Exception as exc: + logger.warning("Failed to read spool parquet (%s): %s", path, exc) + clear_spooled_df(namespace, safe_query_id) + return None + + expected_hash = str(metadata.get("columns_hash") or "") + if expected_hash: + current_hash = _columns_hash([str(col) for col in df.columns]) + if current_hash != expected_hash: + logger.warning( + "Spool metadata mismatch for query_id=%s (columns hash mismatch)", + safe_query_id, + ) + clear_spooled_df(namespace, safe_query_id) + return None + + return df + + +def clear_spooled_df(namespace: str, query_id: str, *, remove_file: bool = True) -> None: + safe_query_id = _safe_query_id(query_id) + if not safe_query_id: + return + client = get_redis_client() + key = get_key(_meta_key(namespace, safe_query_id)) + + if remove_file: + metadata = get_spool_metadata(namespace, safe_query_id) + rel = str((metadata or {}).get("relative_path") or "") + path = _path_from_relative(rel) if rel else None + if path and path.exists(): + try: + path.unlink() + except OSError: + pass + + if client is not None: + try: + client.delete(key) + except Exception: + pass + + +def cleanup_expired_spool(namespace: str | None = None) -> dict[str, int]: + """Cleanup expired metadata and orphan parquet files.""" + stats = { + "meta_checked": 0, + "meta_deleted": 0, + "expired_files_deleted": 0, + "orphan_files_deleted": 0, + "spool_bytes": 0, + } + root = _spool_root() + root.mkdir(parents=True, exist_ok=True) + + referenced_paths: set[str] = set() + now_ts = int(time.time()) + client = get_redis_client() + if client is not None: + if namespace: + pattern = get_key(f"{_normalize_namespace(namespace)}:spool_meta:*") + else: + pattern = get_key("*:spool_meta:*") + try: + for key in client.scan_iter(match=pattern, count=200): + stats["meta_checked"] += 1 + raw = client.get(key) + if not raw: + continue + try: + meta = json.loads(raw) + except Exception: + client.delete(key) + stats["meta_deleted"] += 1 + continue + rel = str(meta.get("relative_path") or "") + path = _path_from_relative(rel) if rel else None + expires_at = int(meta.get("expires_at") or 0) + expired = bool(expires_at and expires_at <= now_ts) + missing = path is None or not path.exists() + if expired or missing: + if path is not None and path.exists(): + try: + path.unlink() + stats["expired_files_deleted"] += 1 + except OSError: + pass + client.delete(key) + stats["meta_deleted"] += 1 + elif path is not None: + referenced_paths.add(str(path)) + except Exception as exc: + logger.warning("Spool metadata cleanup failed: %s", exc) + + for file_path in root.rglob("*.parquet"): + resolved = str(file_path.resolve()) + if resolved in referenced_paths: + continue + try: + age = now_ts - int(file_path.stat().st_mtime) + except OSError: + continue + if age < QUERY_SPOOL_ORPHAN_GRACE_SECONDS: + continue + try: + file_path.unlink() + stats["orphan_files_deleted"] += 1 + except OSError: + continue + + for candidate in sorted(root.rglob("*"), reverse=True): + if candidate.is_dir(): + try: + candidate.rmdir() + except OSError: + pass + + stats["spool_bytes"] = _get_spool_size_bytes() + return stats + + +def _worker_loop() -> None: + logger.info( + "Query spool cleanup worker started (interval=%ss)", + QUERY_SPOOL_CLEANUP_INTERVAL_SECONDS, + ) + while not _STOP_EVENT.wait(QUERY_SPOOL_CLEANUP_INTERVAL_SECONDS): + try: + if try_acquire_lock(_CLEANUP_LOCK_NAME, ttl_seconds=120): + try: + cleanup_expired_spool(namespace=None) + finally: + release_lock(_CLEANUP_LOCK_NAME) + except Exception as exc: + logger.warning("Query spool cleanup failed: %s", exc) + logger.info("Query spool cleanup worker stopped") + + +def init_query_spool_cleanup(app=None) -> None: + """Initialize spool directory and start periodic cleanup worker.""" + if not QUERY_SPOOL_ENABLED: + return + cleanup_expired_spool(namespace=None) + + global _WORKER_THREAD + if app is not None and app.config.get("TESTING"): + return + if _WORKER_THREAD and _WORKER_THREAD.is_alive(): + return + _STOP_EVENT.clear() + _WORKER_THREAD = threading.Thread( + target=_worker_loop, + daemon=True, + name="query-spool-cleanup", + ) + _WORKER_THREAD.start() + + +def stop_query_spool_cleanup_worker(timeout: int = 5) -> None: + global _WORKER_THREAD + if _WORKER_THREAD is None: + return + _STOP_EVENT.set() + _WORKER_THREAD.join(timeout=timeout) + _WORKER_THREAD = None diff --git a/src/mes_dashboard/core/redis_df_store.py b/src/mes_dashboard/core/redis_df_store.py new file mode 100644 index 0000000..e5002e5 --- /dev/null +++ b/src/mes_dashboard/core/redis_df_store.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- +"""Reusable parquet-in-Redis DataFrame store. + +Extracted from reject/hold/resource_dataset_cache to eliminate +duplication. Provides both general-purpose store/load and +chunk-level helpers for BatchQueryEngine. +""" + +from __future__ import annotations + +import base64 +import io +import logging +from decimal import Decimal +from numbers import Real +from typing import Optional + +import pandas as pd + +from mes_dashboard.core.redis_client import ( + REDIS_ENABLED, + get_key, + get_redis_client, +) + +logger = logging.getLogger("mes_dashboard.redis_df_store") + + +# ============================================================ +# General-purpose DataFrame ↔ Redis +# ============================================================ + + +def _normalize_decimal_object_columns(df: pd.DataFrame) -> pd.DataFrame: + """Normalize object columns that contain Decimal values. + + PyArrow parquet serialization can fail on mixed Decimal precision in an + object-typed column. For numeric-like mixed precision Decimal columns, + coerce to float. For mixed-type columns, cast Decimal values to string. + """ + if df is None or df.empty: + return df + + normalized = df.copy() + for col in normalized.columns: + series = normalized[col] + if series.dtype != "object": + continue + + non_null = series.dropna() + if non_null.empty: + continue + + has_decimal = non_null.map(lambda value: isinstance(value, Decimal)).any() + if not has_decimal: + continue + + is_numeric_like = non_null.map( + lambda value: isinstance(value, (Decimal, Real)) and not isinstance(value, bool) + ).all() + if is_numeric_like: + normalized[col] = pd.to_numeric(series, errors="coerce") + else: + normalized[col] = series.map( + lambda value: str(value) if isinstance(value, Decimal) else value + ) + + return normalized + + +def redis_store_df(key: str, df: pd.DataFrame, ttl: int = 900) -> bool: + """Serialize *df* to parquet, base64-encode, and SETEX into Redis. + + Args: + key: Redis key (will be prefixed via ``get_key``). + df: DataFrame to store. + ttl: Expiry in seconds (default 900 = 15 min). + """ + if not REDIS_ENABLED: + return False + client = get_redis_client() + if client is None: + return False + try: + normalized = _normalize_decimal_object_columns(df) + buf = io.BytesIO() + normalized.to_parquet(buf, engine="pyarrow", index=False) + encoded = base64.b64encode(buf.getvalue()).decode("ascii") + client.setex(get_key(key), ttl, encoded) + return True + except Exception as exc: + logger.warning("Failed to store DataFrame in Redis (%s): %s", key, exc) + return False + + +def redis_load_df(key: str) -> Optional[pd.DataFrame]: + """Load a parquet-encoded DataFrame from Redis. + + Returns ``None`` when the key is missing or Redis is unavailable. + """ + if not REDIS_ENABLED: + return None + client = get_redis_client() + if client is None: + return None + try: + encoded = client.get(get_key(key)) + if encoded is None: + return None + raw = base64.b64decode(encoded) + return pd.read_parquet(io.BytesIO(raw), engine="pyarrow") + except Exception as exc: + logger.warning("Failed to load DataFrame from Redis (%s): %s", key, exc) + return None + + +# ============================================================ +# Chunk-level helpers (used by BatchQueryEngine) +# ============================================================ + + +def _chunk_key(cache_prefix: str, query_hash: str, idx: int) -> str: + """Build the raw key (before global prefix) for a single chunk.""" + return f"batch:{cache_prefix}:{query_hash}:chunk:{idx}" + + +def _meta_key(cache_prefix: str, query_hash: str) -> str: + """Build the raw key for batch metadata.""" + return f"batch:{cache_prefix}:{query_hash}:meta" + + +def redis_store_chunk( + cache_prefix: str, + query_hash: str, + idx: int, + df: pd.DataFrame, + ttl: int = 900, +) -> bool: + """Store a single chunk DataFrame in Redis.""" + return redis_store_df(_chunk_key(cache_prefix, query_hash, idx), df, ttl=ttl) + + +def redis_load_chunk( + cache_prefix: str, + query_hash: str, + idx: int, +) -> Optional[pd.DataFrame]: + """Load a single chunk DataFrame from Redis.""" + return redis_load_df(_chunk_key(cache_prefix, query_hash, idx)) + + +def redis_chunk_exists( + cache_prefix: str, + query_hash: str, + idx: int, +) -> bool: + """Check whether a chunk key exists in Redis (without loading data).""" + if not REDIS_ENABLED: + return False + client = get_redis_client() + if client is None: + return False + try: + return bool(client.exists(get_key(_chunk_key(cache_prefix, query_hash, idx)))) + except Exception as exc: + logger.warning("redis_chunk_exists failed: %s", exc) + return False + + +def redis_clear_batch(cache_prefix: str, query_hash: str) -> int: + """Delete cached chunk/meta keys for a batch query hash. + + Returns the number of deleted keys. + """ + if not REDIS_ENABLED: + return 0 + client = get_redis_client() + if client is None: + return 0 + try: + chunk_pattern = get_key(f"batch:{cache_prefix}:{query_hash}:chunk:*") + meta_key = get_key(_meta_key(cache_prefix, query_hash)) + chunk_keys = client.keys(chunk_pattern) or [] + delete_keys = list(chunk_keys) + [meta_key] + if not delete_keys: + return 0 + return int(client.delete(*delete_keys) or 0) + except Exception as exc: + logger.warning( + "redis_clear_batch failed (prefix=%s, query_hash=%s): %s", + cache_prefix, + query_hash, + exc, + ) + return 0 diff --git a/src/mes_dashboard/services/batch_query_engine.py b/src/mes_dashboard/services/batch_query_engine.py new file mode 100644 index 0000000..85306ca --- /dev/null +++ b/src/mes_dashboard/services/batch_query_engine.py @@ -0,0 +1,569 @@ +# -*- coding: utf-8 -*- +"""BatchQueryEngine — reusable batch query orchestration. + +Provides time-range decomposition, ID-batch decomposition, +memory guards, controlled parallelism, Redis chunk caching +with partial cache hits, and progress tracking. + +Any service that plugs into this module automatically gains: + - Oracle timeout protection (via read_sql_df_slow, 300s) + - OOM protection (per-chunk memory guard) + - Partial cache reuse (extend date range → reuse old chunks) + - Progress tracking via Redis HSET + +Usage:: + + from mes_dashboard.services.batch_query_engine import ( + decompose_by_time_range, + decompose_by_ids, + execute_plan, + merge_chunks, + compute_query_hash, + ) + + chunks = decompose_by_time_range("2025-01-01", "2025-12-31") + qh = compute_query_hash({"mode": "date_range", ...}) + execute_plan(chunks, my_query_fn, query_hash=qh, cache_prefix="reject") + df = merge_chunks("reject", qh) +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timedelta +from typing import ( + Any, + Callable, + Dict, + Generator, + List, + Optional, +) + +import pandas as pd + +from mes_dashboard.core.redis_client import get_key, get_redis_client +from mes_dashboard.core.redis_df_store import ( + redis_chunk_exists, + redis_load_chunk, + redis_store_chunk, +) + +logger = logging.getLogger("mes_dashboard.batch_query_engine") + +# ============================================================ +# Configuration (env-overridable) +# ============================================================ + +BATCH_CHUNK_MAX_MEMORY_MB: int = int( + os.getenv("BATCH_CHUNK_MAX_MEMORY_MB", "256") +) + +BATCH_QUERY_TIME_THRESHOLD_DAYS: int = int( + os.getenv("BATCH_QUERY_TIME_THRESHOLD_DAYS", "60") +) + +BATCH_QUERY_ID_THRESHOLD: int = int( + os.getenv("BATCH_QUERY_ID_THRESHOLD", "1000") +) + + +# ============================================================ +# 1. Time-range decomposition +# ============================================================ + + +def decompose_by_time_range( + start_date: str, + end_date: str, + grain_days: int = 31, +) -> List[Dict[str, str]]: + """Split ``[start_date, end_date]`` into monthly-ish chunks. + + Boundary semantics (closed interval): + - Each chunk uses ``[chunk_start, chunk_end]``. + - The next chunk starts at ``previous_chunk_end + 1 day``. + - The final chunk may contain fewer than *grain_days* days. + + Args: + start_date: ISO date string ``YYYY-MM-DD``. + end_date: ISO date string ``YYYY-MM-DD``. + grain_days: Maximum days per chunk (default 31). + + Returns: + List of dicts with ``chunk_start`` and ``chunk_end`` keys. + """ + dt_start = datetime.strptime(start_date, "%Y-%m-%d") + dt_end = datetime.strptime(end_date, "%Y-%m-%d") + + if dt_start > dt_end: + raise ValueError( + f"start_date ({start_date}) must be <= end_date ({end_date})" + ) + + chunks: List[Dict[str, str]] = [] + cursor = dt_start + while cursor <= dt_end: + chunk_end = min(cursor + timedelta(days=grain_days - 1), dt_end) + chunks.append( + { + "chunk_start": cursor.strftime("%Y-%m-%d"), + "chunk_end": chunk_end.strftime("%Y-%m-%d"), + } + ) + cursor = chunk_end + timedelta(days=1) + + return chunks + + +# ============================================================ +# 2. ID-batch decomposition +# ============================================================ + + +def decompose_by_ids( + ids: List[Any], + batch_size: int = 1000, +) -> List[List[Any]]: + """Split *ids* into batches of at most *batch_size*. + + Args: + ids: List of IDs (container IDs, lot IDs, etc.). + batch_size: Maximum items per batch (default 1000, + matching Oracle IN-clause limit). + + Returns: + List of ID sub-lists. + """ + if batch_size < 1: + raise ValueError("batch_size must be >= 1") + return [ids[i : i + batch_size] for i in range(0, len(ids), batch_size)] + + +# ============================================================ +# 3. Deterministic query_hash +# ============================================================ + + +def compute_query_hash(params: Dict[str, Any]) -> str: + """Compute a stable 16-char hex hash for *params*. + + Canonicalization: + - ``json.dumps`` with ``sort_keys=True`` and ``default=str``. + - Lists are sorted before serialisation. + - SHA-256, truncated to first 16 hex chars. + + Only dataset-affecting parameters should be included; + presentation-only parameters (page, per_page, …) must be + excluded by the caller. + """ + canonical = _canonicalize(params) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16] + + +def _canonicalize(obj: Any) -> str: + """Recursively sort lists and produce deterministic JSON.""" + + def _sort_value(v: Any) -> Any: + if isinstance(v, list): + try: + return sorted(_sort_value(i) for i in v) + except TypeError: + return [_sort_value(i) for i in v] + if isinstance(v, dict): + return {k: _sort_value(v[k]) for k in sorted(v.keys())} + return v + + return json.dumps(_sort_value(obj), sort_keys=True, ensure_ascii=False, default=str) + + +# ============================================================ +# 4. Progress tracking via Redis HSET +# ============================================================ + + +def _update_progress( + cache_prefix: str, + query_hash: str, + *, + total: int, + completed: int, + failed: int, + status: str = "running", + has_partial_failure: bool = False, + ttl: int = 900, +) -> None: + """Write/update batch progress metadata to Redis.""" + client = get_redis_client() + if client is None: + return + key = get_key(f"batch:{cache_prefix}:{query_hash}:meta") + pct = round(completed / total * 100, 1) if total else 0 + mapping = { + "total": str(total), + "completed": str(completed), + "failed": str(failed), + "pct": str(pct), + "status": status, + "has_partial_failure": str(has_partial_failure), + } + try: + client.hset(key, mapping=mapping) + client.expire(key, ttl) + except Exception as exc: + logger.warning("Failed to update batch progress: %s", exc) + + +def get_batch_progress( + cache_prefix: str, + query_hash: str, +) -> Optional[Dict[str, str]]: + """Read batch progress metadata from Redis.""" + client = get_redis_client() + if client is None: + return None + key = get_key(f"batch:{cache_prefix}:{query_hash}:meta") + try: + data = client.hgetall(key) + return data if data else None + except Exception: + return None + + +# ============================================================ +# 5. Execute plan +# ============================================================ + +# Type alias for the function each chunk calls. +# Signature: query_fn(chunk, max_rows_per_chunk) -> pd.DataFrame +QueryFn = Callable[..., pd.DataFrame] + + +def execute_plan( + chunks: List[Dict[str, Any]], + query_fn: QueryFn, + *, + parallel: int = 1, + query_hash: Optional[str] = None, + skip_cached: bool = True, + cache_prefix: str = "", + chunk_ttl: int = 900, + max_rows_per_chunk: Optional[int] = None, +) -> str: + """Execute *chunks* through *query_fn* with caching + guards. + + Args: + chunks: List of chunk descriptors (dicts from decompose_*). + query_fn: ``fn(chunk_dict, max_rows_per_chunk=…) -> DataFrame``. + Must use ``read_sql_df_slow`` internally. + parallel: Max concurrent chunks (default 1 = sequential). + query_hash: Precomputed hash; auto-generated if None. + skip_cached: Skip chunks already in Redis (default True). + cache_prefix: Service prefix for Redis keys (e.g. "reject"). + chunk_ttl: TTL in seconds for each chunk key (default 900). + max_rows_per_chunk: Passed to *query_fn* for SQL-level + ``FETCH FIRST N ROWS ONLY``. + + Returns: + The ``query_hash`` identifying this batch. + """ + if query_hash is None: + query_hash = compute_query_hash({"chunks": chunks}) + + total = len(chunks) + completed = 0 + failed = 0 + has_partial_failure = False + + _update_progress( + cache_prefix, query_hash, + total=total, completed=0, failed=0, status="running", ttl=chunk_ttl, + ) + + effective_parallel = _effective_parallelism(parallel) + + if effective_parallel <= 1: + # --- Sequential path --- + for idx, chunk in enumerate(chunks): + if skip_cached and redis_chunk_exists(cache_prefix, query_hash, idx): + completed += 1 + logger.debug("chunk %d/%d cached, skipping", idx, total) + _update_progress( + cache_prefix, query_hash, + total=total, completed=completed, failed=failed, + has_partial_failure=has_partial_failure, ttl=chunk_ttl, + ) + continue + ok = _execute_single_chunk( + idx, chunk, query_fn, cache_prefix, query_hash, + chunk_ttl, max_rows_per_chunk, + ) + if ok: + completed += 1 + else: + failed += 1 + has_partial_failure = True + _update_progress( + cache_prefix, query_hash, + total=total, completed=completed, failed=failed, + has_partial_failure=has_partial_failure, ttl=chunk_ttl, + ) + else: + # --- Parallel path --- + completed, failed, has_partial_failure = _execute_parallel( + chunks, query_fn, cache_prefix, query_hash, + chunk_ttl, max_rows_per_chunk, skip_cached, + effective_parallel, + ) + + final_status = "completed" if failed == 0 else ("failed" if completed == 0 else "partial") + _update_progress( + cache_prefix, query_hash, + total=total, completed=completed, failed=failed, + status=final_status, + has_partial_failure=has_partial_failure, + ttl=chunk_ttl, + ) + + return query_hash + + +def _effective_parallelism(requested: int) -> int: + """Cap parallelism at ``min(requested, semaphore_available - 1)``. + + If semaphore is fully occupied, degrade to sequential (1). + """ + if requested <= 1: + return 1 + try: + from mes_dashboard.core.database import _get_slow_query_semaphore + sem = _get_slow_query_semaphore() + # threading.Semaphore doesn't expose available count directly; + # use a non-blocking acquire/release to estimate. + acquired = sem.acquire(blocking=False) + if not acquired: + logger.info("Semaphore fully occupied; degrading to sequential") + return 1 + sem.release() + # We got one permit, so at least 1 is available. + # Conservative cap: min(requested, available - 1) where available >= 1. + # Since we can't know exact available, just cap at requested. + return min(requested, 3) # hard ceiling to be safe + except Exception: + return 1 + + +def _execute_single_chunk( + idx: int, + chunk: Dict[str, Any], + query_fn: QueryFn, + cache_prefix: str, + query_hash: str, + chunk_ttl: int, + max_rows_per_chunk: Optional[int], +) -> bool: + """Run one chunk through *query_fn*, apply guards, store result. + + Returns True on success, False on failure. + """ + try: + df = query_fn(chunk, max_rows_per_chunk=max_rows_per_chunk) + if df is None: + df = pd.DataFrame() + + # ---- Memory guard ---- + mem_bytes = df.memory_usage(deep=True).sum() + mem_mb = mem_bytes / (1024 * 1024) + if mem_mb > BATCH_CHUNK_MAX_MEMORY_MB: + logger.warning( + "Chunk %d memory %.1f MB exceeds limit %d MB — discarded", + idx, mem_mb, BATCH_CHUNK_MAX_MEMORY_MB, + ) + return False + + # ---- Truncation flag ---- + truncated = ( + max_rows_per_chunk is not None + and len(df) == max_rows_per_chunk + ) + if truncated: + logger.info("Chunk %d returned exactly max_rows_per_chunk=%d (truncated)", idx, max_rows_per_chunk) + + # ---- Store to Redis ---- + stored = redis_store_chunk(cache_prefix, query_hash, idx, df, ttl=chunk_ttl) + if not stored: + logger.warning( + "Chunk %d failed to persist into Redis, marking as failed", idx + ) + return False + + logger.debug( + "Chunk %d completed: %d rows, %.1f MB", + idx, len(df), mem_mb, + ) + return True + + except Exception as exc: + logger.error( + "Chunk %d failed: %s", idx, exc, exc_info=True, + ) + return False + + +def _execute_parallel( + chunks: List[Dict[str, Any]], + query_fn: QueryFn, + cache_prefix: str, + query_hash: str, + chunk_ttl: int, + max_rows_per_chunk: Optional[int], + skip_cached: bool, + max_workers: int, +) -> tuple: + """Execute chunks in parallel via ThreadPoolExecutor. + + Returns (completed, failed, has_partial_failure). + """ + total = len(chunks) + completed = 0 + failed = 0 + has_partial_failure = False + + futures = {} + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for idx, chunk in enumerate(chunks): + if skip_cached and redis_chunk_exists(cache_prefix, query_hash, idx): + completed += 1 + continue + future = executor.submit( + _execute_single_chunk, + idx, chunk, query_fn, + cache_prefix, query_hash, chunk_ttl, max_rows_per_chunk, + ) + futures[future] = idx + + for future in as_completed(futures): + idx = futures[future] + try: + ok = future.result() + if ok: + completed += 1 + else: + failed += 1 + has_partial_failure = True + except Exception as exc: + logger.error("Chunk %d future error: %s", idx, exc) + failed += 1 + has_partial_failure = True + + _update_progress( + cache_prefix, query_hash, + total=total, completed=completed, failed=failed, + has_partial_failure=has_partial_failure, ttl=chunk_ttl, + ) + + return completed, failed, has_partial_failure + + +# ============================================================ +# 6. Merge / iterate chunks +# ============================================================ + + +def merge_chunks( + cache_prefix: str, + query_hash: str, + total: Optional[int] = None, + max_total_rows: Optional[int] = None, +) -> pd.DataFrame: + """Load all chunks from Redis and concatenate into one DataFrame. + + If *total* is not given, reads it from the progress metadata. + Missing chunks are skipped (``has_partial_failure`` semantics). + """ + if total is None: + progress = get_batch_progress(cache_prefix, query_hash) + if progress: + total = int(progress.get("total", 0)) + else: + total = 0 + + dfs: List[pd.DataFrame] = [] + total_rows = 0 + for idx in range(total): + df = redis_load_chunk(cache_prefix, query_hash, idx) + if df is not None and not df.empty: + if max_total_rows is not None and total_rows >= max_total_rows: + logger.warning( + "merge_chunks reached max_total_rows=%d (prefix=%s, query_hash=%s)", + max_total_rows, + cache_prefix, + query_hash, + ) + break + if max_total_rows is not None: + remaining = max_total_rows - total_rows + if remaining <= 0: + break + if len(df) > remaining: + df = df.head(remaining).copy() + logger.warning( + "merge_chunks truncated chunk %d to %d rows (max_total_rows=%d)", + idx, + remaining, + max_total_rows, + ) + dfs.append(df) + total_rows += len(df) + + if not dfs: + return pd.DataFrame() + + return pd.concat(dfs, ignore_index=True) + + +def iterate_chunks( + cache_prefix: str, + query_hash: str, + total: Optional[int] = None, +) -> Generator[pd.DataFrame, None, None]: + """Yield chunk DataFrames one at a time (memory-friendly). + + Skips missing chunks. + """ + if total is None: + progress = get_batch_progress(cache_prefix, query_hash) + if progress: + total = int(progress.get("total", 0)) + else: + total = 0 + + for idx in range(total): + df = redis_load_chunk(cache_prefix, query_hash, idx) + if df is not None: + yield df + + +# ============================================================ +# 7. Convenience: should_use_engine? +# ============================================================ + + +def should_decompose_by_time(start_date: str, end_date: str) -> bool: + """Return True if the date range exceeds the threshold for engine use.""" + try: + dt_start = datetime.strptime(start_date, "%Y-%m-%d") + dt_end = datetime.strptime(end_date, "%Y-%m-%d") + return (dt_end - dt_start).days > BATCH_QUERY_TIME_THRESHOLD_DAYS + except (ValueError, TypeError): + return False + + +def should_decompose_by_ids(ids: List[Any]) -> bool: + """Return True if the ID list exceeds the threshold for engine use.""" + return len(ids) > BATCH_QUERY_ID_THRESHOLD diff --git a/src/mes_dashboard/services/hold_dataset_cache.py b/src/mes_dashboard/services/hold_dataset_cache.py index 1154b90..9e871d8 100644 --- a/src/mes_dashboard/services/hold_dataset_cache.py +++ b/src/mes_dashboard/services/hold_dataset_cache.py @@ -11,9 +11,7 @@ Cache layers: from __future__ import annotations -import base64 import hashlib -import io import json import logging from functools import lru_cache @@ -24,11 +22,7 @@ import pandas as pd from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache from mes_dashboard.core.database import read_sql_df_slow as read_sql_df -from mes_dashboard.core.redis_client import ( - REDIS_ENABLED, - get_key, - get_redis_client, -) +from mes_dashboard.core.redis_df_store import redis_load_df, redis_store_df from mes_dashboard.services.filter_cache import get_workcenter_group as _get_wc_group from mes_dashboard.services.hold_history_service import ( _clean_text, @@ -79,44 +73,16 @@ def _make_query_id(params: dict) -> str: # ============================================================ -# Redis L2 helpers (parquet <-> base64 string) +# Redis L2 helpers (delegated to shared redis_df_store) # ============================================================ -def _redis_key(query_id: str) -> str: - return get_key(f"{_REDIS_NAMESPACE}:{query_id}") - - def _redis_store_df(query_id: str, df: pd.DataFrame) -> None: - if not REDIS_ENABLED: - return - client = get_redis_client() - if client is None: - return - try: - buf = io.BytesIO() - df.to_parquet(buf, engine="pyarrow", index=False) - encoded = base64.b64encode(buf.getvalue()).decode("ascii") - client.setex(_redis_key(query_id), _CACHE_TTL, encoded) - except Exception as exc: - logger.warning("Failed to store DataFrame in Redis: %s", exc) + redis_store_df(f"{_REDIS_NAMESPACE}:{query_id}", df, ttl=_CACHE_TTL) def _redis_load_df(query_id: str) -> Optional[pd.DataFrame]: - if not REDIS_ENABLED: - return None - client = get_redis_client() - if client is None: - return None - try: - encoded = client.get(_redis_key(query_id)) - if encoded is None: - return None - raw = base64.b64decode(encoded) - return pd.read_parquet(io.BytesIO(raw), engine="pyarrow") - except Exception as exc: - logger.warning("Failed to load DataFrame from Redis: %s", exc) - return None + return redis_load_df(f"{_REDIS_NAMESPACE}:{query_id}") # ============================================================ @@ -164,11 +130,49 @@ def execute_primary_query( logger.info( "Hold dataset cache miss for query_id=%s, querying Oracle", query_id ) - sql = _load_sql("base_facts") - params = {"start_date": start_date, "end_date": end_date} - df = read_sql_df(sql, params) - if df is None: - df = pd.DataFrame() + + from mes_dashboard.services.batch_query_engine import ( + decompose_by_time_range, + execute_plan, + merge_chunks, + compute_query_hash, + should_decompose_by_time, + ) + + if should_decompose_by_time(start_date, end_date): + # --- Engine path for long date ranges --- + engine_chunks = decompose_by_time_range(start_date, end_date) + engine_hash = compute_query_hash( + {"start_date": start_date, "end_date": end_date} + ) + base_sql = _load_sql("base_facts") + + def _run_hold_chunk(chunk, max_rows_per_chunk=None): + params = { + "start_date": chunk["chunk_start"], + "end_date": chunk["chunk_end"], + } + result = read_sql_df(base_sql, params) + return result if result is not None else pd.DataFrame() + + logger.info( + "Engine activated for hold: %d chunks (query_id=%s)", + len(engine_chunks), query_id, + ) + execute_plan( + engine_chunks, _run_hold_chunk, + query_hash=engine_hash, + cache_prefix="hold", + chunk_ttl=_CACHE_TTL, + ) + df = merge_chunks("hold", engine_hash) + else: + # --- Direct path (short query) --- + sql = _load_sql("base_facts") + params = {"start_date": start_date, "end_date": end_date} + df = read_sql_df(sql, params) + if df is None: + df = pd.DataFrame() if not df.empty: df["_QUERY_START"] = pd.Timestamp(start_date) diff --git a/src/mes_dashboard/services/job_query_service.py b/src/mes_dashboard/services/job_query_service.py index 811d7d0..649b9e8 100644 --- a/src/mes_dashboard/services/job_query_service.py +++ b/src/mes_dashboard/services/job_query_service.py @@ -140,6 +140,9 @@ def _build_resource_filter_sql( # Query Functions # ============================================================ +_JOB_CACHE_TTL = 600 # 10 min for job query results + + def get_jobs_by_resources( resource_ids: List[str], start_date: str, @@ -147,6 +150,10 @@ def get_jobs_by_resources( ) -> Dict[str, Any]: """Query jobs for selected resources within date range. + For date ranges exceeding BATCH_QUERY_TIME_THRESHOLD_DAYS (default 60), + the query is decomposed into monthly chunks via BatchQueryEngine. + Results are cached in Redis to avoid redundant Oracle queries. + Args: resource_ids: List of RESOURCEID values to query start_date: Start date in YYYY-MM-DD format @@ -165,22 +172,78 @@ def get_jobs_by_resources( return {'error': validation_error} try: - # Build resource filter - resource_filter, resource_params = _build_resource_filter_sql( - resource_ids, return_params=True + from mes_dashboard.services.batch_query_engine import ( + decompose_by_time_range, + execute_plan, + merge_chunks, + compute_query_hash, + should_decompose_by_time, ) + from mes_dashboard.core.redis_df_store import redis_load_df, redis_store_df - # Load SQL template - sql = SQLLoader.load("job_query/job_list") - sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter) + # Check Redis cache first + cache_hash = compute_query_hash({ + "resource_ids": sorted(resource_ids), + "start_date": start_date, + "end_date": end_date, + }) + cache_key = f"job_query:{cache_hash}" + cached_df = redis_load_df(cache_key) + if cached_df is not None: + logger.info("Job query cache hit (hash=%s)", cache_hash) + df = cached_df + elif should_decompose_by_time(start_date, end_date): + # --- Engine path for long date ranges --- + engine_chunks = decompose_by_time_range(start_date, end_date) - # Execute query - params = { - 'start_date': start_date, - 'end_date': end_date, - **resource_params, - } - df = read_sql_df(sql, params) + # Build resource filter once (reused across all chunks) + resource_filter, resource_params = _build_resource_filter_sql( + resource_ids, return_params=True + ) + sql = SQLLoader.load("job_query/job_list") + sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter) + + def _run_job_chunk(chunk, max_rows_per_chunk=None): + chunk_params = { + 'start_date': chunk['chunk_start'], + 'end_date': chunk['chunk_end'], + **resource_params, + } + result = read_sql_df(sql, chunk_params) + return result if result is not None else pd.DataFrame() + + logger.info( + "Engine activated for job query: %d chunks, %d resources", + len(engine_chunks), len(resource_ids), + ) + execute_plan( + engine_chunks, _run_job_chunk, + query_hash=cache_hash, + cache_prefix="job", + chunk_ttl=_JOB_CACHE_TTL, + ) + df = merge_chunks("job", cache_hash) + # Store merged result for fast re-access + if not df.empty: + redis_store_df(cache_key, df, ttl=_JOB_CACHE_TTL) + else: + # --- Direct path (short query) --- + resource_filter, resource_params = _build_resource_filter_sql( + resource_ids, return_params=True + ) + sql = SQLLoader.load("job_query/job_list") + sql = sql.replace("{{ RESOURCE_FILTER }}", resource_filter) + params = { + 'start_date': start_date, + 'end_date': end_date, + **resource_params, + } + df = read_sql_df(sql, params) + if df is None: + df = pd.DataFrame() + # Cache the result + if not df.empty: + redis_store_df(cache_key, df, ttl=_JOB_CACHE_TTL) # Convert to records data = [] diff --git a/src/mes_dashboard/services/mid_section_defect_service.py b/src/mes_dashboard/services/mid_section_defect_service.py index 37933ca..c48f46f 100644 --- a/src/mes_dashboard/services/mid_section_defect_service.py +++ b/src/mes_dashboard/services/mid_section_defect_service.py @@ -56,8 +56,8 @@ from mes_dashboard.config.workcenter_groups import WORKCENTER_GROUPS, get_group_ logger = logging.getLogger('mes_dashboard.mid_section_defect') -# Constants -MAX_QUERY_DAYS = 365 +# Constants +MAX_QUERY_DAYS = 365 CACHE_TTL_DETECTION = 300 # 5 min for detection data CACHE_TTL_LOSS_REASONS = 86400 # 24h for loss reason list (daily sync) @@ -610,11 +610,11 @@ def query_analysis_detail( } -def query_all_loss_reasons() -> Optional[Dict[str, Any]]: - """Get all loss reasons (cached daily in Redis). - - Lightweight query: DISTINCT LOSSREASONNAME from last 365 days. - Cached with 24h TTL — suitable for dropdown population on page load. +def query_all_loss_reasons() -> Optional[Dict[str, Any]]: + """Get all loss reasons (cached daily in Redis). + + Lightweight query: DISTINCT LOSSREASONNAME from last 365 days. + Cached with 24h TTL — suitable for dropdown population on page load. Returns: Dict with 'loss_reasons' list, or None on failure. @@ -861,7 +861,12 @@ def _fetch_station_detection_data( end_date: str, station: str = '測試', ) -> Optional[pd.DataFrame]: - """Execute station_detection.sql and return raw DataFrame.""" + """Execute station_detection.sql and return raw DataFrame. + + For date ranges exceeding BATCH_QUERY_TIME_THRESHOLD_DAYS (default 60), + the query is decomposed into monthly chunks via BatchQueryEngine to + prevent Oracle timeout on high-volume stations. + """ cache_key = make_cache_key( "mid_section_detection", filters={ @@ -885,16 +890,58 @@ def _fetch_station_detection_data( STATION_FILTER=wip_filter, STATION_FILTER_REJECTS=rej_filter, ) - bind_params = { - 'start_date': start_date, - 'end_date': end_date, - **wip_params, - **rej_params, - } - df = read_sql_df(sql, bind_params) - if df is None: - logger.error("Station detection query returned None (station=%s)", station) - return None + + from mes_dashboard.services.batch_query_engine import ( + decompose_by_time_range, + execute_plan, + merge_chunks, + compute_query_hash, + should_decompose_by_time, + ) + + if should_decompose_by_time(start_date, end_date): + # --- Engine path for long date ranges --- + engine_chunks = decompose_by_time_range(start_date, end_date) + engine_hash = compute_query_hash({ + "station": station, + "start_date": start_date, + "end_date": end_date, + }) + + def _run_detection_chunk(chunk, max_rows_per_chunk=None): + chunk_params = { + 'start_date': chunk['chunk_start'], + 'end_date': chunk['chunk_end'], + **wip_params, + **rej_params, + } + result = read_sql_df(sql, chunk_params) + return result if result is not None else pd.DataFrame() + + logger.info( + "Engine activated for detection (%s): %d chunks", + station, len(engine_chunks), + ) + execute_plan( + engine_chunks, _run_detection_chunk, + query_hash=engine_hash, + cache_prefix="msd_detect", + chunk_ttl=CACHE_TTL_DETECTION, + ) + df = merge_chunks("msd_detect", engine_hash) + else: + # --- Direct path (short query) --- + bind_params = { + 'start_date': start_date, + 'end_date': end_date, + **wip_params, + **rej_params, + } + df = read_sql_df(sql, bind_params) + if df is None: + logger.error("Station detection query returned None (station=%s)", station) + return None + logger.info( "Station detection (%s): %d rows, %d unique lots", station, diff --git a/src/mes_dashboard/services/query_tool_service.py b/src/mes_dashboard/services/query_tool_service.py index 86e5f82..2e97a2f 100644 --- a/src/mes_dashboard/services/query_tool_service.py +++ b/src/mes_dashboard/services/query_tool_service.py @@ -40,13 +40,13 @@ except ImportError: logger = logging.getLogger('mes_dashboard.query_tool') # Constants -BATCH_SIZE = 1000 # Oracle IN clause limit -MAX_LOT_IDS = 100 -MAX_SERIAL_NUMBERS = 100 -MAX_WORK_ORDERS = 50 -MAX_GD_WORK_ORDERS = 100 -MAX_EQUIPMENTS = 20 -MAX_DATE_RANGE_DAYS = 365 +BATCH_SIZE = 1000 # Oracle IN clause limit +MAX_LOT_IDS = 100 +MAX_SERIAL_NUMBERS = 100 +MAX_WORK_ORDERS = 50 +MAX_GD_WORK_ORDERS = 100 +MAX_EQUIPMENTS = 20 +MAX_DATE_RANGE_DAYS = 365 DEFAULT_TIME_WINDOW_HOURS = 168 # 1 week for better PJ_TYPE detection ADJACENT_LOTS_COUNT = 3 @@ -102,14 +102,14 @@ def validate_lot_input(input_type: str, values: List[str]) -> Optional[str]: if not values: return '請輸入至少一個查詢條件' - limits = { - 'lot_id': MAX_LOT_IDS, - 'wafer_lot': MAX_LOT_IDS, - 'gd_lot_id': MAX_LOT_IDS, - 'serial_number': MAX_SERIAL_NUMBERS, - 'work_order': MAX_WORK_ORDERS, - 'gd_work_order': MAX_GD_WORK_ORDERS, - } + limits = { + 'lot_id': MAX_LOT_IDS, + 'wafer_lot': MAX_LOT_IDS, + 'gd_lot_id': MAX_LOT_IDS, + 'serial_number': MAX_SERIAL_NUMBERS, + 'work_order': MAX_WORK_ORDERS, + 'gd_work_order': MAX_GD_WORK_ORDERS, + } limit = limits.get(input_type, MAX_LOT_IDS) if len(values) > limit: @@ -385,7 +385,7 @@ def _resolve_by_lot_id(lot_ids: List[str]) -> Dict[str, Any]: CONTAINER_FILTER=builder.get_conditions_sql(), ) - df = read_sql_df(sql, builder.params) + df = read_sql_df_slow(sql, builder.params) data = _df_to_records(df) matched, not_found, expansion_info = _match_rows_by_tokens( lot_ids, @@ -424,7 +424,7 @@ def _resolve_by_wafer_lot(wafer_lots: List[str]) -> Dict[str, Any]: WAFER_FILTER=builder.get_conditions_sql(), ) - df = read_sql_df(sql, builder.params) + df = read_sql_df_slow(sql, builder.params) data = _df_to_records(df) matched, not_found, expansion_info = _match_rows_by_tokens( wafer_lots, @@ -482,7 +482,7 @@ def _resolve_by_gd_lot_id(gd_lot_ids: List[str]) -> Dict[str, Any]: CONTAINER_FILTER=builder.get_conditions_sql(), ) - df = read_sql_df(sql, builder.params) + df = read_sql_df_slow(sql, builder.params) data = _df_to_records(df) matched, not_found, expansion_info = _match_rows_by_tokens( gd_lot_ids, @@ -574,7 +574,7 @@ def _resolve_by_serial_number(serial_numbers: List[str]) -> Dict[str, Any]: config['sql_name'], **{config['filter_key']: builder.get_conditions_sql()}, ) - df = read_sql_df(sql, builder.params) + df = read_sql_df_slow(sql, builder.params) data = _df_to_records(df) matched, _, _ = _match_rows_by_tokens( tokens, @@ -660,7 +660,7 @@ def _resolve_by_work_order(work_orders: List[str]) -> Dict[str, Any]: WORK_ORDER_FILTER=builder.get_conditions_sql(), ) - df = read_sql_df(sql, builder.params) + df = read_sql_df_slow(sql, builder.params) data = _df_to_records(df) matched, not_found, expansion_info = _match_rows_by_tokens( work_orders, @@ -703,7 +703,7 @@ def _resolve_by_gd_work_order(work_orders: List[str]) -> Dict[str, Any]: WORK_ORDER_FILTER=builder.get_conditions_sql(), ) - df = read_sql_df(sql, builder.params) + df = read_sql_df_slow(sql, builder.params) data = _df_to_records(df) matched, not_found, expansion_info = _match_rows_by_tokens( work_orders, @@ -853,7 +853,7 @@ def get_adjacent_lots( 'time_window_hours': time_window_hours, } - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) logger.debug(f"Adjacent lots: {len(data)} records for {equipment_id}") @@ -1127,11 +1127,8 @@ def get_lot_split_merge_history( f"Starting split/merge history query for MFGORDERNAME={work_order} mode={mode}" ) - if full_history: - # Full mode uses dedicated slow query timeout path. - df = read_sql_df_slow(sql, params) - else: - df = read_sql_df(sql, params) + # Both modes use slow query path for timeout protection. + df = read_sql_df_slow(sql, params) data = _df_to_records(df) # Process records for display @@ -1209,7 +1206,7 @@ def _get_mfg_order_for_lot(container_id: str) -> Optional[str]: WHERE CONTAINERID = :container_id AND MFGORDERNAME IS NOT NULL """ - df = read_sql_df(sql, {'container_id': container_id}) + df = read_sql_df_slow(sql, {'container_id': container_id}) if not df.empty: return df.iloc[0]['MFGORDERNAME'] return None @@ -1304,7 +1301,7 @@ def get_lot_splits( sql = SQLLoader.load("query_tool/lot_splits") params = {'container_id': container_id} - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) # Group by FINISHEDNAME to show combined structure @@ -1395,7 +1392,7 @@ def get_lot_jobs( 'time_end': end, } - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) logger.debug(f"LOT jobs: {len(data)} records for {equipment_id}") @@ -1452,7 +1449,7 @@ def get_lot_jobs_with_history( 'time_end': end, } - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) logger.debug( @@ -1503,16 +1500,33 @@ def get_equipment_status_hours( return {'error': validation_error} try: - builder = QueryBuilder() - builder.add_in_condition("r.RESOURCEID", equipment_ids) - sql = SQLLoader.load_with_params( - "query_tool/equipment_status_hours", - EQUIPMENT_FILTER=builder.get_conditions_sql(), - ) + from mes_dashboard.services.batch_query_engine import compute_query_hash + from mes_dashboard.core.redis_df_store import redis_load_df, redis_store_df + + cache_hash = compute_query_hash({ + "fn": "equipment_status_hours", + "equipment_ids": sorted(equipment_ids), + "start_date": start_date, + "end_date": end_date, + }) + cache_key = f"qt:equip_status:{cache_hash}" + cached_df = redis_load_df(cache_key) + + if cached_df is not None: + df = cached_df + else: + builder = QueryBuilder() + builder.add_in_condition("r.RESOURCEID", equipment_ids) + sql = SQLLoader.load_with_params( + "query_tool/equipment_status_hours", + EQUIPMENT_FILTER=builder.get_conditions_sql(), + ) + params = {'start_date': start_date, 'end_date': end_date} + params.update(builder.params) + df = read_sql_df_slow(sql, params) + if df is not None and not df.empty: + redis_store_df(cache_key, df, ttl=300) - params = {'start_date': start_date, 'end_date': end_date} - params.update(builder.params) - df = read_sql_df(sql, params) data = _df_to_records(df) # Calculate totals @@ -1584,7 +1598,7 @@ def get_equipment_lots( params = {'start_date': start_date, 'end_date': end_date} params.update(builder.params) - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) logger.info(f"Equipment lots: {len(data)} records") @@ -1634,7 +1648,7 @@ def get_equipment_materials( params = {'start_date': start_date, 'end_date': end_date} params.update(builder.params) - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) logger.info(f"Equipment materials: {len(data)} records") @@ -1684,7 +1698,7 @@ def get_equipment_rejects( params = {'start_date': start_date, 'end_date': end_date} params.update(builder.params) - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) logger.info(f"Equipment rejects: {len(data)} records") @@ -1736,7 +1750,7 @@ def get_equipment_jobs( params = {'start_date': start_date, 'end_date': end_date} params.update(builder.params) - df = read_sql_df(sql, params) + df = read_sql_df_slow(sql, params) data = _df_to_records(df) logger.info(f"Equipment jobs: {len(data)} records") diff --git a/src/mes_dashboard/services/reject_dataset_cache.py b/src/mes_dashboard/services/reject_dataset_cache.py index bba0924..4e3568e 100644 --- a/src/mes_dashboard/services/reject_dataset_cache.py +++ b/src/mes_dashboard/services/reject_dataset_cache.py @@ -11,22 +11,27 @@ Cache layers: from __future__ import annotations -import base64 -import hashlib -import io -import json -import logging -from typing import Any, Dict, List, Optional +import hashlib +import json +import logging +import os +from typing import Any, Dict, List, Optional import pandas as pd -from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache -from mes_dashboard.core.database import read_sql_df_slow as read_sql_df -from mes_dashboard.core.redis_client import ( - REDIS_ENABLED, - get_key, - get_redis_client, -) +from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache +from mes_dashboard.core.database import read_sql_df_slow as read_sql_df +from mes_dashboard.core.query_spool_store import ( + clear_spooled_df, + load_spooled_df, + store_spooled_df, +) +from mes_dashboard.core.redis_client import get_key, get_redis_client +from mes_dashboard.core.redis_df_store import ( + redis_clear_batch, + redis_load_df, + redis_store_df, +) from mes_dashboard.services.filter_cache import get_specs_for_groups from mes_dashboard.services.reject_history_service import ( _as_float, @@ -54,6 +59,26 @@ _CACHE_TTL = 900 # 15 minutes _CACHE_MAX_SIZE = 8 _REDIS_NAMESPACE = "reject_dataset" _CACHE_SCHEMA_VERSION = 4 +_REJECT_ENGINE_GRAIN_DAYS = max(1, int(os.getenv("REJECT_ENGINE_GRAIN_DAYS", "10"))) +_REJECT_ENGINE_PARALLEL = max(1, int(os.getenv("REJECT_ENGINE_PARALLEL", "2"))) +_REJECT_ENGINE_MAX_ROWS_PER_CHUNK = max( + 1, int(os.getenv("REJECT_ENGINE_MAX_ROWS_PER_CHUNK", "50000")) +) +_REJECT_ENGINE_MAX_TOTAL_ROWS = max( + 1, int(os.getenv("REJECT_ENGINE_MAX_TOTAL_ROWS", "300000")) +) +_REJECT_ENGINE_SPILL_ENABLED = os.getenv("REJECT_ENGINE_SPILL_ENABLED", "true").strip().lower() in { + "1", + "true", + "yes", + "on", +} +_REJECT_ENGINE_MAX_RESULT_MB = max( + 1, int(os.getenv("REJECT_ENGINE_MAX_RESULT_MB", "64")) +) +_REJECT_ENGINE_SPOOL_TTL_SECONDS = max( + 300, int(os.getenv("REJECT_ENGINE_SPOOL_TTL_SECONDS", "21600")) +) _dataset_cache = ProcessLevelCache(ttl_seconds=_CACHE_TTL, max_size=_CACHE_MAX_SIZE) register_process_cache("reject_dataset", _dataset_cache, "Reject Dataset (L1, 15min)") @@ -71,66 +96,96 @@ def _make_query_id(params: dict) -> str: # ============================================================ -# Redis L2 helpers (parquet ↔ base64 string) +# Redis L2 helpers (delegated to shared redis_df_store) # ============================================================ -def _redis_key(query_id: str) -> str: - return get_key(f"{_REDIS_NAMESPACE}:{query_id}") - - def _redis_store_df(query_id: str, df: pd.DataFrame) -> None: - if not REDIS_ENABLED: - return - client = get_redis_client() - if client is None: - return - try: - buf = io.BytesIO() - df.to_parquet(buf, engine="pyarrow", index=False) - encoded = base64.b64encode(buf.getvalue()).decode("ascii") - client.setex(_redis_key(query_id), _CACHE_TTL, encoded) - except Exception as exc: - logger.warning("Failed to store DataFrame in Redis: %s", exc) + redis_store_df(f"{_REDIS_NAMESPACE}:{query_id}", df, ttl=_CACHE_TTL) -def _redis_load_df(query_id: str) -> Optional[pd.DataFrame]: - if not REDIS_ENABLED: - return None - client = get_redis_client() - if client is None: - return None - try: - encoded = client.get(_redis_key(query_id)) - if encoded is None: - return None - raw = base64.b64decode(encoded) - return pd.read_parquet(io.BytesIO(raw), engine="pyarrow") - except Exception as exc: - logger.warning("Failed to load DataFrame from Redis: %s", exc) - return None - - -# ============================================================ -# Cache read (L1 → L2 → None) -# ============================================================ - - -def _get_cached_df(query_id: str) -> Optional[pd.DataFrame]: - """Read cache: L1 hit → return, L1 miss → L2 → write L1 → return.""" - df = _dataset_cache.get(query_id) - if df is not None: - return df - df = _redis_load_df(query_id) - if df is not None: - _dataset_cache.set(query_id, df) - return df - - -def _store_df(query_id: str, df: pd.DataFrame) -> None: - """Write to L1 and L2.""" - _dataset_cache.set(query_id, df) - _redis_store_df(query_id, df) +def _redis_load_df(query_id: str) -> Optional[pd.DataFrame]: + return redis_load_df(f"{_REDIS_NAMESPACE}:{query_id}") + + +def _redis_delete_df(query_id: str) -> None: + client = get_redis_client() + if client is None: + return + try: + client.delete(get_key(f"{_REDIS_NAMESPACE}:{query_id}")) + except Exception: + return + + +# ============================================================ +# Cache read (L1 → L2 → None) +# ============================================================ + + +def _get_cached_df(query_id: str) -> Optional[pd.DataFrame]: + """Read cache: L1 hit → L2 hit → spool fallback.""" + df = _dataset_cache.get(query_id) + if df is not None: + return df + + df = _redis_load_df(query_id) + if df is not None: + _dataset_cache.set(query_id, df) + return df + + df = load_spooled_df(_REDIS_NAMESPACE, query_id) + if df is not None: + # Keep large payload out of L1 cache to avoid worker RSS spikes. + df_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) + if df_mb <= min(float(_REJECT_ENGINE_MAX_RESULT_MB), 32.0): + _dataset_cache.set(query_id, df) + return df + return None + + +def _store_df(query_id: str, df: pd.DataFrame) -> None: + """Write to L1 and L2.""" + _dataset_cache.set(query_id, df) + _redis_store_df(query_id, df) + clear_spooled_df(_REDIS_NAMESPACE, query_id) + + +def _store_query_result(query_id: str, df: pd.DataFrame) -> None: + """Store result using Redis for small sets and parquet spill for large sets.""" + if df is None or df.empty: + return + + df_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) + should_spill = _REJECT_ENGINE_SPILL_ENABLED and ( + len(df) >= _REJECT_ENGINE_MAX_TOTAL_ROWS or df_mb >= _REJECT_ENGINE_MAX_RESULT_MB + ) + + if should_spill: + spilled = store_spooled_df( + _REDIS_NAMESPACE, + query_id, + df, + ttl_seconds=_REJECT_ENGINE_SPOOL_TTL_SECONDS, + ) + if spilled: + _dataset_cache.invalidate(query_id) + _redis_delete_df(query_id) + logger.info( + "Stored query result via parquet spill (query_id=%s, rows=%d, size_mb=%.1f)", + query_id, + len(df), + df_mb, + ) + return + logger.warning( + "Parquet spill failed, fallback to dataset cache (query_id=%s, rows=%d, size_mb=%.1f)", + query_id, + len(df), + df_mb, + ) + + _store_df(query_id, df) # ============================================================ @@ -197,6 +252,7 @@ def execute_primary_query( base_params: Dict[str, Any] = {} resolution_info: Optional[Dict[str, Any]] = None workflow_filter: str = "" # empty = use default date-based filter + container_ids: List[str] = [] # populated in container mode if mode == "date_range": if not start_date or not end_date: @@ -253,13 +309,13 @@ def execute_primary_query( ) # ---- Compute query_id from base params only (policy filters applied in-memory) ---- - query_id_input = { - "cache_schema_version": _CACHE_SCHEMA_VERSION, - "mode": mode, - "start_date": start_date, - "end_date": end_date, - "container_input_type": container_input_type, - "container_values": sorted(container_values or []), + query_id_input = { + "cache_schema_version": _CACHE_SCHEMA_VERSION, + "mode": mode, + "start_date": start_date, + "end_date": end_date, + "container_input_type": container_input_type, + "container_values": sorted(container_values or []), } query_id = _make_query_id(query_id_input) @@ -279,24 +335,134 @@ def execute_primary_query( # ---- Execute Oracle query (NO policy filters — cache unfiltered) ---- logger.info("Dataset cache miss for query_id=%s, querying Oracle", query_id) - sql = _prepare_sql( - "list", - where_clause="", - base_variant="lot", - base_where=base_where, - workflow_filter=workflow_filter, + + # Decide whether to route through BatchQueryEngine + from mes_dashboard.services.batch_query_engine import ( + decompose_by_time_range, + decompose_by_ids, + execute_plan, + merge_chunks, + compute_query_hash, + should_decompose_by_time, + should_decompose_by_ids, + BATCH_QUERY_TIME_THRESHOLD_DAYS, ) - all_params = {**base_params, "offset": 0, "limit": 999999999} - df = read_sql_df(sql, all_params) - if df is None: - df = pd.DataFrame() - # ---- Cache unfiltered, return filtered ---- - if not df.empty: - _store_df(query_id, df) + use_engine = False + engine_chunks: Optional[list] = None + engine_parallel = 1 + engine_hash: Optional[str] = None + + if mode == "date_range" and should_decompose_by_time(start_date, end_date): + engine_chunks = decompose_by_time_range( + start_date, + end_date, + grain_days=_REJECT_ENGINE_GRAIN_DAYS, + ) + engine_parallel = _REJECT_ENGINE_PARALLEL + use_engine = True + logger.info( + "Engine activated for date_range: %d chunks (query_id=%s, grain_days=%d, parallel=%d)", + len(engine_chunks), query_id, _REJECT_ENGINE_GRAIN_DAYS, engine_parallel, + ) + elif mode == "container" and should_decompose_by_ids(container_ids): + id_batches = decompose_by_ids(container_ids) + engine_chunks = [{"ids": batch} for batch in id_batches] + use_engine = True + logger.info( + "Engine activated for container IDs: %d batches (query_id=%s)", + len(engine_chunks), query_id, + ) - filtered = _apply_policy_filters( - df, + if use_engine and engine_chunks: + # --- Engine path --- + engine_hash = compute_query_hash(query_id_input) + redis_clear_batch("reject", engine_hash) + + def _run_reject_chunk(chunk, max_rows_per_chunk=None): + """Execute one chunk of the reject query via read_sql_df_slow.""" + chunk_where_parts: List[str] = [] + chunk_params: Dict[str, Any] = {} + chunk_wf_filter = "" + + if "chunk_start" in chunk: + # Time-range chunk + chunk_where_parts.append( + "r.TXNDATE >= TO_DATE(:start_date, 'YYYY-MM-DD')" + " AND r.TXNDATE < TO_DATE(:end_date, 'YYYY-MM-DD') + 1" + ) + chunk_params["start_date"] = chunk["chunk_start"] + chunk_params["end_date"] = chunk["chunk_end"] + elif "ids" in chunk: + # ID-batch chunk + b = QueryBuilder() + b.add_in_condition("r.CONTAINERID", chunk["ids"]) + cid_w, cid_p = b.build_where_only() + cid_c = cid_w.strip() + if cid_c.upper().startswith("WHERE "): + cid_c = cid_c[6:].strip() + chunk_where_parts.append(cid_c) + chunk_params.update(cid_p) + # Workflow filter for container mode + wfb = QueryBuilder() + wfb.add_in_condition("r0.CONTAINERID", chunk["ids"]) + wf_w, _ = wfb.build_where_only() + wf_c = wf_w.strip() + if wf_c.upper().startswith("WHERE "): + wf_c = wf_c[6:].strip() + chunk_wf_filter = wf_c + + chunk_where = " AND ".join(chunk_where_parts) + chunk_sql = _prepare_sql( + "list", + where_clause="", + base_variant="lot", + base_where=chunk_where, + workflow_filter=chunk_wf_filter, + ) + limit = max_rows_per_chunk if max_rows_per_chunk else 500000 + chunk_params["offset"] = 0 + chunk_params["limit"] = limit + result = read_sql_df(chunk_sql, chunk_params) + return result if result is not None else pd.DataFrame() + + execute_plan( + engine_chunks, + _run_reject_chunk, + parallel=engine_parallel, + skip_cached=False, + query_hash=engine_hash, + cache_prefix="reject", + chunk_ttl=_CACHE_TTL, + max_rows_per_chunk=_REJECT_ENGINE_MAX_ROWS_PER_CHUNK, + ) + df = merge_chunks( + "reject", + engine_hash, + max_total_rows=_REJECT_ENGINE_MAX_TOTAL_ROWS, + ) + else: + # --- Direct path (short query, no engine overhead) --- + sql = _prepare_sql( + "list", + where_clause="", + base_variant="lot", + base_where=base_where, + workflow_filter=workflow_filter, + ) + all_params = {**base_params, "offset": 0, "limit": 500000} + df = read_sql_df(sql, all_params) + if df is None: + df = pd.DataFrame() + + # ---- Cache unfiltered, return filtered ---- + if not df.empty: + _store_query_result(query_id, df) + if engine_hash: + redis_clear_batch("reject", engine_hash) + + filtered = _apply_policy_filters( + df, include_excluded_scrap=include_excluded_scrap, exclude_material_scrap=exclude_material_scrap, exclude_pb_diode=exclude_pb_diode, @@ -387,23 +553,23 @@ def _build_primary_response( # ============================================================ -def apply_view( - *, - query_id: str, - packages: Optional[List[str]] = None, - workcenter_groups: Optional[List[str]] = None, +def apply_view( + *, + query_id: str, + packages: Optional[List[str]] = None, + workcenter_groups: Optional[List[str]] = None, reason: Optional[str] = None, - metric_filter: str = "all", - trend_dates: Optional[List[str]] = None, - detail_reason: Optional[str] = None, - pareto_dimension: Optional[str] = None, - pareto_values: Optional[List[str]] = None, - pareto_selections: Optional[Dict[str, List[str]]] = None, - page: int = 1, - per_page: int = 50, - include_excluded_scrap: bool = False, - exclude_material_scrap: bool = True, - exclude_pb_diode: bool = True, + metric_filter: str = "all", + trend_dates: Optional[List[str]] = None, + detail_reason: Optional[str] = None, + pareto_dimension: Optional[str] = None, + pareto_values: Optional[List[str]] = None, + pareto_selections: Optional[Dict[str, List[str]]] = None, + page: int = 1, + per_page: int = 50, + include_excluded_scrap: bool = False, + exclude_material_scrap: bool = True, + exclude_pb_diode: bool = True, ) -> Optional[Dict[str, Any]]: """Read cache → apply filters → return derived data. Returns None if expired.""" df = _get_cached_df(query_id) @@ -439,18 +605,18 @@ def apply_view( detail_df = detail_df[ detail_df["TXN_DAY"].apply(lambda d: _to_date_str(d) in date_set) ] - if detail_reason: - detail_df = detail_df[ - detail_df["LOSSREASONNAME"].str.strip() == detail_reason.strip() - ] - detail_df = _apply_pareto_selection_filter( - detail_df, - pareto_dimension=pareto_dimension, - pareto_values=pareto_values, - pareto_selections=pareto_selections, - ) - - detail_page = _paginate_detail(detail_df, page=page, per_page=per_page) + if detail_reason: + detail_df = detail_df[ + detail_df["LOSSREASONNAME"].str.strip() == detail_reason.strip() + ] + detail_df = _apply_pareto_selection_filter( + detail_df, + pareto_dimension=pareto_dimension, + pareto_values=pareto_values, + pareto_selections=pareto_selections, + ) + + detail_page = _paginate_detail(detail_df, page=page, per_page=per_page) return { "analytics_raw": analytics_raw, @@ -459,7 +625,7 @@ def apply_view( } -def _apply_supplementary_filters( +def _apply_supplementary_filters( df: pd.DataFrame, *, packages: Optional[List[str]] = None, @@ -496,69 +662,69 @@ def _apply_supplementary_filters( elif metric_filter == "defect" and "DEFECT_QTY" in df.columns: mask &= df["DEFECT_QTY"] > 0 - return df[mask] - - -def _normalize_pareto_values(values: Optional[List[str]]) -> List[str]: - normalized: List[str] = [] - seen = set() - for value in values or []: - item = _normalize_text(value) - if not item or item in seen: - continue - seen.add(item) - normalized.append(item) - return normalized - - -def _apply_pareto_selection_filter( - df: pd.DataFrame, - *, - pareto_dimension: Optional[str] = None, - pareto_values: Optional[List[str]] = None, - pareto_selections: Optional[Dict[str, List[str]]] = None, -) -> pd.DataFrame: - """Apply Pareto multi-select filters on detail/export datasets.""" - if df is None or df.empty: - return df - - normalized_selections = _normalize_pareto_selections(pareto_selections) - if normalized_selections: - filtered = df - for dim in _PARETO_DIMENSIONS: - selected_values = normalized_selections.get(dim) - if not selected_values: - continue - dim_col = _DIM_TO_DF_COLUMN.get(dim) - if not dim_col: - raise ValueError(f"不支援的 pareto_dimension: {dim}") - if dim_col not in filtered.columns: - return filtered.iloc[0:0] - value_set = set(selected_values) - normalized_dimension_values = filtered[dim_col].map( - lambda value: _normalize_text(value) or "(未知)" - ) - filtered = filtered[normalized_dimension_values.isin(value_set)] - if filtered.empty: - return filtered - return filtered - - normalized_values = _normalize_pareto_values(pareto_values) - if not normalized_values: - return df - - dimension = _normalize_text(pareto_dimension).lower() or "reason" - dim_col = _DIM_TO_DF_COLUMN.get(dimension) - if not dim_col: - raise ValueError(f"不支援的 pareto_dimension: {pareto_dimension}") - if dim_col not in df.columns: - return df.iloc[0:0] - - value_set = set(normalized_values) - normalized_dimension_values = df[dim_col].map( - lambda value: _normalize_text(value) or "(未知)" - ) - return df[normalized_dimension_values.isin(value_set)] + return df[mask] + + +def _normalize_pareto_values(values: Optional[List[str]]) -> List[str]: + normalized: List[str] = [] + seen = set() + for value in values or []: + item = _normalize_text(value) + if not item or item in seen: + continue + seen.add(item) + normalized.append(item) + return normalized + + +def _apply_pareto_selection_filter( + df: pd.DataFrame, + *, + pareto_dimension: Optional[str] = None, + pareto_values: Optional[List[str]] = None, + pareto_selections: Optional[Dict[str, List[str]]] = None, +) -> pd.DataFrame: + """Apply Pareto multi-select filters on detail/export datasets.""" + if df is None or df.empty: + return df + + normalized_selections = _normalize_pareto_selections(pareto_selections) + if normalized_selections: + filtered = df + for dim in _PARETO_DIMENSIONS: + selected_values = normalized_selections.get(dim) + if not selected_values: + continue + dim_col = _DIM_TO_DF_COLUMN.get(dim) + if not dim_col: + raise ValueError(f"不支援的 pareto_dimension: {dim}") + if dim_col not in filtered.columns: + return filtered.iloc[0:0] + value_set = set(selected_values) + normalized_dimension_values = filtered[dim_col].map( + lambda value: _normalize_text(value) or "(未知)" + ) + filtered = filtered[normalized_dimension_values.isin(value_set)] + if filtered.empty: + return filtered + return filtered + + normalized_values = _normalize_pareto_values(pareto_values) + if not normalized_values: + return df + + dimension = _normalize_text(pareto_dimension).lower() or "reason" + dim_col = _DIM_TO_DF_COLUMN.get(dimension) + if not dim_col: + raise ValueError(f"不支援的 pareto_dimension: {pareto_dimension}") + if dim_col not in df.columns: + return df.iloc[0:0] + + value_set = set(normalized_values) + normalized_dimension_values = df[dim_col].map( + lambda value: _normalize_text(value) or "(未知)" + ) + return df[normalized_dimension_values.isin(value_set)] # ============================================================ @@ -785,193 +951,193 @@ def _extract_available_filters(df: pd.DataFrame) -> dict: # ============================================================ # Dimension → DF column mapping (matches _DIMENSION_COLUMN_MAP in reject_history_service) -_DIM_TO_DF_COLUMN = { - "reason": "LOSSREASONNAME", - "package": "PRODUCTLINENAME", - "type": "PJ_TYPE", - "workflow": "WORKFLOWNAME", - "workcenter": "WORKCENTER_GROUP", - "equipment": "PRIMARY_EQUIPMENTNAME", -} -_PARETO_DIMENSIONS = tuple(_DIM_TO_DF_COLUMN.keys()) -_PARETO_TOP20_DIMENSIONS = {"type", "workflow", "equipment"} - - -def _normalize_metric_mode(metric_mode: str) -> str: - mode = _normalize_text(metric_mode).lower() - if mode not in {"reject_total", "defect"}: - raise ValueError("Invalid metric_mode, supported: reject_total, defect") - return mode - - -def _normalize_pareto_scope(pareto_scope: str) -> str: - scope = _normalize_text(pareto_scope).lower() or "top80" - if scope not in {"top80", "all"}: - raise ValueError("Invalid pareto_scope, supported: top80, all") - return scope - - -def _normalize_pareto_display_scope(display_scope: str) -> str: - scope = _normalize_text(display_scope).lower() or "all" - if scope not in {"all", "top20"}: - raise ValueError("Invalid pareto_display_scope, supported: all, top20") - return scope - - -def _normalize_pareto_selections( - pareto_selections: Optional[Dict[str, List[str]]], -) -> Dict[str, List[str]]: - normalized: Dict[str, List[str]] = {} - for dim, values in (pareto_selections or {}).items(): - dim_key = _normalize_text(dim).lower() - if not dim_key: - continue - if dim_key not in _DIM_TO_DF_COLUMN: - raise ValueError(f"不支援的 pareto_dimension: {dim}") - normalized_values = _normalize_pareto_values(values) - if normalized_values: - normalized[dim_key] = normalized_values - return normalized - - -def _build_dimension_pareto_items( - df: pd.DataFrame, - *, - dim_col: str, - metric_mode: str, - pareto_scope: str, -) -> List[Dict[str, Any]]: - if df is None or df.empty: - return [] - if dim_col not in df.columns: - return [] - - metric_col = "DEFECT_QTY" if metric_mode == "defect" else "REJECT_TOTAL_QTY" - if metric_col not in df.columns: - return [] - - agg_dict = {} - for col in ["MOVEIN_QTY", "REJECT_TOTAL_QTY", "DEFECT_QTY"]: - if col in df.columns: - agg_dict[col] = (col, "sum") - - grouped = df.groupby(dim_col, sort=False).agg(**agg_dict).reset_index() - if grouped.empty: - return [] - - if "CONTAINERID" in df.columns: - lot_counts = ( - df.groupby(dim_col)["CONTAINERID"] - .nunique() - .reset_index() - .rename(columns={"CONTAINERID": "AFFECTED_LOT_COUNT"}) - ) - grouped = grouped.merge(lot_counts, on=dim_col, how="left") - else: - grouped["AFFECTED_LOT_COUNT"] = 0 - - grouped["METRIC_VALUE"] = grouped[metric_col].fillna(0) - grouped = grouped[grouped["METRIC_VALUE"] > 0].sort_values( - "METRIC_VALUE", ascending=False - ) - if grouped.empty: - return [] - - total_metric = grouped["METRIC_VALUE"].sum() - grouped["PCT"] = (grouped["METRIC_VALUE"] / total_metric * 100).round(4) - grouped["CUM_PCT"] = grouped["PCT"].cumsum().round(4) - - items: List[Dict[str, Any]] = [] - for _, row in grouped.iterrows(): - items.append({ - "reason": _normalize_text(row.get(dim_col)) or "(未知)", - "metric_value": _as_float(row.get("METRIC_VALUE")), - "MOVEIN_QTY": _as_int(row.get("MOVEIN_QTY")), - "REJECT_TOTAL_QTY": _as_int(row.get("REJECT_TOTAL_QTY")), - "DEFECT_QTY": _as_int(row.get("DEFECT_QTY")), - "count": _as_int(row.get("AFFECTED_LOT_COUNT")), - "pct": round(_as_float(row.get("PCT")), 4), - "cumPct": round(_as_float(row.get("CUM_PCT")), 4), - }) - - if pareto_scope == "top80" and items: - top_items = [item for item in items if _as_float(item.get("cumPct")) <= 80.0] - if not top_items: - top_items = [items[0]] - return top_items - return items - - -def _apply_cross_filter( - df: pd.DataFrame, - selections: Dict[str, List[str]], - exclude_dim: str, -) -> pd.DataFrame: - if df is None or df.empty or not selections: - return df - - filtered = df - for dim in _PARETO_DIMENSIONS: - if dim == exclude_dim: - continue - selected_values = selections.get(dim) - if not selected_values: - continue - dim_col = _DIM_TO_DF_COLUMN.get(dim) - if not dim_col: - raise ValueError(f"不支援的 pareto_dimension: {dim}") - if dim_col not in filtered.columns: - return filtered.iloc[0:0] - value_set = set(selected_values) - normalized_dimension_values = filtered[dim_col].map( - lambda value: _normalize_text(value) or "(未知)" - ) - filtered = filtered[normalized_dimension_values.isin(value_set)] - if filtered.empty: - return filtered - return filtered +_DIM_TO_DF_COLUMN = { + "reason": "LOSSREASONNAME", + "package": "PRODUCTLINENAME", + "type": "PJ_TYPE", + "workflow": "WORKFLOWNAME", + "workcenter": "WORKCENTER_GROUP", + "equipment": "PRIMARY_EQUIPMENTNAME", +} +_PARETO_DIMENSIONS = tuple(_DIM_TO_DF_COLUMN.keys()) +_PARETO_TOP20_DIMENSIONS = {"type", "workflow", "equipment"} -def compute_dimension_pareto( - *, - query_id: str, - dimension: str = "reason", - metric_mode: str = "reject_total", - pareto_scope: str = "top80", - packages: Optional[List[str]] = None, - workcenter_groups: Optional[List[str]] = None, - reason: Optional[str] = None, - trend_dates: Optional[List[str]] = None, - include_excluded_scrap: bool = False, - exclude_material_scrap: bool = True, - exclude_pb_diode: bool = True, -) -> Optional[Dict[str, Any]]: - """Compute dimension pareto from cached DataFrame (no Oracle query).""" - metric_mode = _normalize_metric_mode(metric_mode) - pareto_scope = _normalize_pareto_scope(pareto_scope) - dimension = _normalize_text(dimension).lower() or "reason" - if dimension not in _DIM_TO_DF_COLUMN: - raise ValueError( - f"Invalid dimension, supported: {', '.join(sorted(_DIM_TO_DF_COLUMN.keys()))}" - ) - - df = _get_cached_df(query_id) - if df is None: - return None - - # Keep cache-based pareto behavior aligned with primary/view policy filters. - df = _apply_policy_filters( - df, - include_excluded_scrap=include_excluded_scrap, - exclude_material_scrap=exclude_material_scrap, - exclude_pb_diode=exclude_pb_diode, - ) - if df is None or df.empty: - return {"items": [], "dimension": dimension, "metric_mode": metric_mode} - - dim_col = _DIM_TO_DF_COLUMN.get(dimension) - if dim_col not in df.columns: - return {"items": [], "dimension": dimension, "metric_mode": metric_mode} +def _normalize_metric_mode(metric_mode: str) -> str: + mode = _normalize_text(metric_mode).lower() + if mode not in {"reject_total", "defect"}: + raise ValueError("Invalid metric_mode, supported: reject_total, defect") + return mode + + +def _normalize_pareto_scope(pareto_scope: str) -> str: + scope = _normalize_text(pareto_scope).lower() or "top80" + if scope not in {"top80", "all"}: + raise ValueError("Invalid pareto_scope, supported: top80, all") + return scope + + +def _normalize_pareto_display_scope(display_scope: str) -> str: + scope = _normalize_text(display_scope).lower() or "all" + if scope not in {"all", "top20"}: + raise ValueError("Invalid pareto_display_scope, supported: all, top20") + return scope + + +def _normalize_pareto_selections( + pareto_selections: Optional[Dict[str, List[str]]], +) -> Dict[str, List[str]]: + normalized: Dict[str, List[str]] = {} + for dim, values in (pareto_selections or {}).items(): + dim_key = _normalize_text(dim).lower() + if not dim_key: + continue + if dim_key not in _DIM_TO_DF_COLUMN: + raise ValueError(f"不支援的 pareto_dimension: {dim}") + normalized_values = _normalize_pareto_values(values) + if normalized_values: + normalized[dim_key] = normalized_values + return normalized + + +def _build_dimension_pareto_items( + df: pd.DataFrame, + *, + dim_col: str, + metric_mode: str, + pareto_scope: str, +) -> List[Dict[str, Any]]: + if df is None or df.empty: + return [] + if dim_col not in df.columns: + return [] + + metric_col = "DEFECT_QTY" if metric_mode == "defect" else "REJECT_TOTAL_QTY" + if metric_col not in df.columns: + return [] + + agg_dict = {} + for col in ["MOVEIN_QTY", "REJECT_TOTAL_QTY", "DEFECT_QTY"]: + if col in df.columns: + agg_dict[col] = (col, "sum") + + grouped = df.groupby(dim_col, sort=False).agg(**agg_dict).reset_index() + if grouped.empty: + return [] + + if "CONTAINERID" in df.columns: + lot_counts = ( + df.groupby(dim_col)["CONTAINERID"] + .nunique() + .reset_index() + .rename(columns={"CONTAINERID": "AFFECTED_LOT_COUNT"}) + ) + grouped = grouped.merge(lot_counts, on=dim_col, how="left") + else: + grouped["AFFECTED_LOT_COUNT"] = 0 + + grouped["METRIC_VALUE"] = grouped[metric_col].fillna(0) + grouped = grouped[grouped["METRIC_VALUE"] > 0].sort_values( + "METRIC_VALUE", ascending=False + ) + if grouped.empty: + return [] + + total_metric = grouped["METRIC_VALUE"].sum() + grouped["PCT"] = (grouped["METRIC_VALUE"] / total_metric * 100).round(4) + grouped["CUM_PCT"] = grouped["PCT"].cumsum().round(4) + + items: List[Dict[str, Any]] = [] + for _, row in grouped.iterrows(): + items.append({ + "reason": _normalize_text(row.get(dim_col)) or "(未知)", + "metric_value": _as_float(row.get("METRIC_VALUE")), + "MOVEIN_QTY": _as_int(row.get("MOVEIN_QTY")), + "REJECT_TOTAL_QTY": _as_int(row.get("REJECT_TOTAL_QTY")), + "DEFECT_QTY": _as_int(row.get("DEFECT_QTY")), + "count": _as_int(row.get("AFFECTED_LOT_COUNT")), + "pct": round(_as_float(row.get("PCT")), 4), + "cumPct": round(_as_float(row.get("CUM_PCT")), 4), + }) + + if pareto_scope == "top80" and items: + top_items = [item for item in items if _as_float(item.get("cumPct")) <= 80.0] + if not top_items: + top_items = [items[0]] + return top_items + return items + + +def _apply_cross_filter( + df: pd.DataFrame, + selections: Dict[str, List[str]], + exclude_dim: str, +) -> pd.DataFrame: + if df is None or df.empty or not selections: + return df + + filtered = df + for dim in _PARETO_DIMENSIONS: + if dim == exclude_dim: + continue + selected_values = selections.get(dim) + if not selected_values: + continue + dim_col = _DIM_TO_DF_COLUMN.get(dim) + if not dim_col: + raise ValueError(f"不支援的 pareto_dimension: {dim}") + if dim_col not in filtered.columns: + return filtered.iloc[0:0] + value_set = set(selected_values) + normalized_dimension_values = filtered[dim_col].map( + lambda value: _normalize_text(value) or "(未知)" + ) + filtered = filtered[normalized_dimension_values.isin(value_set)] + if filtered.empty: + return filtered + return filtered + + +def compute_dimension_pareto( + *, + query_id: str, + dimension: str = "reason", + metric_mode: str = "reject_total", + pareto_scope: str = "top80", + packages: Optional[List[str]] = None, + workcenter_groups: Optional[List[str]] = None, + reason: Optional[str] = None, + trend_dates: Optional[List[str]] = None, + include_excluded_scrap: bool = False, + exclude_material_scrap: bool = True, + exclude_pb_diode: bool = True, +) -> Optional[Dict[str, Any]]: + """Compute dimension pareto from cached DataFrame (no Oracle query).""" + metric_mode = _normalize_metric_mode(metric_mode) + pareto_scope = _normalize_pareto_scope(pareto_scope) + dimension = _normalize_text(dimension).lower() or "reason" + if dimension not in _DIM_TO_DF_COLUMN: + raise ValueError( + f"Invalid dimension, supported: {', '.join(sorted(_DIM_TO_DF_COLUMN.keys()))}" + ) + + df = _get_cached_df(query_id) + if df is None: + return None + + # Keep cache-based pareto behavior aligned with primary/view policy filters. + df = _apply_policy_filters( + df, + include_excluded_scrap=include_excluded_scrap, + exclude_material_scrap=exclude_material_scrap, + exclude_pb_diode=exclude_pb_diode, + ) + if df is None or df.empty: + return {"items": [], "dimension": dimension, "metric_mode": metric_mode} + + dim_col = _DIM_TO_DF_COLUMN.get(dimension) + if dim_col not in df.columns: + return {"items": [], "dimension": dimension, "metric_mode": metric_mode} # Apply supplementary filters filtered = _apply_supplementary_filters( @@ -992,103 +1158,103 @@ def compute_dimension_pareto( if filtered.empty: return {"items": [], "dimension": dimension, "metric_mode": metric_mode} - items = _build_dimension_pareto_items( - filtered, - dim_col=dim_col, - metric_mode=metric_mode, - pareto_scope=pareto_scope, - ) - - return { - "items": items, - "dimension": dimension, - "metric_mode": metric_mode, - } - - -def compute_batch_pareto( - *, - query_id: str, - metric_mode: str = "reject_total", - pareto_scope: str = "top80", - pareto_display_scope: str = "all", - packages: Optional[List[str]] = None, - workcenter_groups: Optional[List[str]] = None, - reason: Optional[str] = None, - trend_dates: Optional[List[str]] = None, - pareto_selections: Optional[Dict[str, List[str]]] = None, - include_excluded_scrap: bool = False, - exclude_material_scrap: bool = True, - exclude_pb_diode: bool = True, -) -> Optional[Dict[str, Any]]: - """Compute all six Pareto dimensions from cached DataFrame (no Oracle query).""" - metric_mode = _normalize_metric_mode(metric_mode) - pareto_scope = _normalize_pareto_scope(pareto_scope) - pareto_display_scope = _normalize_pareto_display_scope(pareto_display_scope) - normalized_selections = _normalize_pareto_selections(pareto_selections) - - df = _get_cached_df(query_id) - if df is None: - return None - - df = _apply_policy_filters( - df, - include_excluded_scrap=include_excluded_scrap, - exclude_material_scrap=exclude_material_scrap, - exclude_pb_diode=exclude_pb_diode, - ) - if df is None or df.empty: - return { - "dimensions": { - dim: {"items": [], "dimension": dim, "metric_mode": metric_mode} - for dim in _PARETO_DIMENSIONS - } - } - - filtered = _apply_supplementary_filters( - df, - packages=packages, - workcenter_groups=workcenter_groups, - reason=reason, - ) - if filtered is None or filtered.empty: - return { - "dimensions": { - dim: {"items": [], "dimension": dim, "metric_mode": metric_mode} - for dim in _PARETO_DIMENSIONS - } - } - - if trend_dates and "TXN_DAY" in filtered.columns: - date_set = set(trend_dates) - filtered = filtered[ - filtered["TXN_DAY"].apply(lambda d: _to_date_str(d) in date_set) - ] - - dimensions: Dict[str, Dict[str, Any]] = {} - for dim in _PARETO_DIMENSIONS: - dim_col = _DIM_TO_DF_COLUMN.get(dim) - dim_df = _apply_cross_filter(filtered, normalized_selections, exclude_dim=dim) - items = _build_dimension_pareto_items( - dim_df, - dim_col=dim_col, - metric_mode=metric_mode, - pareto_scope=pareto_scope, - ) - if pareto_display_scope == "top20" and dim in _PARETO_TOP20_DIMENSIONS: - items = items[:20] - dimensions[dim] = { - "items": items, - "dimension": dim, - "metric_mode": metric_mode, - } - - return { - "dimensions": dimensions, - "metric_mode": metric_mode, - "pareto_scope": pareto_scope, - "pareto_display_scope": pareto_display_scope, - } + items = _build_dimension_pareto_items( + filtered, + dim_col=dim_col, + metric_mode=metric_mode, + pareto_scope=pareto_scope, + ) + + return { + "items": items, + "dimension": dimension, + "metric_mode": metric_mode, + } + + +def compute_batch_pareto( + *, + query_id: str, + metric_mode: str = "reject_total", + pareto_scope: str = "top80", + pareto_display_scope: str = "all", + packages: Optional[List[str]] = None, + workcenter_groups: Optional[List[str]] = None, + reason: Optional[str] = None, + trend_dates: Optional[List[str]] = None, + pareto_selections: Optional[Dict[str, List[str]]] = None, + include_excluded_scrap: bool = False, + exclude_material_scrap: bool = True, + exclude_pb_diode: bool = True, +) -> Optional[Dict[str, Any]]: + """Compute all six Pareto dimensions from cached DataFrame (no Oracle query).""" + metric_mode = _normalize_metric_mode(metric_mode) + pareto_scope = _normalize_pareto_scope(pareto_scope) + pareto_display_scope = _normalize_pareto_display_scope(pareto_display_scope) + normalized_selections = _normalize_pareto_selections(pareto_selections) + + df = _get_cached_df(query_id) + if df is None: + return None + + df = _apply_policy_filters( + df, + include_excluded_scrap=include_excluded_scrap, + exclude_material_scrap=exclude_material_scrap, + exclude_pb_diode=exclude_pb_diode, + ) + if df is None or df.empty: + return { + "dimensions": { + dim: {"items": [], "dimension": dim, "metric_mode": metric_mode} + for dim in _PARETO_DIMENSIONS + } + } + + filtered = _apply_supplementary_filters( + df, + packages=packages, + workcenter_groups=workcenter_groups, + reason=reason, + ) + if filtered is None or filtered.empty: + return { + "dimensions": { + dim: {"items": [], "dimension": dim, "metric_mode": metric_mode} + for dim in _PARETO_DIMENSIONS + } + } + + if trend_dates and "TXN_DAY" in filtered.columns: + date_set = set(trend_dates) + filtered = filtered[ + filtered["TXN_DAY"].apply(lambda d: _to_date_str(d) in date_set) + ] + + dimensions: Dict[str, Dict[str, Any]] = {} + for dim in _PARETO_DIMENSIONS: + dim_col = _DIM_TO_DF_COLUMN.get(dim) + dim_df = _apply_cross_filter(filtered, normalized_selections, exclude_dim=dim) + items = _build_dimension_pareto_items( + dim_df, + dim_col=dim_col, + metric_mode=metric_mode, + pareto_scope=pareto_scope, + ) + if pareto_display_scope == "top20" and dim in _PARETO_TOP20_DIMENSIONS: + items = items[:20] + dimensions[dim] = { + "items": items, + "dimension": dim, + "metric_mode": metric_mode, + } + + return { + "dimensions": dimensions, + "metric_mode": metric_mode, + "pareto_scope": pareto_scope, + "pareto_display_scope": pareto_display_scope, + } # ============================================================ @@ -1096,22 +1262,22 @@ def compute_batch_pareto( # ============================================================ -def export_csv_from_cache( +def export_csv_from_cache( *, query_id: str, packages: Optional[List[str]] = None, workcenter_groups: Optional[List[str]] = None, reason: Optional[str] = None, - metric_filter: str = "all", - trend_dates: Optional[List[str]] = None, - detail_reason: Optional[str] = None, - pareto_dimension: Optional[str] = None, - pareto_values: Optional[List[str]] = None, - pareto_selections: Optional[Dict[str, List[str]]] = None, - include_excluded_scrap: bool = False, - exclude_material_scrap: bool = True, - exclude_pb_diode: bool = True, -) -> Optional[list]: + metric_filter: str = "all", + trend_dates: Optional[List[str]] = None, + detail_reason: Optional[str] = None, + pareto_dimension: Optional[str] = None, + pareto_values: Optional[List[str]] = None, + pareto_selections: Optional[Dict[str, List[str]]] = None, + include_excluded_scrap: bool = False, + exclude_material_scrap: bool = True, + exclude_pb_diode: bool = True, +) -> Optional[list]: """Read cache → apply filters → return list of dicts for CSV export.""" df = _get_cached_df(query_id) if df is None: @@ -1137,18 +1303,18 @@ def export_csv_from_cache( filtered = filtered[ filtered["TXN_DAY"].apply(lambda d: _to_date_str(d) in date_set) ] - if detail_reason and "LOSSREASONNAME" in filtered.columns: - filtered = filtered[ - filtered["LOSSREASONNAME"].str.strip() == detail_reason.strip() - ] - filtered = _apply_pareto_selection_filter( - filtered, - pareto_dimension=pareto_dimension, - pareto_values=pareto_values, - pareto_selections=pareto_selections, - ) - - rows = [] + if detail_reason and "LOSSREASONNAME" in filtered.columns: + filtered = filtered[ + filtered["LOSSREASONNAME"].str.strip() == detail_reason.strip() + ] + filtered = _apply_pareto_selection_filter( + filtered, + pareto_dimension=pareto_dimension, + pareto_values=pareto_values, + pareto_selections=pareto_selections, + ) + + rows = [] for _, row in filtered.iterrows(): rows.append( { diff --git a/src/mes_dashboard/services/resource_dataset_cache.py b/src/mes_dashboard/services/resource_dataset_cache.py index 98f1c03..2fec290 100644 --- a/src/mes_dashboard/services/resource_dataset_cache.py +++ b/src/mes_dashboard/services/resource_dataset_cache.py @@ -11,9 +11,7 @@ Cache layers: from __future__ import annotations -import base64 import hashlib -import io import json import logging from functools import lru_cache @@ -24,11 +22,7 @@ import pandas as pd from mes_dashboard.core.cache import ProcessLevelCache, register_process_cache from mes_dashboard.core.database import read_sql_df_slow as read_sql_df -from mes_dashboard.core.redis_client import ( - REDIS_ENABLED, - get_key, - get_redis_client, -) +from mes_dashboard.core.redis_df_store import redis_load_df, redis_store_df logger = logging.getLogger("mes_dashboard.resource_dataset_cache") @@ -67,44 +61,16 @@ def _make_query_id(params: dict) -> str: # ============================================================ -# Redis L2 helpers (parquet <-> base64 string) +# Redis L2 helpers (delegated to shared redis_df_store) # ============================================================ -def _redis_key(query_id: str) -> str: - return get_key(f"{_REDIS_NAMESPACE}:{query_id}") - - def _redis_store_df(query_id: str, df: pd.DataFrame) -> None: - if not REDIS_ENABLED: - return - client = get_redis_client() - if client is None: - return - try: - buf = io.BytesIO() - df.to_parquet(buf, engine="pyarrow", index=False) - encoded = base64.b64encode(buf.getvalue()).decode("ascii") - client.setex(_redis_key(query_id), _CACHE_TTL, encoded) - except Exception as exc: - logger.warning("Failed to store DataFrame in Redis: %s", exc) + redis_store_df(f"{_REDIS_NAMESPACE}:{query_id}", df, ttl=_CACHE_TTL) def _redis_load_df(query_id: str) -> Optional[pd.DataFrame]: - if not REDIS_ENABLED: - return None - client = get_redis_client() - if client is None: - return None - try: - encoded = client.get(_redis_key(query_id)) - if encoded is None: - return None - raw = base64.b64decode(encoded) - return pd.read_parquet(io.BytesIO(raw), engine="pyarrow") - except Exception as exc: - logger.warning("Failed to load DataFrame from Redis: %s", exc) - return None + return redis_load_df(f"{_REDIS_NAMESPACE}:{query_id}") # ============================================================ @@ -233,12 +199,47 @@ def execute_primary_query( "detail": _empty_detail(), } - sql = _load_sql("base_facts") - sql = sql.replace("{{ HISTORYID_FILTER }}", historyid_filter) - params = {"start_date": start_date, "end_date": end_date} - df = read_sql_df(sql, params) - if df is None: - df = pd.DataFrame() + from mes_dashboard.services.batch_query_engine import ( + decompose_by_time_range, + execute_plan, + merge_chunks, + compute_query_hash, + should_decompose_by_time, + ) + + base_sql = _load_sql("base_facts") + base_sql = base_sql.replace("{{ HISTORYID_FILTER }}", historyid_filter) + + if should_decompose_by_time(start_date, end_date): + # --- Engine path for long date ranges --- + engine_chunks = decompose_by_time_range(start_date, end_date) + engine_hash = compute_query_hash(query_id_input) + + def _run_resource_chunk(chunk, max_rows_per_chunk=None): + params = { + "start_date": chunk["chunk_start"], + "end_date": chunk["chunk_end"], + } + result = read_sql_df(base_sql, params) + return result if result is not None else pd.DataFrame() + + logger.info( + "Engine activated for resource: %d chunks (query_id=%s)", + len(engine_chunks), query_id, + ) + execute_plan( + engine_chunks, _run_resource_chunk, + query_hash=engine_hash, + cache_prefix="resource", + chunk_ttl=_CACHE_TTL, + ) + df = merge_chunks("resource", engine_hash) + else: + # --- Direct path (short query) --- + params = {"start_date": start_date, "end_date": end_date} + df = read_sql_df(base_sql, params) + if df is None: + df = pd.DataFrame() if not df.empty: _store_df(query_id, df) diff --git a/tests/e2e/test_reject_history_e2e.py b/tests/e2e/test_reject_history_e2e.py new file mode 100644 index 0000000..0478679 --- /dev/null +++ b/tests/e2e/test_reject_history_e2e.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +"""E2E tests for reject-history long-range query flow.""" + +from __future__ import annotations + +import os + +import pytest +import requests + + +def _post_reject_query(app_server: str, body: dict, timeout: float = 420.0) -> requests.Response: + return requests.post( + f"{app_server}/api/reject-history/query", + json=body, + timeout=timeout, + ) + + +@pytest.mark.e2e +@pytest.mark.skipif( + os.environ.get("RUN_LONG_E2E") != "1", + reason="Long-range reject-history E2E disabled; set RUN_LONG_E2E=1 to run.", +) +class TestRejectHistoryLongRangeE2E: + """Real backend E2E checks for long-range reject history query.""" + + def test_query_365_day_range_returns_success(self, app_server: str): + response = _post_reject_query( + app_server, + { + "mode": "date_range", + "start_date": "2025-01-01", + "end_date": "2025-12-31", + "include_excluded_scrap": False, + "exclude_material_scrap": True, + "exclude_pb_diode": True, + }, + ) + + assert response.status_code == 200, response.text[:500] + payload = response.json() + assert payload.get("success") is True, payload + assert payload.get("query_id") + + def test_query_then_view_returns_cached_result(self, app_server: str): + query_resp = _post_reject_query( + app_server, + { + "mode": "date_range", + "start_date": "2025-01-01", + "end_date": "2025-12-31", + }, + ) + assert query_resp.status_code == 200, query_resp.text[:500] + query_payload = query_resp.json() + assert query_payload.get("success") is True, query_payload + query_id = query_payload.get("query_id") + assert query_id + + view_resp = requests.get( + f"{app_server}/api/reject-history/view", + params={ + "query_id": query_id, + "page": 1, + "per_page": 50, + "exclude_material_scrap": "true", + "exclude_pb_diode": "true", + }, + timeout=120, + ) + assert view_resp.status_code == 200, view_resp.text[:500] + view_payload = view_resp.json() + assert view_payload.get("success") is True, view_payload + + def test_query_then_export_cached_returns_csv(self, app_server: str): + query_resp = _post_reject_query( + app_server, + { + "mode": "date_range", + "start_date": "2025-01-01", + "end_date": "2025-12-31", + }, + ) + assert query_resp.status_code == 200, query_resp.text[:500] + query_payload = query_resp.json() + assert query_payload.get("success") is True, query_payload + query_id = query_payload.get("query_id") + assert query_id + + export_resp = requests.get( + f"{app_server}/api/reject-history/export-cached", + params={"query_id": query_id}, + timeout=120, + ) + assert export_resp.status_code == 200, export_resp.text[:300] + assert "text/csv" in export_resp.headers.get("Content-Type", "") + assert "LOT" in export_resp.text[:200] diff --git a/tests/stress/test_reject_history_stress.py b/tests/stress/test_reject_history_stress.py new file mode 100644 index 0000000..3ae592d --- /dev/null +++ b/tests/stress/test_reject_history_stress.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +"""Stress tests for reject-history long-range query stability.""" + +from __future__ import annotations + +import concurrent.futures +import os +import time + +import pytest +import requests + +try: + import redis +except Exception: # pragma: no cover - optional runtime dependency + redis = None + + +@pytest.mark.stress +@pytest.mark.load +@pytest.mark.skipif( + os.environ.get("RUN_LONG_STRESS") != "1", + reason="Long-range reject-history stress disabled; set RUN_LONG_STRESS=1 to run.", +) +class TestRejectHistoryLongRangeStress: + """Concurrent long-range reject-history queries should stay recoverable.""" + + @staticmethod + def _redis_used_memory_bytes() -> int | None: + if redis is None: + return None + redis_url = os.environ.get("STRESS_REDIS_URL", os.environ.get("REDIS_URL", "redis://localhost:6379/0")) + try: + client = redis.Redis.from_url(redis_url, decode_responses=True) + info = client.info("memory") + used = info.get("used_memory") + return int(used) if used is not None else None + except Exception: + return None + + @staticmethod + def _run_query(base_url: str, timeout: float, seed: int) -> tuple[bool, float, str]: + start = time.time() + try: + year = 2024 + (seed % 2) + response = requests.post( + f"{base_url}/api/reject-history/query", + json={ + "mode": "date_range", + "start_date": f"{year}-01-01", + "end_date": f"{year}-12-31", + "exclude_material_scrap": True, + "exclude_pb_diode": True, + }, + timeout=timeout, + ) + duration = time.time() - start + if response.status_code != 200: + return False, duration, f"HTTP {response.status_code}" + payload = response.json() + if payload.get("success") is True and payload.get("query_id"): + return True, duration, "" + return False, duration, f"success={payload.get('success')} error={payload.get('error')}" + except Exception as exc: # pragma: no cover - runtime/network dependent + return False, time.time() - start, str(exc)[:180] + + def test_concurrent_365_day_queries_no_crash(self, base_url: str, stress_result): + result = stress_result("Reject History Long-Range Concurrent") + timeout = float(os.environ.get("STRESS_REJECT_HISTORY_TIMEOUT", "420")) + concurrent_users = int(os.environ.get("STRESS_REJECT_HISTORY_CONCURRENCY", "3")) + rounds = int(os.environ.get("STRESS_REJECT_HISTORY_ROUNDS", "2")) + max_redis_delta_mb = int(os.environ.get("STRESS_REJECT_REDIS_MAX_DELTA_MB", "256")) + total_requests = concurrent_users * rounds + redis_before = self._redis_used_memory_bytes() + + started = time.time() + with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_users) as executor: + futures = [ + executor.submit(self._run_query, base_url, timeout, idx) + for idx in range(total_requests) + ] + for future in concurrent.futures.as_completed(futures): + ok, duration, error = future.result() + if ok: + result.add_success(duration) + else: + result.add_failure(error, duration) + result.total_duration = time.time() - started + + print(result.report()) + assert result.total_requests == total_requests + assert result.success_rate >= 90.0, f"Success rate too low: {result.success_rate:.2f}%" + + health_resp = requests.get(f"{base_url}/health", timeout=10) + assert health_resp.status_code in (200, 503) + + redis_after = self._redis_used_memory_bytes() + if redis_before is not None and redis_after is not None: + delta_mb = (redis_after - redis_before) / (1024 * 1024) + assert delta_mb <= max_redis_delta_mb, ( + f"Redis memory delta too high: {delta_mb:.1f}MB > {max_redis_delta_mb}MB" + ) diff --git a/tests/test_batch_query_engine.py b/tests/test_batch_query_engine.py new file mode 100644 index 0000000..77553b9 --- /dev/null +++ b/tests/test_batch_query_engine.py @@ -0,0 +1,576 @@ +# -*- coding: utf-8 -*- +"""Unit tests for BatchQueryEngine module.""" + +import pytest +from unittest.mock import patch, MagicMock, call + +import pandas as pd + +from mes_dashboard.services.batch_query_engine import ( + compute_query_hash, + decompose_by_ids, + decompose_by_time_range, + execute_plan, + merge_chunks, + iterate_chunks, + should_decompose_by_time, + should_decompose_by_ids, +) + + +# ============================================================ +# 4.1 decompose_by_time_range +# ============================================================ + + +class TestDecomposeByTimeRange: + def test_90_days_yields_3_chunks(self): + chunks = decompose_by_time_range("2025-01-01", "2025-03-31", grain_days=31) + assert len(chunks) == 3 + # First chunk: Jan 1 – Jan 31 + assert chunks[0] == {"chunk_start": "2025-01-01", "chunk_end": "2025-01-31"} + # Second chunk: Feb 1 – Mar 3 + assert chunks[1]["chunk_start"] == "2025-02-01" + # Third chunk ends Mar 31 + assert chunks[2]["chunk_end"] == "2025-03-31" + + def test_31_days_yields_1_chunk(self): + chunks = decompose_by_time_range("2025-01-01", "2025-01-31", grain_days=31) + assert len(chunks) == 1 + assert chunks[0] == {"chunk_start": "2025-01-01", "chunk_end": "2025-01-31"} + + def test_single_day(self): + chunks = decompose_by_time_range("2025-06-15", "2025-06-15") + assert len(chunks) == 1 + assert chunks[0] == {"chunk_start": "2025-06-15", "chunk_end": "2025-06-15"} + + def test_contiguous_no_overlap_no_gap(self): + """Verify closed-interval boundary semantics: no overlap, no gap.""" + chunks = decompose_by_time_range("2025-01-01", "2025-06-30", grain_days=31) + for i in range(1, len(chunks)): + prev_end = chunks[i - 1]["chunk_end"] + cur_start = chunks[i]["chunk_start"] + from datetime import datetime, timedelta + prev_dt = datetime.strptime(prev_end, "%Y-%m-%d") + cur_dt = datetime.strptime(cur_start, "%Y-%m-%d") + assert cur_dt == prev_dt + timedelta(days=1), ( + f"Gap/overlap between chunk {i-1} end={prev_end} and chunk {i} start={cur_start}" + ) + # First starts at start_date, last ends at end_date + assert chunks[0]["chunk_start"] == "2025-01-01" + assert chunks[-1]["chunk_end"] == "2025-06-30" + + def test_final_chunk_may_be_shorter(self): + chunks = decompose_by_time_range("2025-01-01", "2025-02-10", grain_days=31) + assert len(chunks) == 2 + # Second chunk: Feb 1 – Feb 10 (10 days < 31) + assert chunks[1] == {"chunk_start": "2025-02-01", "chunk_end": "2025-02-10"} + + def test_inverted_range_raises(self): + with pytest.raises(ValueError, match="must be <="): + decompose_by_time_range("2025-12-31", "2025-01-01") + + def test_365_days(self): + chunks = decompose_by_time_range("2025-01-01", "2025-12-31", grain_days=31) + assert len(chunks) == 12 # roughly 365/31 ≈ 12 + + +# ============================================================ +# 4.2 decompose_by_ids +# ============================================================ + + +class TestDecomposeByIds: + def test_2500_ids_yields_3_batches(self): + ids = list(range(2500)) + batches = decompose_by_ids(ids, batch_size=1000) + assert len(batches) == 3 + assert len(batches[0]) == 1000 + assert len(batches[1]) == 1000 + assert len(batches[2]) == 500 + + def test_500_ids_yields_1_batch(self): + ids = list(range(500)) + batches = decompose_by_ids(ids, batch_size=1000) + assert len(batches) == 1 + assert len(batches[0]) == 500 + + def test_empty_ids(self): + assert decompose_by_ids([]) == [] + + def test_exact_batch_size(self): + ids = list(range(1000)) + batches = decompose_by_ids(ids, batch_size=1000) + assert len(batches) == 1 + + +# ============================================================ +# 4.3 execute_plan sequential +# ============================================================ + + +class TestExecutePlanSequential: + def _mock_redis(self): + """Set up mock redis for chunk store/load/exists.""" + stored = {} + mock_client = MagicMock() + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + mock_client.hset.return_value = None + mock_client.expire.return_value = None + return mock_client, stored + + def test_sequential_execution_stores_chunks(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client, stored = self._mock_redis() + + call_log = [] + + def fake_query_fn(chunk, max_rows_per_chunk=None): + call_log.append(chunk) + return pd.DataFrame({"V": [1, 2]}) + + chunks = [ + {"chunk_start": "2025-01-01", "chunk_end": "2025-01-31"}, + {"chunk_start": "2025-02-01", "chunk_end": "2025-02-28"}, + ] + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + qh = execute_plan( + chunks, fake_query_fn, + query_hash="testhash", + cache_prefix="test", + skip_cached=False, + ) + + assert qh == "testhash" + assert len(call_log) == 2 + # Chunks should be stored in Redis + assert any("chunk:0" in k for k in stored) + assert any("chunk:1" in k for k in stored) + + +# ============================================================ +# 4.4 execute_plan parallel +# ============================================================ + + +class TestExecutePlanParallel: + def test_parallel_uses_threadpool(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + mock_client.hset.return_value = None + mock_client.expire.return_value = None + + call_count = {"n": 0} + + def fake_query_fn(chunk, max_rows_per_chunk=None): + call_count["n"] += 1 + return pd.DataFrame({"V": [1]}) + + chunks = [{"i": i} for i in range(4)] + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "_effective_parallelism", return_value=2): + qh = execute_plan( + chunks, fake_query_fn, + parallel=2, + query_hash="ptest", + cache_prefix="p", + skip_cached=False, + ) + + assert call_count["n"] == 4 + + +# ============================================================ +# 4.5 partial cache hit +# ============================================================ + + +class TestPartialCacheHit: + def test_skips_cached_chunks(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.hset.return_value = None + mock_client.expire.return_value = None + + # Pre-populate chunks 0 and 1 as "cached" + pre_cached_keys = set() + + def fake_exists(k): + return 1 if k in pre_cached_keys else (1 if k in stored else 0) + + mock_client.exists.side_effect = fake_exists + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + # Pre-store 2 chunks + rds.redis_store_chunk("test", "hash5", 0, pd.DataFrame({"A": [1]}), ttl=60) + rds.redis_store_chunk("test", "hash5", 1, pd.DataFrame({"A": [2]}), ttl=60) + + # Now mark those keys as existing + pre_cached_keys.update(stored.keys()) + + call_log = [] + + def fake_query_fn(chunk, max_rows_per_chunk=None): + call_log.append(chunk) + return pd.DataFrame({"A": [99]}) + + chunks = [{"i": i} for i in range(5)] + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + execute_plan( + chunks, fake_query_fn, + query_hash="hash5", + cache_prefix="test", + skip_cached=True, + ) + + # Only chunks 2, 3, 4 should have been executed + assert len(call_log) == 3 + + +# ============================================================ +# 4.6 memory guard +# ============================================================ + + +class TestMemoryGuard: + def test_oversized_chunk_discarded(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + mock_client.hset.return_value = None + mock_client.expire.return_value = None + + def oversized_query_fn(chunk, max_rows_per_chunk=None): + # Create DF that reports large memory + df = pd.DataFrame({"X": [1]}) + return df + + chunks = [{"i": 0}] + + # Set memory limit to 0 MB so any DF exceeds it + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "BATCH_CHUNK_MAX_MEMORY_MB", 0): + qh = execute_plan( + chunks, oversized_query_fn, + query_hash="memtest", + cache_prefix="m", + skip_cached=False, + ) + + # Chunk should NOT be stored (memory exceeded) + assert not any("chunk:0" in k for k in stored) + + +# ============================================================ +# 4.7 result row count limit +# ============================================================ + + +class TestMaxRowsPerChunk: + def test_max_rows_passed_to_query_fn(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + mock_client.setex.return_value = None + mock_client.get.return_value = None + mock_client.exists.return_value = 0 + mock_client.hset.return_value = None + mock_client.expire.return_value = None + + received_max_rows = [] + + def capture_query_fn(chunk, max_rows_per_chunk=None): + received_max_rows.append(max_rows_per_chunk) + return pd.DataFrame({"V": [1]}) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + execute_plan( + [{"i": 0}], capture_query_fn, + query_hash="rowtest", + cache_prefix="r", + max_rows_per_chunk=5000, + skip_cached=False, + ) + + assert received_max_rows == [5000] + + +# ============================================================ +# 4.8 merge_chunks +# ============================================================ + + +class TestMergeChunks: + def test_merge_produces_correct_df(self): + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.hgetall.return_value = {"total": "3", "completed": "3", "failed": "0"} + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + rds.redis_store_chunk("t", "h", 0, pd.DataFrame({"A": [1, 2]})) + rds.redis_store_chunk("t", "h", 1, pd.DataFrame({"A": [3, 4]})) + rds.redis_store_chunk("t", "h", 2, pd.DataFrame({"A": [5]})) + + import mes_dashboard.services.batch_query_engine as bqe + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + merged = merge_chunks("t", "h") + + assert len(merged) == 5 + assert list(merged["A"]) == [1, 2, 3, 4, 5] + + def test_merge_respects_max_total_rows(self): + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.hgetall.return_value = {"total": "3", "completed": "3", "failed": "0"} + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + rds.redis_store_chunk("t", "cap", 0, pd.DataFrame({"A": [1, 2]})) + rds.redis_store_chunk("t", "cap", 1, pd.DataFrame({"A": [3, 4]})) + rds.redis_store_chunk("t", "cap", 2, pd.DataFrame({"A": [5, 6]})) + + import mes_dashboard.services.batch_query_engine as bqe + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + merged = merge_chunks("t", "cap", max_total_rows=4) + + assert len(merged) == 4 + assert list(merged["A"]) == [1, 2, 3, 4] + + +# ============================================================ +# 4.9 progress tracking +# ============================================================ + + +class TestProgressTracking: + def test_hset_updated_after_each_chunk(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + mock_client.setex.return_value = None + mock_client.get.return_value = None + mock_client.exists.return_value = 0 + mock_client.hset.return_value = None + mock_client.expire.return_value = None + + hset_calls = [] + original_hset = mock_client.hset + + def track_hset(key, mapping=None): + hset_calls.append(mapping.copy() if mapping else {}) + return original_hset(key, mapping=mapping) + + mock_client.hset.side_effect = track_hset + + def fake_query_fn(chunk, max_rows_per_chunk=None): + return pd.DataFrame({"V": [1]}) + + chunks = [{"i": 0}, {"i": 1}, {"i": 2}] + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + execute_plan( + chunks, fake_query_fn, + query_hash="progtest", + cache_prefix="p", + skip_cached=False, + ) + + # Should have initial + 3 per-chunk + final = 5 hset calls + assert len(hset_calls) >= 4 + # Last call should show completed status + last = hset_calls[-1] + assert last["status"] == "completed" + assert last["completed"] == "3" + + +# ============================================================ +# 4.10 chunk failure resilience +# ============================================================ + + +class TestChunkFailureResilience: + def test_one_chunk_fails_others_complete(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + mock_client.hset.return_value = None + mock_client.expire.return_value = None + + call_count = {"n": 0} + + def failing_query_fn(chunk, max_rows_per_chunk=None): + call_count["n"] += 1 + if chunk.get("i") == 1: + raise RuntimeError("Oracle timeout") + return pd.DataFrame({"V": [chunk["i"]]}) + + chunks = [{"i": 0}, {"i": 1}, {"i": 2}] + + hset_calls = [] + mock_client.hset.side_effect = lambda k, mapping=None: hset_calls.append( + mapping.copy() if mapping else {} + ) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client): + qh = execute_plan( + chunks, failing_query_fn, + query_hash="failtest", + cache_prefix="f", + skip_cached=False, + ) + + # All 3 chunks attempted + assert call_count["n"] == 3 + # Final metadata should reflect partial failure + last = hset_calls[-1] + assert last["status"] == "partial" + assert last["completed"] == "2" + assert last["failed"] == "1" + assert last["has_partial_failure"] == "True" + + def test_chunk_store_failure_is_marked_partial(self): + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + mock_client.hset.return_value = None + mock_client.expire.return_value = None + + def query_fn(chunk, max_rows_per_chunk=None): + return pd.DataFrame({"V": [chunk["i"]]}) + + original_store_chunk = bqe.redis_store_chunk + + def fail_one_store(prefix, query_hash, idx, df, ttl=900): + if idx == 1: + return False + return original_store_chunk(prefix, query_hash, idx, df, ttl=ttl) + + hset_calls = [] + mock_client.hset.side_effect = lambda k, mapping=None: hset_calls.append( + mapping.copy() if mapping else {} + ) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "get_redis_client", return_value=mock_client), \ + patch.object(bqe, "redis_store_chunk", side_effect=fail_one_store): + execute_plan( + [{"i": 0}, {"i": 1}, {"i": 2}], + query_fn, + query_hash="storefail", + cache_prefix="sf", + skip_cached=False, + ) + + last = hset_calls[-1] + assert last["status"] == "partial" + assert last["completed"] == "2" + assert last["failed"] == "1" + + +# ============================================================ +# query_hash stability +# ============================================================ + + +class TestQueryHash: + def test_same_params_different_order(self): + h1 = compute_query_hash({"a": 1, "b": [3, 1, 2]}) + h2 = compute_query_hash({"b": [2, 1, 3], "a": 1}) + assert h1 == h2 + + def test_different_params_different_hash(self): + h1 = compute_query_hash({"mode": "date_range", "start": "2025-01-01"}) + h2 = compute_query_hash({"mode": "date_range", "start": "2025-06-01"}) + assert h1 != h2 + + def test_hash_is_16_chars(self): + h = compute_query_hash({"x": 1}) + assert len(h) == 16 + + +# ============================================================ +# should_decompose helpers +# ============================================================ + + +class TestShouldDecompose: + def test_long_range_true(self): + assert should_decompose_by_time("2025-01-01", "2025-12-31") + + def test_short_range_false(self): + assert not should_decompose_by_time("2025-01-01", "2025-02-01") + + def test_large_ids_true(self): + assert should_decompose_by_ids(list(range(2000))) + + def test_small_ids_false(self): + assert not should_decompose_by_ids(list(range(500))) diff --git a/tests/test_database_slow_iter.py b/tests/test_database_slow_iter.py index b0c346c..2fca668 100644 --- a/tests/test_database_slow_iter.py +++ b/tests/test_database_slow_iter.py @@ -117,3 +117,4 @@ def test_runtime_config_includes_fetchmany_size(): assert "slow_fetchmany_size" in runtime assert isinstance(runtime["slow_fetchmany_size"], int) assert runtime["slow_fetchmany_size"] > 0 + assert "slow_pool_enabled" in runtime diff --git a/tests/test_database_slow_pool.py b/tests/test_database_slow_pool.py new file mode 100644 index 0000000..981daaa --- /dev/null +++ b/tests/test_database_slow_pool.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +"""Unit tests for isolated slow-query pool path.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import mes_dashboard.core.database as db + + +@patch.object(db, "oracledb") +@patch.object(db, "get_slow_engine") +@patch.object(db, "_get_slow_query_semaphore") +@patch.object(db, "get_db_runtime_config") +def test_read_sql_df_slow_uses_slow_pool_when_enabled( + mock_runtime, + mock_sem_fn, + mock_get_slow_engine, + mock_oracledb, +): + """Slow query should checkout connection from isolated slow pool.""" + mock_runtime.return_value = { + "slow_pool_enabled": True, + "slow_call_timeout_ms": 60000, + "slow_fetchmany_size": 5000, + "tcp_connect_timeout": 10, + "retry_count": 1, + "retry_delay": 1.0, + } + + sem = MagicMock() + sem.acquire.return_value = True + mock_sem_fn.return_value = sem + + cursor = MagicMock() + cursor.description = [("COL_A",), ("COL_B",)] + cursor.fetchall.return_value = [("v1", "v2")] + + conn = MagicMock() + conn.cursor.return_value = cursor + + engine = MagicMock() + engine.raw_connection.return_value = conn + mock_get_slow_engine.return_value = engine + + df = db.read_sql_df_slow("SELECT 1", {"p0": "x"}) + + assert list(df.columns) == ["COL_A", "COL_B"] + assert len(df) == 1 + mock_get_slow_engine.assert_called_once() + mock_oracledb.connect.assert_not_called() + conn.close.assert_called_once() + sem.release.assert_called_once() + + +@patch.object(db, "oracledb") +@patch.object(db, "get_slow_engine") +@patch.object(db, "_get_slow_query_semaphore") +@patch.object(db, "get_db_runtime_config") +def test_read_sql_df_slow_iter_uses_slow_pool_when_enabled( + mock_runtime, + mock_sem_fn, + mock_get_slow_engine, + mock_oracledb, +): + """Slow iterator query should checkout connection from isolated slow pool.""" + mock_runtime.return_value = { + "slow_pool_enabled": True, + "slow_call_timeout_ms": 60000, + "slow_fetchmany_size": 2, + "tcp_connect_timeout": 10, + "retry_count": 1, + "retry_delay": 1.0, + } + + sem = MagicMock() + sem.acquire.return_value = True + mock_sem_fn.return_value = sem + + cursor = MagicMock() + cursor.description = [("COL_A",), ("COL_B",)] + cursor.fetchmany.side_effect = [ + [("r1a", "r1b")], + [], + ] + + conn = MagicMock() + conn.cursor.return_value = cursor + + engine = MagicMock() + engine.raw_connection.return_value = conn + mock_get_slow_engine.return_value = engine + + batches = list(db.read_sql_df_slow_iter("SELECT 1", {"p0": "x"}, batch_size=2)) + + assert batches == [(["COL_A", "COL_B"], [("r1a", "r1b")])] + mock_get_slow_engine.assert_called_once() + mock_oracledb.connect.assert_not_called() + conn.close.assert_called_once() + sem.release.assert_called_once() + diff --git a/tests/test_hold_dataset_cache.py b/tests/test_hold_dataset_cache.py new file mode 100644 index 0000000..0eab169 --- /dev/null +++ b/tests/test_hold_dataset_cache.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +"""Unit tests for hold_dataset_cache — engine integration (task 6.4).""" + +from __future__ import annotations + +import pandas as pd + +from mes_dashboard.services import hold_dataset_cache as cache_svc + + +class TestHoldEngineDecomposition: + """6.4 — hold-history with long date range triggers engine.""" + + def test_long_range_triggers_engine(self, monkeypatch): + """90-day range → engine decomposition activated.""" + import mes_dashboard.services.batch_query_engine as engine_mod + + engine_calls = {"execute": 0, "merge": 0} + + def fake_execute_plan(chunks, query_fn, **kwargs): + engine_calls["execute"] += 1 + assert len(chunks) == 3 # 90 days / 31 = 3 chunks + return kwargs.get("query_hash", "fake_hash") + + result_df = pd.DataFrame({ + "CONTAINERID": ["C1"], + "HOLDTYPE": ["Quality"], + }) + + def fake_merge_chunks(prefix, qhash, **kwargs): + engine_calls["merge"] += 1 + return result_df + + monkeypatch.setattr(engine_mod, "execute_plan", fake_execute_plan) + monkeypatch.setattr(engine_mod, "merge_chunks", fake_merge_chunks) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._get_cached_df", + lambda _: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._store_df", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._load_sql", + lambda name: "SELECT 1 FROM dual", + ) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._derive_all_views", + lambda df, **kw: { + "summary": {"total": 1}, + "detail": {"items": [], "pagination": {"total": 1}}, + }, + ) + + result = cache_svc.execute_primary_query( + start_date="2025-01-01", + end_date="2025-03-31", + ) + + assert engine_calls["execute"] == 1 + assert engine_calls["merge"] == 1 + + def test_short_range_skips_engine(self, monkeypatch): + """30-day range → direct path, no engine.""" + engine_calls = {"execute": 0} + + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._get_cached_df", + lambda _: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._load_sql", + lambda name: "SELECT 1 FROM dual", + ) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache.read_sql_df", + lambda sql, params: pd.DataFrame({"CONTAINERID": ["C1"]}), + ) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._store_df", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.hold_dataset_cache._derive_all_views", + lambda df, **kw: { + "summary": {"total": 1}, + "detail": {"items": [], "pagination": {"total": 1}}, + }, + ) + + result = cache_svc.execute_primary_query( + start_date="2025-06-01", + end_date="2025-06-30", + ) + + assert engine_calls["execute"] == 0 # Engine NOT used diff --git a/tests/test_job_query_engine.py b/tests/test_job_query_engine.py new file mode 100644 index 0000000..f3c4333 --- /dev/null +++ b/tests/test_job_query_engine.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +"""Unit tests for job_query_service — engine integration (tasks 9.1-9.4).""" + +from __future__ import annotations + +import pandas as pd + +from mes_dashboard.services import job_query_service as job_svc + + +class TestJobQueryEngineDecomposition: + """9.4 — full-year query with many resources → engine decomposition.""" + + def test_long_range_triggers_engine(self, monkeypatch): + """90-day range → engine decomposition for job query.""" + import mes_dashboard.services.batch_query_engine as engine_mod + import mes_dashboard.core.redis_df_store as rds + + engine_calls = {"execute": 0, "merge": 0} + + def fake_execute_plan(chunks, query_fn, **kwargs): + engine_calls["execute"] += 1 + assert len(chunks) == 3 # 90 days / 31 = 3 chunks + assert kwargs.get("cache_prefix") == "job" + return kwargs.get("query_hash", "fake_hash") + + result_df = pd.DataFrame({ + "JOBID": ["J1", "J2"], + "RESOURCEID": ["R1", "R2"], + }) + + def fake_merge_chunks(prefix, qhash, **kwargs): + engine_calls["merge"] += 1 + return result_df + + monkeypatch.setattr(engine_mod, "execute_plan", fake_execute_plan) + monkeypatch.setattr(engine_mod, "merge_chunks", fake_merge_chunks) + monkeypatch.setattr(rds, "redis_load_df", lambda key: None) + monkeypatch.setattr(rds, "redis_store_df", lambda key, df, ttl=None: None) + monkeypatch.setattr( + "mes_dashboard.services.job_query_service.SQLLoader", + type("FakeLoader", (), { + "load": staticmethod(lambda name: "SELECT 1 FROM dual WHERE {{ RESOURCE_FILTER }}"), + }), + ) + + result = job_svc.get_jobs_by_resources( + resource_ids=["R1", "R2", "R3"], + start_date="2025-01-01", + end_date="2025-03-31", + ) + + assert engine_calls["execute"] == 1 + assert engine_calls["merge"] == 1 + assert result["total"] == 2 + assert "error" not in result + + def test_short_range_skips_engine(self, monkeypatch): + """30-day range → direct path, no engine.""" + import mes_dashboard.core.redis_df_store as rds + + engine_calls = {"execute": 0} + + monkeypatch.setattr(rds, "redis_load_df", lambda key: None) + monkeypatch.setattr(rds, "redis_store_df", lambda key, df, ttl=None: None) + monkeypatch.setattr( + "mes_dashboard.services.job_query_service.SQLLoader", + type("FakeLoader", (), { + "load": staticmethod(lambda name: "SELECT 1 FROM dual WHERE {{ RESOURCE_FILTER }}"), + }), + ) + monkeypatch.setattr( + "mes_dashboard.services.job_query_service.read_sql_df", + lambda sql, params: pd.DataFrame({"JOBID": ["J1"]}), + ) + + result = job_svc.get_jobs_by_resources( + resource_ids=["R1"], + start_date="2025-06-01", + end_date="2025-06-30", + ) + + assert engine_calls["execute"] == 0 # Engine NOT used + assert result["total"] == 1 + + def test_redis_cache_hit_skips_query(self, monkeypatch): + """Redis cache hit → returns cached DataFrame without Oracle query.""" + import mes_dashboard.core.redis_df_store as rds + + query_calls = {"sql": 0} + + cached_df = pd.DataFrame({ + "JOBID": ["J-CACHED"], + "RESOURCEID": ["R1"], + }) + + monkeypatch.setattr(rds, "redis_load_df", lambda key: cached_df) + + def fail_sql(*args, **kwargs): + query_calls["sql"] += 1 + raise RuntimeError("Should not reach Oracle") + + monkeypatch.setattr( + "mes_dashboard.services.job_query_service.read_sql_df", + fail_sql, + ) + + result = job_svc.get_jobs_by_resources( + resource_ids=["R1"], + start_date="2025-06-01", + end_date="2025-06-30", + ) + + assert query_calls["sql"] == 0 # Oracle NOT called + assert result["total"] == 1 + assert result["data"][0]["JOBID"] == "J-CACHED" diff --git a/tests/test_mid_section_defect_engine.py b/tests/test_mid_section_defect_engine.py new file mode 100644 index 0000000..0c7c9d8 --- /dev/null +++ b/tests/test_mid_section_defect_engine.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +"""Unit tests for mid_section_defect_service — engine integration (task 8.4).""" + +from __future__ import annotations + +import pandas as pd + +from mes_dashboard.services import mid_section_defect_service as msd_svc + + +class TestDetectionEngineDecomposition: + """8.4 — large date range + high-volume station → engine decomposition.""" + + def test_long_range_triggers_engine(self, monkeypatch): + """90-day range → engine decomposition for detection query.""" + import mes_dashboard.services.batch_query_engine as engine_mod + + engine_calls = {"execute": 0, "merge": 0} + + def fake_execute_plan(chunks, query_fn, **kwargs): + engine_calls["execute"] += 1 + assert len(chunks) == 3 # 90 days / 31 = 3 chunks + assert kwargs.get("cache_prefix") == "msd_detect" + return kwargs.get("query_hash", "fake_hash") + + result_df = pd.DataFrame({ + "CONTAINERID": ["C1", "C2"], + "WORKCENTERNAME": ["TEST-WC-A", "TEST-WC-B"], + }) + + def fake_merge_chunks(prefix, qhash, **kwargs): + engine_calls["merge"] += 1 + return result_df + + monkeypatch.setattr(engine_mod, "execute_plan", fake_execute_plan) + monkeypatch.setattr(engine_mod, "merge_chunks", fake_merge_chunks) + monkeypatch.setattr( + "mes_dashboard.services.mid_section_defect_service.cache_get", + lambda key: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.mid_section_defect_service.cache_set", + lambda key, val, ttl=None: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.mid_section_defect_service.SQLLoader", + type("FakeLoader", (), { + "load_with_params": staticmethod(lambda name, **kw: "SELECT 1 FROM dual"), + }), + ) + + df = msd_svc._fetch_station_detection_data( + start_date="2025-01-01", + end_date="2025-03-31", + station="測試", + ) + + assert engine_calls["execute"] == 1 + assert engine_calls["merge"] == 1 + assert df is not None + assert len(df) == 2 + + def test_short_range_skips_engine(self, monkeypatch): + """30-day range → direct path, no engine.""" + engine_calls = {"execute": 0} + + monkeypatch.setattr( + "mes_dashboard.services.mid_section_defect_service.cache_get", + lambda key: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.mid_section_defect_service.cache_set", + lambda key, val, ttl=None: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.mid_section_defect_service.SQLLoader", + type("FakeLoader", (), { + "load_with_params": staticmethod(lambda name, **kw: "SELECT 1 FROM dual"), + }), + ) + monkeypatch.setattr( + "mes_dashboard.services.mid_section_defect_service.read_sql_df", + lambda sql, params: pd.DataFrame({"CONTAINERID": ["C1"]}), + ) + + df = msd_svc._fetch_station_detection_data( + start_date="2025-06-01", + end_date="2025-06-30", + station="測試", + ) + + assert engine_calls["execute"] == 0 # Engine NOT used + assert df is not None + assert len(df) == 1 diff --git a/tests/test_query_spool_store.py b/tests/test_query_spool_store.py new file mode 100644 index 0000000..afe25c9 --- /dev/null +++ b/tests/test_query_spool_store.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +"""Unit tests for parquet query spool store.""" + +from __future__ import annotations + +import fnmatch +import json +import os +import time + +import pandas as pd + +from mes_dashboard.core.redis_client import get_key +from mes_dashboard.core import query_spool_store as spool + + +class FakeRedis: + def __init__(self) -> None: + self._data: dict[str, str] = {} + self._expires: dict[str, int] = {} + + def _purge_if_expired(self, key: str) -> None: + exp = self._expires.get(key) + if exp is not None and exp <= int(time.time()): + self._data.pop(key, None) + self._expires.pop(key, None) + + def setex(self, key: str, ttl: int, value: str) -> bool: + self._data[key] = value + self._expires[key] = int(time.time()) + int(ttl) + return True + + def get(self, key: str): + self._purge_if_expired(key) + return self._data.get(key) + + def delete(self, *keys) -> int: + deleted = 0 + for key in keys: + if key in self._data: + deleted += 1 + self._data.pop(key, None) + self._expires.pop(key, None) + return deleted + + def scan_iter(self, match: str | None = None, count: int = 100): + for key in list(self._data.keys()): + self._purge_if_expired(key) + if key not in self._data: + continue + if match and not fnmatch.fnmatch(key, match): + continue + yield key + + +def _build_df() -> pd.DataFrame: + return pd.DataFrame( + { + "CONTAINERID": ["C1", "C2"], + "LOSSREASONNAME": ["001_A", "002_B"], + "REJECT_TOTAL_QTY": [10, 20], + } + ) + + +def test_spool_store_and_load_roundtrip(monkeypatch, tmp_path): + fake = FakeRedis() + monkeypatch.setattr(spool, "QUERY_SPOOL_ENABLED", True) + monkeypatch.setattr(spool, "QUERY_SPOOL_DIR", tmp_path / "query_spool") + monkeypatch.setattr(spool, "get_redis_client", lambda: fake) + + ok = spool.store_spooled_df("reject_dataset", "qid-roundtrip-1", _build_df(), ttl_seconds=1200) + assert ok is True + + metadata = spool.get_spool_metadata("reject_dataset", "qid-roundtrip-1") + assert metadata is not None + assert metadata.get("row_count") == 2 + + loaded = spool.load_spooled_df("reject_dataset", "qid-roundtrip-1") + assert loaded is not None + pd.testing.assert_frame_equal( + loaded.sort_values("CONTAINERID").reset_index(drop=True), + _build_df().sort_values("CONTAINERID").reset_index(drop=True), + ) + + +def test_spool_load_returns_none_when_metadata_hash_mismatch(monkeypatch, tmp_path): + fake = FakeRedis() + monkeypatch.setattr(spool, "QUERY_SPOOL_ENABLED", True) + monkeypatch.setattr(spool, "QUERY_SPOOL_DIR", tmp_path / "query_spool") + monkeypatch.setattr(spool, "get_redis_client", lambda: fake) + + assert spool.store_spooled_df("reject_dataset", "qid-hash-1", _build_df(), ttl_seconds=1200) + key = get_key(spool._meta_key("reject_dataset", "qid-hash-1")) + metadata = json.loads(fake.get(key)) + metadata["columns_hash"] = "deadbeefdeadbeef" + fake.setex(key, 1200, json.dumps(metadata, ensure_ascii=False)) + + loaded = spool.load_spooled_df("reject_dataset", "qid-hash-1") + assert loaded is None + assert fake.get(key) is None + + +def test_spool_load_returns_none_when_file_missing(monkeypatch, tmp_path): + fake = FakeRedis() + monkeypatch.setattr(spool, "QUERY_SPOOL_ENABLED", True) + monkeypatch.setattr(spool, "QUERY_SPOOL_DIR", tmp_path / "query_spool") + monkeypatch.setattr(spool, "get_redis_client", lambda: fake) + + assert spool.store_spooled_df("reject_dataset", "qid-missing-file-1", _build_df(), ttl_seconds=1200) + metadata = spool.get_spool_metadata("reject_dataset", "qid-missing-file-1") + assert metadata is not None + path = spool._path_from_relative(metadata["relative_path"]) + assert path is not None and path.exists() + path.unlink() + + loaded = spool.load_spooled_df("reject_dataset", "qid-missing-file-1") + assert loaded is None + assert spool.get_spool_metadata("reject_dataset", "qid-missing-file-1") is None + + +def test_cleanup_expired_and_orphan_files(monkeypatch, tmp_path): + fake = FakeRedis() + root = tmp_path / "query_spool" + monkeypatch.setattr(spool, "QUERY_SPOOL_ENABLED", True) + monkeypatch.setattr(spool, "QUERY_SPOOL_DIR", root) + monkeypatch.setattr(spool, "QUERY_SPOOL_ORPHAN_GRACE_SECONDS", 1) + monkeypatch.setattr(spool, "get_redis_client", lambda: fake) + + now = int(time.time()) + + assert spool.store_spooled_df("reject_dataset", "qid-valid-1", _build_df(), ttl_seconds=1200) + assert spool.store_spooled_df("reject_dataset", "qid-expired-1", _build_df(), ttl_seconds=1200) + + expired_key = get_key(spool._meta_key("reject_dataset", "qid-expired-1")) + expired_meta = json.loads(fake.get(expired_key)) + expired_path = spool._path_from_relative(expired_meta["relative_path"]) + assert expired_path is not None and expired_path.exists() + expired_meta["expires_at"] = now - 10 + fake.setex(expired_key, 1200, json.dumps(expired_meta, ensure_ascii=False)) + + orphan_dir = root / "reject_dataset" + orphan_dir.mkdir(parents=True, exist_ok=True) + orphan_path = orphan_dir / "orphan.parquet" + _build_df().to_parquet(orphan_path, engine="pyarrow", index=False) + old_time = now - 120 + os.utime(orphan_path, (old_time, old_time)) + + stats = spool.cleanup_expired_spool(namespace="reject_dataset") + assert stats["meta_deleted"] >= 1 + assert stats["expired_files_deleted"] >= 1 + assert stats["orphan_files_deleted"] >= 1 + assert not orphan_path.exists() + assert not expired_path.exists() + assert spool.get_spool_metadata("reject_dataset", "qid-valid-1") is not None diff --git a/tests/test_query_tool_engine.py b/tests/test_query_tool_engine.py new file mode 100644 index 0000000..bc95e53 --- /dev/null +++ b/tests/test_query_tool_engine.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- +"""Unit tests for query_tool_service — slow-query migration + caching (tasks 10.1-10.5).""" + +from __future__ import annotations + +from unittest.mock import patch, MagicMock + +import pandas as pd + +from mes_dashboard.services import query_tool_service as qt_svc + + +class TestSlowQueryMigration: + """10.2 — verify high-risk read_sql_df paths migrated to read_sql_df_slow.""" + + def test_resolve_by_lot_id_uses_slow(self, monkeypatch): + """_resolve_by_lot_id should call read_sql_df_slow, not read_sql_df.""" + calls = {"slow": 0, "fast": 0} + + def fake_slow(sql, params=None, **kw): + calls["slow"] += 1 + return pd.DataFrame({"CONTAINERID": ["C1"], "CONTAINERNAME": ["LOT-1"]}) + + def fake_fast(sql, params=None): + calls["fast"] += 1 + return pd.DataFrame() + + monkeypatch.setattr(qt_svc, "read_sql_df_slow", fake_slow) + monkeypatch.setattr(qt_svc, "read_sql_df", fake_fast) + monkeypatch.setattr(qt_svc, "SQLLoader", + type("FakeLoader", (), { + "load_with_params": staticmethod(lambda name, **kw: "SELECT 1 FROM dual"), + }), + ) + + result = qt_svc._resolve_by_lot_id(["LOT-1"]) + + assert calls["slow"] == 1 + assert calls["fast"] == 0 + + def test_resolve_by_work_order_uses_slow(self, monkeypatch): + """_resolve_by_work_order should call read_sql_df_slow.""" + calls = {"slow": 0, "fast": 0} + + def fake_slow(sql, params=None, **kw): + calls["slow"] += 1 + return pd.DataFrame({ + "CONTAINERID": ["C1"], + "CONTAINERNAME": ["LOT-1"], + "MFGORDERNAME": ["GA25010101"], + }) + + def fake_fast(sql, params=None): + calls["fast"] += 1 + return pd.DataFrame() + + monkeypatch.setattr(qt_svc, "read_sql_df_slow", fake_slow) + monkeypatch.setattr(qt_svc, "read_sql_df", fake_fast) + monkeypatch.setattr(qt_svc, "SQLLoader", + type("FakeLoader", (), { + "load_with_params": staticmethod(lambda name, **kw: "SELECT 1 FROM dual"), + }), + ) + + result = qt_svc._resolve_by_work_order(["GA25010101"]) + + assert calls["slow"] >= 1 + assert calls["fast"] == 0 + + def test_equipment_status_hours_uses_slow(self, monkeypatch): + """get_equipment_status_hours should call read_sql_df_slow.""" + import mes_dashboard.core.redis_df_store as rds + + calls = {"slow": 0, "fast": 0} + + def fake_slow(sql, params=None, **kw): + calls["slow"] += 1 + return pd.DataFrame({ + "RESOURCEID": ["EQ1"], + "PRD_HOURS": [100.0], + "SBY_HOURS": [20.0], + "UDT_HOURS": [10.0], + "SDT_HOURS": [5.0], + "EGT_HOURS": [3.0], + "NST_HOURS": [2.0], + "TOTAL_HOURS": [140.0], + }) + + def fake_fast(sql, params=None): + calls["fast"] += 1 + return pd.DataFrame() + + monkeypatch.setattr(qt_svc, "read_sql_df_slow", fake_slow) + monkeypatch.setattr(qt_svc, "read_sql_df", fake_fast) + monkeypatch.setattr(rds, "redis_load_df", lambda key: None) + monkeypatch.setattr(rds, "redis_store_df", lambda key, df, ttl=None: None) + monkeypatch.setattr(qt_svc, "SQLLoader", + type("FakeLoader", (), { + "load_with_params": staticmethod(lambda name, **kw: "SELECT 1 FROM dual"), + }), + ) + + result = qt_svc.get_equipment_status_hours( + equipment_ids=["EQ1"], + start_date="2025-01-01", + end_date="2025-01-31", + ) + + assert calls["slow"] == 1 + assert calls["fast"] == 0 + assert "error" not in result + assert result["totals"]["PRD_HOURS"] == 100.0 + + +class TestEquipmentCaching: + """10.4/10.5 — equipment query caching via Redis.""" + + def test_equipment_status_cache_hit(self, monkeypatch): + """Redis cache hit → returns cached result without Oracle query.""" + import mes_dashboard.core.redis_df_store as rds + + calls = {"sql": 0} + + cached_df = pd.DataFrame({ + "RESOURCEID": ["EQ-CACHED"], + "PRD_HOURS": [50.0], + "SBY_HOURS": [10.0], + "UDT_HOURS": [5.0], + "SDT_HOURS": [2.0], + "EGT_HOURS": [1.0], + "NST_HOURS": [0.0], + "TOTAL_HOURS": [68.0], + }) + + monkeypatch.setattr(rds, "redis_load_df", lambda key: cached_df) + + def fail_sql(*args, **kwargs): + calls["sql"] += 1 + raise RuntimeError("Should not reach Oracle") + + monkeypatch.setattr(qt_svc, "read_sql_df_slow", fail_sql) + monkeypatch.setattr(qt_svc, "read_sql_df", fail_sql) + + result = qt_svc.get_equipment_status_hours( + equipment_ids=["EQ1"], + start_date="2025-01-01", + end_date="2025-01-31", + ) + + assert calls["sql"] == 0 # Oracle NOT called + assert result["data"][0]["RESOURCEID"] == "EQ-CACHED" diff --git a/tests/test_redis_df_store.py b/tests/test_redis_df_store.py new file mode 100644 index 0000000..b396f6c --- /dev/null +++ b/tests/test_redis_df_store.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +"""Unit tests for redis_df_store module.""" + +import pytest +from unittest.mock import patch, MagicMock +from decimal import Decimal + +import pandas as pd + + +class TestRedisStoreDf: + """3.1 — round-trip store/load.""" + + def test_round_trip(self): + """Store a DF, load it back, verify equality.""" + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + stored = {} + + def fake_setex(key, ttl, value): + stored[key] = value + + def fake_get(key): + return stored.get(key) + + mock_client.setex.side_effect = fake_setex + mock_client.get.side_effect = fake_get + + df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + rds.redis_store_df("test:key", df, ttl=60) + loaded = rds.redis_load_df("test:key") + + assert loaded is not None + pd.testing.assert_frame_equal(loaded, df) + + def test_store_empty_df(self): + """Round-trip with an empty DataFrame preserves schema.""" + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + + df = pd.DataFrame({"COL": pd.Series([], dtype="int64")}) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + rds.redis_store_df("test:empty", df, ttl=60) + loaded = rds.redis_load_df("test:empty") + + assert loaded is not None + assert len(loaded) == 0 + assert list(loaded.columns) == ["COL"] + + def test_decimal_object_column_round_trip(self): + """Mixed-precision Decimal object columns should store without serialization errors.""" + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + + df = pd.DataFrame( + { + "REJECT_SHARE_PCT": [Decimal("12.345"), Decimal("1.2"), None], + "REJECT_RATE_PCT": [Decimal("0.123456"), Decimal("10.9"), Decimal("9.000001")], + "LABEL": ["A", "B", "C"], + } + ) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + assert rds.redis_store_df("test:decimal", df, ttl=60) + loaded = rds.redis_load_df("test:decimal") + + assert loaded is not None + assert loaded["REJECT_SHARE_PCT"].dtype.kind in ("f", "i") + assert loaded["REJECT_RATE_PCT"].dtype.kind in ("f", "i") + assert loaded.loc[0, "REJECT_SHARE_PCT"] == pytest.approx(12.345) + assert loaded.loc[2, "REJECT_RATE_PCT"] == pytest.approx(9.000001) + + +class TestChunkHelpers: + """3.2 — chunk-level helpers round-trip.""" + + def test_chunk_round_trip(self): + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + stored = {} + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + + df = pd.DataFrame({"X": [10, 20]}) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + rds.redis_store_chunk("reject", "abc123", 0, df, ttl=60) + assert rds.redis_chunk_exists("reject", "abc123", 0) + loaded = rds.redis_load_chunk("reject", "abc123", 0) + + assert loaded is not None + pd.testing.assert_frame_equal(loaded, df) + + def test_chunk_not_exists(self): + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + mock_client.exists.return_value = 0 + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + assert not rds.redis_chunk_exists("reject", "abc123", 99) + + def test_clear_batch_removes_chunk_and_meta_keys(self): + import mes_dashboard.core.redis_df_store as rds + + mock_client = MagicMock() + deleted = {"keys": []} + + mock_client.keys.return_value = [ + "mes-dashboard:batch:reject:q123:chunk:0", + "mes-dashboard:batch:reject:q123:chunk:1", + ] + mock_client.delete.side_effect = lambda *keys: deleted["keys"].extend(keys) or len(keys) + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=mock_client): + count = rds.redis_clear_batch("reject", "q123") + + assert count == 3 + assert any("chunk:0" in key for key in deleted["keys"]) + assert any("chunk:1" in key for key in deleted["keys"]) + assert any("meta" in key for key in deleted["keys"]) + + +class TestRedisUnavailable: + """3.3 — graceful fallback when Redis is unavailable.""" + + def test_store_no_redis(self): + """store returns without error when Redis disabled.""" + import mes_dashboard.core.redis_df_store as rds + + df = pd.DataFrame({"A": [1]}) + with patch.object(rds, "REDIS_ENABLED", False): + rds.redis_store_df("key", df) # no exception + + def test_load_no_redis(self): + """load returns None when Redis disabled.""" + import mes_dashboard.core.redis_df_store as rds + + with patch.object(rds, "REDIS_ENABLED", False): + result = rds.redis_load_df("key") + assert result is None + + def test_chunk_exists_no_redis(self): + import mes_dashboard.core.redis_df_store as rds + + with patch.object(rds, "REDIS_ENABLED", False): + assert not rds.redis_chunk_exists("p", "h", 0) + + def test_store_client_none(self): + """store returns without error when client is None.""" + import mes_dashboard.core.redis_df_store as rds + + df = pd.DataFrame({"A": [1]}) + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=None): + rds.redis_store_df("key", df) # no exception + + def test_load_client_none(self): + """load returns None when client is None.""" + import mes_dashboard.core.redis_df_store as rds + + with patch.object(rds, "REDIS_ENABLED", True), \ + patch.object(rds, "get_redis_client", return_value=None): + result = rds.redis_load_df("key") + assert result is None diff --git a/tests/test_reject_dataset_cache.py b/tests/test_reject_dataset_cache.py index f0fdf14..34339d4 100644 --- a/tests/test_reject_dataset_cache.py +++ b/tests/test_reject_dataset_cache.py @@ -3,6 +3,9 @@ from __future__ import annotations +from decimal import Decimal +from unittest.mock import MagicMock + import pandas as pd import pytest @@ -292,3 +295,359 @@ def test_apply_pareto_selection_filter_supports_multi_dimension_and_logic(): assert len(filtered) == 1 assert set(filtered["CONTAINERNAME"].tolist()) == {"LOT-002"} + + +# ============================================================ +# 5.9 — 365-day date range → engine decomposition, no Oracle timeout +# ============================================================ + + +class TestEngineDecompositionDateRange: + """Verify engine routing for long date ranges.""" + + def test_365_day_range_triggers_engine(self, monkeypatch): + """5.9: 365-day date range → chunks decomposed, engine path used.""" + import mes_dashboard.services.batch_query_engine as engine_mod + + # Track calls via engine module (local imports inside function pull from here) + engine_calls = { + "decompose": 0, + "execute": 0, + "merge": 0, + "chunk_count": 0, + "parallel": 0, + "max_rows_per_chunk": 0, + } + + original_decompose = engine_mod.decompose_by_time_range + + def tracked_decompose(*args, **kwargs): + engine_calls["decompose"] += 1 + return original_decompose(*args, **kwargs) + + def fake_execute_plan(chunks, query_fn, **kwargs): + engine_calls["execute"] += 1 + engine_calls["chunk_count"] = len(chunks) + engine_calls["parallel"] = int(kwargs.get("parallel", 1)) + engine_calls["max_rows_per_chunk"] = int(kwargs.get("max_rows_per_chunk", 0)) + return kwargs.get("query_hash", "fake_hash") + + result_df = pd.DataFrame({ + "CONTAINERID": ["C1"], + "LOSSREASONNAME": ["R1"], + "REJECT_TOTAL_QTY": [10], + }) + + def fake_merge_chunks(prefix, qhash, **kwargs): + engine_calls["merge"] += 1 + return result_df + + # Mock on engine module (local imports will pick these up) + monkeypatch.setattr(engine_mod, "decompose_by_time_range", tracked_decompose) + monkeypatch.setattr(engine_mod, "execute_plan", fake_execute_plan) + monkeypatch.setattr(engine_mod, "merge_chunks", fake_merge_chunks) + # Mock service-level helpers + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._prepare_sql", + lambda *a, **kw: "SELECT 1 FROM dual", + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._store_df", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._get_cached_df", + lambda _: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._apply_policy_filters", + lambda df, **kw: df, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._build_primary_response", + lambda qid, df, meta, ri: {"query_id": qid, "rows": len(df)}, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._build_where_clause", + lambda **kw: ("", {}, {}), + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._validate_range", + lambda *a: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache.redis_clear_batch", + lambda *a, **kw: 0, + ) + + result = cache_svc.execute_primary_query( + mode="date_range", + start_date="2025-01-01", + end_date="2025-12-31", + ) + + assert engine_calls["decompose"] == 1 + assert engine_calls["execute"] == 1 + assert engine_calls["merge"] == 1 + assert result["rows"] == 1 + + expected_chunks = original_decompose( + "2025-01-01", + "2025-12-31", + grain_days=cache_svc._REJECT_ENGINE_GRAIN_DAYS, + ) + assert engine_calls["chunk_count"] == len(expected_chunks) + assert engine_calls["parallel"] == cache_svc._REJECT_ENGINE_PARALLEL + assert engine_calls["max_rows_per_chunk"] == cache_svc._REJECT_ENGINE_MAX_ROWS_PER_CHUNK + + def test_short_range_skips_engine(self, monkeypatch): + """Short date range (<= threshold) uses direct path, no engine.""" + import mes_dashboard.services.batch_query_engine as engine_mod + + engine_calls = {"decompose": 0} + + original_decompose = engine_mod.decompose_by_time_range + + def tracked_decompose(*args, **kwargs): + engine_calls["decompose"] += 1 + return original_decompose(*args, **kwargs) + + monkeypatch.setattr(engine_mod, "decompose_by_time_range", tracked_decompose) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._get_cached_df", + lambda _: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._prepare_sql", + lambda *a, **kw: "SELECT 1 FROM dual", + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache.read_sql_df", + lambda sql, params: pd.DataFrame({"CONTAINERID": ["C1"]}), + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._store_df", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._apply_policy_filters", + lambda df, **kw: df, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._build_primary_response", + lambda qid, df, meta, ri: {"query_id": qid, "rows": len(df)}, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._build_where_clause", + lambda **kw: ("", {}, {}), + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache.redis_clear_batch", + lambda *a, **kw: 0, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._validate_range", + lambda *a: None, + ) + + result = cache_svc.execute_primary_query( + mode="date_range", + start_date="2025-06-01", + end_date="2025-06-30", + ) + + assert engine_calls["decompose"] == 0 # Engine NOT used + assert result["rows"] == 1 + + +# ============================================================ +# 5.10 — Large workorder (500+ containers) → ID batching +# ============================================================ + + +class TestEngineDecompositionContainerIDs: + """Verify engine routing for large container ID sets.""" + + def test_large_container_set_triggers_engine(self, monkeypatch): + """5.10: 1500 container IDs → engine ID batching activated.""" + import mes_dashboard.services.batch_query_engine as engine_mod + + engine_calls = {"execute": 0, "merge": 0} + fake_ids = [f"CID-{i:04d}" for i in range(1500)] + + def fake_execute_plan(chunks, query_fn, **kwargs): + engine_calls["execute"] += 1 + # Verify correct number of chunks + assert len(chunks) == 2 # 1500 / 1000 = 2 batches + return kwargs.get("query_hash", "fake_hash") + + result_df = pd.DataFrame({"CONTAINERID": fake_ids[:5]}) + + def fake_merge_chunks(prefix, qhash, **kwargs): + engine_calls["merge"] += 1 + return result_df + + monkeypatch.setattr(engine_mod, "execute_plan", fake_execute_plan) + monkeypatch.setattr(engine_mod, "merge_chunks", fake_merge_chunks) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache.resolve_containers", + lambda input_type, values: { + "container_ids": fake_ids, + "resolution_info": {"type": input_type, "count": len(fake_ids)}, + }, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._get_cached_df", + lambda _: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._prepare_sql", + lambda *a, **kw: "SELECT 1 FROM dual", + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._store_df", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._apply_policy_filters", + lambda df, **kw: df, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._build_primary_response", + lambda qid, df, meta, ri: {"query_id": qid, "rows": len(df)}, + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache._build_where_clause", + lambda **kw: ("", {}, {}), + ) + monkeypatch.setattr( + "mes_dashboard.services.reject_dataset_cache.redis_clear_batch", + lambda *a, **kw: 0, + ) + + result = cache_svc.execute_primary_query( + mode="container", + container_input_type="workorder", + container_values=["WO-BIG"], + ) + + assert engine_calls["execute"] == 1 + assert engine_calls["merge"] == 1 + + +def test_engine_path_stores_mixed_precision_decimal_chunks_without_redis_serialization_error( + monkeypatch, caplog +): + """Long-range engine path should handle Decimal object columns in chunk cache.""" + import mes_dashboard.core.redis_df_store as rds + import mes_dashboard.services.batch_query_engine as bqe + + mock_client = MagicMock() + stored = {} + hashes = {} + + mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v}) + mock_client.get.side_effect = lambda k: stored.get(k) + mock_client.exists.side_effect = lambda k: 1 if k in stored else 0 + mock_client.hset.side_effect = lambda k, mapping=None: hashes.setdefault(k, {}).update(mapping or {}) + mock_client.hgetall.side_effect = lambda k: hashes.get(k, {}) + mock_client.expire.return_value = None + + engine_row = pd.DataFrame( + { + "CONTAINERID": ["C-1", "C-2"], + "LOSSREASONNAME": ["001_A", "002_B"], + "REJECT_TOTAL_QTY": [10, 20], + "REJECT_SHARE_PCT": [Decimal("12.345"), Decimal("1.2")], + "REJECT_RATE_PCT": [Decimal("0.123456"), Decimal("9.000001")], + } + ) + + monkeypatch.setattr(cache_svc, "_get_cached_df", lambda _: None) + monkeypatch.setattr(cache_svc, "_prepare_sql", lambda *a, **kw: "SELECT 1 FROM dual") + monkeypatch.setattr(cache_svc, "_build_where_clause", lambda **kw: ("", {}, {})) + monkeypatch.setattr(cache_svc, "_validate_range", lambda *a: None) + monkeypatch.setattr(cache_svc, "_apply_policy_filters", lambda df, **kw: df) + monkeypatch.setattr(cache_svc, "_build_primary_response", lambda qid, df, meta, ri: {"rows": len(df)}) + monkeypatch.setattr(cache_svc, "read_sql_df", lambda sql, params: engine_row.copy()) + monkeypatch.setattr(cache_svc, "redis_clear_batch", lambda *a, **kw: 0) + + monkeypatch.setattr(rds, "REDIS_ENABLED", True) + monkeypatch.setattr(rds, "get_redis_client", lambda: mock_client) + monkeypatch.setattr(bqe, "get_redis_client", lambda: mock_client) + result = cache_svc.execute_primary_query( + mode="date_range", + start_date="2025-01-01", + end_date="2025-12-31", + ) + + expected_chunks = bqe.decompose_by_time_range( + "2025-01-01", + "2025-12-31", + grain_days=cache_svc._REJECT_ENGINE_GRAIN_DAYS, + ) + assert result["rows"] == len(expected_chunks) * 2 + assert "Failed to store DataFrame in Redis" not in caplog.text + assert any("batch:reject" in key for key in stored) + + +def test_large_result_spills_to_parquet_and_view_export_use_spool_fallback(monkeypatch): + """13.8: long-range oversized result should use spool and still serve view/export.""" + spool_data = {} + df = _build_detail_filter_df().copy() + + cache_svc._dataset_cache.clear() + monkeypatch.setattr(cache_svc, "_redis_load_df", lambda _qid: None) + monkeypatch.setattr(cache_svc, "_validate_range", lambda *_: None) + monkeypatch.setattr(cache_svc, "_build_where_clause", lambda **kw: ("", {}, {})) + monkeypatch.setattr(cache_svc, "_prepare_sql", lambda *a, **kw: "SELECT 1 FROM dual") + monkeypatch.setattr(cache_svc, "read_sql_df", lambda sql, params: df.copy()) + monkeypatch.setattr(cache_svc, "_apply_policy_filters", lambda data, **kw: data) + monkeypatch.setattr( + cache_svc, + "_build_primary_response", + lambda qid, result_df, meta, resolution_info: {"query_id": qid, "rows": len(result_df)}, + ) + + monkeypatch.setattr(cache_svc, "_REJECT_ENGINE_SPILL_ENABLED", True) + monkeypatch.setattr(cache_svc, "_REJECT_ENGINE_MAX_TOTAL_ROWS", 1) + monkeypatch.setattr(cache_svc, "_REJECT_ENGINE_MAX_RESULT_MB", 1) + monkeypatch.setattr(cache_svc, "_store_df", lambda *_a, **_kw: (_ for _ in ()).throw(AssertionError("_store_df should not be called for spill path"))) + monkeypatch.setattr(cache_svc, "_redis_delete_df", lambda *_a, **_kw: None) + + def fake_store_spooled_df(namespace, query_id, data, ttl_seconds=None): + spool_data[(namespace, query_id)] = data.copy() + return True + + def fake_load_spooled_df(namespace, query_id): + stored = spool_data.get((namespace, query_id)) + return stored.copy() if stored is not None else None + + monkeypatch.setattr(cache_svc, "store_spooled_df", fake_store_spooled_df) + monkeypatch.setattr(cache_svc, "load_spooled_df", fake_load_spooled_df) + + result = cache_svc.execute_primary_query( + mode="date_range", + start_date="2025-01-01", + end_date="2025-01-31", + ) + + query_id = result["query_id"] + assert result["rows"] == len(df) + assert (cache_svc._REDIS_NAMESPACE, query_id) in spool_data + + # Force cache miss for L1/L2 and verify spool fallback serves view/export. + cache_svc._dataset_cache.clear() + monkeypatch.setattr(cache_svc, "_redis_load_df", lambda _qid: None) + monkeypatch.setattr( + "mes_dashboard.services.scrap_reason_exclusion_cache.get_excluded_reasons", + lambda: [], + ) + + view_result = cache_svc.apply_view(query_id=query_id, page=1, per_page=200) + assert view_result is not None + assert view_result["detail"]["pagination"]["total"] == len(df) + + export_rows = cache_svc.export_csv_from_cache(query_id=query_id) + assert export_rows is not None + assert len(export_rows) == len(df) diff --git a/tests/test_resource_dataset_cache.py b/tests/test_resource_dataset_cache.py new file mode 100644 index 0000000..7c8e41f --- /dev/null +++ b/tests/test_resource_dataset_cache.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +"""Unit tests for resource_dataset_cache — engine integration (task 7.4).""" + +from __future__ import annotations + +import pandas as pd + +from mes_dashboard.services import resource_dataset_cache as cache_svc + + +class TestResourceEngineDecomposition: + """7.4 — resource-history with long date range triggers engine.""" + + def test_long_range_triggers_engine(self, monkeypatch): + """90-day range → engine decomposition activated.""" + import mes_dashboard.services.batch_query_engine as engine_mod + + engine_calls = {"execute": 0, "merge": 0} + + def fake_execute_plan(chunks, query_fn, **kwargs): + engine_calls["execute"] += 1 + assert len(chunks) == 3 # 90 days / 31 = 3 chunks + return kwargs.get("query_hash", "fake_hash") + + result_df = pd.DataFrame({ + "HISTORYID": [1, 2], + "RESOURCEID": ["R1", "R2"], + }) + + def fake_merge_chunks(prefix, qhash, **kwargs): + engine_calls["merge"] += 1 + return result_df + + monkeypatch.setattr(engine_mod, "execute_plan", fake_execute_plan) + monkeypatch.setattr(engine_mod, "merge_chunks", fake_merge_chunks) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_cached_df", + lambda _: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._store_df", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._load_sql", + lambda name: "SELECT 1 FROM dual", + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_filtered_resources_and_lookup", + lambda **kw: ( + [{"RESOURCEID": "R1", "RESOURCENAME": "Machine-1"}], + {"R1": {"RESOURCENAME": "Machine-1"}}, + "h.HISTORYID IN (SELECT HISTORYID FROM RESOURCEHISTORY)", + ), + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_resource_lookup", + lambda: {}, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_workcenter_mapping", + lambda: {}, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._derive_summary", + lambda df, rl, wc, gran: {"total_hours": 100}, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._derive_detail", + lambda df, rl, wc: {"items": [], "pagination": {"total": 2}}, + ) + + result = cache_svc.execute_primary_query( + start_date="2025-01-01", + end_date="2025-03-31", + workcenter_groups=["WB"], + ) + + assert engine_calls["execute"] == 1 + assert engine_calls["merge"] == 1 + assert result["query_id"] is not None + + def test_short_range_skips_engine(self, monkeypatch): + """30-day range → direct path, no engine.""" + engine_calls = {"execute": 0} + + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_cached_df", + lambda _: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._load_sql", + lambda name: "SELECT 1 FROM dual", + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache.read_sql_df", + lambda sql, params: pd.DataFrame({"HISTORYID": [1]}), + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._store_df", + lambda *a, **kw: None, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_filtered_resources_and_lookup", + lambda **kw: ( + [{"RESOURCEID": "R1"}], + {"R1": {"RESOURCENAME": "Machine-1"}}, + "h.HISTORYID IN (SELECT HISTORYID FROM RESOURCEHISTORY)", + ), + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_resource_lookup", + lambda: {}, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._get_workcenter_mapping", + lambda: {}, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._derive_summary", + lambda df, rl, wc, gran: {}, + ) + monkeypatch.setattr( + "mes_dashboard.services.resource_dataset_cache._derive_detail", + lambda df, rl, wc: {"items": [], "pagination": {"total": 1}}, + ) + + result = cache_svc.execute_primary_query( + start_date="2025-06-01", + end_date="2025-06-30", + workcenter_groups=["WB"], + ) + + assert engine_calls["execute"] == 0 # Engine NOT used