feat(reject-history): fix silent data loss by propagating partial failure metadata to frontend

Chunk failures in BatchQueryEngine were silently discarded — `has_partial_failure` was tracked in Redis but never surfaced to the API response or frontend. Users could see incomplete data without any warning. This commit closes the gap end-to-end: Backend: - Track failed chunk time ranges (`failed_ranges`) in batch engine progress metadata - Add single retry for transient Oracle errors (timeout, connection) in `_execute_single_chunk` - Read `get_batch_progress()` after merge but before `redis_clear_batch()` cleanup - Inject `has_partial_failure`, `failed_chunk_count`, `failed_ranges` into API response meta - Persist partial failure flag to independent Redis key with TTL aligned to data storage layer - Add shared container-resolution policy module with wildcard/expansion guardrails - Refactor reason filter from single-value to multi-select (`reason` → `reasons`) Frontend: - Add client-side date range validation (730-day limit) before API submission - Display amber warning banner on partial failure with specific failed date ranges - Support generic fallback message for container-mode queries without date ranges - Update FilterPanel to support multi-select reason chips Specs & tests: - Create batch-query-resilience spec; update reject-history-api and reject-history-page specs - Add 7 new tests for retry, memory guard, failed ranges, partial failure propagation, TTL - Cross-service regression verified (hold, resource, job, msd — 411 tests pass) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-03 14:00:07 +08:00
parent f1506787fb
commit a275c30c0e
35 changed files with 3028 additions and 1460 deletions
--- a/.env.example
+++ b/.env.example
@@ -59,6 +59,16 @@ QUERY_TOOL_MAX_CONTAINER_IDS=200
 RESOURCE_DETAIL_DEFAULT_LIMIT=500
 RESOURCE_DETAIL_MAX_LIMIT=500

+# Shared container-resolution guardrails
+# 0 = disable raw input count cap (recommended: rely on expansion limits instead)
+CONTAINER_RESOLVE_INPUT_MAX_VALUES=0
+# Wildcard pattern must include this many literal-prefix chars before %/_ (e.g., GA%)
+CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN=4
+# Per-token expansion guard (avoid one wildcard exploding into too many container IDs)
+CONTAINER_RESOLVE_MAX_EXPANSION_PER_TOKEN=2000
+# Total resolved container-ID guard for a single resolve request
+CONTAINER_RESOLVE_MAX_CONTAINER_IDS=30000
+
 # Trust boundary for forwarded headers (safe default: false)
 # Direct-exposure deployment (no reverse proxy): keep this false
 TRUST_PROXY_HEADERS=false
@@ -101,14 +111,14 @@ GUNICORN_WORKERS=2
 GUNICORN_THREADS=4

 # Worker timeout (seconds): should stay above DB/query-tool slow paths
-GUNICORN_TIMEOUT=130
+GUNICORN_TIMEOUT=360

 # Graceful shutdown timeout for worker reloads (seconds)
-GUNICORN_GRACEFUL_TIMEOUT=60
+GUNICORN_GRACEFUL_TIMEOUT=300

 # Worker recycle policy (set 0 to disable)
-GUNICORN_MAX_REQUESTS=5000
-GUNICORN_MAX_REQUESTS_JITTER=500
+GUNICORN_MAX_REQUESTS=1200
+GUNICORN_MAX_REQUESTS_JITTER=300

 # ============================================================
 # Redis Configuration (for WIP cache)
@@ -201,6 +211,8 @@ TRACE_EVENTS_MAX_WORKERS=2
 # Max parallel workers for EventFetcher batch queries (per domain)
 # Recommend: 2 (peak concurrent slow queries = TRACE_EVENTS_MAX_WORKERS × this)
 EVENT_FETCHER_MAX_WORKERS=2
+# false = any failed batch raises error (avoid silent partial data)
+EVENT_FETCHER_ALLOW_PARTIAL_RESULTS=false

 # Max parallel workers for forward pipeline WIP+rejects fetching
 FORWARD_PIPELINE_MAX_WORKERS=2
@@ -351,7 +363,7 @@ REJECT_ENGINE_SPOOL_CLEANUP_INTERVAL_SECONDS=300
 REJECT_ENGINE_SPOOL_ORPHAN_GRACE_SECONDS=600

 # Batch query engine thresholds
-BATCH_QUERY_TIME_THRESHOLD_DAYS=60
+BATCH_QUERY_TIME_THRESHOLD_DAYS=10
 BATCH_QUERY_ID_THRESHOLD=1000
 BATCH_CHUNK_MAX_MEMORY_MB=256

--- a/README.md
+++ b/README.md
@@ -284,6 +284,15 @@ QUERY_TOOL_MAX_CONTAINER_IDS=200
 RESOURCE_DETAIL_DEFAULT_LIMIT=500
 RESOURCE_DETAIL_MAX_LIMIT=500

+# 共用解析防護（LOT/WAFER/工單）
+CONTAINER_RESOLVE_INPUT_MAX_VALUES=0          # 0=不限制輸入筆數
+CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN=4    # 萬用字元前最少字首長度（例如 GA25%）
+CONTAINER_RESOLVE_MAX_EXPANSION_PER_TOKEN=2000
+CONTAINER_RESOLVE_MAX_CONTAINER_IDS=30000
+
+# EventFetcher 批次容錯策略
+EVENT_FETCHER_ALLOW_PARTIAL_RESULTS=false     # false=任一批次失敗即整體失敗，避免靜默缺資料
+
 # 反向代理信任邊界（無反向代理時務必維持 false）
 TRUST_PROXY_HEADERS=false
 TRUSTED_PROXY_IPS=127.0.0.1
--- a/frontend/src/core/reject-history-filters.js
+++ b/frontend/src/core/reject-history-filters.js
@@ -35,7 +35,7 @@ export function toRejectFilterSnapshot(input = {}) {
    endDate: normalizeText(input.endDate),
    workcenterGroups: normalizeArray(input.workcenterGroups),
    packages: normalizeArray(input.packages),
-    reason: normalizeText(input.reason),
+    reasons: normalizeArray(input.reasons),
    includeExcludedScrap: normalizeBoolean(input.includeExcludedScrap, false),
    excludeMaterialScrap: normalizeBoolean(input.excludeMaterialScrap, true),
    excludePbDiode: normalizeBoolean(input.excludePbDiode, true),
@@ -77,7 +77,7 @@ export function pruneRejectFilterSelections(filters = {}, options = {}) {
  const removed = {
    workcenterGroups: [],
    packages: [],
-    reason: '',
+    reasons: [],
  };

  if (hasWorkcenterOptions) {
@@ -100,9 +100,14 @@ export function pruneRejectFilterSelections(filters = {}, options = {}) {
    });
  }

-  if (next.reason && hasReasonOptions && !validReasons.has(next.reason)) {
-    removed.reason = next.reason;
-    next.reason = '';
+  if (hasReasonOptions) {
+    next.reasons = next.reasons.filter((value) => {
+      if (validReasons.has(value)) {
+        return true;
+      }
+      removed.reasons.push(value);
+      return false;
+    });
  }

  return {
@@ -111,7 +116,7 @@ export function pruneRejectFilterSelections(filters = {}, options = {}) {
    removedCount:
      removed.workcenterGroups.length +
      removed.packages.length +
-      (removed.reason ? 1 : 0),
+      removed.reasons.length,
  };
 }

@@ -126,13 +131,13 @@ export function buildRejectOptionsRequestParams(filters = {}) {
    exclude_material_scrap: next.excludeMaterialScrap,
    exclude_pb_diode: next.excludePbDiode,
  };
-  if (next.reason) {
-    params.reason = next.reason;
+  if (next.reasons.length > 0) {
+    params.reasons = next.reasons;
  }
  return params;
 }

-export function buildRejectCommonQueryParams(filters = {}, { reason = '' } = {}) {
+export function buildRejectCommonQueryParams(filters = {}, { reasons: extraReasons = [] } = {}) {
  const next = toRejectFilterSnapshot(filters);
  const params = {
    start_date: next.startDate,
@@ -143,9 +148,9 @@ export function buildRejectCommonQueryParams(filters = {}, { reason = '' } = {})
    exclude_material_scrap: next.excludeMaterialScrap,
    exclude_pb_diode: next.excludePbDiode,
  };
-  const effectiveReason = normalizeText(reason) || next.reason;
-  if (effectiveReason) {
-    params.reasons = [effectiveReason];
+  const merged = normalizeArray([...next.reasons, ...normalizeArray(extraReasons)]);
+  if (merged.length > 0) {
+    params.reasons = merged;
  }
  return params;
 }
@@ -168,6 +173,30 @@ export function parseMultiLineInput(text) {
  return result;
 }

+export function validateDateRange(startDate, endDate) {
+  const MAX_QUERY_DAYS = 730;
+  const start = normalizeText(startDate);
+  const end = normalizeText(endDate);
+  if (!start || !end) {
+    return '請先設定開始與結束日期';
+  }
+
+  const startDt = new Date(`${start}T00:00:00`);
+  const endDt = new Date(`${end}T00:00:00`);
+  if (Number.isNaN(startDt.getTime()) || Number.isNaN(endDt.getTime())) {
+    return '日期格式不正確';
+  }
+  if (endDt < startDt) {
+    return '結束日期必須大於起始日期';
+  }
+  const dayMs = 24 * 60 * 60 * 1000;
+  const days = Math.floor((endDt - startDt) / dayMs) + 1;
+  if (days > MAX_QUERY_DAYS) {
+    return '查詢範圍不可超過 730 天（約兩年）';
+  }
+  return '';
+}
+
 export function buildViewParams(queryId, {
  supplementaryFilters = {},
  metricFilter = 'all',
@@ -185,8 +214,8 @@ export function buildViewParams(queryId, {
  if (supplementaryFilters.workcenterGroups?.length > 0) {
    params.workcenter_groups = supplementaryFilters.workcenterGroups;
  }
-  if (supplementaryFilters.reason) {
-    params.reason = supplementaryFilters.reason;
+  if (supplementaryFilters.reasons?.length > 0) {
+    params.reasons = supplementaryFilters.reasons;
  }
  if (metricFilter && metricFilter !== 'all') {
    params.metric_filter = metricFilter;
--- a/frontend/src/reject-history/App.vue
+++ b/frontend/src/reject-history/App.vue
@@ -5,6 +5,7 @@ import { apiGet, apiPost } from '../core/api.js';
 import {
  buildViewParams,
  parseMultiLineInput,
+  validateDateRange,
 } from '../core/reject-history-filters.js';
 import { replaceRuntimeHistory } from '../core/shell-navigation.js';

@@ -104,14 +105,14 @@ const availableFilters = ref({ workcenterGroups: [], packages: [], reasons: [] }
 const supplementaryFilters = reactive({
  packages: [],
  workcenterGroups: [],
-  reason: '',
+  reasons: [],
 });

 // ---- Interactive state ----
 const page = ref(1);
 const selectedTrendDates = ref([]);
 const trendLegendSelected = ref({ '扣帳報廢量': true, '不扣帳報廢量': true });
-const paretoDisplayScope = ref('all');
+const paretoDisplayScope = ref('top20');
 const paretoSelections = reactive(createEmptyParetoSelections());
 const paretoData = reactive(createEmptyParetoData());

@@ -146,6 +147,7 @@ const loading = reactive({
  exporting: false,
 });
 const errorMessage = ref('');
+const partialFailureWarning = ref('');
 const lastQueryAt = ref('');

 // ---- Request staleness tracking ----
@@ -241,8 +243,8 @@ function buildBatchParetoParams() {
  if (supplementaryFilters.workcenterGroups.length > 0) {
    params.workcenter_groups = supplementaryFilters.workcenterGroups;
  }
-  if (supplementaryFilters.reason) {
-    params.reason = supplementaryFilters.reason;
+  if (supplementaryFilters.reasons.length > 0) {
+    params.reasons = supplementaryFilters.reasons;
  }
  if (selectedTrendDates.value.length > 0) {
    params.trend_dates = selectedTrendDates.value;
@@ -301,11 +303,20 @@ async function executePrimaryQuery() {
  loading.querying = true;
  loading.list = true;
  errorMessage.value = '';
+  partialFailureWarning.value = '';

  try {
    const body = { mode: queryMode.value };

    if (queryMode.value === 'date_range') {
+      const dateValidationError = validateDateRange(
+        draftFilters.startDate,
+        draftFilters.endDate,
+      );
+      if (dateValidationError) {
+        errorMessage.value = dateValidationError;
+        return;
+      }
      body.start_date = draftFilters.startDate;
      body.end_date = draftFilters.endDate;
    } else {
@@ -321,6 +332,19 @@ async function executePrimaryQuery() {
    if (isStaleRequest(requestId)) return;

    const result = unwrapApiResult(resp, '主查詢執行失敗');
+    const meta = result.meta || {};
+    if (meta.has_partial_failure) {
+      const failedChunkCount = Number(meta.failed_chunk_count || 0);
+      const failedRanges = Array.isArray(meta.failed_ranges) ? meta.failed_ranges : [];
+      if (failedRanges.length > 0) {
+        const rangesText = failedRanges
+          .map((item) => `${item.start} ~ ${item.end}`)
+          .join('、');
+        partialFailureWarning.value = `警告：以下日期區間的資料擷取失敗（${failedChunkCount} 個批次）：${rangesText}。目前顯示結果可能不完整。`;
+      } else {
+        partialFailureWarning.value = `警告：${failedChunkCount} 個查詢批次的資料擷取失敗。目前顯示結果可能不完整。`;
+      }
+    }

    committedPrimary.mode = queryMode.value;
    committedPrimary.startDate = draftFilters.startDate;
@@ -344,7 +368,7 @@ async function executePrimaryQuery() {

    supplementaryFilters.packages = [];
    supplementaryFilters.workcenterGroups = [];
-    supplementaryFilters.reason = '';
+    supplementaryFilters.reasons = [];
    page.value = 1;
    selectedTrendDates.value = [];
    resetParetoSelections();
@@ -445,7 +469,7 @@ function clearFilters() {
  draftFilters.excludeMaterialScrap = true;
  draftFilters.excludePbDiode = true;
  draftFilters.paretoTop80 = true;
-  paretoDisplayScope.value = 'all';
+  paretoDisplayScope.value = 'top20';
  resetParetoSelections();
  void executePrimaryQuery();
 }
@@ -520,7 +544,7 @@ function clearParetoSelection() {
 function onSupplementaryChange(filters) {
  supplementaryFilters.packages = filters.packages || [];
  supplementaryFilters.workcenterGroups = filters.workcenterGroups || [];
-  supplementaryFilters.reason = filters.reason || '';
+  supplementaryFilters.reasons = filters.reasons || [];
  page.value = 1;
  selectedTrendDates.value = [];
  resetParetoSelections();
@@ -545,7 +569,7 @@ function removeFilterChip(chip) {
  }

  if (chip.type === 'reason') {
-    supplementaryFilters.reason = '';
+    supplementaryFilters.reasons = supplementaryFilters.reasons.filter((r) => r !== chip.value);
    page.value = 1;
    updateUrlState();
    void Promise.all([refreshView(), fetchBatchPareto()]);
@@ -584,7 +608,7 @@ async function exportCsv() {
    params.set('query_id', queryId.value);
    for (const pkg of supplementaryFilters.packages) params.append('packages', pkg);
    for (const wc of supplementaryFilters.workcenterGroups) params.append('workcenter_groups', wc);
-    if (supplementaryFilters.reason) params.set('reason', supplementaryFilters.reason);
+    for (const r of supplementaryFilters.reasons) params.append('reasons', r);
    params.set('metric_filter', metricFilterParam());
    for (const date of selectedTrendDates.value) params.append('trend_dates', date);
    for (const [dimension, key] of Object.entries(PARETO_SELECTION_PARAM_MAP)) {
@@ -760,13 +784,13 @@ const activeFilterChips = computed(() => {
    value: '',
  });

-  if (supplementaryFilters.reason) {
+  for (const reason of supplementaryFilters.reasons) {
    chips.push({
-      key: `reason:${supplementaryFilters.reason}`,
-      label: `原因: ${supplementaryFilters.reason}`,
+      key: `reason:${reason}`,
+      label: `原因: ${reason}`,
      removable: true,
      type: 'reason',
-      value: supplementaryFilters.reason,
+      value: reason,
    });
  }

@@ -866,16 +890,14 @@ function updateUrlState() {

  appendArrayParams(params, 'packages', supplementaryFilters.packages);
  appendArrayParams(params, 'workcenter_groups', supplementaryFilters.workcenterGroups);
-  if (supplementaryFilters.reason) {
-    params.set('reason', supplementaryFilters.reason);
-  }
+  appendArrayParams(params, 'reasons', supplementaryFilters.reasons);

  appendArrayParams(params, 'trend_dates', selectedTrendDates.value);
  for (const [dimension, key] of Object.entries(PARETO_SELECTION_PARAM_MAP)) {
    appendArrayParams(params, key, paretoSelections[dimension] || []);
  }

-  if (paretoDisplayScope.value !== 'all') {
+  if (paretoDisplayScope.value !== 'top20') {
    params.set('pareto_display_scope', paretoDisplayScope.value);
  }
  if (!committedPrimary.paretoTop80) {
@@ -945,7 +967,7 @@ function restoreFromUrl() {

  supplementaryFilters.packages = readArrayParam(params, 'packages');
  supplementaryFilters.workcenterGroups = readArrayParam(params, 'workcenter_groups');
-  supplementaryFilters.reason = String(params.get('reason') || '').trim();
+  supplementaryFilters.reasons = readArrayParam(params, 'reasons');

  selectedTrendDates.value = readArrayParam(params, 'trend_dates');

@@ -969,7 +991,7 @@ function restoreFromUrl() {
  }

  const urlParetoDisplayScope = String(params.get('pareto_display_scope') || '').trim().toLowerCase();
-  paretoDisplayScope.value = urlParetoDisplayScope === 'top20' ? 'top20' : 'all';
+  paretoDisplayScope.value = urlParetoDisplayScope === 'all' ? 'all' : 'top20';

  const parsedPage = Number(params.get('page') || '1');
  page.value = Number.isFinite(parsedPage) && parsedPage > 0 ? parsedPage : 1;
@@ -1001,6 +1023,9 @@ onMounted(() => {
    </header>

    <div v-if="errorMessage" class="error-banner">{{ errorMessage }}</div>
+    <div v-if="partialFailureWarning" class="warning-banner">
+      {{ partialFailureWarning }}
+    </div>

    <FilterPanel
      :filters="draftFilters"
--- a/frontend/src/reject-history/components/FilterPanel.vue
+++ b/frontend/src/reject-history/components/FilterPanel.vue
@@ -8,23 +8,23 @@ const props = defineProps({
  containerInput: { type: String, default: '' },
  availableFilters: { type: Object, default: () => ({}) },
  supplementaryFilters: { type: Object, default: () => ({}) },
-  queryId: { type: String, default: '' },
-  resolutionInfo: { type: Object, default: null },
-  loading: { type: Object, required: true },
-  activeFilterChips: { type: Array, default: () => [] },
-  paretoDisplayScope: { type: String, default: 'all' },
-});
+  queryId: { type: String, default: '' },
+  resolutionInfo: { type: Object, default: null },
+  loading: { type: Object, required: true },
+  activeFilterChips: { type: Array, default: () => [] },
+  paretoDisplayScope: { type: String, default: 'all' },
+});

 const emit = defineEmits([
  'apply',
  'clear',
  'export-csv',
-  'remove-chip',
-  'pareto-scope-toggle',
-  'pareto-display-scope-change',
-  'update:queryMode',
-  'update:containerInputType',
-  'update:containerInput',
+  'remove-chip',
+  'pareto-scope-toggle',
+  'pareto-display-scope-change',
+  'update:queryMode',
+  'update:containerInputType',
+  'update:containerInput',
  'supplementary-change',
 ]);

@@ -32,7 +32,7 @@ function emitSupplementary(patch) {
  emit('supplementary-change', {
    packages: props.supplementaryFilters.packages || [],
    workcenterGroups: props.supplementaryFilters.workcenterGroups || [],
-    reason: props.supplementaryFilters.reason || '',
+    reasons: props.supplementaryFilters.reasons || [],
    ...patch,
  });
 }
@@ -86,23 +86,23 @@ function emitSupplementary(patch) {

      <!-- Container mode -->
      <template v-else>
-        <div class="filter-group">
-          <label class="filter-label" for="container-type">輸入類型</label>
-          <select
-            id="container-type"
-            class="filter-input"
-            :value="containerInputType"
-            @change="$emit('update:containerInputType', $event.target.value)"
-          >
-            <option value="lot">LOT</option>
-            <option value="work_order">工單</option>
-            <option value="wafer_lot">WAFER LOT</option>
-          </select>
-        </div>
-        <div class="filter-group filter-group-wide">
-          <label class="filter-label" for="container-input"
-            >輸入值 (每行一個，支援 * 或 % wildcard)</label
-          >
+        <div class="filter-group filter-group-full container-input-group">
+          <div class="container-label-row">
+            <label class="filter-label" for="container-type">輸入類型</label>
+            <select
+              id="container-type"
+              class="filter-input container-type-select"
+              :value="containerInputType"
+              @change="$emit('update:containerInputType', $event.target.value)"
+            >
+              <option value="lot">LOT</option>
+              <option value="work_order">工單</option>
+              <option value="wafer_lot">WAFER LOT</option>
+            </select>
+            <label class="filter-label" for="container-input"
+              >輸入值 (每行一個，支援 * 或 % wildcard)</label
+            >
+          </div>
          <textarea
            id="container-input"
            class="filter-input filter-textarea"
@@ -124,12 +124,12 @@ function emitSupplementary(patch) {
            <input v-model="filters.excludeMaterialScrap" type="checkbox" />
            排除原物料報廢
          </label>
-          <label class="checkbox-pill">
-            <input v-model="filters.excludePbDiode" type="checkbox" />
-            排除 PB_* 系列
-          </label>
-        </div>
-        <div class="filter-actions">
+          <label class="checkbox-pill">
+            <input v-model="filters.excludePbDiode" type="checkbox" />
+            排除 PB_* 系列
+          </label>
+        </div>
+        <div class="filter-actions">
          <button
            class="btn btn-primary"
            :disabled="loading.querying"
@@ -181,30 +181,30 @@ function emitSupplementary(patch) {
      </template>
    </div>

-    <!-- Supplementary filters (only after primary query) -->
-    <div v-if="queryId" class="supplementary-panel">
-      <div class="supplementary-header">補充篩選 (快取內篩選)</div>
-      <div class="supplementary-toolbar">
-        <label class="checkbox-pill">
-          <input
-            :checked="filters.paretoTop80"
-            type="checkbox"
-            @change="$emit('pareto-scope-toggle', $event.target.checked)"
-          />
-          Pareto 僅顯示累計前 80%
-        </label>
-        <label class="filter-label">顯示範圍</label>
-        <select
-          class="dimension-select pareto-scope-select"
-          :value="paretoDisplayScope"
-          @change="$emit('pareto-display-scope-change', $event.target.value)"
-        >
-          <option value="all">全部顯示</option>
-          <option value="top20">只顯示 TOP 20</option>
-        </select>
-      </div>
-      <div class="supplementary-row">
-        <div class="filter-group">
+    <!-- Supplementary filters (only after primary query) -->
+    <div v-if="queryId" class="supplementary-panel">
+      <div class="supplementary-header">補充篩選 (快取內篩選)</div>
+      <div class="supplementary-toolbar">
+        <label class="checkbox-pill">
+          <input
+            :checked="filters.paretoTop80"
+            type="checkbox"
+            @change="$emit('pareto-scope-toggle', $event.target.checked)"
+          />
+          Pareto 僅顯示累計前 80%
+        </label>
+        <label class="filter-label">顯示範圍</label>
+        <select
+          class="dimension-select pareto-scope-select"
+          :value="paretoDisplayScope"
+          @change="$emit('pareto-display-scope-change', $event.target.value)"
+        >
+          <option value="all">全部顯示</option>
+          <option value="top20">只顯示 TOP 20</option>
+        </select>
+      </div>
+      <div class="supplementary-row">
+        <div class="filter-group">
          <label class="filter-label">WORKCENTER GROUP</label>
          <MultiSelect
            :model-value="supplementaryFilters.workcenterGroups"
@@ -227,22 +227,14 @@ function emitSupplementary(patch) {
        </div>

        <div class="filter-group">
-          <label class="filter-label" for="supp-reason">報廢原因</label>
-          <select
-            id="supp-reason"
-            class="filter-input"
-            :value="supplementaryFilters.reason"
-            @change="emitSupplementary({ reason: $event.target.value })"
-          >
-            <option value="">全部原因</option>
-            <option
-              v-for="r in availableFilters.reasons || []"
-              :key="r"
-              :value="r"
-            >
-              {{ r }}
-            </option>
-          </select>
+          <label class="filter-label">報廢原因</label>
+          <MultiSelect
+            :model-value="supplementaryFilters.reasons"
+            :options="availableFilters.reasons || []"
+            placeholder="全部原因"
+            searchable
+            @update:model-value="emitSupplementary({ reasons: $event })"
+          />
        </div>
      </div>
    </div>
--- a/frontend/src/reject-history/style.css
+++ b/frontend/src/reject-history/style.css
@@ -41,6 +41,19 @@
  line-height: 1.5;
 }

+.container-label-row {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  flex-wrap: wrap;
+}
+
+.container-type-select {
+  width: auto;
+  min-width: 120px;
+  max-width: 180px;
+}
+
 .supplementary-panel {
  border-top: 1px solid var(--border);
  padding: 16px 18px;
@@ -119,6 +132,15 @@
  font-size: 13px;
 }

+.warning-banner {
+  margin-bottom: 14px;
+  padding: 10px 12px;
+  border-radius: 6px;
+  background: #fffbeb;
+  color: #b45309;
+  font-size: 13px;
+}
+
 .filter-panel {
  display: grid;
  grid-template-columns: repeat(4, minmax(0, 1fr));
--- a/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/.openspec.yaml
+++ b/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-03-03
--- a/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/design.md
+++ b/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/design.md
@@ -0,0 +1,80 @@
+## Context
+
+報廢歷史查詢使用 `BatchQueryEngine` 將長日期範圍拆成 10 天 chunks 平行查詢 Oracle。每個 chunk 有記憶體上限（256 MB）和 timeout（300s）防護。當 chunk 失敗時，`has_partial_failure` 旗標寫入 Redis HSET（key: `batch:reject:{hash}:meta`），但此資訊**在三個斷點被丟失**：
+
+1. `reject_dataset_cache.py` 的 `execute_primary_query()` 未讀取 batch progress metadata
+2. API route 直接 `jsonify({"success": True, **result})`，在 partial chunk failure 路徑下仍回 HTTP 200 + `success: true`，不區分完整與不完整結果
+3. 前端 `App.vue` 沒有任何 partial failure 處理邏輯
+
+另一個問題：`redis_clear_batch()` 在 `execute_primary_query()` 的清理階段會刪除 metadata key，所以讀取必須在清理之前。
+
+前端的 730 天日期上限驗證只在後端 `_validate_range()` 做，前端缺乏即時回饋。
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- 將 `has_partial_failure` 從 Redis metadata 傳遞到 API response `meta` 欄位
+- 追蹤失敗 chunk 的時間範圍，讓前端可顯示具體的缺漏區間
+- 前端顯示 amber warning banner，告知使用者資料可能不完整
+- 前端加入日期範圍即時驗證，避免無效 API 請求
+- 對 transient error（Oracle timeout、連線失敗）加入單次重試，減少不必要的 partial failure
+- 持久化 partial failure 旗標到獨立 Redis key，讓 cache-hit 路徑也能還原警告狀態
+
+**Non-Goals:**
+
+- 不改變現有 chunk 分片策略或記憶體上限數值
+- 不實作前端的自動重查/重試機制
+- 不修改 `EVENT_FETCHER_ALLOW_PARTIAL_RESULTS` 的行為（預設已是安全的 false）
+- 不加入 progress bar / 即時進度追蹤 UI
+
+## Decisions
+
+### D1: 在 `redis_clear_batch` 之前讀取 metadata
+
+**決定**: 在 `execute_primary_query()` 中，`merge_chunks()` 之後、`redis_clear_batch()` 之前，呼叫 `get_batch_progress("reject", engine_hash)` 讀取 partial failure 狀態。
+
+**理由**: `redis_clear_batch` 會刪除包含 metadata 的 key，之後就讀不到了。此時 chunk 資料已合併完成，是最後可讀取 metadata 的時機點。
+
+### D2: 用獨立 Redis key 持久化 partial failure flag，TTL 對齊實際資料層
+
+**決定**: 在 `_store_query_result()` 之後，將 partial failure 資訊存到 `reject_dataset:{query_id}:partial_failure` Redis HSET。**TTL 必須與資料實際存活的層一致**：若資料 spill 到 parquet spool（`_REJECT_ENGINE_SPOOL_TTL_SECONDS = 21600s`），partial failure flag 的 TTL 也要用 21600s；若資料存在 L1/L2（`_CACHE_TTL = 900s`），flag TTL 用 900s。實作方式：`_store_partial_failure_flag()` 接受 `ttl` 參數，由呼叫端根據 `should_spill` 判斷傳入 `_REJECT_ENGINE_SPOOL_TTL_SECONDS` 或 `_CACHE_TTL`。Cache-hit 路徑透過 `_load_partial_failure_flag(query_id)` 還原。
+
+**替代方案 A**: 將 flag 嵌入 DataFrame 的 attrs 或另外 pickle。
+**為何不採用**: DataFrame attrs 在 parquet 序列化時會丟失；pickle 增加反序列化風險。
+
+**替代方案 B**: 固定 TTL=900s。
+**為何不採用**: 大查詢 spill 到 parquet spool（21600s TTL），資料還能讀 6 小時，但 partial failure flag 15 分鐘就過期，造成「資料讀得到但警告消失」。
+
+### D3: 在 `_update_progress` 中追蹤 failed_ranges（僅 time-range chunk）
+
+**決定**: 擴充 `_update_progress()` 接受 `failed_ranges: Optional[List[Dict]]` 參數，以 JSON 字串存入 Redis HSET。Sequential 和 parallel path 均從失敗的 chunk descriptor 提取 `chunk_start` / `chunk_end`。**僅當 chunk descriptor 包含 `chunk_start`/`chunk_end` 時才記錄**（即 `decompose_by_time_range` 產生的 time-range chunk）。
+
+**container-id 分塊的情境**: reject 的 container 模式使用 `decompose_by_ids()`，chunk 結構為 `{"ids": [...]}` 不含日期範圍。此時 `failed_ranges` 為空 list，前端透過 `failed_chunk_count > 0` 顯示 generic 警告訊息（「N 個查詢批次的資料擷取失敗」），不含日期區間。
+
+**理由**: chunk descriptor 的結構由 decompose 函式決定，engine 層不應假設所有 chunk 都有時間範圍。
+
+### D4: Memory guard 失敗不重試
+
+**決定**: `_execute_single_chunk()` 加入 `max_retries=1`，但只對 `_is_retryable_error()` 回傳 true 的 exception 重試。Memory guard（記憶體超限）和 Redis store 失敗直接 return False，不重試。
+
+**理由**: Memory guard 代表該時段資料量確實過大，重試結果相同；Oracle timeout 和連線錯誤則可能是暫態問題。
+
+### D5: 前端 warning banner 使用既有 amber 色系
+
+**決定**: 新增 `.warning-banner` CSS class，使用 `background: #fffbeb; color: #b45309`，與既有 `.resolution-warn` 的 amber 色系一致。放在 `.error-banner` 之後。
+
+**替代方案**: 使用 toast/notification 元件。
+**為何不採用**: 此專案無 toast 系統，amber banner 與 red error-banner 模式統一。
+
+### D6: 前端日期驗證函式放在共用 filters module
+
+**決定**: 在 `frontend/src/core/reject-history-filters.js` 新增 `validateDateRange()`，複用 `resource-history/App.vue:231-248` 的驗證模式。
+
+**理由**: reject-history-filters.js 已是此頁面的 filter 工具模組，validateDateRange 屬於 filter 驗證邏輯。
+
+## Risks / Trade-offs
+
+- **[中] 重試邏輯影響所有 execute_plan 呼叫端** — `_execute_single_chunk()` 是 shared function，被 reject / hold / resource / job / msd 五個服務共用。重試邏輯為加法行為（新增 retry loop 包在既有 try/except 外），成功路徑不變。→ 需要對其他 4 個服務執行 smoke test（既有測試通過即可）。若需更保守，可加入 `max_retries` 參數讓呼叫端控制（預設 1），但目前判斷統一重試對所有服務都是正面效果。
+- **[低] 重試增加 Oracle 負擔** — 單次重試最多增加 1 倍的失敗查詢量。→ 透過 `_is_retryable_error()` 嚴格過濾，只重試 transient error，且 parallel path 最多 3 worker，影響可控。
+- **[低] failed_ranges JSON 大小** — 理論上 73 chunks（730/10）全部失敗會產生 73 筆 range，JSON < 5 KB。→ 遠低於 Redis HSET 欄位限制。
--- a/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/proposal.md
+++ b/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/proposal.md
@@ -0,0 +1,34 @@
+## Why
+
+報廢歷史查詢的防爆機制（時間分片 + 記憶體上限 256 MB + Oracle timeout 300s）在 chunk 失敗時會丟棄該 chunk 的資料，`has_partial_failure` 旗標僅寫入 Redis metadata，**從未傳遞到 API response 或前端**。使用者查到不完整資料卻毫不知情，影響決策正確性。此外，730 天日期上限僅在後端驗證，前端無即時提示，導致不必要的等待。
+
+## What Changes
+
+- 後端 `reject_dataset_cache` 在 `execute_plan()` 後讀取 batch progress metadata，將 `has_partial_failure`、失敗 chunk 數量及失敗時間範圍注入 API response `meta` 欄位
+- 後端 `batch_query_engine` 追蹤失敗 chunk 的時間區間描述，寫入 Redis metadata 的 `failed_ranges` 欄位
+- 後端 `_execute_single_chunk()` 對 transient error（Oracle timeout / 連線錯誤）加入單次重試，memory guard 失敗不重試
+- 前端新增 amber warning banner，當 `meta.has_partial_failure` 為 true 時顯示不完整資料警告及失敗的日期區間
+- 前端新增日期範圍即時驗證（730 天上限），在 API 發送前攔截無效範圍
+
+## Capabilities
+
+### New Capabilities
+
+- `batch-query-resilience`: 批次查詢引擎的失敗範圍追蹤、partial failure metadata 傳遞、及 transient error 單次重試機制
+
+### Modified Capabilities
+
+- `reject-history-api`: API response `meta` 新增 `has_partial_failure`、`failed_chunk_count`、`failed_ranges` 欄位，讓前端得知查詢結果完整性
+- `reject-history-page`: 新增 amber warning banner 顯示 partial failure 警告；新增前端日期範圍即時驗證（730 天上限）
+
+## Impact
+
+- **後端服務 — batch_query_engine.py（共用模組，影響所有使用 execute_plan 的服務）**:
+  - 追蹤 failed_ranges + 重試邏輯修改的是 `_execute_single_chunk()`，此函式被 **reject / hold / resource / job / msd** 五個 dataset cache 服務共用
+  - 重試邏輯為加法行為（新增 retry loop），不改變既有成功路徑，對其他服務向後相容
+  - `failed_ranges` 追蹤僅在 chunk descriptor 含 `chunk_start`/`chunk_end` 時才記錄，container-id 分塊（僅 reject container 模式使用）不受影響
+  - 需對 hold / resource / job / msd 執行回歸 smoke test
+- **後端服務 — reject_dataset_cache.py**: 讀取 metadata + 注入 response + 持久化 partial failure flag
+- **前端**: `App.vue`（warning banner + 日期驗證）、`reject-history-filters.js`（validateDateRange 函式）、`style.css`（.warning-banner 樣式）
+- **API 契約**: response `meta` 新增可選欄位（向後相容，現有前端不受影響）
+- **測試**: `test_batch_query_engine.py`、`test_reject_dataset_cache.py` 需新增對應測試案例；hold / resource / job / msd 需回歸驗證
--- a/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/specs/batch-query-resilience/spec.md
+++ b/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/specs/batch-query-resilience/spec.md
@@ -0,0 +1,82 @@
+## ADDED Requirements
+
+### Requirement: BatchQueryEngine SHALL track failed chunk time ranges in progress metadata
+The engine SHALL record the time ranges of failed chunks in Redis progress metadata so consumers can report which date intervals have missing data.
+
+#### Scenario: Failed chunk range recorded in sequential path
+- **WHEN** a chunk with `chunk_start` and `chunk_end` keys fails during sequential execution
+- **THEN** `_update_progress()` SHALL store a `failed_ranges` field in the Redis HSET metadata
+- **THEN** `failed_ranges` SHALL be a JSON array of objects, each with `start` and `end` string keys
+- **THEN** the array SHALL contain one entry per failed chunk
+
+#### Scenario: Failed chunk range recorded in parallel path
+- **WHEN** a chunk with `chunk_start` and `chunk_end` keys fails during parallel execution
+- **THEN** the failed chunk's time range SHALL be appended to `failed_ranges` in the same format as the sequential path
+
+#### Scenario: No failed ranges when all chunks succeed
+- **WHEN** all chunks complete successfully
+- **THEN** the `failed_ranges` field SHALL NOT be present in Redis metadata
+
+#### Scenario: ID-batch chunks produce no failed_ranges entries
+- **WHEN** a chunk created by `decompose_by_ids()` (containing only an `ids` key, no `chunk_start`/`chunk_end`) fails
+- **THEN** no entry SHALL be appended to `failed_ranges` for that chunk
+- **THEN** `has_partial_failure` SHALL still be set to `True`
+- **THEN** `failed` count SHALL still be incremented
+
+#### Scenario: get_batch_progress returns failed_ranges
+- **WHEN** `get_batch_progress()` is called after execution with failed chunks
+- **THEN** the returned dict SHALL include `failed_ranges` as a JSON string parseable to a list of `{start, end}` objects
+
+### Requirement: BatchQueryEngine SHALL retry transient chunk failures once
+The engine SHALL retry chunk execution once for transient errors (Oracle timeout, connection errors) but SHALL NOT retry deterministic failures (memory guard, Redis store).
+
+#### Scenario: Oracle timeout retried once
+- **WHEN** `_execute_single_chunk()` raises an exception matching Oracle timeout patterns (`DPY-4024`, `ORA-01013`)
+- **THEN** the chunk SHALL be retried exactly once
+- **WHEN** the retry succeeds
+- **THEN** the chunk SHALL be marked as successful
+
+#### Scenario: Connection error retried once
+- **WHEN** `_execute_single_chunk()` raises `TimeoutError`, `ConnectionError`, or `OSError`
+- **THEN** the chunk SHALL be retried exactly once
+
+#### Scenario: Retry exhausted marks chunk as failed
+- **WHEN** a chunk fails on both the initial attempt and the retry
+- **THEN** the chunk SHALL be marked as failed
+- **THEN** `has_partial_failure` SHALL be set to `True`
+
+#### Scenario: Memory guard failure NOT retried
+- **WHEN** a chunk's DataFrame exceeds `BATCH_CHUNK_MAX_MEMORY_MB`
+- **THEN** the chunk SHALL return `False` immediately without retry
+- **THEN** the query function SHALL have been called exactly once for that chunk
+
+#### Scenario: Redis store failure NOT retried
+- **WHEN** `redis_store_chunk()` returns `False`
+- **THEN** the chunk SHALL return `False` immediately without retry
+
+### Requirement: reject_dataset_cache SHALL propagate partial failure metadata to API response
+The cache service SHALL read batch execution metadata and include partial failure information in the API response `meta` field.
+
+#### Scenario: Partial failure metadata included in response
+- **WHEN** `execute_primary_query()` uses the batch engine path and `get_batch_progress()` returns `has_partial_failure=True`
+- **THEN** the response `meta` dict SHALL include `has_partial_failure: true`
+- **THEN** the response `meta` dict SHALL include `failed_chunk_count` as an integer
+- **THEN** if `failed_ranges` is present, the response `meta` dict SHALL include `failed_ranges` as a list of `{start, end}` objects
+
+#### Scenario: Metadata read before redis_clear_batch
+- **WHEN** `execute_primary_query()` calls `get_batch_progress()`
+- **THEN** the call SHALL occur after `merge_chunks()` and before `redis_clear_batch()`
+
+#### Scenario: No partial failure on successful query
+- **WHEN** all chunks complete successfully
+- **THEN** the response `meta` dict SHALL NOT include `has_partial_failure`
+
+#### Scenario: Cache-hit path restores partial failure flag
+- **WHEN** a cached DataFrame is returned (cache hit) and a partial failure flag was stored during the original query
+- **THEN** the response `meta` dict SHALL include the same `has_partial_failure`, `failed_chunk_count`, and `failed_ranges` as the original response
+
+#### Scenario: Partial failure flag TTL matches data storage layer
+- **WHEN** partial failure is detected and the query result is spilled to parquet spool
+- **THEN** the partial failure flag SHALL be stored with TTL equal to `_REJECT_ENGINE_SPOOL_TTL_SECONDS` (default 21600 seconds)
+- **WHEN** partial failure is detected and the query result is stored in L1/L2 Redis cache
+- **THEN** the partial failure flag SHALL be stored with TTL equal to `_CACHE_TTL` (default 900 seconds)
--- a/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/specs/reject-history-api/spec.md
+++ b/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/specs/reject-history-api/spec.md
@@ -0,0 +1,36 @@
+## MODIFIED Requirements
+
+### Requirement: Reject History API SHALL validate required query parameters
+The API SHALL validate date parameters and basic paging bounds before executing database work.
+
+#### Scenario: Missing required dates
+- **WHEN** a reject-history endpoint requiring date range is called without `start_date` or `end_date`
+- **THEN** the API SHALL return HTTP 400 with a descriptive validation error
+
+#### Scenario: Invalid date order
+- **WHEN** `end_date` is earlier than `start_date`
+- **THEN** the API SHALL return HTTP 400 and SHALL NOT run SQL queries
+
+#### Scenario: Date range exceeds maximum
+- **WHEN** the date range between `start_date` and `end_date` exceeds 730 days
+- **THEN** the API SHALL return HTTP 400 with error message "日期範圍不可超過 730 天"
+
+## ADDED Requirements
+
+### Requirement: Reject History API primary query response SHALL include partial failure metadata
+The primary query endpoint SHALL include batch execution completeness information in the response `meta` field when chunks fail during batch query execution.
+
+#### Scenario: Partial failure metadata in response
+- **WHEN** `POST /api/reject-history/query` completes with some chunks failing
+- **THEN** the response SHALL include `meta.has_partial_failure: true`
+- **THEN** the response SHALL include `meta.failed_chunk_count` as a positive integer
+- **THEN** the response SHALL include `meta.failed_ranges` as an array of `{start, end}` date strings (if available)
+- **THEN** the HTTP status SHALL still be 200 (data is partially available)
+
+#### Scenario: No partial failure metadata on full success
+- **WHEN** `POST /api/reject-history/query` completes with all chunks succeeding
+- **THEN** the response `meta` SHALL NOT include `has_partial_failure`, `failed_chunk_count`, or `failed_ranges`
+
+#### Scenario: Partial failure metadata preserved on cache hit
+- **WHEN** `POST /api/reject-history/query` returns cached data that originally had partial failures
+- **THEN** the response SHALL include the same `meta.has_partial_failure`, `meta.failed_chunk_count`, and `meta.failed_ranges` as the original response
--- a/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/specs/reject-history-page/spec.md
+++ b/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/specs/reject-history-page/spec.md
@@ -0,0 +1,58 @@
+## ADDED Requirements
+
+### Requirement: Reject History page SHALL display partial failure warning banner
+The page SHALL display an amber warning banner when the query result contains partial failures, informing users that displayed data may be incomplete.
+
+#### Scenario: Warning banner displayed on partial failure
+- **WHEN** the primary query response includes `meta.has_partial_failure: true`
+- **THEN** an amber warning banner SHALL be displayed below the error banner position
+- **THEN** the warning message SHALL be in Traditional Chinese
+
+#### Scenario: Warning banner shows failed date ranges
+- **WHEN** `meta.failed_ranges` contains date range objects
+- **THEN** the warning banner SHALL display the specific failed date ranges (e.g., "以下日期區間的資料擷取失敗：2025-01-01 ~ 2025-01-10")
+
+#### Scenario: Warning banner shows generic message without ranges (container mode or missing range data)
+- **WHEN** `meta.has_partial_failure` is true but `meta.failed_ranges` is empty or absent (e.g., container-id batch query)
+- **THEN** the warning banner SHALL display a generic message with the failed chunk count (e.g., "3 個查詢批次的資料擷取失敗")
+
+#### Scenario: Warning banner cleared on new query
+- **WHEN** user initiates a new primary query
+- **THEN** the warning banner SHALL be cleared before the new query executes
+- **THEN** if the new query also has partial failures, the warning SHALL update with new failure information
+
+#### Scenario: Warning banner coexists with error banner
+- **WHEN** both an error message and a partial failure warning exist
+- **THEN** the error banner SHALL appear first, followed by the warning banner
+
+#### Scenario: Warning banner visual style
+- **WHEN** the warning banner is rendered
+- **THEN** it SHALL use amber/orange color scheme (background `#fffbeb`, text `#b45309`)
+- **THEN** the style SHALL be consistent with the existing `.resolution-warn` color pattern
+
+### Requirement: Reject History page SHALL validate date range before query submission
+The page SHALL validate the date range on the client side before sending the API request, providing immediate feedback for invalid ranges.
+
+#### Scenario: Date range exceeds 730-day limit
+- **WHEN** user selects a date range exceeding 730 days and clicks "查詢"
+- **THEN** the page SHALL display an error message "查詢範圍不可超過 730 天（約兩年）"
+- **THEN** the API request SHALL NOT be sent
+
+#### Scenario: Missing start or end date
+- **WHEN** user clicks "查詢" without setting both start_date and end_date (in date_range mode)
+- **THEN** the page SHALL display an error message "請先設定開始與結束日期"
+- **THEN** the API request SHALL NOT be sent
+
+#### Scenario: End date before start date
+- **WHEN** user selects an end_date earlier than start_date
+- **THEN** the page SHALL display an error message "結束日期必須大於起始日期"
+- **THEN** the API request SHALL NOT be sent
+
+#### Scenario: Valid date range proceeds normally
+- **WHEN** user selects a valid date range within 730 days and clicks "查詢"
+- **THEN** no validation error SHALL be shown
+- **THEN** the API request SHALL proceed normally
+
+#### Scenario: Container mode skips date validation
+- **WHEN** query mode is "container" (not "date_range")
+- **THEN** date range validation SHALL be skipped
--- a/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/tasks.md
+++ b/openspec/changes/archive/2026-03-03-fix-silent-data-loss-reject-history/tasks.md
@@ -0,0 +1,46 @@
+## 1. 前端日期範圍即時驗證
+
+- [x] 1.1 在 `frontend/src/core/reject-history-filters.js` 末尾新增 `validateDateRange(startDate, endDate)` 函式（MAX_QUERY_DAYS=730），回傳空字串表示通過、非空字串為錯誤訊息
+- [x] 1.2 在 `frontend/src/reject-history/App.vue` import `validateDateRange`，在 `executePrimaryQuery()` 的 API 呼叫前（`errorMessage.value = ''` 重置之後）加入 date_range 模式的驗證邏輯，驗證失敗時設定 `errorMessage` 並 return
+
+## 2. 後端追蹤失敗 chunk 時間範圍
+
+- [x] 2.1 在 `batch_query_engine.py` 的 `_update_progress()` 簽名加入 `failed_ranges: Optional[List] = None` 參數，在 mapping dict 中條件性加入 `json.dumps(failed_ranges)` 欄位
+- [x] 2.2 在 `execute_plan()` 的 sequential path（`for idx, chunk in enumerate(chunks)` 迴圈區段）新增 `failed_range_list = []`，chunk 失敗時從 chunk descriptor 條件性提取 `chunk_start`/`chunk_end` append 到 list（僅 time-range chunk 才有），傳入每次 `_update_progress()` 呼叫
+- [x] 2.3 在 `_execute_parallel()` 修改 `futures` dict 為 `futures[future] = (idx, chunk)` 以保留 chunk descriptor，新增 `failed_range_list`，失敗時條件性 append range，返回值改為 4-tuple `(completed, failed, has_partial_failure, failed_range_list)`；同步更新 `execute_plan()` 中呼叫 `_execute_parallel()` 的解構為 4-tuple
+
+## 3. 後端 chunk 失敗單次重試
+
+- [x] 3.1 在 `batch_query_engine.py` 新增 `_RETRYABLE_PATTERNS` 常數和 `_is_retryable_error(exc)` 函式，辨識 Oracle timeout / 連線錯誤
+- [x] 3.2 修改 `_execute_single_chunk()` 加入 `max_retries: int = 1` 參數，將 try/except 包在 retry loop 中：memory guard 和 Redis store 失敗直接 return False 不重試；exception 中若 `_is_retryable_error()` 為 True 則 log warning 並 continue
+
+## 4. 後端傳遞 partial failure 到 API response
+
+- [x] 4.1 在 `reject_dataset_cache.py` 的 `execute_primary_query()` 內 batch_query_engine local import 區塊加入 `get_batch_progress`
+- [x] 4.2 在 `execute_primary_query()` 的 `merge_chunks()` 呼叫之後、`redis_clear_batch()` 呼叫之前，呼叫 `get_batch_progress("reject", engine_hash)` 讀取 `has_partial_failure`、`failed`、`failed_ranges`
+- [x] 4.3 在 `redis_clear_batch()` 之後、`_apply_policy_filters()` 之前，將 partial failure 資訊條件性注入 `meta` dict（`has_partial_failure`、`failed_chunk_count`、`failed_ranges`）
+- [x] 4.4 新增 `_store_partial_failure_flag(query_id, failed_count, failed_ranges, ttl)` 和 `_load_partial_failure_flag(query_id)` 兩個 helper，使用 Redis HSET 存取 `reject_dataset:{query_id}:partial_failure`；`ttl` 由呼叫端傳入
+- [x] 4.5 在 `_store_query_result()` 呼叫之後呼叫 `_store_partial_failure_flag()`，TTL 根據 `_store_query_result()` 內的 `should_spill` 判斷：spill 到 spool 時用 `_REJECT_ENGINE_SPOOL_TTL_SECONDS`（21600s），否則用 `_CACHE_TTL`（900s）；在 `_get_cached_df()` cache-hit 路徑呼叫 `_load_partial_failure_flag()` 並 `meta.update()`
+
+## 5. 前端 partial failure 警告 banner
+
+- [x] 5.1 在 `frontend/src/reject-history/App.vue` 新增 `partialFailureWarning` ref，在 `executePrimaryQuery()` 開頭重置，在讀取 result 後根據 `result.meta.has_partial_failure` 設定警告訊息（含 failed_ranges 的日期區間文字；無 ranges 時用 failed_chunk_count 的 generic 訊息）
+- [x] 5.2 在 App.vue template 的 error-banner `<div>` 之後加入 `<div v-if="partialFailureWarning" class="warning-banner">{{ partialFailureWarning }}</div>`
+- [x] 5.3 在 `frontend/src/reject-history/style.css` 的 `.error-banner` 規則之後加入 `.warning-banner` 樣式（background: #fffbeb, color: #b45309）
+
+## 6. 測試
+
+- [x] 6.1 在 `tests/test_batch_query_engine.py` 新增 `test_transient_failure_retried_once`：mock query_fn 第一次 raise TimeoutError、第二次成功，assert chunk 最終成功且 query_fn 被呼叫 2 次
+- [x] 6.2 在 `tests/test_batch_query_engine.py` 新增 `test_memory_guard_not_retried`：mock query_fn 回傳超大 DataFrame，assert query_fn 僅被呼叫 1 次
+- [x] 6.3 在 `tests/test_batch_query_engine.py` 新增 `test_failed_ranges_tracked`：3 chunks 其中 1 個失敗，assert Redis metadata 含 `failed_ranges` JSON
+- [x] 6.4 在 `tests/test_reject_dataset_cache.py` 新增 `test_partial_failure_in_response_meta`：mock `get_batch_progress` 回傳 `has_partial_failure=True`，assert response `meta` 包含旗標和 `failed_ranges`
+- [x] 6.5 在 `tests/test_reject_dataset_cache.py` 新增 `test_cache_hit_restores_partial_failure`：先寫入 partial failure flag，cache hit 時 assert meta 有旗標
+- [x] 6.6 在 `tests/test_reject_dataset_cache.py` 新增 `test_partial_failure_ttl_matches_spool`：當 should_spill=True 時 assert flag TTL 為 `_REJECT_ENGINE_SPOOL_TTL_SECONDS`，否則為 `_CACHE_TTL`
+- [x] 6.7 在 `tests/test_batch_query_engine.py` 新增 `test_id_batch_chunk_no_failed_ranges`：container-id 分塊 chunk 失敗時 assert `failed_ranges` 為空 list 但 `has_partial_failure=True`
+
+## 7. 跨服務回歸驗證
+
+- [x] 7.1 執行 `pytest tests/test_batch_query_engine.py tests/test_reject_dataset_cache.py -v` 確認本次修改的測試全部通過
+- [x] 7.2 執行 hold_dataset_cache 相關測試確認重試邏輯不影響 hold：`pytest tests/ -k "hold" -v`
+- [x] 7.3 執行 resource / job / msd 相關測試確認回歸：`pytest tests/ -k "resource or job or mid_section" -v`
+- [x] 7.4 若任何跨服務測試失敗，檢查是否為 `_execute_single_chunk` 簽名變更（`max_retries` 參數）導致，確認 keyword-only 預設值不影響既有呼叫
--- a/openspec/specs/batch-query-resilience/spec.md
+++ b/openspec/specs/batch-query-resilience/spec.md
@@ -0,0 +1,86 @@
+# batch-query-resilience Specification
+
+## Purpose
+Batch query engine resilience features: failed chunk range tracking, transient error retry, and partial failure metadata propagation to API consumers.
+
+## Requirements
+### Requirement: BatchQueryEngine SHALL track failed chunk time ranges in progress metadata
+The engine SHALL record the time ranges of failed chunks in Redis progress metadata so consumers can report which date intervals have missing data.
+
+#### Scenario: Failed chunk range recorded in sequential path
+- **WHEN** a chunk with `chunk_start` and `chunk_end` keys fails during sequential execution
+- **THEN** `_update_progress()` SHALL store a `failed_ranges` field in the Redis HSET metadata
+- **THEN** `failed_ranges` SHALL be a JSON array of objects, each with `start` and `end` string keys
+- **THEN** the array SHALL contain one entry per failed chunk
+
+#### Scenario: Failed chunk range recorded in parallel path
+- **WHEN** a chunk with `chunk_start` and `chunk_end` keys fails during parallel execution
+- **THEN** the failed chunk's time range SHALL be appended to `failed_ranges` in the same format as the sequential path
+
+#### Scenario: No failed ranges when all chunks succeed
+- **WHEN** all chunks complete successfully
+- **THEN** the `failed_ranges` field SHALL NOT be present in Redis metadata
+
+#### Scenario: ID-batch chunks produce no failed_ranges entries
+- **WHEN** a chunk created by `decompose_by_ids()` (containing only an `ids` key, no `chunk_start`/`chunk_end`) fails
+- **THEN** no entry SHALL be appended to `failed_ranges` for that chunk
+- **THEN** `has_partial_failure` SHALL still be set to `True`
+- **THEN** `failed` count SHALL still be incremented
+
+#### Scenario: get_batch_progress returns failed_ranges
+- **WHEN** `get_batch_progress()` is called after execution with failed chunks
+- **THEN** the returned dict SHALL include `failed_ranges` as a JSON string parseable to a list of `{start, end}` objects
+
+### Requirement: BatchQueryEngine SHALL retry transient chunk failures once
+The engine SHALL retry chunk execution once for transient errors (Oracle timeout, connection errors) but SHALL NOT retry deterministic failures (memory guard, Redis store).
+
+#### Scenario: Oracle timeout retried once
+- **WHEN** `_execute_single_chunk()` raises an exception matching Oracle timeout patterns (`DPY-4024`, `ORA-01013`)
+- **THEN** the chunk SHALL be retried exactly once
+- **WHEN** the retry succeeds
+- **THEN** the chunk SHALL be marked as successful
+
+#### Scenario: Connection error retried once
+- **WHEN** `_execute_single_chunk()` raises `TimeoutError`, `ConnectionError`, or `OSError`
+- **THEN** the chunk SHALL be retried exactly once
+
+#### Scenario: Retry exhausted marks chunk as failed
+- **WHEN** a chunk fails on both the initial attempt and the retry
+- **THEN** the chunk SHALL be marked as failed
+- **THEN** `has_partial_failure` SHALL be set to `True`
+
+#### Scenario: Memory guard failure NOT retried
+- **WHEN** a chunk's DataFrame exceeds `BATCH_CHUNK_MAX_MEMORY_MB`
+- **THEN** the chunk SHALL return `False` immediately without retry
+- **THEN** the query function SHALL have been called exactly once for that chunk
+
+#### Scenario: Redis store failure NOT retried
+- **WHEN** `redis_store_chunk()` returns `False`
+- **THEN** the chunk SHALL return `False` immediately without retry
+
+### Requirement: reject_dataset_cache SHALL propagate partial failure metadata to API response
+The cache service SHALL read batch execution metadata and include partial failure information in the API response `meta` field.
+
+#### Scenario: Partial failure metadata included in response
+- **WHEN** `execute_primary_query()` uses the batch engine path and `get_batch_progress()` returns `has_partial_failure=True`
+- **THEN** the response `meta` dict SHALL include `has_partial_failure: true`
+- **THEN** the response `meta` dict SHALL include `failed_chunk_count` as an integer
+- **THEN** if `failed_ranges` is present, the response `meta` dict SHALL include `failed_ranges` as a list of `{start, end}` objects
+
+#### Scenario: Metadata read before redis_clear_batch
+- **WHEN** `execute_primary_query()` calls `get_batch_progress()`
+- **THEN** the call SHALL occur after `merge_chunks()` and before `redis_clear_batch()`
+
+#### Scenario: No partial failure on successful query
+- **WHEN** all chunks complete successfully
+- **THEN** the response `meta` dict SHALL NOT include `has_partial_failure`
+
+#### Scenario: Cache-hit path restores partial failure flag
+- **WHEN** a cached DataFrame is returned (cache hit) and a partial failure flag was stored during the original query
+- **THEN** the response `meta` dict SHALL include the same `has_partial_failure`, `failed_chunk_count`, and `failed_ranges` as the original response
+
+#### Scenario: Partial failure flag TTL matches data storage layer
+- **WHEN** partial failure is detected and the query result is spilled to parquet spool
+- **THEN** the partial failure flag SHALL be stored with TTL equal to `_REJECT_ENGINE_SPOOL_TTL_SECONDS` (default 21600 seconds)
+- **WHEN** partial failure is detected and the query result is stored in L1/L2 Redis cache
+- **THEN** the partial failure flag SHALL be stored with TTL equal to `_CACHE_TTL` (default 900 seconds)
--- a/openspec/specs/reject-history-api/spec.md
+++ b/openspec/specs/reject-history-api/spec.md
@@ -14,6 +14,28 @@ The API SHALL validate date parameters and basic paging bounds before executing
 - **WHEN** `end_date` is earlier than `start_date`
 - **THEN** the API SHALL return HTTP 400 and SHALL NOT run SQL queries

+#### Scenario: Date range exceeds maximum
+- **WHEN** the date range between `start_date` and `end_date` exceeds 730 days
+- **THEN** the API SHALL return HTTP 400 with error message "日期範圍不可超過 730 天"
+
+### Requirement: Reject History API primary query response SHALL include partial failure metadata
+The primary query endpoint SHALL include batch execution completeness information in the response `meta` field when chunks fail during batch query execution.
+
+#### Scenario: Partial failure metadata in response
+- **WHEN** `POST /api/reject-history/query` completes with some chunks failing
+- **THEN** the response SHALL include `meta.has_partial_failure: true`
+- **THEN** the response SHALL include `meta.failed_chunk_count` as a positive integer
+- **THEN** the response SHALL include `meta.failed_ranges` as an array of `{start, end}` date strings (if available)
+- **THEN** the HTTP status SHALL still be 200 (data is partially available)
+
+#### Scenario: No partial failure metadata on full success
+- **WHEN** `POST /api/reject-history/query` completes with all chunks succeeding
+- **THEN** the response `meta` SHALL NOT include `has_partial_failure`, `failed_chunk_count`, or `failed_ranges`
+
+#### Scenario: Partial failure metadata preserved on cache hit
+- **WHEN** `POST /api/reject-history/query` returns cached data that originally had partial failures
+- **THEN** the response SHALL include the same `meta.has_partial_failure`, `meta.failed_chunk_count`, and `meta.failed_ranges` as the original response
+
 ### Requirement: Reject History API SHALL provide summary metrics endpoint
 The API SHALL provide aggregated summary metrics for the selected filter context.

--- a/openspec/specs/reject-history-page/spec.md
+++ b/openspec/specs/reject-history-page/spec.md
@@ -236,6 +236,63 @@ The page template SHALL delegate sections to focused sub-components, following t
 - **THEN** `App.vue` SHALL hold all reactive state and API logic
 - **THEN** sub-components SHALL receive data via props and communicate via events

+### Requirement: Reject History page SHALL display partial failure warning banner
+The page SHALL display an amber warning banner when the query result contains partial failures, informing users that displayed data may be incomplete.
+
+#### Scenario: Warning banner displayed on partial failure
+- **WHEN** the primary query response includes `meta.has_partial_failure: true`
+- **THEN** an amber warning banner SHALL be displayed below the error banner position
+- **THEN** the warning message SHALL be in Traditional Chinese
+
+#### Scenario: Warning banner shows failed date ranges
+- **WHEN** `meta.failed_ranges` contains date range objects
+- **THEN** the warning banner SHALL display the specific failed date ranges (e.g., "以下日期區間的資料擷取失敗：2025-01-01 ~ 2025-01-10")
+
+#### Scenario: Warning banner shows generic message without ranges (container mode or missing range data)
+- **WHEN** `meta.has_partial_failure` is true but `meta.failed_ranges` is empty or absent (e.g., container-id batch query)
+- **THEN** the warning banner SHALL display a generic message with the failed chunk count (e.g., "3 個查詢批次的資料擷取失敗")
+
+#### Scenario: Warning banner cleared on new query
+- **WHEN** user initiates a new primary query
+- **THEN** the warning banner SHALL be cleared before the new query executes
+- **THEN** if the new query also has partial failures, the warning SHALL update with new failure information
+
+#### Scenario: Warning banner coexists with error banner
+- **WHEN** both an error message and a partial failure warning exist
+- **THEN** the error banner SHALL appear first, followed by the warning banner
+
+#### Scenario: Warning banner visual style
+- **WHEN** the warning banner is rendered
+- **THEN** it SHALL use amber/orange color scheme (background `#fffbeb`, text `#b45309`)
+- **THEN** the style SHALL be consistent with the existing `.resolution-warn` color pattern
+
+### Requirement: Reject History page SHALL validate date range before query submission
+The page SHALL validate the date range on the client side before sending the API request, providing immediate feedback for invalid ranges.
+
+#### Scenario: Date range exceeds 730-day limit
+- **WHEN** user selects a date range exceeding 730 days and clicks "查詢"
+- **THEN** the page SHALL display an error message "查詢範圍不可超過 730 天（約兩年）"
+- **THEN** the API request SHALL NOT be sent
+
+#### Scenario: Missing start or end date
+- **WHEN** user clicks "查詢" without setting both start_date and end_date (in date_range mode)
+- **THEN** the page SHALL display an error message "請先設定開始與結束日期"
+- **THEN** the API request SHALL NOT be sent
+
+#### Scenario: End date before start date
+- **WHEN** user selects an end_date earlier than start_date
+- **THEN** the page SHALL display an error message "結束日期必須大於起始日期"
+- **THEN** the API request SHALL NOT be sent
+
+#### Scenario: Valid date range proceeds normally
+- **WHEN** user selects a valid date range within 730 days and clicks "查詢"
+- **THEN** no validation error SHALL be shown
+- **THEN** the API request SHALL proceed normally
+
+#### Scenario: Container mode skips date validation
+- **WHEN** query mode is "container" (not "date_range")
+- **THEN** date range validation SHALL be skipped
+
 ### Requirement: Frontend API timeout
 The reject-history page SHALL use a 360-second API timeout (up from 60 seconds) for all Oracle-backed API calls.

--- a/src/mes_dashboard/routes/reject_history_routes.py
+++ b/src/mes_dashboard/routes/reject_history_routes.py
@@ -11,6 +11,7 @@ from flask import Blueprint, Response, jsonify, request

 from mes_dashboard.core.cache import cache_get, cache_set, make_cache_key
 from mes_dashboard.core.rate_limit import configured_rate_limit
+from mes_dashboard.core.request_validation import parse_json_payload
 from mes_dashboard.core.utils import parse_bool_query
 from mes_dashboard.services.reject_dataset_cache import (
    apply_view,
@@ -344,7 +345,7 @@ def api_reject_history_reason_pareto():
                pareto_scope=pareto_scope,
                packages=_parse_multi_param("packages") or None,
                workcenter_groups=_parse_multi_param("workcenter_groups") or None,
-                reason=request.args.get("reason", "").strip() or None,
+                reasons=_parse_multi_param("reasons") or None,
                trend_dates=_parse_multi_param("trend_dates") or None,
                include_excluded_scrap=include_excluded_scrap,
                exclude_material_scrap=exclude_material_scrap,
@@ -404,7 +405,7 @@ def api_reject_history_batch_pareto():
            pareto_display_scope=pareto_display_scope,
            packages=_parse_multi_param("packages") or None,
            workcenter_groups=_parse_multi_param("workcenter_groups") or None,
-            reason=request.args.get("reason", "").strip() or None,
+            reasons=_parse_multi_param("reasons") or None,
            trend_dates=_parse_multi_param("trend_dates") or None,
            pareto_selections=_parse_multi_pareto_selections(),
            include_excluded_scrap=include_excluded_scrap,
@@ -548,7 +549,9 @@ def api_reject_history_analytics():
@reject_history_bp.route("/api/reject-history/query", methods=["POST"])
 def api_reject_history_query():
    """Primary query: execute Oracle → cache DataFrame → return results."""
-    body = request.get_json(silent=True) or {}
+    body, payload_error = parse_json_payload(require_non_empty_object=True)
+    if payload_error is not None:
+        return jsonify({"success": False, "error": payload_error.message}), payload_error.status_code

    mode = str(body.get("mode", "")).strip()
    if mode not in ("date_range", "container"):
@@ -599,7 +602,7 @@ def api_reject_history_view():
    page = request.args.get("page", 1, type=int) or 1
    per_page = request.args.get("per_page", 50, type=int) or 50
    metric_filter = request.args.get("metric_filter", "all").strip().lower() or "all"
-    reason = request.args.get("reason", "").strip() or None
+    reasons = _parse_multi_param("reasons") or None
    detail_reason = request.args.get("detail_reason", "").strip() or None
    pareto_selections = _parse_multi_pareto_selections()
    pareto_dimension = None
@@ -618,7 +621,7 @@ def api_reject_history_view():
            query_id=query_id,
            packages=_parse_multi_param("packages") or None,
            workcenter_groups=_parse_multi_param("workcenter_groups") or None,
-            reason=reason,
+            reasons=reasons,
            metric_filter=metric_filter,
            trend_dates=_parse_multi_param("trend_dates") or None,
            detail_reason=detail_reason,
@@ -653,7 +656,7 @@ def api_reject_history_export_cached():
        return jsonify({"success": False, "error": "缺少必要參數: query_id"}), 400

    metric_filter = request.args.get("metric_filter", "all").strip().lower() or "all"
-    reason = request.args.get("reason", "").strip() or None
+    reasons = _parse_multi_param("reasons") or None
    detail_reason = request.args.get("detail_reason", "").strip() or None
    pareto_selections = _parse_multi_pareto_selections()
    pareto_dimension = None
@@ -672,7 +675,7 @@ def api_reject_history_export_cached():
            query_id=query_id,
            packages=_parse_multi_param("packages") or None,
            workcenter_groups=_parse_multi_param("workcenter_groups") or None,
-            reason=reason,
+            reasons=reasons,
            metric_filter=metric_filter,
            trend_dates=_parse_multi_param("trend_dates") or None,
            detail_reason=detail_reason,
--- a/src/mes_dashboard/services/batch_query_engine.py
+++ b/src/mes_dashboard/services/batch_query_engine.py
@@ -56,6 +56,18 @@ from mes_dashboard.core.redis_df_store import (

 logger = logging.getLogger("mes_dashboard.batch_query_engine")

+
+_RETRYABLE_PATTERNS = (
+    "dpy-4024",
+    "ora-01013",
+    "ora-03113",
+    "ora-03135",
+    "ora-12514",
+    "ora-12541",
+    "timeout",
+    "timed out",
+)
+
 # ============================================================
 # Configuration (env-overridable)
 # ============================================================
@@ -65,7 +77,7 @@ BATCH_CHUNK_MAX_MEMORY_MB: int = int(
 )

 BATCH_QUERY_TIME_THRESHOLD_DAYS: int = int(
-    os.getenv("BATCH_QUERY_TIME_THRESHOLD_DAYS", "60")
+    os.getenv("BATCH_QUERY_TIME_THRESHOLD_DAYS", "10")
 )

 BATCH_QUERY_ID_THRESHOLD: int = int(
@@ -196,6 +208,7 @@ def _update_progress(
    failed: int,
    status: str = "running",
    has_partial_failure: bool = False,
+    failed_ranges: Optional[List[Dict[str, str]]] = None,
    ttl: int = 900,
 ) -> None:
    """Write/update batch progress metadata to Redis."""
@@ -212,6 +225,10 @@ def _update_progress(
        "status": status,
        "has_partial_failure": str(has_partial_failure),
    }
+    if failed_ranges is not None:
+        mapping["failed_ranges"] = json.dumps(
+            failed_ranges, ensure_ascii=False, default=str
+        )
    try:
        client.hset(key, mapping=mapping)
        client.expire(key, ttl)
@@ -279,6 +296,7 @@ def execute_plan(
    completed = 0
    failed = 0
    has_partial_failure = False
+    failed_range_list: Optional[List[Dict[str, str]]] = None

    _update_progress(
        cache_prefix, query_hash,
@@ -296,7 +314,9 @@ def execute_plan(
                _update_progress(
                    cache_prefix, query_hash,
                    total=total, completed=completed, failed=failed,
-                    has_partial_failure=has_partial_failure, ttl=chunk_ttl,
+                    has_partial_failure=has_partial_failure,
+                    failed_ranges=failed_range_list,
+                    ttl=chunk_ttl,
                )
                continue
            ok = _execute_single_chunk(
@@ -308,14 +328,24 @@ def execute_plan(
            else:
                failed += 1
                has_partial_failure = True
+                if failed_range_list is None:
+                    failed_range_list = []
+                chunk_start = chunk.get("chunk_start")
+                chunk_end = chunk.get("chunk_end")
+                if chunk_start and chunk_end:
+                    failed_range_list.append(
+                        {"start": str(chunk_start), "end": str(chunk_end)}
+                    )
            _update_progress(
                cache_prefix, query_hash,
                total=total, completed=completed, failed=failed,
-                has_partial_failure=has_partial_failure, ttl=chunk_ttl,
+                has_partial_failure=has_partial_failure,
+                failed_ranges=failed_range_list,
+                ttl=chunk_ttl,
            )
    else:
        # --- Parallel path ---
-        completed, failed, has_partial_failure = _execute_parallel(
+        completed, failed, has_partial_failure, failed_range_list = _execute_parallel(
            chunks, query_fn, cache_prefix, query_hash,
            chunk_ttl, max_rows_per_chunk, skip_cached,
            effective_parallel,
@@ -327,6 +357,7 @@ def execute_plan(
        total=total, completed=completed, failed=failed,
        status=final_status,
        has_partial_failure=has_partial_failure,
+        failed_ranges=failed_range_list,
        ttl=chunk_ttl,
    )

@@ -366,53 +397,59 @@ def _execute_single_chunk(
    query_hash: str,
    chunk_ttl: int,
    max_rows_per_chunk: Optional[int],
+    max_retries: int = 1,
 ) -> bool:
    """Run one chunk through *query_fn*, apply guards, store result.

    Returns True on success, False on failure.
    """
-    try:
-        df = query_fn(chunk, max_rows_per_chunk=max_rows_per_chunk)
-        if df is None:
-            df = pd.DataFrame()
+    attempts = max(0, int(max_retries)) + 1
+    for attempt in range(attempts):
+        try:
+            df = query_fn(chunk, max_rows_per_chunk=max_rows_per_chunk)
+            if df is None:
+                df = pd.DataFrame()

-        # ---- Memory guard ----
-        mem_bytes = df.memory_usage(deep=True).sum()
-        mem_mb = mem_bytes / (1024 * 1024)
-        if mem_mb > BATCH_CHUNK_MAX_MEMORY_MB:
-            logger.warning(
-                "Chunk %d memory %.1f MB exceeds limit %d MB — discarded",
-                idx, mem_mb, BATCH_CHUNK_MAX_MEMORY_MB,
+            # ---- Memory guard ----
+            mem_bytes = df.memory_usage(deep=True).sum()
+            mem_mb = mem_bytes / (1024 * 1024)
+            if mem_mb > BATCH_CHUNK_MAX_MEMORY_MB:
+                logger.warning(
+                    "Chunk %d memory %.1f MB exceeds limit %d MB — discarded",
+                    idx, mem_mb, BATCH_CHUNK_MAX_MEMORY_MB,
+                )
+                return False
+
+            # ---- Store to Redis ----
+            stored = redis_store_chunk(cache_prefix, query_hash, idx, df, ttl=chunk_ttl)
+            if not stored:
+                logger.warning(
+                    "Chunk %d failed to persist into Redis, marking as failed", idx
+                )
+                return False
+
+            logger.debug(
+                "Chunk %d completed: %d rows, %.1f MB",
+                idx, len(df), mem_mb,
+            )
+            return True
+
+        except Exception as exc:
+            should_retry = attempt < attempts - 1 and _is_retryable_error(exc)
+            if should_retry:
+                logger.warning(
+                    "Chunk %d transient failure on attempt %d/%d: %s; retrying",
+                    idx,
+                    attempt + 1,
+                    attempts,
+                    exc,
+                )
+                continue
+            logger.error(
+                "Chunk %d failed: %s", idx, exc, exc_info=True,
            )
            return False
-
-        # ---- Truncation flag ----
-        truncated = (
-            max_rows_per_chunk is not None
-            and len(df) == max_rows_per_chunk
-        )
-        if truncated:
-            logger.info("Chunk %d returned exactly max_rows_per_chunk=%d (truncated)", idx, max_rows_per_chunk)
-
-        # ---- Store to Redis ----
-        stored = redis_store_chunk(cache_prefix, query_hash, idx, df, ttl=chunk_ttl)
-        if not stored:
-            logger.warning(
-                "Chunk %d failed to persist into Redis, marking as failed", idx
-            )
-            return False
-
-        logger.debug(
-            "Chunk %d completed: %d rows, %.1f MB",
-            idx, len(df), mem_mb,
-        )
-        return True
-
-    except Exception as exc:
-        logger.error(
-            "Chunk %d failed: %s", idx, exc, exc_info=True,
-        )
-        return False
+    return False


 def _execute_parallel(
@@ -427,12 +464,13 @@ def _execute_parallel(
 ) -> tuple:
    """Execute chunks in parallel via ThreadPoolExecutor.

-    Returns (completed, failed, has_partial_failure).
+    Returns (completed, failed, has_partial_failure, failed_ranges).
    """
    total = len(chunks)
    completed = 0
    failed = 0
    has_partial_failure = False
+    failed_range_list: Optional[List[Dict[str, str]]] = None

    futures = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -445,10 +483,10 @@ def _execute_parallel(
                idx, chunk, query_fn,
                cache_prefix, query_hash, chunk_ttl, max_rows_per_chunk,
            )
-            futures[future] = idx
+            futures[future] = (idx, chunk)

        for future in as_completed(futures):
-            idx = futures[future]
+            idx, chunk = futures[future]
            try:
                ok = future.result()
                if ok:
@@ -456,18 +494,46 @@ def _execute_parallel(
                else:
                    failed += 1
                    has_partial_failure = True
+                    if failed_range_list is None:
+                        failed_range_list = []
+                    chunk_start = chunk.get("chunk_start")
+                    chunk_end = chunk.get("chunk_end")
+                    if chunk_start and chunk_end:
+                        failed_range_list.append(
+                            {"start": str(chunk_start), "end": str(chunk_end)}
+                        )
            except Exception as exc:
                logger.error("Chunk %d future error: %s", idx, exc)
                failed += 1
                has_partial_failure = True
+                if failed_range_list is None:
+                    failed_range_list = []
+                chunk_start = chunk.get("chunk_start")
+                chunk_end = chunk.get("chunk_end")
+                if chunk_start and chunk_end:
+                    failed_range_list.append(
+                        {"start": str(chunk_start), "end": str(chunk_end)}
+                    )

            _update_progress(
                cache_prefix, query_hash,
                total=total, completed=completed, failed=failed,
-                has_partial_failure=has_partial_failure, ttl=chunk_ttl,
+                has_partial_failure=has_partial_failure,
+                failed_ranges=failed_range_list,
+                ttl=chunk_ttl,
            )

-    return completed, failed, has_partial_failure
+    return completed, failed, has_partial_failure, failed_range_list
+
+
+def _is_retryable_error(exc: Exception) -> bool:
+    """Return True for transient Oracle/network timeout errors."""
+    if isinstance(exc, (TimeoutError, ConnectionError, OSError)):
+        return True
+    text = str(exc).strip().lower()
+    if not text:
+        return False
+    return any(pattern in text for pattern in _RETRYABLE_PATTERNS)


 # ============================================================
--- a/src/mes_dashboard/services/container_resolution_policy.py
+++ b/src/mes_dashboard/services/container_resolution_policy.py
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+"""Shared guardrails for LOT/WAFER/工單 container resolution."""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, Iterable, List, Optional
+
+
+def _env_int(name: str, default: int) -> int:
+    raw = os.getenv(name)
+    if raw is None:
+        return int(default)
+    try:
+        return int(raw)
+    except (TypeError, ValueError):
+        return int(default)
+
+
+def _normalize_wildcard_token(value: str) -> str:
+    return str(value or "").replace("*", "%")
+
+
+def _is_pattern_token(value: str) -> bool:
+    token = _normalize_wildcard_token(value)
+    return "%" in token or "_" in token
+
+
+def _literal_prefix_before_wildcard(value: str) -> str:
+    token = _normalize_wildcard_token(value)
+    for idx, ch in enumerate(token):
+        if ch in ("%", "_"):
+            return token[:idx]
+    return token
+
+
+def normalize_input_values(values: Iterable[Any]) -> List[str]:
+    normalized: List[str] = []
+    seen = set()
+    for raw in values or []:
+        token = str(raw or "").strip()
+        if not token or token in seen:
+            continue
+        seen.add(token)
+        normalized.append(token)
+    return normalized
+
+
+def validate_resolution_request(input_type: str, values: Iterable[Any]) -> Optional[str]:
+    """Validate resolver request without hard-capping raw input count."""
+    tokens = normalize_input_values(values)
+    if not tokens:
+        return "請輸入至少一個查詢條件"
+
+    # Compatibility switch. Default 0 means "no count cap".
+    max_values = max(_env_int("CONTAINER_RESOLVE_INPUT_MAX_VALUES", 0), 0)
+    if max_values and len(tokens) > max_values:
+        return f"輸入數量超過上限 ({max_values} 筆)"
+
+    # Wildcard safety: avoid full-table scans like "%" or "_".
+    min_prefix_len = max(_env_int("CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN", 2), 0)
+    if min_prefix_len > 0:
+        invalid_patterns: List[str] = []
+        for token in tokens:
+            if not _is_pattern_token(token):
+                continue
+            if len(_literal_prefix_before_wildcard(token).strip()) < min_prefix_len:
+                invalid_patterns.append(token)
+        if invalid_patterns:
+            sample = ", ".join(invalid_patterns[:3])
+            suffix = "..." if len(invalid_patterns) > 3 else ""
+            return (
+                f"{input_type} 萬用字元條件過於寬鬆（需至少 {min_prefix_len} 碼前綴）: "
+                f"{sample}{suffix}"
+            )
+
+    return None
+
+
+def extract_container_ids(rows: Iterable[Dict[str, Any]]) -> List[str]:
+    ids: List[str] = []
+    seen = set()
+    for row in rows or []:
+        cid = str(
+            row.get("container_id")
+            or row.get("CONTAINERID")
+            or ""
+        ).strip()
+        if not cid or cid in seen:
+            continue
+        seen.add(cid)
+        ids.append(cid)
+    return ids
+
+
+def assess_resolution_result(result: Dict[str, Any]) -> Dict[str, Any]:
+    """Assess expansion result against guardrails."""
+    expansion_info = result.get("expansion_info") or {}
+    max_expand_per_token = max(
+        _env_int("CONTAINER_RESOLVE_MAX_EXPANSION_PER_TOKEN", 2000),
+        1,
+    )
+    offenders: List[Dict[str, Any]] = []
+    for token, count in expansion_info.items():
+        try:
+            c = int(count)
+        except (TypeError, ValueError):
+            continue
+        if c > max_expand_per_token:
+            offenders.append({"token": str(token), "count": c})
+
+    unique_ids = extract_container_ids(result.get("data") or [])
+    max_container_ids = max(
+        _env_int("CONTAINER_RESOLVE_MAX_CONTAINER_IDS", 30000),
+        1,
+    )
+    return {
+        "max_expansion_per_token": max_expand_per_token,
+        "expansion_offenders": offenders,
+        "max_container_ids": max_container_ids,
+        "resolved_container_ids": len(unique_ids),
+        "over_container_limit": len(unique_ids) > max_container_ids,
+    }
+
+
+def validate_resolution_result(
+    result: Dict[str, Any],
+    *,
+    strict: bool = True,
+) -> Optional[str]:
+    """Validate expansion result guardrails.
+
+    strict=True: exceed guardrail -> return error message.
+    strict=False: exceed guardrail -> allow caller to continue (split/decompose path).
+    """
+    assessment = assess_resolution_result(result)
+    offenders = assessment.get("expansion_offenders") or []
+    if offenders and strict:
+        first = offenders[0]
+        token = str(first.get("token") or "")
+        count = int(first.get("count") or 0)
+        return (
+            f"單一條件展開過大 ({count} 筆，限制 {assessment['max_expansion_per_token']})，"
+            f"請縮小範圍: {token}"
+        )
+
+    if bool(assessment.get("over_container_limit")) and strict:
+        return (
+            f"解析結果過大（{assessment['resolved_container_ids']} 筆 CONTAINERID，限制 {assessment['max_container_ids']}）"
+            "，請縮小查詢條件"
+        )
+    return None
--- a/src/mes_dashboard/services/event_fetcher.py
+++ b/src/mes_dashboard/services/event_fetcher.py
@@ -21,6 +21,10 @@ logger = logging.getLogger("mes_dashboard.event_fetcher")
 ORACLE_IN_BATCH_SIZE = 1000
 EVENT_FETCHER_MAX_WORKERS = int(os.getenv('EVENT_FETCHER_MAX_WORKERS', '2'))
 CACHE_SKIP_CID_THRESHOLD = int(os.getenv('EVENT_FETCHER_CACHE_SKIP_CID_THRESHOLD', '10000'))
+EVENT_FETCHER_ALLOW_PARTIAL_RESULTS = (
+    os.getenv('EVENT_FETCHER_ALLOW_PARTIAL_RESULTS', 'false').strip().lower()
+    in {'1', 'true', 'yes', 'on'}
+)

 _DOMAIN_SPECS: Dict[str, Dict[str, Any]] = {
    "history": {
@@ -280,16 +284,23 @@ class EventFetcher:
            for batch in batches:
                _fetch_and_group_batch(batch)
        else:
+            failures = []
            with ThreadPoolExecutor(max_workers=min(len(batches), EVENT_FETCHER_MAX_WORKERS)) as executor:
                futures = {executor.submit(_fetch_and_group_batch, b): b for b in batches}
                for future in as_completed(futures):
                    try:
                        future.result()
-                    except Exception:
+                    except Exception as exc:
+                        failures.append((futures[future], exc))
                        logger.error(
                            "EventFetcher batch query failed domain=%s batch_size=%s",
                            domain, len(futures[future]), exc_info=True,
                        )
+            if failures and not EVENT_FETCHER_ALLOW_PARTIAL_RESULTS:
+                failed_cids = sum(len(batch) for batch, _ in failures)
+                raise RuntimeError(
+                    f"EventFetcher chunk failed (domain={domain}, failed_chunks={len(failures)}, failed_cids={failed_cids})"
+                )

        result = dict(grouped)
        del grouped
--- a/src/mes_dashboard/services/job_query_service.py
+++ b/src/mes_dashboard/services/job_query_service.py
@@ -150,7 +150,7 @@ def get_jobs_by_resources(
 ) -> Dict[str, Any]:
    """Query jobs for selected resources within date range.

-    For date ranges exceeding BATCH_QUERY_TIME_THRESHOLD_DAYS (default 60),
+    For date ranges exceeding BATCH_QUERY_TIME_THRESHOLD_DAYS (default 10),
    the query is decomposed into monthly chunks via BatchQueryEngine.
    Results are cached in Redis to avoid redundant Oracle queries.

--- a/src/mes_dashboard/services/mid_section_defect_service.py
+++ b/src/mes_dashboard/services/mid_section_defect_service.py
@@ -863,7 +863,7 @@ def _fetch_station_detection_data(
 ) -> Optional[pd.DataFrame]:
    """Execute station_detection.sql and return raw DataFrame.

-    For date ranges exceeding BATCH_QUERY_TIME_THRESHOLD_DAYS (default 60),
+    For date ranges exceeding BATCH_QUERY_TIME_THRESHOLD_DAYS (default 10),
    the query is decomposed into monthly chunks via BatchQueryEngine to
    prevent Oracle timeout on high-volume stations.
    """
--- a/src/mes_dashboard/services/query_tool_service.py
+++ b/src/mes_dashboard/services/query_tool_service.py
@@ -26,9 +26,15 @@ from typing import Any, Dict, List, Optional, Generator, Iterable, Tuple

 import pandas as pd

-from mes_dashboard.core.database import read_sql_df
-from mes_dashboard.sql import QueryBuilder, SQLLoader
-from mes_dashboard.services.event_fetcher import EventFetcher
+from mes_dashboard.core.database import read_sql_df
+from mes_dashboard.sql import QueryBuilder, SQLLoader
+from mes_dashboard.services.container_resolution_policy import (
+    assess_resolution_result,
+    normalize_input_values,
+    validate_resolution_request,
+    validate_resolution_result,
+)
+from mes_dashboard.services.event_fetcher import EventFetcher

 try:
    from mes_dashboard.core.database import read_sql_df_slow
@@ -89,7 +95,7 @@ def validate_date_range(start_date: str, end_date: str, max_days: int = MAX_DATE
        return f'日期格式錯誤: {e}'


-def validate_lot_input(input_type: str, values: List[str]) -> Optional[str]:
+def validate_lot_input(input_type: str, values: List[str]) -> Optional[str]:
    """Validate LOT input based on type.

    Args:
@@ -99,23 +105,7 @@ def validate_lot_input(input_type: str, values: List[str]) -> Optional[str]:
    Returns:
        Error message if validation fails, None if valid.
    """
-    if not values:
-        return '請輸入至少一個查詢條件'
-
-    limits = {
-        'lot_id': MAX_LOT_IDS,
-        'wafer_lot': MAX_LOT_IDS,
-        'gd_lot_id': MAX_LOT_IDS,
-        'serial_number': MAX_SERIAL_NUMBERS,
-        'work_order': MAX_WORK_ORDERS,
-        'gd_work_order': MAX_GD_WORK_ORDERS,
-    }
-
-    limit = limits.get(input_type, MAX_LOT_IDS)
-    if len(values) > limit:
-        return f'輸入數量超過上限 ({limit} 筆)'
-
-    return None
+    return validate_resolution_request(input_type, values)


 def validate_equipment_input(equipment_ids: List[str]) -> Optional[str]:
@@ -344,27 +334,50 @@ def resolve_lots(input_type: str, values: List[str]) -> Dict[str, Any]:
        return {'error': validation_error}

    # Clean values
-    cleaned = [v.strip() for v in values if v.strip()]
-    if not cleaned:
-        return {'error': '請輸入有效的查詢條件'}
+    cleaned = normalize_input_values(values)
+    if not cleaned:
+        return {'error': '請輸入有效的查詢條件'}

    try:
-        if input_type == 'lot_id':
-            return _resolve_by_lot_id(cleaned)
-        elif input_type == 'wafer_lot':
-            return _resolve_by_wafer_lot(cleaned)
-        elif input_type == 'gd_lot_id':
-            return _resolve_by_gd_lot_id(cleaned)
-        elif input_type == 'serial_number':
-            return _resolve_by_serial_number(cleaned)
-        elif input_type == 'work_order':
-            return _resolve_by_work_order(cleaned)
-        elif input_type == 'gd_work_order':
-            return _resolve_by_gd_work_order(cleaned)
-        else:
-            return {'error': f'不支援的輸入類型: {input_type}'}
-
-    except Exception as exc:
+        if input_type == 'lot_id':
+            result = _resolve_by_lot_id(cleaned)
+        elif input_type == 'wafer_lot':
+            result = _resolve_by_wafer_lot(cleaned)
+        elif input_type == 'gd_lot_id':
+            result = _resolve_by_gd_lot_id(cleaned)
+        elif input_type == 'serial_number':
+            result = _resolve_by_serial_number(cleaned)
+        elif input_type == 'work_order':
+            result = _resolve_by_work_order(cleaned)
+        elif input_type == 'gd_work_order':
+            result = _resolve_by_gd_work_order(cleaned)
+        else:
+            return {'error': f'不支援的輸入類型: {input_type}'}
+
+        guard_assessment = assess_resolution_result(result)
+        overflow_tokens = guard_assessment.get("expansion_offenders") or []
+        overflow_total = bool(guard_assessment.get("over_container_limit"))
+        if overflow_tokens or overflow_total:
+            logger.warning(
+                "Resolution guardrail overflow (input_type=%s, offenders=%s, resolved=%s, max=%s); continuing with decompose path",
+                input_type,
+                len(overflow_tokens),
+                guard_assessment.get("resolved_container_ids"),
+                guard_assessment.get("max_container_ids"),
+            )
+            result["guardrail"] = {
+                "overflow": True,
+                "expansion_offenders": overflow_tokens,
+                "resolved_container_ids": guard_assessment.get("resolved_container_ids"),
+                "max_container_ids": guard_assessment.get("max_container_ids"),
+            }
+        # Keep compatibility: validation API remains available for strict call sites.
+        guard_error = validate_resolution_result(result, strict=False)
+        if guard_error:
+            return {'error': guard_error}
+        return result
+
+    except Exception as exc:
        logger.error(f"LOT resolution failed: {exc}")
        return {'error': f'解析失敗: {str(exc)}'}

--- a/src/mes_dashboard/services/reject_dataset_cache.py
+++ b/src/mes_dashboard/services/reject_dataset_cache.py
--- a/src/mes_dashboard/sql/reject_history/performance_daily.sql
+++ b/src/mes_dashboard/sql/reject_history/performance_daily.sql
@@ -30,6 +30,31 @@ WITH spec_map AS (
    WHERE SPEC IS NOT NULL
    GROUP BY SPEC
 ),
+reject_scope AS (
+    SELECT DISTINCT
+        r.WIPTRACKINGGROUPKEYID
+    FROM DWH.DW_MES_LOTREJECTHISTORY r
+    WHERE {{ BASE_WHERE }}
+      AND r.WIPTRACKINGGROUPKEYID IS NOT NULL
+),
+wip_workflow_map AS (
+    SELECT
+        WIPTRACKINGGROUPKEYID,
+        WORKFLOWNAME
+    FROM (
+        SELECT
+            lwh.WIPTRACKINGGROUPKEYID,
+            lwh.WORKFLOWNAME,
+            ROW_NUMBER() OVER (
+                PARTITION BY lwh.WIPTRACKINGGROUPKEYID
+                ORDER BY lwh.MOVEOUTTIMESTAMP DESC NULLS LAST
+            ) AS rn
+        FROM DWH.DW_MES_LOTWIPHISTORY lwh
+        INNER JOIN reject_scope rs
+            ON rs.WIPTRACKINGGROUPKEYID = lwh.WIPTRACKINGGROUPKEYID
+    )
+    WHERE rn = 1
+),
 reject_raw AS (
    SELECT
        TRUNC(r.TXNDATE) AS TXN_DAY,
@@ -105,7 +130,7 @@ reject_raw AS (
    FROM DWH.DW_MES_LOTREJECTHISTORY r
    LEFT JOIN DWH.DW_MES_CONTAINER c
      ON c.CONTAINERID = r.CONTAINERID
-    LEFT JOIN DWH.DW_MES_LOTWIPHISTORY lwh
+    LEFT JOIN wip_workflow_map lwh
      ON lwh.WIPTRACKINGGROUPKEYID = r.WIPTRACKINGGROUPKEYID
    LEFT JOIN spec_map sm
      ON sm.SPEC = TRIM(r.SPECNAME)
--- a/src/mes_dashboard/sql/reject_history/performance_daily_lot.sql
+++ b/src/mes_dashboard/sql/reject_history/performance_daily_lot.sql
@@ -6,8 +6,8 @@
 --   :end_date   - End date (YYYY-MM-DD)

 WITH spec_map AS (
-    SELECT
-        SPEC,
+    SELECT
+        SPEC,
        MIN(WORK_CENTER) KEEP (
            DENSE_RANK FIRST ORDER BY WORKCENTERSEQUENCE_GROUP
        ) AS WORK_CENTER,
@@ -15,9 +15,34 @@ WITH spec_map AS (
            DENSE_RANK FIRST ORDER BY WORKCENTERSEQUENCE_GROUP
        ) AS WORKCENTER_GROUP,
        MIN(WORKCENTERSEQUENCE_GROUP) AS WORKCENTERSEQUENCE_GROUP
-    FROM DWH.DW_MES_SPEC_WORKCENTER_V
-    WHERE SPEC IS NOT NULL
-    GROUP BY SPEC
+    FROM DWH.DW_MES_SPEC_WORKCENTER_V
+    WHERE SPEC IS NOT NULL
+    GROUP BY SPEC
+),
+reject_scope AS (
+    SELECT DISTINCT
+        r.WIPTRACKINGGROUPKEYID
+    FROM DWH.DW_MES_LOTREJECTHISTORY r
+    WHERE {{ BASE_WHERE }}
+      AND r.WIPTRACKINGGROUPKEYID IS NOT NULL
+),
+wip_workflow_map AS (
+    SELECT
+        WIPTRACKINGGROUPKEYID,
+        WORKFLOWNAME
+    FROM (
+        SELECT
+            lwh.WIPTRACKINGGROUPKEYID,
+            lwh.WORKFLOWNAME,
+            ROW_NUMBER() OVER (
+                PARTITION BY lwh.WIPTRACKINGGROUPKEYID
+                ORDER BY lwh.MOVEOUTTIMESTAMP DESC NULLS LAST
+            ) AS rn
+        FROM DWH.DW_MES_LOTWIPHISTORY lwh
+        INNER JOIN reject_scope rs
+            ON rs.WIPTRACKINGGROUPKEYID = lwh.WIPTRACKINGGROUPKEYID
+    )
+    WHERE rn = 1
 ),
 reject_raw AS (
    SELECT
@@ -99,7 +124,7 @@ reject_raw AS (
    FROM DWH.DW_MES_LOTREJECTHISTORY r
    LEFT JOIN DWH.DW_MES_CONTAINER c
      ON c.CONTAINERID = r.CONTAINERID
-    LEFT JOIN DWH.DW_MES_LOTWIPHISTORY lwh
+    LEFT JOIN wip_workflow_map lwh
      ON lwh.WIPTRACKINGGROUPKEYID = r.WIPTRACKINGGROUPKEYID
    LEFT JOIN spec_map sm
      ON sm.SPEC = TRIM(r.SPECNAME)
--- a/tests/test_batch_query_engine.py
+++ b/tests/test_batch_query_engine.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """Unit tests for BatchQueryEngine module."""

+import json
 import pytest
 from unittest.mock import patch, MagicMock, call

@@ -482,8 +483,8 @@ class TestChunkFailureResilience:
                skip_cached=False,
            )

-        # All 3 chunks attempted
-        assert call_count["n"] == 3
+        # One chunk retried once due retryable timeout pattern.
+        assert call_count["n"] == 4
        # Final metadata should reflect partial failure
        last = hset_calls[-1]
        assert last["status"] == "partial"
@@ -567,10 +568,147 @@ class TestShouldDecompose:
        assert should_decompose_by_time("2025-01-01", "2025-12-31")

    def test_short_range_false(self):
-        assert not should_decompose_by_time("2025-01-01", "2025-02-01")
+        assert not should_decompose_by_time("2025-01-01", "2025-01-11")

    def test_large_ids_true(self):
        assert should_decompose_by_ids(list(range(2000)))

    def test_small_ids_false(self):
        assert not should_decompose_by_ids(list(range(500)))
+
+
+class TestRetryAndFailedRanges:
+    def _mock_redis(self):
+        mock_client = MagicMock()
+        stored = {}
+        hashes = {}
+
+        mock_client.setex.side_effect = lambda k, t, v: stored.update({k: v})
+        mock_client.get.side_effect = lambda k: stored.get(k)
+        mock_client.exists.side_effect = lambda k: 1 if k in stored else 0
+        mock_client.hset.side_effect = lambda k, mapping=None: hashes.setdefault(k, {}).update(mapping or {})
+        mock_client.hgetall.side_effect = lambda k: hashes.get(k, {})
+        mock_client.expire.return_value = None
+        return mock_client
+
+    def test_transient_failure_retried_once(self):
+        import mes_dashboard.core.redis_df_store as rds
+        import mes_dashboard.services.batch_query_engine as bqe
+
+        mock_client = self._mock_redis()
+        call_count = {"n": 0}
+
+        def flaky_query_fn(chunk, max_rows_per_chunk=None):
+            call_count["n"] += 1
+            if call_count["n"] == 1:
+                raise TimeoutError("connection timed out")
+            return pd.DataFrame({"V": [1]})
+
+        with patch.object(rds, "REDIS_ENABLED", True), \
+             patch.object(rds, "get_redis_client", return_value=mock_client), \
+             patch.object(bqe, "get_redis_client", return_value=mock_client):
+            execute_plan(
+                [{"chunk_start": "2025-01-01", "chunk_end": "2025-01-10"}],
+                flaky_query_fn,
+                query_hash="retryonce",
+                cache_prefix="retry",
+                skip_cached=False,
+            )
+            progress = bqe.get_batch_progress("retry", "retryonce")
+
+        assert call_count["n"] == 2
+        assert progress is not None
+        assert progress.get("status") == "completed"
+        assert progress.get("failed") == "0"
+
+    def test_memory_guard_not_retried(self):
+        import mes_dashboard.core.redis_df_store as rds
+        import mes_dashboard.services.batch_query_engine as bqe
+
+        mock_client = self._mock_redis()
+        call_count = {"n": 0}
+
+        def large_df_query_fn(chunk, max_rows_per_chunk=None):
+            call_count["n"] += 1
+            return pd.DataFrame({"V": [1]})
+
+        with patch.object(rds, "REDIS_ENABLED", True), \
+             patch.object(rds, "get_redis_client", return_value=mock_client), \
+             patch.object(bqe, "get_redis_client", return_value=mock_client), \
+             patch.object(bqe, "BATCH_CHUNK_MAX_MEMORY_MB", 0):
+            execute_plan(
+                [{"chunk_start": "2025-01-01", "chunk_end": "2025-01-10"}],
+                large_df_query_fn,
+                query_hash="memnoretry",
+                cache_prefix="retry",
+                skip_cached=False,
+            )
+
+        assert call_count["n"] == 1
+
+    def test_failed_ranges_tracked(self):
+        import mes_dashboard.core.redis_df_store as rds
+        import mes_dashboard.services.batch_query_engine as bqe
+
+        mock_client = self._mock_redis()
+
+        def query_fn(chunk, max_rows_per_chunk=None):
+            if chunk["chunk_start"] == "2025-01-11":
+                raise RuntimeError("chunk failure")
+            return pd.DataFrame({"V": [1]})
+
+        chunks = [
+            {"chunk_start": "2025-01-01", "chunk_end": "2025-01-10"},
+            {"chunk_start": "2025-01-11", "chunk_end": "2025-01-20"},
+            {"chunk_start": "2025-01-21", "chunk_end": "2025-01-30"},
+        ]
+        with patch.object(rds, "REDIS_ENABLED", True), \
+             patch.object(rds, "get_redis_client", return_value=mock_client), \
+             patch.object(bqe, "get_redis_client", return_value=mock_client):
+            execute_plan(
+                chunks,
+                query_fn,
+                query_hash="franges",
+                cache_prefix="retry",
+                skip_cached=False,
+            )
+            progress = bqe.get_batch_progress("retry", "franges")
+
+        assert progress is not None
+        assert progress.get("has_partial_failure") == "True"
+        assert progress.get("failed") == "1"
+        failed_ranges = json.loads(progress.get("failed_ranges", "[]"))
+        assert failed_ranges == [{"start": "2025-01-11", "end": "2025-01-20"}]
+
+    def test_id_batch_chunk_no_failed_ranges(self):
+        import mes_dashboard.core.redis_df_store as rds
+        import mes_dashboard.services.batch_query_engine as bqe
+
+        mock_client = self._mock_redis()
+
+        def query_fn(chunk, max_rows_per_chunk=None):
+            if chunk.get("ids") == ["B"]:
+                raise RuntimeError("id chunk failed")
+            return pd.DataFrame({"V": [1]})
+
+        chunks = [
+            {"ids": ["A"]},
+            {"ids": ["B"]},
+        ]
+        with patch.object(rds, "REDIS_ENABLED", True), \
+             patch.object(rds, "get_redis_client", return_value=mock_client), \
+             patch.object(bqe, "get_redis_client", return_value=mock_client):
+            execute_plan(
+                chunks,
+                query_fn,
+                query_hash="idfail",
+                cache_prefix="retry",
+                skip_cached=False,
+            )
+            progress = bqe.get_batch_progress("retry", "idfail")
+
+        assert progress is not None
+        assert progress.get("has_partial_failure") == "True"
+        assert progress.get("failed") == "1"
+        failed_ranges = json.loads(progress.get("failed_ranges", "[]"))
+        assert failed_ranges == []
--- a/tests/test_container_resolution_policy.py
+++ b/tests/test_container_resolution_policy.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""Unit tests for shared container resolution policy helpers."""
+
+from __future__ import annotations
+
+from mes_dashboard.services import container_resolution_policy as policy
+
+
+def test_validate_resolution_request_rejects_empty_values():
+    assert policy.validate_resolution_request("lot_id", []) is not None
+
+
+def test_validate_resolution_request_rejects_broad_pattern(monkeypatch):
+    monkeypatch.setenv("CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN", "2")
+    error = policy.validate_resolution_request("lot_id", ["%"])
+    assert error is not None
+    assert "萬用字元條件過於寬鬆" in error
+
+
+def test_validate_resolution_request_allows_pattern_with_prefix(monkeypatch):
+    monkeypatch.setenv("CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN", "2")
+    error = policy.validate_resolution_request("lot_id", ["GA26%"])
+    assert error is None
+
+
+def test_validate_resolution_result_rejects_excessive_expansion(monkeypatch):
+    monkeypatch.setenv("CONTAINER_RESOLVE_MAX_EXPANSION_PER_TOKEN", "3")
+    result = {
+        "data": [{"container_id": "C1"}],
+        "expansion_info": {"GA%": 10},
+    }
+    error = policy.validate_resolution_result(result)
+    assert error is not None
+    assert "單一條件展開過大" in error
+
+
+def test_validate_resolution_result_rejects_excessive_container_count(monkeypatch):
+    monkeypatch.setenv("CONTAINER_RESOLVE_MAX_CONTAINER_IDS", "2")
+    result = {
+        "data": [
+            {"container_id": "C1"},
+            {"container_id": "C2"},
+            {"container_id": "C3"},
+        ],
+        "expansion_info": {},
+    }
+    error = policy.validate_resolution_result(result)
+    assert error is not None
+    assert "解析結果過大" in error
+
+
+def test_validate_resolution_result_non_strict_allows_overflow(monkeypatch):
+    monkeypatch.setenv("CONTAINER_RESOLVE_MAX_CONTAINER_IDS", "2")
+    result = {
+        "data": [
+            {"container_id": "C1"},
+            {"container_id": "C2"},
+            {"container_id": "C3"},
+        ],
+        "expansion_info": {"GA%": 999},
+    }
+    error = policy.validate_resolution_result(result, strict=False)
+    assert error is None
+
+
+def test_extract_container_ids_deduplicates_and_preserves_order():
+    rows = [
+        {"container_id": "C1"},
+        {"container_id": "C1"},
+        {"CONTAINERID": "C2"},
+        {"container_id": "C3"},
+    ]
+    assert policy.extract_container_ids(rows) == ["C1", "C2", "C3"]
--- a/tests/test_event_fetcher.py
+++ b/tests/test_event_fetcher.py
@@ -198,3 +198,60 @@ def test_fetch_events_sanitizes_nan_values(
    result = EventFetcher.fetch_events(["CID-1"], "upstream_history")

    assert result["CID-1"][0]["VALUE"] is None
+
+
+@patch("mes_dashboard.services.event_fetcher.cache_set")
+@patch("mes_dashboard.services.event_fetcher.cache_get", return_value=None)
+@patch("mes_dashboard.services.event_fetcher.read_sql_df_slow_iter")
+@patch("mes_dashboard.services.event_fetcher.SQLLoader.load")
+def test_fetch_events_raises_when_parallel_batch_fails_and_partial_disabled(
+    mock_sql_load,
+    mock_iter,
+    _mock_cache_get,
+    _mock_cache_set,
+    monkeypatch,
+):
+    mock_sql_load.return_value = "SELECT * FROM t WHERE h.CONTAINERID = :container_id {{ WORKCENTER_FILTER }}"
+    monkeypatch.setattr("mes_dashboard.services.event_fetcher.EVENT_FETCHER_ALLOW_PARTIAL_RESULTS", False)
+    monkeypatch.setattr("mes_dashboard.services.event_fetcher.EVENT_FETCHER_MAX_WORKERS", 2)
+
+    def _side_effect(sql, params, timeout_seconds=60):
+        if "CID-1000" in params.values():
+            raise RuntimeError("chunk fail")
+        return iter([])
+
+    mock_iter.side_effect = _side_effect
+    cids = [f"CID-{i}" for i in range(1001)]  # force >1 batch
+
+    try:
+        EventFetcher.fetch_events(cids, "history")
+        assert False, "expected RuntimeError"
+    except RuntimeError as exc:
+        assert "chunk failed" in str(exc)
+
+
+@patch("mes_dashboard.services.event_fetcher.cache_set")
+@patch("mes_dashboard.services.event_fetcher.cache_get", return_value=None)
+@patch("mes_dashboard.services.event_fetcher.read_sql_df_slow_iter")
+@patch("mes_dashboard.services.event_fetcher.SQLLoader.load")
+def test_fetch_events_allows_partial_when_enabled(
+    mock_sql_load,
+    mock_iter,
+    _mock_cache_get,
+    _mock_cache_set,
+    monkeypatch,
+):
+    mock_sql_load.return_value = "SELECT * FROM t WHERE h.CONTAINERID = :container_id {{ WORKCENTER_FILTER }}"
+    monkeypatch.setattr("mes_dashboard.services.event_fetcher.EVENT_FETCHER_ALLOW_PARTIAL_RESULTS", True)
+    monkeypatch.setattr("mes_dashboard.services.event_fetcher.EVENT_FETCHER_MAX_WORKERS", 2)
+
+    def _side_effect(sql, params, timeout_seconds=60):
+        if "CID-1000" in params.values():
+            raise RuntimeError("chunk fail")
+        return iter([])
+
+    mock_iter.side_effect = _side_effect
+    cids = [f"CID-{i}" for i in range(1001)]
+
+    result = EventFetcher.fetch_events(cids, "history")
+    assert result == {}
--- a/tests/test_job_query_engine.py
+++ b/tests/test_job_query_engine.py
@@ -77,7 +77,7 @@ class TestJobQueryEngineDecomposition:
        result = job_svc.get_jobs_by_resources(
            resource_ids=["R1"],
            start_date="2025-06-01",
-            end_date="2025-06-30",
+            end_date="2025-06-05",
        )

        assert engine_calls["execute"] == 0  # Engine NOT used
--- a/tests/test_job_query_service.py
+++ b/tests/test_job_query_service.py
@@ -191,7 +191,7 @@ class TestErrorLeakageProtection:
    def test_query_error_masks_internal_details(self, mock_read):
        mock_read.side_effect = RuntimeError("ORA-00942: table or view does not exist")

-        result = get_jobs_by_resources(["RES001"], "2024-01-01", "2024-01-31")
+        result = get_jobs_by_resources(["RES001"], "2024-01-01", "2024-01-05")

        assert result["error"] == QUERY_ERROR_MESSAGE
        assert "ORA-00942" not in result["error"]
--- a/tests/test_mid_section_defect_engine.py
+++ b/tests/test_mid_section_defect_engine.py
@@ -85,7 +85,7 @@ class TestDetectionEngineDecomposition:

        df = msd_svc._fetch_station_detection_data(
            start_date="2025-06-01",
-            end_date="2025-06-30",
+            end_date="2025-06-05",
            station="測試",
        )

--- a/tests/test_query_tool_routes.py
+++ b/tests/test_query_tool_routes.py
@@ -14,7 +14,7 @@ from unittest.mock import patch, MagicMock
 from mes_dashboard import create_app
 from mes_dashboard.core.cache import NoOpCache
 from mes_dashboard.core.rate_limit import reset_rate_limits_for_tests
-from mes_dashboard.services.query_tool_service import MAX_DATE_RANGE_DAYS, MAX_LOT_IDS
+from mes_dashboard.services.query_tool_service import MAX_DATE_RANGE_DAYS


@pytest.fixture
@@ -118,20 +118,19 @@ class TestResolveEndpoint:
        data = json.loads(response.data)
        assert 'error' in data

-    def test_values_over_limit(self, client):
-        """Should reject values exceeding limit."""
-        values = [f'GA{i:09d}' for i in range(MAX_LOT_IDS + 1)]
+    def test_rejects_too_broad_wildcard(self, client):
+        """Should reject wildcard patterns that are too broad."""
        response = client.post(
            '/api/query-tool/resolve',
            json={
                'input_type': 'lot_id',
-                'values': values
-            }
-        )
+                'values': ['%']
+            }
+        )
        assert response.status_code == 400
        data = json.loads(response.data)
        assert 'error' in data
-        assert '超過上限' in data['error'] or str(MAX_LOT_IDS) in data['error']
+        assert '萬用字元條件過於寬鬆' in data['error']

    @patch('mes_dashboard.routes.query_tool_routes.resolve_lots')
    def test_resolve_success(self, mock_resolve, client):
--- a/tests/test_query_tool_service.py
+++ b/tests/test_query_tool_service.py
@@ -90,7 +90,7 @@ class TestValidateDateRange:
        assert '格式' in result or 'format' in result.lower()


-class TestValidateLotInput:
+class TestValidateLotInput:
    """Tests for validate_lot_input function."""

    def test_valid_lot_ids(self):
@@ -117,53 +117,24 @@ class TestValidateLotInput:
        assert result is not None
        assert '至少一個' in result

-    def test_exceeds_lot_id_limit(self):
-        """Should reject LOT IDs exceeding limit."""
-        values = [f'GA{i:09d}' for i in range(MAX_LOT_IDS + 1)]
-        result = validate_lot_input('lot_id', values)
-        assert result is not None
-        assert '超過上限' in result
-        assert str(MAX_LOT_IDS) in result
-
-    def test_exceeds_serial_number_limit(self):
-        """Should reject serial numbers exceeding limit."""
-        values = [f'SN{i:06d}' for i in range(MAX_SERIAL_NUMBERS + 1)]
-        result = validate_lot_input('serial_number', values)
-        assert result is not None
-        assert '超過上限' in result
-        assert str(MAX_SERIAL_NUMBERS) in result
-
-    def test_exceeds_work_order_limit(self):
-        """Should reject work orders exceeding limit."""
-        values = [f'WO{i:06d}' for i in range(MAX_WORK_ORDERS + 1)]
-        result = validate_lot_input('work_order', values)
-        assert result is not None
-        assert '超過上限' in result
-        assert str(MAX_WORK_ORDERS) in result
+    def test_large_input_list_allowed_when_no_count_cap(self, monkeypatch):
+        """Should allow large lists when count cap is disabled."""
+        monkeypatch.setenv("CONTAINER_RESOLVE_INPUT_MAX_VALUES", "0")
+        values = [f'GA{i:09d}' for i in range(MAX_LOT_IDS + 50)]
+        result = validate_lot_input('lot_id', values)
+        assert result is None

-    def test_exceeds_gd_work_order_limit(self):
-        """Should reject GD work orders exceeding limit."""
-        values = [f'GD{i:06d}' for i in range(MAX_GD_WORK_ORDERS + 1)]
-        result = validate_lot_input('gd_work_order', values)
+    def test_rejects_too_broad_wildcard_pattern(self, monkeypatch):
+        """Should reject broad wildcard like '%' to prevent full scan."""
+        monkeypatch.setenv("CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN", "2")
+        result = validate_lot_input('lot_id', ['%'])
        assert result is not None
-        assert '超過上限' in result
-        assert str(MAX_GD_WORK_ORDERS) in result
-
-    def test_exactly_at_limit(self):
-        """Should accept values exactly at limit."""
-        values = [f'GA{i:09d}' for i in range(MAX_LOT_IDS)]
-        result = validate_lot_input('lot_id', values)
-        assert result is None
-
-    def test_unknown_input_type_uses_default_limit(self):
-        """Should use default limit for unknown input types."""
-        values = [f'X{i}' for i in range(MAX_LOT_IDS)]
-        result = validate_lot_input('unknown_type', values)
-        assert result is None
-
-        values_over = [f'X{i}' for i in range(MAX_LOT_IDS + 1)]
-        result = validate_lot_input('unknown_type', values_over)
-        assert result is not None
+        assert '萬用字元條件過於寬鬆' in result
+
+    def test_accepts_wildcard_with_prefix(self, monkeypatch):
+        monkeypatch.setenv("CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN", "2")
+        result = validate_lot_input('lot_id', ['GA25%'])
+        assert result is None


 class TestValidateEquipmentInput:
--- a/tests/test_reject_dataset_cache.py
+++ b/tests/test_reject_dataset_cache.py
@@ -3,6 +3,7 @@

 from __future__ import annotations

+import json
 from decimal import Decimal
 from unittest.mock import MagicMock

@@ -400,6 +401,72 @@ class TestEngineDecompositionDateRange:
        assert engine_calls["parallel"] == cache_svc._REJECT_ENGINE_PARALLEL
        assert engine_calls["max_rows_per_chunk"] == cache_svc._REJECT_ENGINE_MAX_ROWS_PER_CHUNK

+    def test_engine_chunk_uses_paged_fetch_without_truncation(self, monkeypatch):
+        """Engine chunk should fetch all pages (offset paging), not truncate at page size."""
+        import mes_dashboard.services.batch_query_engine as engine_mod
+
+        offsets = []
+        captured = {"df": pd.DataFrame(), "merge_kwargs": None}
+
+        def fake_read_sql(sql, params):
+            offset = int(params.get("offset", 0))
+            limit = int(params.get("limit", 0))
+            offsets.append(offset)
+            total_rows = 5
+            remaining = max(total_rows - offset, 0)
+            take = min(limit, remaining)
+            if take <= 0:
+                return pd.DataFrame()
+            return pd.DataFrame(
+                {
+                    "CONTAINERID": [f"C{offset + i}" for i in range(take)],
+                    "LOSSREASONNAME": ["R1"] * take,
+                    "REJECT_TOTAL_QTY": [1] * take,
+                }
+            )
+
+        def fake_execute_plan(chunks, query_fn, **kwargs):
+            page_size = kwargs.get("max_rows_per_chunk")
+            captured["df"] = query_fn(chunks[0], max_rows_per_chunk=page_size)
+            return kwargs.get("query_hash", "qh")
+
+        def fake_merge_chunks(prefix, qhash, **kwargs):
+            captured["merge_kwargs"] = kwargs
+            return captured["df"]
+
+        monkeypatch.setattr(cache_svc, "_REJECT_ENGINE_MAX_ROWS_PER_CHUNK", 2)
+        monkeypatch.setattr(engine_mod, "should_decompose_by_time", lambda *_a, **_kw: True)
+        monkeypatch.setattr(
+            engine_mod,
+            "decompose_by_time_range",
+            lambda *_a, **_kw: [{"chunk_start": "2025-01-01", "chunk_end": "2025-01-31"}],
+        )
+        monkeypatch.setattr(engine_mod, "execute_plan", fake_execute_plan)
+        monkeypatch.setattr(engine_mod, "merge_chunks", fake_merge_chunks)
+        monkeypatch.setattr(cache_svc, "read_sql_df", fake_read_sql)
+        monkeypatch.setattr(cache_svc, "_get_cached_df", lambda _qid: None)
+        monkeypatch.setattr(cache_svc, "_prepare_sql", lambda *a, **kw: "SELECT 1 FROM dual")
+        monkeypatch.setattr(cache_svc, "_build_where_clause", lambda **kw: ("", {}, {}))
+        monkeypatch.setattr(cache_svc, "_validate_range", lambda *_a, **_kw: None)
+        monkeypatch.setattr(cache_svc, "_apply_policy_filters", lambda df, **kw: df)
+        monkeypatch.setattr(cache_svc, "_store_query_result", lambda *_a, **_kw: None)
+        monkeypatch.setattr(cache_svc, "redis_clear_batch", lambda *_a, **_kw: 0)
+        monkeypatch.setattr(
+            cache_svc,
+            "_build_primary_response",
+            lambda qid, df, meta, ri: {"query_id": qid, "rows": len(df)},
+        )
+
+        result = cache_svc.execute_primary_query(
+            mode="date_range",
+            start_date="2025-01-01",
+            end_date="2025-03-01",
+        )
+
+        assert result["rows"] == 5
+        assert offsets == [0, 2, 4]
+        assert captured["merge_kwargs"] == {}
+
    def test_short_range_skips_engine(self, monkeypatch):
        """Short date range (<= threshold) uses direct path, no engine."""
        import mes_dashboard.services.batch_query_engine as engine_mod
@@ -453,7 +520,7 @@ class TestEngineDecompositionDateRange:
        result = cache_svc.execute_primary_query(
            mode="date_range",
            start_date="2025-06-01",
-            end_date="2025-06-30",
+            end_date="2025-06-10",
        )

        assert engine_calls["decompose"] == 0  # Engine NOT used
@@ -629,7 +696,7 @@ def test_large_result_spills_to_parquet_and_view_export_use_spool_fallback(monke
    result = cache_svc.execute_primary_query(
        mode="date_range",
        start_date="2025-01-01",
-        end_date="2025-01-31",
+        end_date="2025-01-05",
    )

    query_id = result["query_id"]
@@ -651,3 +718,185 @@ def test_large_result_spills_to_parquet_and_view_export_use_spool_fallback(monke
    export_rows = cache_svc.export_csv_from_cache(query_id=query_id)
    assert export_rows is not None
    assert len(export_rows) == len(df)
+
+
+def test_resolve_containers_deduplicates_container_ids(monkeypatch):
+    monkeypatch.setattr(
+        cache_svc,
+        "_RESOLVERS",
+        {
+            "lot": lambda values: {
+                "data": [
+                    {"container_id": "CID-1"},
+                    {"container_id": "CID-1"},
+                    {"container_id": "CID-2"},
+                ],
+                "input_count": len(values),
+                "not_found": [],
+                "expansion_info": {"LOT%": 2},
+            }
+        },
+    )
+    monkeypatch.setenv("CONTAINER_RESOLVE_MAX_EXPANSION_PER_TOKEN", "10")
+    monkeypatch.setenv("CONTAINER_RESOLVE_MAX_CONTAINER_IDS", "10")
+
+    resolved = cache_svc.resolve_containers("lot", ["LOT%"])
+
+    assert resolved["container_ids"] == ["CID-1", "CID-2"]
+    assert resolved["resolution_info"]["resolved_count"] == 2
+
+
+def test_resolve_containers_allows_oversized_expansion_and_sets_guardrail(monkeypatch):
+    monkeypatch.setattr(
+        cache_svc,
+        "_RESOLVERS",
+        {
+            "lot": lambda values: {
+                "data": [{"container_id": "CID-1"}],
+                "input_count": len(values),
+                "not_found": [],
+                "expansion_info": {"GA%": 999},
+            }
+        },
+    )
+    monkeypatch.setenv("CONTAINER_RESOLVE_MAX_EXPANSION_PER_TOKEN", "50")
+    monkeypatch.setenv("CONTAINER_RESOLVE_PATTERN_MIN_PREFIX_LEN", "2")
+
+    resolved = cache_svc.resolve_containers("lot", ["GA%"])
+    guardrail = resolved["resolution_info"].get("guardrail") or {}
+    assert guardrail.get("overflow") is True
+    assert len(guardrail.get("expansion_offenders") or []) == 1
+
+
+def test_partial_failure_in_response_meta(monkeypatch):
+    import mes_dashboard.services.batch_query_engine as engine_mod
+
+    df = pd.DataFrame({"CONTAINERID": ["C1"], "LOSSREASONNAME": ["R1"], "REJECT_TOTAL_QTY": [1]})
+
+    monkeypatch.setattr(cache_svc, "_get_cached_df", lambda _qid: None)
+    monkeypatch.setattr(cache_svc, "_validate_range", lambda *_a, **_kw: None)
+    monkeypatch.setattr(cache_svc, "_build_where_clause", lambda **kw: ("", {}, {}))
+    monkeypatch.setattr(cache_svc, "_prepare_sql", lambda *a, **kw: "SELECT 1 FROM dual")
+    monkeypatch.setattr(cache_svc, "_apply_policy_filters", lambda data, **kw: data)
+    monkeypatch.setattr(cache_svc, "_store_query_result", lambda *_a, **_kw: False)
+    monkeypatch.setattr(cache_svc, "redis_clear_batch", lambda *_a, **_kw: None)
+    monkeypatch.setattr(
+        cache_svc,
+        "_build_primary_response",
+        lambda qid, result_df, meta, resolution_info: {"query_id": qid, "meta": meta},
+    )
+    monkeypatch.setattr(cache_svc, "_store_partial_failure_flag", lambda *_a, **_kw: None)
+
+    monkeypatch.setattr(engine_mod, "should_decompose_by_time", lambda *_a, **_kw: True)
+    monkeypatch.setattr(
+        engine_mod,
+        "decompose_by_time_range",
+        lambda *_a, **_kw: [{"chunk_start": "2025-01-01", "chunk_end": "2025-01-10"}],
+    )
+    monkeypatch.setattr(engine_mod, "execute_plan", lambda *a, **kw: kw.get("query_hash"))
+    monkeypatch.setattr(engine_mod, "merge_chunks", lambda *a, **kw: df.copy())
+    monkeypatch.setattr(
+        engine_mod,
+        "get_batch_progress",
+        lambda *_a, **_kw: {
+            "has_partial_failure": "True",
+            "failed": "2",
+            "failed_ranges": json.dumps([{"start": "2025-01-01", "end": "2025-01-10"}]),
+        },
+    )
+
+    result = cache_svc.execute_primary_query(
+        mode="date_range",
+        start_date="2025-01-01",
+        end_date="2025-03-01",
+    )
+    meta = result.get("meta") or {}
+    assert meta.get("has_partial_failure") is True
+    assert meta.get("failed_chunk_count") == 2
+    assert meta.get("failed_ranges") == [{"start": "2025-01-01", "end": "2025-01-10"}]
+
+
+def test_cache_hit_restores_partial_failure(monkeypatch):
+    cached_df = pd.DataFrame({"CONTAINERID": ["C1"], "LOSSREASONNAME": ["R1"], "REJECT_TOTAL_QTY": [1]})
+
+    monkeypatch.setattr(cache_svc, "_get_cached_df", lambda _qid: cached_df)
+    monkeypatch.setattr(cache_svc, "_validate_range", lambda *_a, **_kw: None)
+    monkeypatch.setattr(cache_svc, "_build_where_clause", lambda **kw: ("", {}, {}))
+    monkeypatch.setattr(cache_svc, "_apply_policy_filters", lambda data, **kw: data)
+    monkeypatch.setattr(
+        cache_svc,
+        "_load_partial_failure_flag",
+        lambda _qid: {
+            "has_partial_failure": True,
+            "failed_chunk_count": 3,
+            "failed_ranges": [],
+        },
+    )
+    monkeypatch.setattr(
+        cache_svc,
+        "_build_primary_response",
+        lambda qid, result_df, meta, resolution_info: {"query_id": qid, "meta": meta},
+    )
+
+    result = cache_svc.execute_primary_query(
+        mode="date_range",
+        start_date="2025-01-01",
+        end_date="2025-01-31",
+    )
+    meta = result.get("meta") or {}
+    assert meta.get("has_partial_failure") is True
+    assert meta.get("failed_chunk_count") == 3
+    assert meta.get("failed_ranges") == []
+
+
+@pytest.mark.parametrize(
+    "store_result,expected_ttl",
+    [
+        (True, cache_svc._REJECT_ENGINE_SPOOL_TTL_SECONDS),
+        (False, cache_svc._CACHE_TTL),
+    ],
+)
+def test_partial_failure_ttl_matches_spool(monkeypatch, store_result, expected_ttl):
+    import mes_dashboard.services.batch_query_engine as engine_mod
+
+    df = pd.DataFrame({"CONTAINERID": ["C1"], "LOSSREASONNAME": ["R1"], "REJECT_TOTAL_QTY": [1]})
+    captured = {"ttls": []}
+
+    monkeypatch.setattr(cache_svc, "_get_cached_df", lambda _qid: None)
+    monkeypatch.setattr(cache_svc, "_validate_range", lambda *_a, **_kw: None)
+    monkeypatch.setattr(cache_svc, "_build_where_clause", lambda **kw: ("", {}, {}))
+    monkeypatch.setattr(cache_svc, "_prepare_sql", lambda *a, **kw: "SELECT 1 FROM dual")
+    monkeypatch.setattr(cache_svc, "_apply_policy_filters", lambda data, **kw: data)
+    monkeypatch.setattr(cache_svc, "_store_query_result", lambda *_a, **_kw: store_result)
+    monkeypatch.setattr(cache_svc, "redis_clear_batch", lambda *_a, **_kw: None)
+    monkeypatch.setattr(
+        cache_svc,
+        "_build_primary_response",
+        lambda qid, result_df, meta, resolution_info: {"query_id": qid, "meta": meta},
+    )
+    monkeypatch.setattr(
+        cache_svc,
+        "_store_partial_failure_flag",
+        lambda _qid, _failed, _ranges, ttl: captured["ttls"].append(ttl),
+    )
+
+    monkeypatch.setattr(engine_mod, "should_decompose_by_time", lambda *_a, **_kw: True)
+    monkeypatch.setattr(
+        engine_mod,
+        "decompose_by_time_range",
+        lambda *_a, **_kw: [{"chunk_start": "2025-01-01", "chunk_end": "2025-01-10"}],
+    )
+    monkeypatch.setattr(engine_mod, "execute_plan", lambda *a, **kw: kw.get("query_hash"))
+    monkeypatch.setattr(engine_mod, "merge_chunks", lambda *a, **kw: df.copy())
+    monkeypatch.setattr(
+        engine_mod,
+        "get_batch_progress",
+        lambda *_a, **_kw: {"has_partial_failure": "True", "failed": "1", "failed_ranges": "[]"},
+    )
+
+    cache_svc.execute_primary_query(
+        mode="date_range",
+        start_date="2025-01-01",
+        end_date="2025-03-01",
+    )
+    assert captured["ttls"] == [expected_ttl]