Files
DashBoard/frontend/src/shared-composables/useTraceProgress.js
egg dbe0da057c feat(trace-pipeline): memory triage, async job queue, and NDJSON streaming
Three proposals addressing the 2026-02-25 trace pipeline OOM crash (114K CIDs):

1. trace-events-memory-triage: fetchmany iterator (read_sql_df_slow_iter),
   admission control (50K CID limit for non-MSD), cache skip for large queries,
   early memory release with gc.collect()

2. trace-async-job-queue: RQ-based async jobs for queries >20K CIDs,
   separate worker process with isolated memory, frontend polling via
   useTraceProgress composable, systemd service + deploy scripts

3. trace-streaming-response: chunked Redis storage (TRACE_STREAM_BATCH_SIZE=5000),
   NDJSON stream endpoint (GET /api/trace/job/<id>/stream), frontend
   ReadableStream consumer for progressive rendering, backward-compatible
   with legacy single-key storage

All three proposals archived. 1101 tests pass, frontend builds clean.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 21:01:27 +08:00

391 lines
11 KiB
JavaScript

import { reactive, ref } from 'vue';
import { apiGet, apiPost, ensureMesApiAvailable } from '../core/api.js';
ensureMesApiAvailable();
const DEFAULT_STAGE_TIMEOUT_MS = 360000;
const JOB_POLL_INTERVAL_MS = 3000;
const JOB_POLL_MAX_MS = 1800000; // 30 minutes
const PROFILE_DOMAINS = Object.freeze({
query_tool: ['history', 'materials', 'rejects', 'holds', 'jobs'],
mid_section_defect: ['upstream_history', 'materials'],
mid_section_defect_forward: ['upstream_history', 'downstream_rejects'],
});
function stageKey(stageName) {
if (stageName === 'seed-resolve') return 'seed';
if (stageName === 'lineage') return 'lineage';
return 'events';
}
function normalizeSeedContainerIds(seedPayload) {
const rows = Array.isArray(seedPayload?.seeds) ? seedPayload.seeds : [];
const seen = new Set();
const containerIds = [];
rows.forEach((row) => {
const id = String(row?.container_id || '').trim();
if (!id || seen.has(id)) {
return;
}
seen.add(id);
containerIds.push(id);
});
return containerIds;
}
function collectAllContainerIds(seedContainerIds, lineagePayload, direction) {
const seen = new Set(seedContainerIds);
const merged = [...seedContainerIds];
if (direction === 'forward') {
const childrenMap = lineagePayload?.children_map || {};
const queue = [...seedContainerIds];
while (queue.length > 0) {
const current = queue.shift();
const children = childrenMap[current];
if (!Array.isArray(children)) continue;
for (const child of children) {
const id = String(child || '').trim();
if (!id || seen.has(id)) continue;
seen.add(id);
merged.push(id);
queue.push(id);
}
}
} else {
const ancestors = lineagePayload?.ancestors || {};
Object.values(ancestors).forEach((values) => {
if (!Array.isArray(values)) {
return;
}
values.forEach((value) => {
const id = String(value || '').trim();
if (!id || seen.has(id)) {
return;
}
seen.add(id);
merged.push(id);
});
});
}
return merged;
}
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Poll an async trace job until it completes or fails.
*
* @param {string} statusUrl - The job status endpoint URL
* @param {object} options - { signal, onProgress }
* @returns {Promise<string>} The final status ('finished' or throws)
*/
async function pollJobUntilComplete(statusUrl, { signal, onProgress } = {}) {
const started = Date.now();
while (true) {
if (signal?.aborted) {
throw new DOMException('Aborted', 'AbortError');
}
const status = await apiGet(statusUrl, { timeout: 15000, signal });
if (typeof onProgress === 'function') {
onProgress(status);
}
if (status.status === 'finished') {
return 'finished';
}
if (status.status === 'failed') {
const error = new Error(status.error || '非同步查詢失敗');
error.errorCode = 'JOB_FAILED';
throw error;
}
if (Date.now() - started > JOB_POLL_MAX_MS) {
const error = new Error('非同步查詢超時');
error.errorCode = 'JOB_POLL_TIMEOUT';
throw error;
}
await sleep(JOB_POLL_INTERVAL_MS);
}
}
/**
* Consume an NDJSON stream from the server, calling onChunk for each line.
*
* @param {string} url - The stream endpoint URL
* @param {object} options - { signal, onChunk }
* @returns {Promise<void>}
*/
async function consumeNDJSONStream(url, { signal, onChunk } = {}) {
const response = await fetch(url, { signal });
if (!response.ok) {
const text = await response.text().catch(() => '');
const error = new Error(`Stream request failed: HTTP ${response.status} ${text}`);
error.errorCode = 'STREAM_FAILED';
throw error;
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop(); // keep incomplete last line
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
const chunk = JSON.parse(trimmed);
if (typeof onChunk === 'function') onChunk(chunk);
} catch {
// skip malformed NDJSON lines
}
}
}
// process remaining buffer
if (buffer.trim()) {
try {
const chunk = JSON.parse(buffer.trim());
if (typeof onChunk === 'function') onChunk(chunk);
} catch {
// skip malformed final line
}
}
} finally {
reader.releaseLock();
}
}
export function useTraceProgress({ profile } = {}) {
const current_stage = ref(null);
const completed_stages = ref([]);
const is_running = ref(false);
const stage_results = reactive({
seed: null,
lineage: null,
events: null,
});
const stage_errors = reactive({
seed: null,
lineage: null,
events: null,
});
// Async job progress (populated when events stage uses async path)
const job_progress = reactive({
active: false,
job_id: null,
status: null,
elapsed_seconds: 0,
progress: '',
});
let activeController = null;
function reset() {
completed_stages.value = [];
current_stage.value = null;
stage_results.seed = null;
stage_results.lineage = null;
stage_results.events = null;
stage_errors.seed = null;
stage_errors.lineage = null;
stage_errors.events = null;
job_progress.active = false;
job_progress.job_id = null;
job_progress.status = null;
job_progress.elapsed_seconds = 0;
job_progress.progress = '';
}
function abort() {
if (activeController) {
activeController.abort();
activeController = null;
}
}
async function execute(params = {}) {
const direction = params.direction || 'backward';
const domainKey = profile === 'mid_section_defect' && direction === 'forward'
? 'mid_section_defect_forward'
: profile;
const domains = PROFILE_DOMAINS[domainKey];
if (!domains) {
throw new Error(`Unsupported trace profile: ${profile}`);
}
abort();
reset();
is_running.value = true;
const controller = new AbortController();
activeController = controller;
try {
current_stage.value = 'seed-resolve';
const seedPayload = await apiPost(
'/api/trace/seed-resolve',
{ profile, params },
{ timeout: DEFAULT_STAGE_TIMEOUT_MS, signal: controller.signal },
);
stage_results.seed = seedPayload;
completed_stages.value = [...completed_stages.value, 'seed-resolve'];
const seedContainerIds = normalizeSeedContainerIds(seedPayload);
if (seedContainerIds.length === 0) {
return stage_results;
}
current_stage.value = 'lineage';
const lineagePayload = await apiPost(
'/api/trace/lineage',
{
profile,
container_ids: seedContainerIds,
cache_key: seedPayload?.cache_key || null,
params,
},
{ timeout: DEFAULT_STAGE_TIMEOUT_MS, signal: controller.signal },
);
stage_results.lineage = lineagePayload;
completed_stages.value = [...completed_stages.value, 'lineage'];
const allContainerIds = collectAllContainerIds(seedContainerIds, lineagePayload, direction);
current_stage.value = 'events';
const eventsPayload = await apiPost(
'/api/trace/events',
{
profile,
container_ids: allContainerIds,
domains,
cache_key: seedPayload?.cache_key || null,
params,
seed_container_ids: seedContainerIds,
lineage: {
ancestors: lineagePayload?.ancestors || {},
children_map: lineagePayload?.children_map || {},
seed_roots: lineagePayload?.seed_roots || {},
},
},
{ timeout: DEFAULT_STAGE_TIMEOUT_MS, signal: controller.signal },
);
// Async path: server returned 202 with job_id
if (eventsPayload?.async === true && eventsPayload?.status_url) {
job_progress.active = true;
job_progress.job_id = eventsPayload.job_id;
job_progress.status = 'queued';
// Phase 1: poll until job finishes
await pollJobUntilComplete(eventsPayload.status_url, {
signal: controller.signal,
onProgress: (status) => {
job_progress.status = status.status;
job_progress.elapsed_seconds = status.elapsed_seconds || 0;
job_progress.progress = status.progress || '';
},
});
// Phase 2: stream result via NDJSON (or fall back to full result)
const streamUrl = eventsPayload.stream_url;
if (streamUrl) {
job_progress.progress = 'streaming';
const streamedResult = { stage: 'events', results: {}, aggregation: null };
let totalRecords = 0;
await consumeNDJSONStream(streamUrl, {
signal: controller.signal,
onChunk: (chunk) => {
if (chunk.type === 'domain_start') {
streamedResult.results[chunk.domain] = { data: [], count: 0, total: chunk.total };
} else if (chunk.type === 'records' && streamedResult.results[chunk.domain]) {
const domainResult = streamedResult.results[chunk.domain];
domainResult.data.push(...chunk.data);
domainResult.count = domainResult.data.length;
totalRecords += chunk.data.length;
job_progress.progress = `streaming: ${totalRecords} records`;
} else if (chunk.type === 'aggregation') {
streamedResult.aggregation = chunk.data;
} else if (chunk.type === 'warning') {
streamedResult.error = chunk.code;
streamedResult.failed_domains = chunk.failed_domains;
} else if (chunk.type === 'full_result') {
// Legacy fallback: server sent full result as single chunk
Object.assign(streamedResult, chunk.data);
}
},
});
stage_results.events = streamedResult;
} else {
// No stream_url: fall back to fetching full result
const resultUrl = `${eventsPayload.status_url}/result`;
stage_results.events = await apiGet(resultUrl, {
timeout: DEFAULT_STAGE_TIMEOUT_MS,
signal: controller.signal,
});
}
job_progress.active = false;
} else {
// Sync path
stage_results.events = eventsPayload;
}
completed_stages.value = [...completed_stages.value, 'events'];
return stage_results;
} catch (error) {
if (error?.name === 'AbortError') {
return stage_results;
}
const key = stageKey(current_stage.value);
stage_errors[key] = {
code: error?.errorCode || null,
message: error?.message || '追溯查詢失敗',
};
return stage_results;
} finally {
if (activeController === controller) {
activeController = null;
}
current_stage.value = null;
is_running.value = false;
job_progress.active = false;
}
}
return {
current_stage,
completed_stages,
stage_results,
stage_errors,
job_progress,
is_running,
execute,
reset,
abort,
};
}