feat(trace-pipeline): memory triage, async job queue, and NDJSON streaming
Three proposals addressing the 2026-02-25 trace pipeline OOM crash (114K CIDs): 1. trace-events-memory-triage: fetchmany iterator (read_sql_df_slow_iter), admission control (50K CID limit for non-MSD), cache skip for large queries, early memory release with gc.collect() 2. trace-async-job-queue: RQ-based async jobs for queries >20K CIDs, separate worker process with isolated memory, frontend polling via useTraceProgress composable, systemd service + deploy scripts 3. trace-streaming-response: chunked Redis storage (TRACE_STREAM_BATCH_SIZE=5000), NDJSON stream endpoint (GET /api/trace/job/<id>/stream), frontend ReadableStream consumer for progressive rendering, backward-compatible with legacy single-key storage All three proposals archived. 1101 tests pass, frontend builds clean. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -242,9 +242,14 @@ show_next_steps() {
|
||||
echo " sudo chmod 640 .env"
|
||||
echo " sudo cp deploy/mes-dashboard.service /etc/systemd/system/"
|
||||
echo " sudo cp deploy/mes-dashboard-watchdog.service /etc/systemd/system/"
|
||||
echo " sudo cp deploy/mes-dashboard-trace-worker.service /etc/systemd/system/"
|
||||
echo " sudo systemctl daemon-reload"
|
||||
echo " sudo systemctl enable --now mes-dashboard mes-dashboard-watchdog"
|
||||
echo ""
|
||||
echo "Optional: enable async trace worker (for large queries)"
|
||||
echo " # Set TRACE_WORKER_ENABLED=true in .env"
|
||||
echo " sudo systemctl enable --now mes-dashboard-trace-worker"
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
}
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ PORT=$(echo "$DEFAULT_PORT" | cut -d: -f2)
|
||||
REDIS_ENABLED="${REDIS_ENABLED:-true}"
|
||||
# Worker watchdog configuration
|
||||
WATCHDOG_ENABLED="${WATCHDOG_ENABLED:-true}"
|
||||
# RQ trace worker configuration
|
||||
TRACE_WORKER_ENABLED="${TRACE_WORKER_ENABLED:-false}"
|
||||
TRACE_WORKER_QUEUE="${TRACE_WORKER_QUEUE:-trace-events}"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
@@ -73,6 +76,8 @@ resolve_runtime_paths() {
|
||||
WATCHDOG_PID_FILE="${WATCHDOG_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/gunicorn.pid}"
|
||||
WATCHDOG_STATE_FILE="${WATCHDOG_STATE_FILE:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart_state.json}"
|
||||
WATCHDOG_PROCESS_PID_FILE="${WATCHDOG_PROCESS_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/worker_watchdog.pid}"
|
||||
RQ_WORKER_PID_FILE="${WATCHDOG_RUNTIME_DIR}/rq_trace_worker.pid"
|
||||
RQ_WORKER_LOG="${LOG_DIR}/rq_worker.log"
|
||||
PID_FILE="${WATCHDOG_PID_FILE}"
|
||||
export WATCHDOG_RUNTIME_DIR WATCHDOG_RESTART_FLAG WATCHDOG_PID_FILE WATCHDOG_STATE_FILE WATCHDOG_PROCESS_PID_FILE
|
||||
}
|
||||
@@ -585,6 +590,126 @@ stop_watchdog() {
|
||||
return 0
|
||||
}
|
||||
|
||||
# ============================================================
|
||||
# RQ Trace Worker Management Functions
|
||||
# ============================================================
|
||||
get_rq_worker_pid() {
|
||||
if [ -f "$RQ_WORKER_PID_FILE" ]; then
|
||||
local pid
|
||||
pid=$(cat "$RQ_WORKER_PID_FILE" 2>/dev/null || true)
|
||||
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
|
||||
echo "$pid"
|
||||
return 0
|
||||
fi
|
||||
rm -f "$RQ_WORKER_PID_FILE"
|
||||
fi
|
||||
|
||||
local discovered_pid
|
||||
discovered_pid=$(pgrep -f "[r]q worker.*${TRACE_WORKER_QUEUE}" 2>/dev/null | head -1 || true)
|
||||
if [ -n "${discovered_pid}" ] && kill -0 "${discovered_pid}" 2>/dev/null; then
|
||||
echo "${discovered_pid}" > "$RQ_WORKER_PID_FILE"
|
||||
echo "${discovered_pid}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
is_rq_worker_running() {
|
||||
get_rq_worker_pid &>/dev/null
|
||||
}
|
||||
|
||||
start_rq_worker() {
|
||||
if ! is_enabled "${TRACE_WORKER_ENABLED:-false}"; then
|
||||
log_info "RQ trace worker is disabled (TRACE_WORKER_ENABLED=${TRACE_WORKER_ENABLED:-false})"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "$REDIS_ENABLED" != "true" ]; then
|
||||
log_warn "Redis is disabled; cannot start RQ trace worker"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if is_rq_worker_running; then
|
||||
local pid
|
||||
pid=$(get_rq_worker_pid)
|
||||
log_success "RQ trace worker already running (PID: ${pid})"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Check if rq is installed
|
||||
if ! python -c "import rq" 2>/dev/null; then
|
||||
log_warn "rq package not installed; skip trace worker (pip install rq)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_info "Starting RQ trace worker (queue: ${TRACE_WORKER_QUEUE})..."
|
||||
local redis_url="${REDIS_URL:-redis://localhost:6379/0}"
|
||||
if command -v setsid >/dev/null 2>&1; then
|
||||
setsid rq worker "${TRACE_WORKER_QUEUE}" --url "${redis_url}" >> "$RQ_WORKER_LOG" 2>&1 < /dev/null &
|
||||
else
|
||||
nohup rq worker "${TRACE_WORKER_QUEUE}" --url "${redis_url}" >> "$RQ_WORKER_LOG" 2>&1 < /dev/null &
|
||||
fi
|
||||
local pid=$!
|
||||
echo "$pid" > "$RQ_WORKER_PID_FILE"
|
||||
|
||||
sleep 1
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_success "RQ trace worker started (PID: ${pid})"
|
||||
return 0
|
||||
fi
|
||||
|
||||
rm -f "$RQ_WORKER_PID_FILE"
|
||||
log_error "Failed to start RQ trace worker"
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_rq_worker() {
|
||||
if ! is_rq_worker_running; then
|
||||
rm -f "$RQ_WORKER_PID_FILE"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local pid
|
||||
pid=$(get_rq_worker_pid)
|
||||
log_info "Stopping RQ trace worker (PID: ${pid})..."
|
||||
kill -TERM "$pid" 2>/dev/null || true
|
||||
|
||||
local count=0
|
||||
while kill -0 "$pid" 2>/dev/null && [ $count -lt 10 ]; do
|
||||
sleep 1
|
||||
count=$((count + 1))
|
||||
done
|
||||
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
rm -f "$RQ_WORKER_PID_FILE"
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_error "Failed to stop RQ trace worker"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log_success "RQ trace worker stopped"
|
||||
return 0
|
||||
}
|
||||
|
||||
rq_worker_status() {
|
||||
if ! is_enabled "${TRACE_WORKER_ENABLED:-false}"; then
|
||||
echo -e " RQ Worker:${YELLOW} DISABLED${NC}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if is_rq_worker_running; then
|
||||
local pid=$(get_rq_worker_pid)
|
||||
echo -e " RQ Worker:${GREEN} RUNNING${NC} (PID: ${pid}, queue: ${TRACE_WORKER_QUEUE})"
|
||||
else
|
||||
echo -e " RQ Worker:${RED} STOPPED${NC}"
|
||||
fi
|
||||
}
|
||||
|
||||
do_start() {
|
||||
local foreground=false
|
||||
|
||||
@@ -662,6 +787,7 @@ do_start() {
|
||||
log_info "Access URL: http://localhost:${PORT}"
|
||||
log_info "Logs: ${LOG_DIR}/"
|
||||
start_watchdog || return 1
|
||||
start_rq_worker
|
||||
echo "[$(timestamp)] Server started (PID: ${pid})" >> "$STARTUP_LOG"
|
||||
else
|
||||
log_error "Failed to start server"
|
||||
@@ -723,6 +849,7 @@ do_stop() {
|
||||
fi
|
||||
fi
|
||||
|
||||
stop_rq_worker
|
||||
stop_watchdog
|
||||
}
|
||||
|
||||
@@ -768,6 +895,7 @@ do_status() {
|
||||
else
|
||||
echo -e " Watchdog:${YELLOW} DISABLED${NC}"
|
||||
fi
|
||||
rq_worker_status
|
||||
|
||||
if is_running; then
|
||||
echo ""
|
||||
|
||||
Reference in New Issue
Block a user