chore: finalize vite migration hardening and archive openspec changes
This commit is contained in:
223
scripts/run_cache_benchmarks.py
Executable file
223
scripts/run_cache_benchmarks.py
Executable file
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Benchmark cache query baseline vs indexed selection.
|
||||
|
||||
This benchmark is used as a repeatable governance harness for P1 cache/query
|
||||
efficiency work. It focuses on deterministic synthetic workloads so operators
|
||||
can compare relative latency and memory amplification over time.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import random
|
||||
import statistics
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
FIXTURE_PATH = ROOT / "tests" / "fixtures" / "cache_benchmark_fixture.json"
|
||||
|
||||
|
||||
def load_fixture(path: Path = FIXTURE_PATH) -> dict[str, Any]:
|
||||
payload = json.loads(path.read_text())
|
||||
if "rows" not in payload:
|
||||
raise ValueError("fixture requires rows")
|
||||
return payload
|
||||
|
||||
|
||||
def build_dataset(rows: int, seed: int) -> pd.DataFrame:
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
|
||||
workcenters = [f"WC-{idx:02d}" for idx in range(1, 31)]
|
||||
packages = ["QFN", "DFN", "SOT", "SOP", "BGA", "TSOP"]
|
||||
types = ["TYPE-A", "TYPE-B", "TYPE-C", "TYPE-D"]
|
||||
statuses = ["RUN", "QUEUE", "HOLD"]
|
||||
hold_reasons = ["", "", "", "YieldLimit", "特殊需求管控", "PM Hold"]
|
||||
|
||||
frame = pd.DataFrame(
|
||||
{
|
||||
"WORKCENTER_GROUP": np.random.choice(workcenters, rows),
|
||||
"PACKAGE_LEF": np.random.choice(packages, rows),
|
||||
"PJ_TYPE": np.random.choice(types, rows),
|
||||
"WIP_STATUS": np.random.choice(statuses, rows, p=[0.45, 0.35, 0.20]),
|
||||
"HOLDREASONNAME": np.random.choice(hold_reasons, rows),
|
||||
"QTY": np.random.randint(1, 500, rows),
|
||||
"WORKORDER": [f"WO-{i:06d}" for i in range(rows)],
|
||||
"LOTID": [f"LOT-{i:07d}" for i in range(rows)],
|
||||
}
|
||||
)
|
||||
return frame
|
||||
|
||||
|
||||
def _build_index(df: pd.DataFrame) -> dict[str, dict[str, set[int]]]:
|
||||
def by_column(column: str) -> dict[str, set[int]]:
|
||||
grouped = df.groupby(column, dropna=True, sort=False).indices
|
||||
return {str(k): {int(i) for i in v} for k, v in grouped.items()}
|
||||
|
||||
return {
|
||||
"workcenter": by_column("WORKCENTER_GROUP"),
|
||||
"package": by_column("PACKAGE_LEF"),
|
||||
"type": by_column("PJ_TYPE"),
|
||||
"status": by_column("WIP_STATUS"),
|
||||
}
|
||||
|
||||
|
||||
def _baseline_query(df: pd.DataFrame, query: dict[str, str]) -> int:
|
||||
subset = df
|
||||
if query.get("workcenter"):
|
||||
subset = subset[subset["WORKCENTER_GROUP"] == query["workcenter"]]
|
||||
if query.get("package"):
|
||||
subset = subset[subset["PACKAGE_LEF"] == query["package"]]
|
||||
if query.get("type"):
|
||||
subset = subset[subset["PJ_TYPE"] == query["type"]]
|
||||
if query.get("status"):
|
||||
subset = subset[subset["WIP_STATUS"] == query["status"]]
|
||||
return int(len(subset))
|
||||
|
||||
|
||||
def _indexed_query(_df: pd.DataFrame, indexes: dict[str, dict[str, set[int]]], query: dict[str, str]) -> int:
|
||||
selected: set[int] | None = None
|
||||
for key, bucket in (
|
||||
("workcenter", "workcenter"),
|
||||
("package", "package"),
|
||||
("type", "type"),
|
||||
("status", "status"),
|
||||
):
|
||||
current = indexes[bucket].get(query.get(key, ""))
|
||||
if current is None:
|
||||
return 0
|
||||
if selected is None:
|
||||
selected = set(current)
|
||||
else:
|
||||
selected.intersection_update(current)
|
||||
if not selected:
|
||||
return 0
|
||||
return len(selected or ())
|
||||
|
||||
|
||||
def _build_queries(df: pd.DataFrame, query_count: int, seed: int) -> list[dict[str, str]]:
|
||||
random.seed(seed + 17)
|
||||
workcenters = sorted(df["WORKCENTER_GROUP"].dropna().astype(str).unique().tolist())
|
||||
packages = sorted(df["PACKAGE_LEF"].dropna().astype(str).unique().tolist())
|
||||
types = sorted(df["PJ_TYPE"].dropna().astype(str).unique().tolist())
|
||||
statuses = sorted(df["WIP_STATUS"].dropna().astype(str).unique().tolist())
|
||||
|
||||
queries: list[dict[str, str]] = []
|
||||
for _ in range(query_count):
|
||||
queries.append(
|
||||
{
|
||||
"workcenter": random.choice(workcenters),
|
||||
"package": random.choice(packages),
|
||||
"type": random.choice(types),
|
||||
"status": random.choice(statuses),
|
||||
}
|
||||
)
|
||||
return queries
|
||||
|
||||
|
||||
def _p95(values: list[float]) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
sorted_values = sorted(values)
|
||||
index = min(max(math.ceil(0.95 * len(sorted_values)) - 1, 0), len(sorted_values) - 1)
|
||||
return sorted_values[index]
|
||||
|
||||
|
||||
def run_benchmark(rows: int, query_count: int, seed: int) -> dict[str, Any]:
|
||||
df = build_dataset(rows=rows, seed=seed)
|
||||
queries = _build_queries(df, query_count=query_count, seed=seed)
|
||||
indexes = _build_index(df)
|
||||
|
||||
baseline_latencies: list[float] = []
|
||||
indexed_latencies: list[float] = []
|
||||
baseline_rows: list[int] = []
|
||||
indexed_rows: list[int] = []
|
||||
|
||||
for query in queries:
|
||||
start = time.perf_counter()
|
||||
baseline_rows.append(_baseline_query(df, query))
|
||||
baseline_latencies.append((time.perf_counter() - start) * 1000)
|
||||
|
||||
start = time.perf_counter()
|
||||
indexed_rows.append(_indexed_query(df, indexes, query))
|
||||
indexed_latencies.append((time.perf_counter() - start) * 1000)
|
||||
|
||||
if baseline_rows != indexed_rows:
|
||||
raise AssertionError("benchmark correctness drift: indexed result mismatch")
|
||||
|
||||
frame_bytes = int(df.memory_usage(index=True, deep=True).sum())
|
||||
index_entries = sum(len(bucket) for buckets in indexes.values() for bucket in buckets.values())
|
||||
index_bytes_estimate = int(index_entries * 16)
|
||||
|
||||
baseline_p95 = _p95(baseline_latencies)
|
||||
indexed_p95 = _p95(indexed_latencies)
|
||||
|
||||
return {
|
||||
"rows": rows,
|
||||
"query_count": query_count,
|
||||
"seed": seed,
|
||||
"latency_ms": {
|
||||
"baseline_avg": round(statistics.fmean(baseline_latencies), 4),
|
||||
"baseline_p95": round(baseline_p95, 4),
|
||||
"indexed_avg": round(statistics.fmean(indexed_latencies), 4),
|
||||
"indexed_p95": round(indexed_p95, 4),
|
||||
"p95_ratio_indexed_vs_baseline": round(
|
||||
(indexed_p95 / baseline_p95) if baseline_p95 > 0 else 0.0,
|
||||
4,
|
||||
),
|
||||
},
|
||||
"memory_bytes": {
|
||||
"frame": frame_bytes,
|
||||
"index_estimate": index_bytes_estimate,
|
||||
"amplification_ratio": round(
|
||||
(frame_bytes + index_bytes_estimate) / max(frame_bytes, 1),
|
||||
4,
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
fixture = load_fixture()
|
||||
|
||||
parser = argparse.ArgumentParser(description="Run cache baseline vs indexed benchmark")
|
||||
parser.add_argument("--rows", type=int, default=int(fixture.get("rows", 30000)))
|
||||
parser.add_argument("--queries", type=int, default=int(fixture.get("query_count", 400)))
|
||||
parser.add_argument("--seed", type=int, default=int(fixture.get("seed", 42)))
|
||||
parser.add_argument("--enforce", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
report = run_benchmark(rows=args.rows, query_count=args.queries, seed=args.seed)
|
||||
print(json.dumps(report, ensure_ascii=False, indent=2))
|
||||
|
||||
if not args.enforce:
|
||||
return 0
|
||||
|
||||
thresholds = fixture.get("thresholds") or {}
|
||||
max_latency_ratio = float(thresholds.get("max_p95_ratio_indexed_vs_baseline", 1.25))
|
||||
max_amplification = float(thresholds.get("max_memory_amplification_ratio", 1.8))
|
||||
|
||||
latency_ratio = float(report["latency_ms"]["p95_ratio_indexed_vs_baseline"])
|
||||
amplification_ratio = float(report["memory_bytes"]["amplification_ratio"])
|
||||
|
||||
if latency_ratio > max_latency_ratio:
|
||||
raise SystemExit(
|
||||
f"Latency regression: {latency_ratio:.4f} > max allowed {max_latency_ratio:.4f}"
|
||||
)
|
||||
if amplification_ratio > max_amplification:
|
||||
raise SystemExit(
|
||||
f"Memory amplification regression: {amplification_ratio:.4f} > max allowed {max_amplification:.4f}"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
40
scripts/start_server.sh
Normal file → Executable file
40
scripts/start_server.sh
Normal file → Executable file
@@ -9,7 +9,7 @@ set -uo pipefail
|
||||
# Configuration
|
||||
# ============================================================
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
CONDA_ENV="mes-dashboard"
|
||||
CONDA_ENV="${CONDA_ENV_NAME:-mes-dashboard}"
|
||||
APP_NAME="mes-dashboard"
|
||||
PID_FILE_DEFAULT="${ROOT}/tmp/gunicorn.pid"
|
||||
PID_FILE="${WATCHDOG_PID_FILE:-${PID_FILE_DEFAULT}}"
|
||||
@@ -56,7 +56,7 @@ timestamp() {
|
||||
resolve_runtime_paths() {
|
||||
WATCHDOG_RUNTIME_DIR="${WATCHDOG_RUNTIME_DIR:-${ROOT}/tmp}"
|
||||
WATCHDOG_RESTART_FLAG="${WATCHDOG_RESTART_FLAG:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart.flag}"
|
||||
WATCHDOG_PID_FILE="${WATCHDOG_PID_FILE:-${PID_FILE_DEFAULT}}"
|
||||
WATCHDOG_PID_FILE="${WATCHDOG_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/gunicorn.pid}"
|
||||
WATCHDOG_STATE_FILE="${WATCHDOG_STATE_FILE:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart_state.json}"
|
||||
PID_FILE="${WATCHDOG_PID_FILE}"
|
||||
export WATCHDOG_RUNTIME_DIR WATCHDOG_RESTART_FLAG WATCHDOG_PID_FILE WATCHDOG_STATE_FILE
|
||||
@@ -81,8 +81,14 @@ check_conda() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ -n "${CONDA_BIN:-}" ] && [ ! -x "${CONDA_BIN}" ]; then
|
||||
log_error "CONDA_BIN is set but not executable: ${CONDA_BIN}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Source conda
|
||||
source "$(conda info --base)/etc/profile.d/conda.sh"
|
||||
local conda_cmd="${CONDA_BIN:-$(command -v conda)}"
|
||||
source "$(${conda_cmd} info --base)/etc/profile.d/conda.sh"
|
||||
|
||||
# Check if environment exists
|
||||
if ! conda env list | grep -q "^${CONDA_ENV} "; then
|
||||
@@ -95,6 +101,33 @@ check_conda() {
|
||||
return 0
|
||||
}
|
||||
|
||||
validate_runtime_contract() {
|
||||
conda activate "$CONDA_ENV"
|
||||
export PYTHONPATH="${ROOT}/src:${PYTHONPATH:-}"
|
||||
|
||||
if python - <<'PY'
|
||||
import os
|
||||
import sys
|
||||
|
||||
from mes_dashboard.core.runtime_contract import build_runtime_contract_diagnostics
|
||||
|
||||
strict = os.getenv("RUNTIME_CONTRACT_ENFORCE", "true").strip().lower() in {"1", "true", "yes", "on"}
|
||||
diag = build_runtime_contract_diagnostics(strict=strict)
|
||||
if not diag["valid"]:
|
||||
for error in diag["errors"]:
|
||||
print(f"RUNTIME_CONTRACT_ERROR: {error}")
|
||||
raise SystemExit(1)
|
||||
PY
|
||||
then
|
||||
log_success "Runtime contract validation passed"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_error "Runtime contract validation failed"
|
||||
log_info "Fix env vars: WATCHDOG_RUNTIME_DIR / WATCHDOG_RESTART_FLAG / WATCHDOG_PID_FILE / WATCHDOG_STATE_FILE / CONDA_BIN"
|
||||
return 1
|
||||
}
|
||||
|
||||
check_dependencies() {
|
||||
conda activate "$CONDA_ENV"
|
||||
|
||||
@@ -329,6 +362,7 @@ run_all_checks() {
|
||||
check_env_file
|
||||
load_env
|
||||
resolve_runtime_paths
|
||||
validate_runtime_contract || return 1
|
||||
check_port || return 1
|
||||
check_database
|
||||
check_redis
|
||||
|
||||
177
scripts/worker_watchdog.py
Normal file → Executable file
177
scripts/worker_watchdog.py
Normal file → Executable file
@@ -31,6 +31,23 @@ import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
SRC_ROOT = PROJECT_ROOT / "src"
|
||||
if str(SRC_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(SRC_ROOT))
|
||||
|
||||
from mes_dashboard.core.runtime_contract import ( # noqa: E402
|
||||
build_runtime_contract_diagnostics,
|
||||
load_runtime_contract,
|
||||
)
|
||||
from mes_dashboard.core.worker_recovery_policy import ( # noqa: E402
|
||||
decide_restart_request,
|
||||
evaluate_worker_recovery_state,
|
||||
extract_last_requested_at,
|
||||
extract_restart_history,
|
||||
get_worker_recovery_policy_config,
|
||||
)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -45,7 +62,10 @@ logger = logging.getLogger('mes_dashboard.watchdog')
|
||||
# Configuration
|
||||
# ============================================================
|
||||
|
||||
CHECK_INTERVAL = int(os.getenv('WATCHDOG_CHECK_INTERVAL', '5'))
|
||||
_RUNTIME_CONTRACT = load_runtime_contract(project_root=PROJECT_ROOT)
|
||||
CHECK_INTERVAL = int(
|
||||
os.getenv('WATCHDOG_CHECK_INTERVAL', str(_RUNTIME_CONTRACT['watchdog_check_interval']))
|
||||
)
|
||||
|
||||
|
||||
def _env_int(name: str, default: int) -> int:
|
||||
@@ -55,22 +75,11 @@ def _env_int(name: str, default: int) -> int:
|
||||
return default
|
||||
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
DEFAULT_RUNTIME_DIR = Path(
|
||||
os.getenv('WATCHDOG_RUNTIME_DIR', str(PROJECT_ROOT / 'tmp'))
|
||||
)
|
||||
RESTART_FLAG_PATH = os.getenv(
|
||||
'WATCHDOG_RESTART_FLAG',
|
||||
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart.flag')
|
||||
)
|
||||
GUNICORN_PID_FILE = os.getenv(
|
||||
'WATCHDOG_PID_FILE',
|
||||
str(DEFAULT_RUNTIME_DIR / 'gunicorn.pid')
|
||||
)
|
||||
RESTART_STATE_FILE = os.getenv(
|
||||
'WATCHDOG_STATE_FILE',
|
||||
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart_state.json')
|
||||
)
|
||||
DEFAULT_RUNTIME_DIR = Path(_RUNTIME_CONTRACT['watchdog_runtime_dir'])
|
||||
RESTART_FLAG_PATH = _RUNTIME_CONTRACT['watchdog_restart_flag']
|
||||
GUNICORN_PID_FILE = _RUNTIME_CONTRACT['watchdog_pid_file']
|
||||
RESTART_STATE_FILE = _RUNTIME_CONTRACT['watchdog_state_file']
|
||||
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT['version']
|
||||
RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
|
||||
|
||||
|
||||
@@ -78,6 +87,32 @@ RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
|
||||
# Watchdog Implementation
|
||||
# ============================================================
|
||||
|
||||
|
||||
def validate_runtime_contract_or_raise() -> None:
|
||||
"""Fail fast if runtime contract is inconsistent."""
|
||||
strict = os.getenv("RUNTIME_CONTRACT_ENFORCE", "true").strip().lower() in {
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
"on",
|
||||
}
|
||||
diagnostics = build_runtime_contract_diagnostics(strict=strict)
|
||||
if diagnostics["valid"]:
|
||||
return
|
||||
|
||||
details = "; ".join(diagnostics["errors"])
|
||||
raise RuntimeError(f"Runtime contract validation failed: {details}")
|
||||
|
||||
|
||||
def log_restart_audit(event: str, payload: dict) -> None:
|
||||
entry = {
|
||||
"event": event,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"runtime_contract_version": RUNTIME_CONTRACT_VERSION,
|
||||
**payload,
|
||||
}
|
||||
logger.info("worker_watchdog_audit %s", json.dumps(entry, ensure_ascii=False))
|
||||
|
||||
def get_gunicorn_pid() -> int | None:
|
||||
"""Get Gunicorn master PID from PID file.
|
||||
|
||||
@@ -155,7 +190,12 @@ def save_restart_state(
|
||||
requested_at: str | None = None,
|
||||
requested_ip: str | None = None,
|
||||
completed_at: str | None = None,
|
||||
success: bool = True
|
||||
success: bool = True,
|
||||
source: str = "manual",
|
||||
decision: str = "allowed",
|
||||
decision_reason: str | None = None,
|
||||
manual_override: bool = False,
|
||||
policy_state: dict | None = None,
|
||||
) -> None:
|
||||
"""Save restart state for status queries.
|
||||
|
||||
@@ -173,7 +213,12 @@ def save_restart_state(
|
||||
"requested_at": requested_at,
|
||||
"requested_ip": requested_ip,
|
||||
"completed_at": completed_at,
|
||||
"success": success
|
||||
"success": success,
|
||||
"source": source,
|
||||
"decision": decision,
|
||||
"decision_reason": decision_reason,
|
||||
"manual_override": manual_override,
|
||||
"policy_state": policy_state or {},
|
||||
}
|
||||
current_state = load_restart_state()
|
||||
history = current_state.get("history", [])
|
||||
@@ -229,6 +274,47 @@ def process_restart_request() -> bool:
|
||||
return False
|
||||
|
||||
logger.info(f"Restart flag detected: {flag_data}")
|
||||
source = str(flag_data.get("source") or "manual").strip().lower()
|
||||
manual_override = bool(flag_data.get("manual_override"))
|
||||
override_ack = bool(flag_data.get("override_acknowledged"))
|
||||
restart_state = load_restart_state()
|
||||
restart_history = extract_restart_history(restart_state)
|
||||
policy_state = evaluate_worker_recovery_state(
|
||||
restart_history,
|
||||
last_requested_at=extract_last_requested_at(restart_state),
|
||||
)
|
||||
decision = decide_restart_request(
|
||||
policy_state,
|
||||
source=source,
|
||||
manual_override=manual_override,
|
||||
override_acknowledged=override_ack,
|
||||
)
|
||||
|
||||
if not decision["allowed"]:
|
||||
remove_restart_flag()
|
||||
save_restart_state(
|
||||
requested_by=flag_data.get("user"),
|
||||
requested_at=flag_data.get("timestamp"),
|
||||
requested_ip=flag_data.get("ip"),
|
||||
completed_at=datetime.now().isoformat(),
|
||||
success=False,
|
||||
source=source,
|
||||
decision=decision["decision"],
|
||||
decision_reason=decision["reason"],
|
||||
manual_override=manual_override,
|
||||
policy_state=policy_state,
|
||||
)
|
||||
log_restart_audit(
|
||||
"restart_blocked",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"decision": decision,
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
return True
|
||||
|
||||
# Get Gunicorn master PID
|
||||
pid = get_gunicorn_pid()
|
||||
@@ -242,7 +328,22 @@ def process_restart_request() -> bool:
|
||||
requested_at=flag_data.get("timestamp"),
|
||||
requested_ip=flag_data.get("ip"),
|
||||
completed_at=datetime.now().isoformat(),
|
||||
success=False
|
||||
success=False,
|
||||
source=source,
|
||||
decision="failed",
|
||||
decision_reason="gunicorn_pid_unavailable",
|
||||
manual_override=manual_override,
|
||||
policy_state=policy_state,
|
||||
)
|
||||
log_restart_audit(
|
||||
"restart_failed",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"decision_reason": "gunicorn_pid_unavailable",
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
return True
|
||||
|
||||
@@ -258,7 +359,12 @@ def process_restart_request() -> bool:
|
||||
requested_at=flag_data.get("timestamp"),
|
||||
requested_ip=flag_data.get("ip"),
|
||||
completed_at=datetime.now().isoformat(),
|
||||
success=success
|
||||
success=success,
|
||||
source=source,
|
||||
decision="executed" if success else "failed",
|
||||
decision_reason="signal_sighup" if success else "signal_failed",
|
||||
manual_override=manual_override,
|
||||
policy_state=policy_state,
|
||||
)
|
||||
|
||||
if success:
|
||||
@@ -267,17 +373,44 @@ def process_restart_request() -> bool:
|
||||
f"Requested by: {flag_data.get('user', 'unknown')}, "
|
||||
f"IP: {flag_data.get('ip', 'unknown')}"
|
||||
)
|
||||
log_restart_audit(
|
||||
"restart_executed",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"manual_override": manual_override,
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
else:
|
||||
log_restart_audit(
|
||||
"restart_failed",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"decision_reason": "signal_failed",
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def run_watchdog() -> None:
|
||||
"""Main watchdog loop."""
|
||||
validate_runtime_contract_or_raise()
|
||||
policy = get_worker_recovery_policy_config()
|
||||
logger.info(
|
||||
f"Worker watchdog started - "
|
||||
f"Check interval: {CHECK_INTERVAL}s, "
|
||||
f"Flag path: {RESTART_FLAG_PATH}, "
|
||||
f"PID file: {GUNICORN_PID_FILE}"
|
||||
f"PID file: {GUNICORN_PID_FILE}, "
|
||||
f"Policy(cooldown={policy['cooldown_seconds']}s, "
|
||||
f"retry_budget={policy['retry_budget']}, "
|
||||
f"window={policy['window_seconds']}s, "
|
||||
f"guarded={policy['guarded_mode_enabled']})"
|
||||
)
|
||||
|
||||
while True:
|
||||
|
||||
Reference in New Issue
Block a user