chore: finalize vite migration hardening and archive openspec changes
This commit is contained in:
177
scripts/worker_watchdog.py
Normal file → Executable file
177
scripts/worker_watchdog.py
Normal file → Executable file
@@ -31,6 +31,23 @@ import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
SRC_ROOT = PROJECT_ROOT / "src"
|
||||
if str(SRC_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(SRC_ROOT))
|
||||
|
||||
from mes_dashboard.core.runtime_contract import ( # noqa: E402
|
||||
build_runtime_contract_diagnostics,
|
||||
load_runtime_contract,
|
||||
)
|
||||
from mes_dashboard.core.worker_recovery_policy import ( # noqa: E402
|
||||
decide_restart_request,
|
||||
evaluate_worker_recovery_state,
|
||||
extract_last_requested_at,
|
||||
extract_restart_history,
|
||||
get_worker_recovery_policy_config,
|
||||
)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -45,7 +62,10 @@ logger = logging.getLogger('mes_dashboard.watchdog')
|
||||
# Configuration
|
||||
# ============================================================
|
||||
|
||||
CHECK_INTERVAL = int(os.getenv('WATCHDOG_CHECK_INTERVAL', '5'))
|
||||
_RUNTIME_CONTRACT = load_runtime_contract(project_root=PROJECT_ROOT)
|
||||
CHECK_INTERVAL = int(
|
||||
os.getenv('WATCHDOG_CHECK_INTERVAL', str(_RUNTIME_CONTRACT['watchdog_check_interval']))
|
||||
)
|
||||
|
||||
|
||||
def _env_int(name: str, default: int) -> int:
|
||||
@@ -55,22 +75,11 @@ def _env_int(name: str, default: int) -> int:
|
||||
return default
|
||||
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
DEFAULT_RUNTIME_DIR = Path(
|
||||
os.getenv('WATCHDOG_RUNTIME_DIR', str(PROJECT_ROOT / 'tmp'))
|
||||
)
|
||||
RESTART_FLAG_PATH = os.getenv(
|
||||
'WATCHDOG_RESTART_FLAG',
|
||||
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart.flag')
|
||||
)
|
||||
GUNICORN_PID_FILE = os.getenv(
|
||||
'WATCHDOG_PID_FILE',
|
||||
str(DEFAULT_RUNTIME_DIR / 'gunicorn.pid')
|
||||
)
|
||||
RESTART_STATE_FILE = os.getenv(
|
||||
'WATCHDOG_STATE_FILE',
|
||||
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart_state.json')
|
||||
)
|
||||
DEFAULT_RUNTIME_DIR = Path(_RUNTIME_CONTRACT['watchdog_runtime_dir'])
|
||||
RESTART_FLAG_PATH = _RUNTIME_CONTRACT['watchdog_restart_flag']
|
||||
GUNICORN_PID_FILE = _RUNTIME_CONTRACT['watchdog_pid_file']
|
||||
RESTART_STATE_FILE = _RUNTIME_CONTRACT['watchdog_state_file']
|
||||
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT['version']
|
||||
RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
|
||||
|
||||
|
||||
@@ -78,6 +87,32 @@ RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
|
||||
# Watchdog Implementation
|
||||
# ============================================================
|
||||
|
||||
|
||||
def validate_runtime_contract_or_raise() -> None:
|
||||
"""Fail fast if runtime contract is inconsistent."""
|
||||
strict = os.getenv("RUNTIME_CONTRACT_ENFORCE", "true").strip().lower() in {
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
"on",
|
||||
}
|
||||
diagnostics = build_runtime_contract_diagnostics(strict=strict)
|
||||
if diagnostics["valid"]:
|
||||
return
|
||||
|
||||
details = "; ".join(diagnostics["errors"])
|
||||
raise RuntimeError(f"Runtime contract validation failed: {details}")
|
||||
|
||||
|
||||
def log_restart_audit(event: str, payload: dict) -> None:
|
||||
entry = {
|
||||
"event": event,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"runtime_contract_version": RUNTIME_CONTRACT_VERSION,
|
||||
**payload,
|
||||
}
|
||||
logger.info("worker_watchdog_audit %s", json.dumps(entry, ensure_ascii=False))
|
||||
|
||||
def get_gunicorn_pid() -> int | None:
|
||||
"""Get Gunicorn master PID from PID file.
|
||||
|
||||
@@ -155,7 +190,12 @@ def save_restart_state(
|
||||
requested_at: str | None = None,
|
||||
requested_ip: str | None = None,
|
||||
completed_at: str | None = None,
|
||||
success: bool = True
|
||||
success: bool = True,
|
||||
source: str = "manual",
|
||||
decision: str = "allowed",
|
||||
decision_reason: str | None = None,
|
||||
manual_override: bool = False,
|
||||
policy_state: dict | None = None,
|
||||
) -> None:
|
||||
"""Save restart state for status queries.
|
||||
|
||||
@@ -173,7 +213,12 @@ def save_restart_state(
|
||||
"requested_at": requested_at,
|
||||
"requested_ip": requested_ip,
|
||||
"completed_at": completed_at,
|
||||
"success": success
|
||||
"success": success,
|
||||
"source": source,
|
||||
"decision": decision,
|
||||
"decision_reason": decision_reason,
|
||||
"manual_override": manual_override,
|
||||
"policy_state": policy_state or {},
|
||||
}
|
||||
current_state = load_restart_state()
|
||||
history = current_state.get("history", [])
|
||||
@@ -229,6 +274,47 @@ def process_restart_request() -> bool:
|
||||
return False
|
||||
|
||||
logger.info(f"Restart flag detected: {flag_data}")
|
||||
source = str(flag_data.get("source") or "manual").strip().lower()
|
||||
manual_override = bool(flag_data.get("manual_override"))
|
||||
override_ack = bool(flag_data.get("override_acknowledged"))
|
||||
restart_state = load_restart_state()
|
||||
restart_history = extract_restart_history(restart_state)
|
||||
policy_state = evaluate_worker_recovery_state(
|
||||
restart_history,
|
||||
last_requested_at=extract_last_requested_at(restart_state),
|
||||
)
|
||||
decision = decide_restart_request(
|
||||
policy_state,
|
||||
source=source,
|
||||
manual_override=manual_override,
|
||||
override_acknowledged=override_ack,
|
||||
)
|
||||
|
||||
if not decision["allowed"]:
|
||||
remove_restart_flag()
|
||||
save_restart_state(
|
||||
requested_by=flag_data.get("user"),
|
||||
requested_at=flag_data.get("timestamp"),
|
||||
requested_ip=flag_data.get("ip"),
|
||||
completed_at=datetime.now().isoformat(),
|
||||
success=False,
|
||||
source=source,
|
||||
decision=decision["decision"],
|
||||
decision_reason=decision["reason"],
|
||||
manual_override=manual_override,
|
||||
policy_state=policy_state,
|
||||
)
|
||||
log_restart_audit(
|
||||
"restart_blocked",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"decision": decision,
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
return True
|
||||
|
||||
# Get Gunicorn master PID
|
||||
pid = get_gunicorn_pid()
|
||||
@@ -242,7 +328,22 @@ def process_restart_request() -> bool:
|
||||
requested_at=flag_data.get("timestamp"),
|
||||
requested_ip=flag_data.get("ip"),
|
||||
completed_at=datetime.now().isoformat(),
|
||||
success=False
|
||||
success=False,
|
||||
source=source,
|
||||
decision="failed",
|
||||
decision_reason="gunicorn_pid_unavailable",
|
||||
manual_override=manual_override,
|
||||
policy_state=policy_state,
|
||||
)
|
||||
log_restart_audit(
|
||||
"restart_failed",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"decision_reason": "gunicorn_pid_unavailable",
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
return True
|
||||
|
||||
@@ -258,7 +359,12 @@ def process_restart_request() -> bool:
|
||||
requested_at=flag_data.get("timestamp"),
|
||||
requested_ip=flag_data.get("ip"),
|
||||
completed_at=datetime.now().isoformat(),
|
||||
success=success
|
||||
success=success,
|
||||
source=source,
|
||||
decision="executed" if success else "failed",
|
||||
decision_reason="signal_sighup" if success else "signal_failed",
|
||||
manual_override=manual_override,
|
||||
policy_state=policy_state,
|
||||
)
|
||||
|
||||
if success:
|
||||
@@ -267,17 +373,44 @@ def process_restart_request() -> bool:
|
||||
f"Requested by: {flag_data.get('user', 'unknown')}, "
|
||||
f"IP: {flag_data.get('ip', 'unknown')}"
|
||||
)
|
||||
log_restart_audit(
|
||||
"restart_executed",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"manual_override": manual_override,
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
else:
|
||||
log_restart_audit(
|
||||
"restart_failed",
|
||||
{
|
||||
"source": source,
|
||||
"actor": flag_data.get("user"),
|
||||
"ip": flag_data.get("ip"),
|
||||
"decision_reason": "signal_failed",
|
||||
"policy_state": policy_state,
|
||||
},
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def run_watchdog() -> None:
|
||||
"""Main watchdog loop."""
|
||||
validate_runtime_contract_or_raise()
|
||||
policy = get_worker_recovery_policy_config()
|
||||
logger.info(
|
||||
f"Worker watchdog started - "
|
||||
f"Check interval: {CHECK_INTERVAL}s, "
|
||||
f"Flag path: {RESTART_FLAG_PATH}, "
|
||||
f"PID file: {GUNICORN_PID_FILE}"
|
||||
f"PID file: {GUNICORN_PID_FILE}, "
|
||||
f"Policy(cooldown={policy['cooldown_seconds']}s, "
|
||||
f"retry_budget={policy['retry_budget']}, "
|
||||
f"window={policy['window_seconds']}s, "
|
||||
f"guarded={policy['guarded_mode_enabled']})"
|
||||
)
|
||||
|
||||
while True:
|
||||
|
||||
Reference in New Issue
Block a user