chore: finalize vite migration hardening and archive openspec changes

This commit is contained in:
beabigegg
2026-02-08 20:03:36 +08:00
parent b56e80381b
commit c8e225101e
119 changed files with 6547 additions and 1301 deletions

177
scripts/worker_watchdog.py Normal file → Executable file
View File

@@ -31,6 +31,23 @@ import time
from datetime import datetime
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
sys.path.insert(0, str(SRC_ROOT))
from mes_dashboard.core.runtime_contract import ( # noqa: E402
build_runtime_contract_diagnostics,
load_runtime_contract,
)
from mes_dashboard.core.worker_recovery_policy import ( # noqa: E402
decide_restart_request,
evaluate_worker_recovery_state,
extract_last_requested_at,
extract_restart_history,
get_worker_recovery_policy_config,
)
# Configure logging
logging.basicConfig(
level=logging.INFO,
@@ -45,7 +62,10 @@ logger = logging.getLogger('mes_dashboard.watchdog')
# Configuration
# ============================================================
CHECK_INTERVAL = int(os.getenv('WATCHDOG_CHECK_INTERVAL', '5'))
_RUNTIME_CONTRACT = load_runtime_contract(project_root=PROJECT_ROOT)
CHECK_INTERVAL = int(
os.getenv('WATCHDOG_CHECK_INTERVAL', str(_RUNTIME_CONTRACT['watchdog_check_interval']))
)
def _env_int(name: str, default: int) -> int:
@@ -55,22 +75,11 @@ def _env_int(name: str, default: int) -> int:
return default
PROJECT_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_RUNTIME_DIR = Path(
os.getenv('WATCHDOG_RUNTIME_DIR', str(PROJECT_ROOT / 'tmp'))
)
RESTART_FLAG_PATH = os.getenv(
'WATCHDOG_RESTART_FLAG',
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart.flag')
)
GUNICORN_PID_FILE = os.getenv(
'WATCHDOG_PID_FILE',
str(DEFAULT_RUNTIME_DIR / 'gunicorn.pid')
)
RESTART_STATE_FILE = os.getenv(
'WATCHDOG_STATE_FILE',
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart_state.json')
)
DEFAULT_RUNTIME_DIR = Path(_RUNTIME_CONTRACT['watchdog_runtime_dir'])
RESTART_FLAG_PATH = _RUNTIME_CONTRACT['watchdog_restart_flag']
GUNICORN_PID_FILE = _RUNTIME_CONTRACT['watchdog_pid_file']
RESTART_STATE_FILE = _RUNTIME_CONTRACT['watchdog_state_file']
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT['version']
RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
@@ -78,6 +87,32 @@ RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
# Watchdog Implementation
# ============================================================
def validate_runtime_contract_or_raise() -> None:
"""Fail fast if runtime contract is inconsistent."""
strict = os.getenv("RUNTIME_CONTRACT_ENFORCE", "true").strip().lower() in {
"1",
"true",
"yes",
"on",
}
diagnostics = build_runtime_contract_diagnostics(strict=strict)
if diagnostics["valid"]:
return
details = "; ".join(diagnostics["errors"])
raise RuntimeError(f"Runtime contract validation failed: {details}")
def log_restart_audit(event: str, payload: dict) -> None:
entry = {
"event": event,
"timestamp": datetime.utcnow().isoformat(),
"runtime_contract_version": RUNTIME_CONTRACT_VERSION,
**payload,
}
logger.info("worker_watchdog_audit %s", json.dumps(entry, ensure_ascii=False))
def get_gunicorn_pid() -> int | None:
"""Get Gunicorn master PID from PID file.
@@ -155,7 +190,12 @@ def save_restart_state(
requested_at: str | None = None,
requested_ip: str | None = None,
completed_at: str | None = None,
success: bool = True
success: bool = True,
source: str = "manual",
decision: str = "allowed",
decision_reason: str | None = None,
manual_override: bool = False,
policy_state: dict | None = None,
) -> None:
"""Save restart state for status queries.
@@ -173,7 +213,12 @@ def save_restart_state(
"requested_at": requested_at,
"requested_ip": requested_ip,
"completed_at": completed_at,
"success": success
"success": success,
"source": source,
"decision": decision,
"decision_reason": decision_reason,
"manual_override": manual_override,
"policy_state": policy_state or {},
}
current_state = load_restart_state()
history = current_state.get("history", [])
@@ -229,6 +274,47 @@ def process_restart_request() -> bool:
return False
logger.info(f"Restart flag detected: {flag_data}")
source = str(flag_data.get("source") or "manual").strip().lower()
manual_override = bool(flag_data.get("manual_override"))
override_ack = bool(flag_data.get("override_acknowledged"))
restart_state = load_restart_state()
restart_history = extract_restart_history(restart_state)
policy_state = evaluate_worker_recovery_state(
restart_history,
last_requested_at=extract_last_requested_at(restart_state),
)
decision = decide_restart_request(
policy_state,
source=source,
manual_override=manual_override,
override_acknowledged=override_ack,
)
if not decision["allowed"]:
remove_restart_flag()
save_restart_state(
requested_by=flag_data.get("user"),
requested_at=flag_data.get("timestamp"),
requested_ip=flag_data.get("ip"),
completed_at=datetime.now().isoformat(),
success=False,
source=source,
decision=decision["decision"],
decision_reason=decision["reason"],
manual_override=manual_override,
policy_state=policy_state,
)
log_restart_audit(
"restart_blocked",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"decision": decision,
"policy_state": policy_state,
},
)
return True
# Get Gunicorn master PID
pid = get_gunicorn_pid()
@@ -242,7 +328,22 @@ def process_restart_request() -> bool:
requested_at=flag_data.get("timestamp"),
requested_ip=flag_data.get("ip"),
completed_at=datetime.now().isoformat(),
success=False
success=False,
source=source,
decision="failed",
decision_reason="gunicorn_pid_unavailable",
manual_override=manual_override,
policy_state=policy_state,
)
log_restart_audit(
"restart_failed",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"decision_reason": "gunicorn_pid_unavailable",
"policy_state": policy_state,
},
)
return True
@@ -258,7 +359,12 @@ def process_restart_request() -> bool:
requested_at=flag_data.get("timestamp"),
requested_ip=flag_data.get("ip"),
completed_at=datetime.now().isoformat(),
success=success
success=success,
source=source,
decision="executed" if success else "failed",
decision_reason="signal_sighup" if success else "signal_failed",
manual_override=manual_override,
policy_state=policy_state,
)
if success:
@@ -267,17 +373,44 @@ def process_restart_request() -> bool:
f"Requested by: {flag_data.get('user', 'unknown')}, "
f"IP: {flag_data.get('ip', 'unknown')}"
)
log_restart_audit(
"restart_executed",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"manual_override": manual_override,
"policy_state": policy_state,
},
)
else:
log_restart_audit(
"restart_failed",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"decision_reason": "signal_failed",
"policy_state": policy_state,
},
)
return True
def run_watchdog() -> None:
"""Main watchdog loop."""
validate_runtime_contract_or_raise()
policy = get_worker_recovery_policy_config()
logger.info(
f"Worker watchdog started - "
f"Check interval: {CHECK_INTERVAL}s, "
f"Flag path: {RESTART_FLAG_PATH}, "
f"PID file: {GUNICORN_PID_FILE}"
f"PID file: {GUNICORN_PID_FILE}, "
f"Policy(cooldown={policy['cooldown_seconds']}s, "
f"retry_budget={policy['retry_budget']}, "
f"window={policy['window_seconds']}s, "
f"guarded={policy['guarded_mode_enabled']})"
)
while True: