436 lines
13 KiB
Python
Executable File
436 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""Worker watchdog for MES Dashboard.
|
|
|
|
Monitors a restart flag file and signals Gunicorn master to gracefully
|
|
reload workers when the flag is detected.
|
|
|
|
Usage:
|
|
python scripts/worker_watchdog.py
|
|
|
|
The watchdog:
|
|
- Checks for /tmp/mes_dashboard_restart.flag every 5 seconds
|
|
- Sends SIGHUP to Gunicorn master process when flag is detected
|
|
- Removes the flag file after signaling
|
|
- Logs all restart events
|
|
|
|
Configuration via environment variables:
|
|
- WATCHDOG_CHECK_INTERVAL: Check interval in seconds (default: 5)
|
|
- WATCHDOG_RESTART_FLAG: Path to restart flag file
|
|
- WATCHDOG_PID_FILE: Path to Gunicorn PID file
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import signal
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
SRC_ROOT = PROJECT_ROOT / "src"
|
|
if str(SRC_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(SRC_ROOT))
|
|
|
|
from mes_dashboard.core.runtime_contract import ( # noqa: E402
|
|
build_runtime_contract_diagnostics,
|
|
load_runtime_contract,
|
|
)
|
|
from mes_dashboard.core.worker_recovery_policy import ( # noqa: E402
|
|
decide_restart_request,
|
|
evaluate_worker_recovery_state,
|
|
extract_last_requested_at,
|
|
extract_restart_history,
|
|
get_worker_recovery_policy_config,
|
|
)
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout),
|
|
]
|
|
)
|
|
logger = logging.getLogger('mes_dashboard.watchdog')
|
|
|
|
# ============================================================
|
|
# Configuration
|
|
# ============================================================
|
|
|
|
_RUNTIME_CONTRACT = load_runtime_contract(project_root=PROJECT_ROOT)
|
|
CHECK_INTERVAL = int(
|
|
os.getenv('WATCHDOG_CHECK_INTERVAL', str(_RUNTIME_CONTRACT['watchdog_check_interval']))
|
|
)
|
|
|
|
|
|
def _env_int(name: str, default: int) -> int:
|
|
try:
|
|
return int(os.getenv(name, str(default)))
|
|
except (TypeError, ValueError):
|
|
return default
|
|
|
|
|
|
DEFAULT_RUNTIME_DIR = Path(_RUNTIME_CONTRACT['watchdog_runtime_dir'])
|
|
RESTART_FLAG_PATH = _RUNTIME_CONTRACT['watchdog_restart_flag']
|
|
GUNICORN_PID_FILE = _RUNTIME_CONTRACT['watchdog_pid_file']
|
|
RESTART_STATE_FILE = _RUNTIME_CONTRACT['watchdog_state_file']
|
|
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT['version']
|
|
RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
|
|
|
|
|
|
# ============================================================
|
|
# Watchdog Implementation
|
|
# ============================================================
|
|
|
|
|
|
def validate_runtime_contract_or_raise() -> None:
|
|
"""Fail fast if runtime contract is inconsistent."""
|
|
strict = os.getenv("RUNTIME_CONTRACT_ENFORCE", "true").strip().lower() in {
|
|
"1",
|
|
"true",
|
|
"yes",
|
|
"on",
|
|
}
|
|
diagnostics = build_runtime_contract_diagnostics(strict=strict)
|
|
if diagnostics["valid"]:
|
|
return
|
|
|
|
details = "; ".join(diagnostics["errors"])
|
|
raise RuntimeError(f"Runtime contract validation failed: {details}")
|
|
|
|
|
|
def log_restart_audit(event: str, payload: dict) -> None:
|
|
entry = {
|
|
"event": event,
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"runtime_contract_version": RUNTIME_CONTRACT_VERSION,
|
|
**payload,
|
|
}
|
|
logger.info("worker_watchdog_audit %s", json.dumps(entry, ensure_ascii=False))
|
|
|
|
def get_gunicorn_pid() -> int | None:
|
|
"""Get Gunicorn master PID from PID file.
|
|
|
|
Returns:
|
|
PID of Gunicorn master process, or None if not found.
|
|
"""
|
|
pid_path = Path(GUNICORN_PID_FILE)
|
|
|
|
if not pid_path.exists():
|
|
logger.warning(f"PID file not found: {GUNICORN_PID_FILE}")
|
|
return None
|
|
|
|
try:
|
|
pid = int(pid_path.read_text().strip())
|
|
# Verify process exists
|
|
os.kill(pid, 0)
|
|
return pid
|
|
except (ValueError, ProcessLookupError, PermissionError) as e:
|
|
logger.warning(f"Invalid or stale PID file: {e}")
|
|
return None
|
|
|
|
|
|
def read_restart_flag() -> dict | None:
|
|
"""Read and parse the restart flag file.
|
|
|
|
Returns:
|
|
Dictionary with restart metadata, or None if no flag exists.
|
|
"""
|
|
flag_path = Path(RESTART_FLAG_PATH)
|
|
|
|
if not flag_path.exists():
|
|
return None
|
|
|
|
try:
|
|
content = flag_path.read_text().strip()
|
|
if content:
|
|
return json.loads(content)
|
|
return {"timestamp": datetime.now().isoformat()}
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
logger.warning(f"Error reading restart flag: {e}")
|
|
return {"timestamp": datetime.now().isoformat(), "error": str(e)}
|
|
|
|
|
|
def remove_restart_flag() -> bool:
|
|
"""Remove the restart flag file.
|
|
|
|
Returns:
|
|
True if file was removed, False otherwise.
|
|
"""
|
|
flag_path = Path(RESTART_FLAG_PATH)
|
|
|
|
try:
|
|
if flag_path.exists():
|
|
flag_path.unlink()
|
|
return True
|
|
return False
|
|
except IOError as e:
|
|
logger.error(f"Failed to remove restart flag: {e}")
|
|
return False
|
|
|
|
|
|
def load_restart_state() -> dict:
|
|
"""Load persisted restart state from disk."""
|
|
state_path = Path(RESTART_STATE_FILE)
|
|
if not state_path.exists():
|
|
return {}
|
|
try:
|
|
return json.loads(state_path.read_text())
|
|
except (json.JSONDecodeError, IOError):
|
|
return {}
|
|
|
|
|
|
def save_restart_state(
|
|
requested_by: str | None = None,
|
|
requested_at: str | None = None,
|
|
requested_ip: str | None = None,
|
|
completed_at: str | None = None,
|
|
success: bool = True,
|
|
source: str = "manual",
|
|
decision: str = "allowed",
|
|
decision_reason: str | None = None,
|
|
manual_override: bool = False,
|
|
policy_state: dict | None = None,
|
|
) -> None:
|
|
"""Save restart state for status queries.
|
|
|
|
Args:
|
|
requested_by: Username who requested the restart.
|
|
requested_at: ISO timestamp when restart was requested.
|
|
requested_ip: IP address of requester.
|
|
completed_at: ISO timestamp when restart was completed.
|
|
success: Whether the restart was successful.
|
|
"""
|
|
state_path = Path(RESTART_STATE_FILE)
|
|
|
|
entry = {
|
|
"requested_by": requested_by,
|
|
"requested_at": requested_at,
|
|
"requested_ip": requested_ip,
|
|
"completed_at": completed_at,
|
|
"success": success,
|
|
"source": source,
|
|
"decision": decision,
|
|
"decision_reason": decision_reason,
|
|
"manual_override": manual_override,
|
|
"policy_state": policy_state or {},
|
|
}
|
|
current_state = load_restart_state()
|
|
history = current_state.get("history", [])
|
|
if not isinstance(history, list):
|
|
history = []
|
|
history.append(entry)
|
|
if len(history) > RESTART_HISTORY_MAX:
|
|
history = history[-RESTART_HISTORY_MAX:]
|
|
|
|
state = {
|
|
"last_restart": entry,
|
|
"history": history,
|
|
"history_limit": RESTART_HISTORY_MAX,
|
|
}
|
|
|
|
try:
|
|
state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
state_path.write_text(json.dumps(state, indent=2))
|
|
except IOError as e:
|
|
logger.error(f"Failed to save restart state: {e}")
|
|
|
|
|
|
def send_reload_signal(pid: int) -> bool:
|
|
"""Send SIGHUP to Gunicorn master to reload workers.
|
|
|
|
Args:
|
|
pid: PID of Gunicorn master process.
|
|
|
|
Returns:
|
|
True if signal was sent successfully, False otherwise.
|
|
"""
|
|
try:
|
|
os.kill(pid, signal.SIGHUP)
|
|
logger.info(f"Sent SIGHUP to Gunicorn master (PID: {pid})")
|
|
return True
|
|
except ProcessLookupError:
|
|
logger.error(f"Process {pid} not found")
|
|
return False
|
|
except PermissionError:
|
|
logger.error(f"Permission denied sending signal to PID {pid}")
|
|
return False
|
|
|
|
|
|
def process_restart_request() -> bool:
|
|
"""Process a restart request if flag file exists.
|
|
|
|
Returns:
|
|
True if restart was processed, False if no restart needed.
|
|
"""
|
|
flag_data = read_restart_flag()
|
|
|
|
if flag_data is None:
|
|
return False
|
|
|
|
logger.info(f"Restart flag detected: {flag_data}")
|
|
source = str(flag_data.get("source") or "manual").strip().lower()
|
|
manual_override = bool(flag_data.get("manual_override"))
|
|
override_ack = bool(flag_data.get("override_acknowledged"))
|
|
restart_state = load_restart_state()
|
|
restart_history = extract_restart_history(restart_state)
|
|
policy_state = evaluate_worker_recovery_state(
|
|
restart_history,
|
|
last_requested_at=extract_last_requested_at(restart_state),
|
|
)
|
|
decision = decide_restart_request(
|
|
policy_state,
|
|
source=source,
|
|
manual_override=manual_override,
|
|
override_acknowledged=override_ack,
|
|
)
|
|
|
|
if not decision["allowed"]:
|
|
remove_restart_flag()
|
|
save_restart_state(
|
|
requested_by=flag_data.get("user"),
|
|
requested_at=flag_data.get("timestamp"),
|
|
requested_ip=flag_data.get("ip"),
|
|
completed_at=datetime.now().isoformat(),
|
|
success=False,
|
|
source=source,
|
|
decision=decision["decision"],
|
|
decision_reason=decision["reason"],
|
|
manual_override=manual_override,
|
|
policy_state=policy_state,
|
|
)
|
|
log_restart_audit(
|
|
"restart_blocked",
|
|
{
|
|
"source": source,
|
|
"actor": flag_data.get("user"),
|
|
"ip": flag_data.get("ip"),
|
|
"decision": decision,
|
|
"policy_state": policy_state,
|
|
},
|
|
)
|
|
return True
|
|
|
|
# Get Gunicorn master PID
|
|
pid = get_gunicorn_pid()
|
|
|
|
if pid is None:
|
|
logger.error("Cannot restart: Gunicorn master PID not found")
|
|
# Still remove flag to prevent infinite loop
|
|
remove_restart_flag()
|
|
save_restart_state(
|
|
requested_by=flag_data.get("user"),
|
|
requested_at=flag_data.get("timestamp"),
|
|
requested_ip=flag_data.get("ip"),
|
|
completed_at=datetime.now().isoformat(),
|
|
success=False,
|
|
source=source,
|
|
decision="failed",
|
|
decision_reason="gunicorn_pid_unavailable",
|
|
manual_override=manual_override,
|
|
policy_state=policy_state,
|
|
)
|
|
log_restart_audit(
|
|
"restart_failed",
|
|
{
|
|
"source": source,
|
|
"actor": flag_data.get("user"),
|
|
"ip": flag_data.get("ip"),
|
|
"decision_reason": "gunicorn_pid_unavailable",
|
|
"policy_state": policy_state,
|
|
},
|
|
)
|
|
return True
|
|
|
|
# Send reload signal
|
|
success = send_reload_signal(pid)
|
|
|
|
# Remove flag file
|
|
remove_restart_flag()
|
|
|
|
# Save state
|
|
save_restart_state(
|
|
requested_by=flag_data.get("user"),
|
|
requested_at=flag_data.get("timestamp"),
|
|
requested_ip=flag_data.get("ip"),
|
|
completed_at=datetime.now().isoformat(),
|
|
success=success,
|
|
source=source,
|
|
decision="executed" if success else "failed",
|
|
decision_reason="signal_sighup" if success else "signal_failed",
|
|
manual_override=manual_override,
|
|
policy_state=policy_state,
|
|
)
|
|
|
|
if success:
|
|
logger.info(
|
|
f"Worker restart completed - "
|
|
f"Requested by: {flag_data.get('user', 'unknown')}, "
|
|
f"IP: {flag_data.get('ip', 'unknown')}"
|
|
)
|
|
log_restart_audit(
|
|
"restart_executed",
|
|
{
|
|
"source": source,
|
|
"actor": flag_data.get("user"),
|
|
"ip": flag_data.get("ip"),
|
|
"manual_override": manual_override,
|
|
"policy_state": policy_state,
|
|
},
|
|
)
|
|
else:
|
|
log_restart_audit(
|
|
"restart_failed",
|
|
{
|
|
"source": source,
|
|
"actor": flag_data.get("user"),
|
|
"ip": flag_data.get("ip"),
|
|
"decision_reason": "signal_failed",
|
|
"policy_state": policy_state,
|
|
},
|
|
)
|
|
|
|
return True
|
|
|
|
|
|
def run_watchdog() -> None:
|
|
"""Main watchdog loop."""
|
|
validate_runtime_contract_or_raise()
|
|
policy = get_worker_recovery_policy_config()
|
|
logger.info(
|
|
f"Worker watchdog started - "
|
|
f"Check interval: {CHECK_INTERVAL}s, "
|
|
f"Flag path: {RESTART_FLAG_PATH}, "
|
|
f"PID file: {GUNICORN_PID_FILE}, "
|
|
f"Policy(cooldown={policy['cooldown_seconds']}s, "
|
|
f"retry_budget={policy['retry_budget']}, "
|
|
f"window={policy['window_seconds']}s, "
|
|
f"guarded={policy['guarded_mode_enabled']})"
|
|
)
|
|
|
|
while True:
|
|
try:
|
|
process_restart_request()
|
|
except Exception as e:
|
|
logger.exception(f"Error in watchdog loop: {e}")
|
|
|
|
time.sleep(CHECK_INTERVAL)
|
|
|
|
|
|
def main() -> None:
|
|
"""Entry point for watchdog script."""
|
|
try:
|
|
run_watchdog()
|
|
except KeyboardInterrupt:
|
|
logger.info("Watchdog stopped by user")
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|