Files
DashBoard/src/mes_dashboard/routes/admin_routes.py

667 lines
23 KiB
Python

# -*- coding: utf-8 -*-
"""Admin routes for page management and performance monitoring."""
from __future__ import annotations
import json
import logging
import os
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from flask import Blueprint, g, jsonify, render_template, request
from mes_dashboard.core.permissions import admin_required
from mes_dashboard.core.response import error_response, TOO_MANY_REQUESTS
from mes_dashboard.core.resilience import (
build_recovery_recommendation,
get_resilience_thresholds,
summarize_restart_history,
)
from mes_dashboard.core.runtime_contract import (
build_runtime_contract_diagnostics,
load_runtime_contract,
)
from mes_dashboard.core.worker_recovery_policy import (
decide_restart_request,
evaluate_worker_recovery_state,
extract_last_requested_at,
extract_restart_history,
load_restart_state,
)
from mes_dashboard.services.page_registry import get_all_pages, set_page_status
admin_bp = Blueprint("admin", __name__, url_prefix="/admin")
logger = logging.getLogger("mes_dashboard.admin")
# ============================================================
# Worker Restart Configuration
# ============================================================
_RUNTIME_CONTRACT = load_runtime_contract()
WATCHDOG_RUNTIME_DIR = _RUNTIME_CONTRACT["watchdog_runtime_dir"]
RESTART_FLAG_PATH = _RUNTIME_CONTRACT["watchdog_restart_flag"]
RESTART_STATE_PATH = _RUNTIME_CONTRACT["watchdog_state_file"]
WATCHDOG_PID_PATH = _RUNTIME_CONTRACT["watchdog_pid_file"]
GUNICORN_BIND = _RUNTIME_CONTRACT["gunicorn_bind"]
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT["version"]
# Track last restart request time (in-memory for this worker)
_last_restart_request: float = 0.0
# ============================================================
# Performance Monitoring Routes
# ============================================================
@admin_bp.route("/performance")
@admin_required
def performance():
"""Performance monitoring dashboard."""
return render_template("admin/performance.html")
@admin_bp.route("/api/system-status", methods=["GET"])
@admin_required
def api_system_status():
"""API: Get system status for performance dashboard."""
from mes_dashboard.core.database import get_pool_runtime_config, get_pool_status
from mes_dashboard.core.redis_client import REDIS_ENABLED
from mes_dashboard.core.circuit_breaker import get_circuit_breaker_status
from mes_dashboard.routes.health_routes import (
check_database,
check_redis,
get_route_cache_status,
)
# Database status
db_status, db_error = check_database()
# Redis status
redis_status = 'disabled'
if REDIS_ENABLED:
redis_status, _ = check_redis()
# Circuit breaker status
circuit_breaker = get_circuit_breaker_status()
route_cache = get_route_cache_status()
pool_runtime = get_pool_runtime_config()
try:
pool_state = get_pool_status()
except Exception:
pool_state = None
thresholds = get_resilience_thresholds()
restart_state = _get_restart_state()
restart_churn = _get_restart_churn_summary(restart_state)
policy_state = _get_restart_policy_state(restart_state)
in_cooldown = bool(policy_state.get("cooldown"))
remaining = int(policy_state.get("cooldown_remaining_seconds") or 0)
degraded_reason = None
if db_status == "error":
degraded_reason = "database_unreachable"
elif circuit_breaker.get("state") == "OPEN":
degraded_reason = "circuit_breaker_open"
elif (pool_state or {}).get("saturation", 0.0) >= 1.0:
degraded_reason = "db_pool_saturated"
elif redis_status == "error":
degraded_reason = "redis_unavailable"
elif route_cache.get("degraded"):
degraded_reason = "route_cache_degraded"
recommendation = build_recovery_recommendation(
degraded_reason=degraded_reason,
pool_saturation=(pool_state or {}).get("saturation"),
circuit_state=circuit_breaker.get("state"),
restart_churn_exceeded=bool(restart_churn.get("exceeded")),
cooldown_active=in_cooldown,
)
alerts = _build_restart_alerts(
pool_saturation=(pool_state or {}).get("saturation"),
circuit_state=circuit_breaker.get("state"),
route_cache_degraded=bool(route_cache.get("degraded")),
policy_state=policy_state,
thresholds=thresholds,
)
runtime_contract = build_runtime_contract_diagnostics(strict=False)
# Cache status
from mes_dashboard.routes.health_routes import (
get_cache_status,
get_resource_cache_status,
get_equipment_status_cache_status
)
return jsonify({
"success": True,
"data": {
"database": {
"status": db_status,
"error": db_error
},
"redis": {
"status": redis_status,
"enabled": REDIS_ENABLED
},
"circuit_breaker": circuit_breaker,
"cache": {
"wip": get_cache_status(),
"resource": get_resource_cache_status(),
"equipment": get_equipment_status_cache_status()
},
"runtime_resilience": {
"degraded_reason": degraded_reason,
"pool_runtime": pool_runtime,
"pool_state": pool_state,
"route_cache": route_cache,
"thresholds": thresholds,
"alerts": alerts,
"restart_churn": restart_churn,
"policy_state": {
"state": policy_state.get("state"),
"allowed": policy_state.get("allowed"),
"cooldown": policy_state.get("cooldown"),
"blocked": policy_state.get("blocked"),
"cooldown_remaining_seconds": remaining,
},
"recovery_recommendation": recommendation,
"restart_cooldown": {
"active": in_cooldown,
"remaining_seconds": remaining if in_cooldown else 0,
},
},
"runtime_contract": runtime_contract,
"single_port_bind": GUNICORN_BIND,
"worker_pid": os.getpid()
}
})
@admin_bp.route("/api/metrics", methods=["GET"])
@admin_required
def api_metrics():
"""API: Get performance metrics for dashboard."""
from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics
summary = get_metrics_summary()
metrics = get_query_metrics()
return jsonify({
"success": True,
"data": {
"p50_ms": summary.get("p50_ms"),
"p95_ms": summary.get("p95_ms"),
"p99_ms": summary.get("p99_ms"),
"count": summary.get("count"),
"slow_count": summary.get("slow_count"),
"slow_rate": summary.get("slow_rate"),
"worker_pid": summary.get("worker_pid"),
"collected_at": summary.get("collected_at"),
# Include latency distribution for charts
"latencies": metrics.get_latencies()[-100:] # Last 100 for chart
}
})
@admin_bp.route("/api/logs", methods=["GET"])
@admin_required
def api_logs():
"""API: Get recent logs from SQLite log store."""
from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
if not LOG_STORE_ENABLED:
return jsonify({
"success": True,
"data": {
"logs": [],
"enabled": False,
"total": 0
}
})
# Query parameters
level = request.args.get("level")
q = request.args.get("q")
limit = request.args.get("limit", 50, type=int)
offset = request.args.get("offset", 0, type=int)
since = request.args.get("since")
log_store = get_log_store()
# Get total count for pagination
total = log_store.count_logs(level=level, q=q, since=since)
# Get paginated logs
logs = log_store.query_logs(
level=level,
q=q,
limit=min(limit, 100), # Cap at 100 per page
offset=offset,
since=since
)
return jsonify({
"success": True,
"data": {
"logs": logs,
"count": len(logs),
"total": total,
"enabled": True,
"stats": log_store.get_stats()
}
})
@admin_bp.route("/api/logs/cleanup", methods=["POST"])
@admin_required
def api_logs_cleanup():
"""API: Manually trigger log cleanup.
Supports optional parameters:
- older_than_days: Delete logs older than N days (default: use configured retention)
- keep_count: Keep only the most recent N logs (optional)
"""
from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED
if not LOG_STORE_ENABLED:
return jsonify({
"success": False,
"error": "Log store is disabled"
}), 400
log_store = get_log_store()
# Get current stats before cleanup
stats_before = log_store.get_stats()
# Perform cleanup
deleted = log_store.cleanup_old_logs()
# Get stats after cleanup
stats_after = log_store.get_stats()
user = getattr(g, "username", "unknown")
logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries")
return jsonify({
"success": True,
"data": {
"deleted": deleted,
"before": {
"count": stats_before.get("count", 0),
"size_bytes": stats_before.get("size_bytes", 0)
},
"after": {
"count": stats_after.get("count", 0),
"size_bytes": stats_after.get("size_bytes", 0)
}
}
})
# ============================================================
# Worker Restart Control Routes
# ============================================================
def _get_restart_state() -> dict:
"""Read worker restart state from file."""
return load_restart_state(RESTART_STATE_PATH)
def _iso_from_epoch(ts: float) -> str | None:
if ts <= 0:
return None
return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
def _check_restart_cooldown() -> tuple[bool, float]:
"""Check if restart is in cooldown.
Returns:
Tuple of (is_in_cooldown, remaining_seconds).
"""
policy = _get_restart_policy_state()
if policy.get("cooldown"):
return True, float(policy.get("cooldown_remaining_seconds") or 0.0)
return False, 0.0
def _get_restart_history(state: dict | None = None) -> list[dict]:
"""Return bounded restart history for admin telemetry."""
payload = state if state is not None else _get_restart_state()
return extract_restart_history(payload)[-20:]
def _get_restart_churn_summary(state: dict | None = None) -> dict:
"""Summarize restart churn within active resilience window."""
history = _get_restart_history(state)
return summarize_restart_history(history)
def _get_restart_policy_state(state: dict | None = None) -> dict[str, Any]:
"""Return effective worker restart policy state."""
payload = state if state is not None else _get_restart_state()
history = _get_restart_history(payload)
last_requested = extract_last_requested_at(payload)
in_memory_requested = _iso_from_epoch(_last_restart_request)
if in_memory_requested:
try:
in_memory_dt = datetime.fromisoformat(in_memory_requested)
persisted_dt = datetime.fromisoformat(last_requested) if last_requested else None
except (TypeError, ValueError):
in_memory_dt = None
persisted_dt = None
if in_memory_dt and (persisted_dt is None or in_memory_dt > persisted_dt):
last_requested = in_memory_requested
return evaluate_worker_recovery_state(
history,
last_requested_at=last_requested,
)
def _build_restart_alerts(
*,
pool_saturation: float | None,
circuit_state: str | None,
route_cache_degraded: bool,
policy_state: dict[str, Any],
thresholds: dict[str, Any],
) -> dict[str, Any]:
saturation = float(pool_saturation or 0.0)
warning = float(thresholds.get("pool_saturation_warning", 0.9))
critical = float(thresholds.get("pool_saturation_critical", 1.0))
return {
"pool_warning": saturation >= warning,
"pool_critical": saturation >= critical,
"circuit_open": circuit_state == "OPEN",
"route_cache_degraded": bool(route_cache_degraded),
"restart_churn_exceeded": bool(policy_state.get("churn_exceeded")),
"restart_blocked": bool(policy_state.get("blocked")),
}
def _log_restart_audit(event: str, payload: dict[str, Any]) -> None:
entry = {
"event": event,
"timestamp": datetime.now(tz=timezone.utc).isoformat(),
"runtime_contract_version": RUNTIME_CONTRACT_VERSION,
**payload,
}
logger.info("worker_restart_audit %s", json.dumps(entry, ensure_ascii=False))
@admin_bp.route("/api/worker/restart", methods=["POST"])
@admin_required
def api_worker_restart():
"""API: Request worker restart.
Writes a restart flag file that the watchdog process monitors.
Enforces a 60-second cooldown between restart requests.
"""
global _last_restart_request
payload = request.get_json(silent=True) or {}
manual_override = bool(payload.get("manual_override"))
override_acknowledged = bool(payload.get("override_acknowledged"))
override_reason = str(payload.get("override_reason") or "").strip()
# Get request metadata
user = getattr(g, "username", "unknown")
ip = request.remote_addr or "unknown"
timestamp = datetime.now(tz=timezone.utc).isoformat()
state = _get_restart_state()
policy_state = _get_restart_policy_state(state)
decision = decide_restart_request(
policy_state,
source="manual",
manual_override=manual_override,
override_acknowledged=override_acknowledged,
)
if manual_override and not override_reason:
return error_response(
"RESTART_OVERRIDE_REASON_REQUIRED",
"Manual override requires non-empty override_reason for audit traceability.",
status_code=400,
)
if not decision["allowed"]:
status_code = 429 if policy_state.get("cooldown") else 409
if status_code == 429:
message = (
f"Restart in cooldown. Please wait "
f"{int(policy_state.get('cooldown_remaining_seconds') or 0)} seconds."
)
code = TOO_MANY_REQUESTS
else:
message = (
"Restart blocked by guarded mode. "
"Set manual_override=true and override_acknowledged=true to proceed."
)
code = "RESTART_POLICY_BLOCKED"
_log_restart_audit(
"restart_request_blocked",
{
"actor": user,
"ip": ip,
"decision": decision,
"policy_state": policy_state,
},
)
return error_response(
code,
message,
status_code=status_code,
)
# Write restart flag file
flag_path = Path(RESTART_FLAG_PATH)
flag_data = {
"user": user,
"ip": ip,
"timestamp": timestamp,
"worker_pid": os.getpid(),
"source": "manual",
"manual_override": bool(manual_override and override_acknowledged),
"override_acknowledged": override_acknowledged,
"override_reason": override_reason or None,
"policy_state": policy_state,
"policy_decision": decision["decision"],
"runtime_contract_version": RUNTIME_CONTRACT_VERSION,
}
try:
flag_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = flag_path.with_suffix(flag_path.suffix + ".tmp")
tmp_path.write_text(json.dumps(flag_data, ensure_ascii=False))
tmp_path.replace(flag_path)
except IOError as e:
logger.error(f"Failed to write restart flag: {e}")
return error_response(
"RESTART_FAILED",
f"Failed to request restart: {e}",
status_code=500
)
# Update in-memory cooldown
_last_restart_request = time.time()
_log_restart_audit(
"restart_request_accepted",
{
"actor": user,
"ip": ip,
"decision": decision,
"policy_state": policy_state,
"override_reason": override_reason or None,
},
)
return jsonify({
"success": True,
"data": {
"message": "Restart requested. Workers will reload shortly.",
"requested_by": user,
"requested_at": timestamp,
"policy_state": {
"state": policy_state.get("state"),
"allowed": policy_state.get("allowed"),
"cooldown": policy_state.get("cooldown"),
"blocked": policy_state.get("blocked"),
"cooldown_remaining_seconds": policy_state.get("cooldown_remaining_seconds"),
},
"decision": decision,
"single_port_bind": GUNICORN_BIND,
"watchdog": {
"runtime_dir": WATCHDOG_RUNTIME_DIR,
"flag_path": RESTART_FLAG_PATH,
"pid_path": WATCHDOG_PID_PATH,
"state_path": RESTART_STATE_PATH,
},
}
})
@admin_bp.route("/api/worker/status", methods=["GET"])
@admin_required
def api_worker_status():
"""API: Get worker status and restart information."""
# Get last restart info
state = _get_restart_state()
last_restart = state.get("last_restart", {})
history = _get_restart_history(state)
churn = _get_restart_churn_summary(state)
policy_state = _get_restart_policy_state(state)
thresholds = get_resilience_thresholds()
recommendation = build_recovery_recommendation(
degraded_reason="db_pool_saturated" if policy_state.get("blocked") else None,
pool_saturation=None,
circuit_state=None,
restart_churn_exceeded=bool(churn.get("exceeded")),
cooldown_active=bool(policy_state.get("cooldown")),
)
runtime_contract = build_runtime_contract_diagnostics(strict=False)
# Get worker start time (psutil is optional)
worker_start_time = None
try:
import psutil
process = psutil.Process(os.getpid())
worker_start_time = datetime.fromtimestamp(
process.create_time()
).isoformat()
except ImportError:
# psutil not installed, try /proc on Linux
try:
stat_path = f"/proc/{os.getpid()}/stat"
with open(stat_path) as f:
stat = f.read().split()
# Field 22 is starttime in clock ticks since boot
# This is a simplified fallback
pass
except Exception:
pass
except Exception:
pass
return jsonify({
"success": True,
"data": {
"worker_pid": os.getpid(),
"worker_start_time": worker_start_time,
"runtime_contract": {
"version": runtime_contract["contract"]["version"],
"validation": {
"valid": runtime_contract["valid"],
"errors": runtime_contract["errors"],
},
"single_port_bind": GUNICORN_BIND,
"watchdog": {
"runtime_dir": WATCHDOG_RUNTIME_DIR,
"flag_path": RESTART_FLAG_PATH,
"flag_exists": Path(RESTART_FLAG_PATH).exists(),
"pid_path": WATCHDOG_PID_PATH,
"pid_exists": Path(WATCHDOG_PID_PATH).exists(),
"state_path": RESTART_STATE_PATH,
"state_exists": Path(RESTART_STATE_PATH).exists(),
},
},
"cooldown": {
"active": bool(policy_state.get("cooldown")),
"remaining_seconds": int(policy_state.get("cooldown_remaining_seconds") or 0)
},
"resilience": {
"thresholds": thresholds,
"alerts": {
"restart_churn_exceeded": bool(churn.get("exceeded")),
"restart_blocked": bool(policy_state.get("blocked")),
},
"restart_churn": churn,
"policy_state": {
"state": policy_state.get("state"),
"allowed": policy_state.get("allowed"),
"cooldown": policy_state.get("cooldown"),
"blocked": policy_state.get("blocked"),
"cooldown_remaining_seconds": policy_state.get("cooldown_remaining_seconds"),
"attempts_in_window": policy_state.get("attempts_in_window"),
"retry_budget": policy_state.get("retry_budget"),
"churn_threshold": policy_state.get("churn_threshold"),
"window_seconds": policy_state.get("window_seconds"),
},
"recovery_recommendation": recommendation,
},
"restart_history": history,
"last_restart": {
"requested_by": last_restart.get("requested_by"),
"requested_at": last_restart.get("requested_at"),
"requested_ip": last_restart.get("requested_ip"),
"completed_at": last_restart.get("completed_at"),
"success": last_restart.get("success")
}
}
})
# ============================================================
# Page Management Routes
# ============================================================
@admin_bp.route("/pages")
@admin_required
def pages():
"""Page management interface."""
return render_template("admin/pages.html")
@admin_bp.route("/api/pages", methods=["GET"])
@admin_required
def api_get_pages():
"""API: Get all page configurations."""
return jsonify({"success": True, "pages": get_all_pages()})
@admin_bp.route("/api/pages/<path:route>", methods=["PUT"])
@admin_required
def api_update_page(route: str):
"""API: Update page status."""
data = request.get_json()
status = data.get("status")
name = data.get("name")
if status not in ("released", "dev"):
return jsonify({"success": False, "error": "Invalid status"}), 400
# Ensure route starts with /
if not route.startswith("/"):
route = "/" + route
try:
set_page_status(route, status, name)
return jsonify({"success": True})
except Exception as e:
return jsonify({"success": False, "error": str(e)}), 500