DashBoard/src/mes_dashboard/routes/admin_routes.py

# -*- coding: utf-8 -*-
"""Admin routes for page management and performance monitoring."""

from __future__ import annotations

import json
import logging
import os
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

from flask import Blueprint, g, jsonify, render_template, request

from mes_dashboard.core.permissions import admin_required
from mes_dashboard.core.response import error_response, TOO_MANY_REQUESTS
from mes_dashboard.core.resilience import (
    build_recovery_recommendation,
    get_resilience_thresholds,
    summarize_restart_history,
)
from mes_dashboard.core.runtime_contract import (
    build_runtime_contract_diagnostics,
    load_runtime_contract,
)
from mes_dashboard.core.worker_recovery_policy import (
    decide_restart_request,
    evaluate_worker_recovery_state,
    extract_last_requested_at,
    extract_restart_history,
    load_restart_state,
)
from mes_dashboard.services.page_registry import get_all_pages, set_page_status

admin_bp = Blueprint("admin", __name__, url_prefix="/admin")
logger = logging.getLogger("mes_dashboard.admin")

# ============================================================
# Worker Restart Configuration
# ============================================================

_RUNTIME_CONTRACT = load_runtime_contract()
WATCHDOG_RUNTIME_DIR = _RUNTIME_CONTRACT["watchdog_runtime_dir"]
RESTART_FLAG_PATH = _RUNTIME_CONTRACT["watchdog_restart_flag"]
RESTART_STATE_PATH = _RUNTIME_CONTRACT["watchdog_state_file"]
WATCHDOG_PID_PATH = _RUNTIME_CONTRACT["watchdog_pid_file"]
GUNICORN_BIND = _RUNTIME_CONTRACT["gunicorn_bind"]
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT["version"]

# Track last restart request time (in-memory for this worker)
_last_restart_request: float = 0.0


# ============================================================
# Performance Monitoring Routes
# ============================================================

@admin_bp.route("/performance")
@admin_required
def performance():
    """Performance monitoring dashboard."""
    return render_template("admin/performance.html")


@admin_bp.route("/api/system-status", methods=["GET"])
@admin_required
def api_system_status():
    """API: Get system status for performance dashboard."""
    from mes_dashboard.core.database import get_pool_runtime_config, get_pool_status
    from mes_dashboard.core.redis_client import REDIS_ENABLED
    from mes_dashboard.core.circuit_breaker import get_circuit_breaker_status
    from mes_dashboard.routes.health_routes import (
        check_database,
        check_redis,
        get_route_cache_status,
    )

    # Database status
    db_status, db_error = check_database()

    # Redis status
    redis_status = 'disabled'
    if REDIS_ENABLED:
        redis_status, _ = check_redis()

    # Circuit breaker status
    circuit_breaker = get_circuit_breaker_status()
    route_cache = get_route_cache_status()
    pool_runtime = get_pool_runtime_config()
    try:
        pool_state = get_pool_status()
    except Exception:
        pool_state = None
    thresholds = get_resilience_thresholds()
    restart_state = _get_restart_state()
    restart_churn = _get_restart_churn_summary(restart_state)
    policy_state = _get_restart_policy_state(restart_state)
    in_cooldown = bool(policy_state.get("cooldown"))
    remaining = int(policy_state.get("cooldown_remaining_seconds") or 0)

    degraded_reason = None
    if db_status == "error":
        degraded_reason = "database_unreachable"
    elif circuit_breaker.get("state") == "OPEN":
        degraded_reason = "circuit_breaker_open"
    elif (pool_state or {}).get("saturation", 0.0) >= 1.0:
        degraded_reason = "db_pool_saturated"
    elif redis_status == "error":
        degraded_reason = "redis_unavailable"
    elif route_cache.get("degraded"):
        degraded_reason = "route_cache_degraded"
    recommendation = build_recovery_recommendation(
        degraded_reason=degraded_reason,
        pool_saturation=(pool_state or {}).get("saturation"),
        circuit_state=circuit_breaker.get("state"),
        restart_churn_exceeded=bool(restart_churn.get("exceeded")),
        cooldown_active=in_cooldown,
    )
    alerts = _build_restart_alerts(
        pool_saturation=(pool_state or {}).get("saturation"),
        circuit_state=circuit_breaker.get("state"),
        route_cache_degraded=bool(route_cache.get("degraded")),
        policy_state=policy_state,
        thresholds=thresholds,
    )
    runtime_contract = build_runtime_contract_diagnostics(strict=False)

    # Cache status
    from mes_dashboard.routes.health_routes import (
        get_cache_status,
        get_resource_cache_status,
        get_equipment_status_cache_status
    )

    return jsonify({
        "success": True,
        "data": {
            "database": {
                "status": db_status,
                "error": db_error
            },
            "redis": {
                "status": redis_status,
                "enabled": REDIS_ENABLED
            },
            "circuit_breaker": circuit_breaker,
            "cache": {
                "wip": get_cache_status(),
                "resource": get_resource_cache_status(),
                "equipment": get_equipment_status_cache_status()
            },
            "runtime_resilience": {
                "degraded_reason": degraded_reason,
                "pool_runtime": pool_runtime,
                "pool_state": pool_state,
                "route_cache": route_cache,
                "thresholds": thresholds,
                "alerts": alerts,
                "restart_churn": restart_churn,
                "policy_state": {
                    "state": policy_state.get("state"),
                    "allowed": policy_state.get("allowed"),
                    "cooldown": policy_state.get("cooldown"),
                    "blocked": policy_state.get("blocked"),
                    "cooldown_remaining_seconds": remaining,
                },
                "recovery_recommendation": recommendation,
                "restart_cooldown": {
                    "active": in_cooldown,
                    "remaining_seconds": remaining if in_cooldown else 0,
                },
            },
            "runtime_contract": runtime_contract,
            "single_port_bind": GUNICORN_BIND,
            "worker_pid": os.getpid()
        }
    })


@admin_bp.route("/api/metrics", methods=["GET"])
@admin_required
def api_metrics():
    """API: Get performance metrics for dashboard."""
    from mes_dashboard.core.metrics import get_metrics_summary, get_query_metrics

    summary = get_metrics_summary()
    metrics = get_query_metrics()

    return jsonify({
        "success": True,
        "data": {
            "p50_ms": summary.get("p50_ms"),
            "p95_ms": summary.get("p95_ms"),
            "p99_ms": summary.get("p99_ms"),
            "count": summary.get("count"),
            "slow_count": summary.get("slow_count"),
            "slow_rate": summary.get("slow_rate"),
            "worker_pid": summary.get("worker_pid"),
            "collected_at": summary.get("collected_at"),
            # Include latency distribution for charts
            "latencies": metrics.get_latencies()[-100:]  # Last 100 for chart
        }
    })


@admin_bp.route("/api/logs", methods=["GET"])
@admin_required
def api_logs():
    """API: Get recent logs from SQLite log store."""
    from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED

    if not LOG_STORE_ENABLED:
        return jsonify({
            "success": True,
            "data": {
                "logs": [],
                "enabled": False,
                "total": 0
            }
        })

    # Query parameters
    level = request.args.get("level")
    q = request.args.get("q")
    limit = request.args.get("limit", 50, type=int)
    offset = request.args.get("offset", 0, type=int)
    since = request.args.get("since")

    log_store = get_log_store()

    # Get total count for pagination
    total = log_store.count_logs(level=level, q=q, since=since)

    # Get paginated logs
    logs = log_store.query_logs(
        level=level,
        q=q,
        limit=min(limit, 100),  # Cap at 100 per page
        offset=offset,
        since=since
    )

    return jsonify({
        "success": True,
        "data": {
            "logs": logs,
            "count": len(logs),
            "total": total,
            "enabled": True,
            "stats": log_store.get_stats()
        }
    })


@admin_bp.route("/api/logs/cleanup", methods=["POST"])
@admin_required
def api_logs_cleanup():
    """API: Manually trigger log cleanup.

    Supports optional parameters:
    - older_than_days: Delete logs older than N days (default: use configured retention)
    - keep_count: Keep only the most recent N logs (optional)
    """
    from mes_dashboard.core.log_store import get_log_store, LOG_STORE_ENABLED

    if not LOG_STORE_ENABLED:
        return jsonify({
            "success": False,
            "error": "Log store is disabled"
        }), 400

    log_store = get_log_store()

    # Get current stats before cleanup
    stats_before = log_store.get_stats()

    # Perform cleanup
    deleted = log_store.cleanup_old_logs()

    # Get stats after cleanup
    stats_after = log_store.get_stats()

    user = getattr(g, "username", "unknown")
    logger.info(f"Log cleanup triggered by {user}: deleted {deleted} entries")

    return jsonify({
        "success": True,
        "data": {
            "deleted": deleted,
            "before": {
                "count": stats_before.get("count", 0),
                "size_bytes": stats_before.get("size_bytes", 0)
            },
            "after": {
                "count": stats_after.get("count", 0),
                "size_bytes": stats_after.get("size_bytes", 0)
            }
        }
    })


# ============================================================
# Worker Restart Control Routes
# ============================================================

def _get_restart_state() -> dict:
    """Read worker restart state from file."""
    return load_restart_state(RESTART_STATE_PATH)


def _iso_from_epoch(ts: float) -> str | None:
    if ts <= 0:
        return None
    return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()


def _check_restart_cooldown() -> tuple[bool, float]:
    """Check if restart is in cooldown.

    Returns:
        Tuple of (is_in_cooldown, remaining_seconds).
    """
    policy = _get_restart_policy_state()
    if policy.get("cooldown"):
        return True, float(policy.get("cooldown_remaining_seconds") or 0.0)
    return False, 0.0


def _get_restart_history(state: dict | None = None) -> list[dict]:
    """Return bounded restart history for admin telemetry."""
    payload = state if state is not None else _get_restart_state()
    return extract_restart_history(payload)[-20:]


def _get_restart_churn_summary(state: dict | None = None) -> dict:
    """Summarize restart churn within active resilience window."""
    history = _get_restart_history(state)
    return summarize_restart_history(history)


def _get_restart_policy_state(state: dict | None = None) -> dict[str, Any]:
    """Return effective worker restart policy state."""
    payload = state if state is not None else _get_restart_state()
    history = _get_restart_history(payload)
    last_requested = extract_last_requested_at(payload)

    in_memory_requested = _iso_from_epoch(_last_restart_request)
    if in_memory_requested:
        try:
            in_memory_dt = datetime.fromisoformat(in_memory_requested)
            persisted_dt = datetime.fromisoformat(last_requested) if last_requested else None
        except (TypeError, ValueError):
            in_memory_dt = None
            persisted_dt = None
        if in_memory_dt and (persisted_dt is None or in_memory_dt > persisted_dt):
            last_requested = in_memory_requested

    return evaluate_worker_recovery_state(
        history,
        last_requested_at=last_requested,
    )


def _build_restart_alerts(
    *,
    pool_saturation: float | None,
    circuit_state: str | None,
    route_cache_degraded: bool,
    policy_state: dict[str, Any],
    thresholds: dict[str, Any],
) -> dict[str, Any]:
    saturation = float(pool_saturation or 0.0)
    warning = float(thresholds.get("pool_saturation_warning", 0.9))
    critical = float(thresholds.get("pool_saturation_critical", 1.0))
    return {
        "pool_warning": saturation >= warning,
        "pool_critical": saturation >= critical,
        "circuit_open": circuit_state == "OPEN",
        "route_cache_degraded": bool(route_cache_degraded),
        "restart_churn_exceeded": bool(policy_state.get("churn_exceeded")),
        "restart_blocked": bool(policy_state.get("blocked")),
    }


def _log_restart_audit(event: str, payload: dict[str, Any]) -> None:
    entry = {
        "event": event,
        "timestamp": datetime.now(tz=timezone.utc).isoformat(),
        "runtime_contract_version": RUNTIME_CONTRACT_VERSION,
        **payload,
    }
    logger.info("worker_restart_audit %s", json.dumps(entry, ensure_ascii=False))


@admin_bp.route("/api/worker/restart", methods=["POST"])
@admin_required
def api_worker_restart():
    """API: Request worker restart.

    Writes a restart flag file that the watchdog process monitors.
    Enforces a 60-second cooldown between restart requests.
    """
    global _last_restart_request

    payload = request.get_json(silent=True) or {}
    manual_override = bool(payload.get("manual_override"))
    override_acknowledged = bool(payload.get("override_acknowledged"))
    override_reason = str(payload.get("override_reason") or "").strip()

    # Get request metadata
    user = getattr(g, "username", "unknown")
    ip = request.remote_addr or "unknown"
    timestamp = datetime.now(tz=timezone.utc).isoformat()

    state = _get_restart_state()
    policy_state = _get_restart_policy_state(state)
    decision = decide_restart_request(
        policy_state,
        source="manual",
        manual_override=manual_override,
        override_acknowledged=override_acknowledged,
    )

    if manual_override and not override_reason:
        return error_response(
            "RESTART_OVERRIDE_REASON_REQUIRED",
            "Manual override requires non-empty override_reason for audit traceability.",
            status_code=400,
        )

    if not decision["allowed"]:
        status_code = 429 if policy_state.get("cooldown") else 409
        if status_code == 429:
            message = (
                f"Restart in cooldown. Please wait "
                f"{int(policy_state.get('cooldown_remaining_seconds') or 0)} seconds."
            )
            code = TOO_MANY_REQUESTS
        else:
            message = (
                "Restart blocked by guarded mode. "
                "Set manual_override=true and override_acknowledged=true to proceed."
            )
            code = "RESTART_POLICY_BLOCKED"
        _log_restart_audit(
            "restart_request_blocked",
            {
                "actor": user,
                "ip": ip,
                "decision": decision,
                "policy_state": policy_state,
            },
        )
        return error_response(
            code,
            message,
            status_code=status_code,
        )

    # Write restart flag file
    flag_path = Path(RESTART_FLAG_PATH)
    flag_data = {
        "user": user,
        "ip": ip,
        "timestamp": timestamp,
        "worker_pid": os.getpid(),
        "source": "manual",
        "manual_override": bool(manual_override and override_acknowledged),
        "override_acknowledged": override_acknowledged,
        "override_reason": override_reason or None,
        "policy_state": policy_state,
        "policy_decision": decision["decision"],
        "runtime_contract_version": RUNTIME_CONTRACT_VERSION,
    }

    try:
        flag_path.parent.mkdir(parents=True, exist_ok=True)
        tmp_path = flag_path.with_suffix(flag_path.suffix + ".tmp")
        tmp_path.write_text(json.dumps(flag_data, ensure_ascii=False))
        tmp_path.replace(flag_path)
    except IOError as e:
        logger.error(f"Failed to write restart flag: {e}")
        return error_response(
            "RESTART_FAILED",
            f"Failed to request restart: {e}",
            status_code=500
        )

    # Update in-memory cooldown
    _last_restart_request = time.time()

    _log_restart_audit(
        "restart_request_accepted",
        {
            "actor": user,
            "ip": ip,
            "decision": decision,
            "policy_state": policy_state,
            "override_reason": override_reason or None,
        },
    )

    return jsonify({
        "success": True,
        "data": {
            "message": "Restart requested. Workers will reload shortly.",
            "requested_by": user,
            "requested_at": timestamp,
            "policy_state": {
                "state": policy_state.get("state"),
                "allowed": policy_state.get("allowed"),
                "cooldown": policy_state.get("cooldown"),
                "blocked": policy_state.get("blocked"),
                "cooldown_remaining_seconds": policy_state.get("cooldown_remaining_seconds"),
            },
            "decision": decision,
            "single_port_bind": GUNICORN_BIND,
            "watchdog": {
                "runtime_dir": WATCHDOG_RUNTIME_DIR,
                "flag_path": RESTART_FLAG_PATH,
                "pid_path": WATCHDOG_PID_PATH,
                "state_path": RESTART_STATE_PATH,
            },
        }
    })


@admin_bp.route("/api/worker/status", methods=["GET"])
@admin_required
def api_worker_status():
    """API: Get worker status and restart information."""
    # Get last restart info
    state = _get_restart_state()
    last_restart = state.get("last_restart", {})
    history = _get_restart_history(state)
    churn = _get_restart_churn_summary(state)
    policy_state = _get_restart_policy_state(state)
    thresholds = get_resilience_thresholds()
    recommendation = build_recovery_recommendation(
        degraded_reason="db_pool_saturated" if policy_state.get("blocked") else None,
        pool_saturation=None,
        circuit_state=None,
        restart_churn_exceeded=bool(churn.get("exceeded")),
        cooldown_active=bool(policy_state.get("cooldown")),
    )
    runtime_contract = build_runtime_contract_diagnostics(strict=False)

    # Get worker start time (psutil is optional)
    worker_start_time = None
    try:
        import psutil
        process = psutil.Process(os.getpid())
        worker_start_time = datetime.fromtimestamp(
            process.create_time()
        ).isoformat()
    except ImportError:
        # psutil not installed, try /proc on Linux
        try:
            stat_path = f"/proc/{os.getpid()}/stat"
            with open(stat_path) as f:
                stat = f.read().split()
                # Field 22 is starttime in clock ticks since boot
                # This is a simplified fallback
                pass
        except Exception:
            pass
    except Exception:
        pass

    return jsonify({
        "success": True,
        "data": {
            "worker_pid": os.getpid(),
            "worker_start_time": worker_start_time,
            "runtime_contract": {
                "version": runtime_contract["contract"]["version"],
                "validation": {
                    "valid": runtime_contract["valid"],
                    "errors": runtime_contract["errors"],
                },
                "single_port_bind": GUNICORN_BIND,
                "watchdog": {
                    "runtime_dir": WATCHDOG_RUNTIME_DIR,
                    "flag_path": RESTART_FLAG_PATH,
                    "flag_exists": Path(RESTART_FLAG_PATH).exists(),
                    "pid_path": WATCHDOG_PID_PATH,
                    "pid_exists": Path(WATCHDOG_PID_PATH).exists(),
                    "state_path": RESTART_STATE_PATH,
                    "state_exists": Path(RESTART_STATE_PATH).exists(),
                },
            },
            "cooldown": {
                "active": bool(policy_state.get("cooldown")),
                "remaining_seconds": int(policy_state.get("cooldown_remaining_seconds") or 0)
            },
            "resilience": {
                "thresholds": thresholds,
                "alerts": {
                    "restart_churn_exceeded": bool(churn.get("exceeded")),
                    "restart_blocked": bool(policy_state.get("blocked")),
                },
                "restart_churn": churn,
                "policy_state": {
                    "state": policy_state.get("state"),
                    "allowed": policy_state.get("allowed"),
                    "cooldown": policy_state.get("cooldown"),
                    "blocked": policy_state.get("blocked"),
                    "cooldown_remaining_seconds": policy_state.get("cooldown_remaining_seconds"),
                    "attempts_in_window": policy_state.get("attempts_in_window"),
                    "retry_budget": policy_state.get("retry_budget"),
                    "churn_threshold": policy_state.get("churn_threshold"),
                    "window_seconds": policy_state.get("window_seconds"),
                },
                "recovery_recommendation": recommendation,
            },
            "restart_history": history,
            "last_restart": {
                "requested_by": last_restart.get("requested_by"),
                "requested_at": last_restart.get("requested_at"),
                "requested_ip": last_restart.get("requested_ip"),
                "completed_at": last_restart.get("completed_at"),
                "success": last_restart.get("success")
            }
        }
    })


# ============================================================
# Page Management Routes
# ============================================================

@admin_bp.route("/pages")
@admin_required
def pages():
    """Page management interface."""
    return render_template("admin/pages.html")


@admin_bp.route("/api/pages", methods=["GET"])
@admin_required
def api_get_pages():
    """API: Get all page configurations."""
    return jsonify({"success": True, "pages": get_all_pages()})


@admin_bp.route("/api/pages/<path:route>", methods=["PUT"])
@admin_required
def api_update_page(route: str):
    """API: Update page status."""
    data = request.get_json()
    status = data.get("status")
    name = data.get("name")

    if status not in ("released", "dev"):
        return jsonify({"success": False, "error": "Invalid status"}), 400

    # Ensure route starts with /
    if not route.startswith("/"):
        route = "/" + route

    try:
        set_page_status(route, status, name)
        return jsonify({"success": True})
    except Exception as e:
        return jsonify({"success": False, "error": str(e)}), 500