chore: finalize vite migration hardening and archive openspec changes

This commit is contained in:
beabigegg
2026-02-08 20:03:36 +08:00
parent b56e80381b
commit c8e225101e
119 changed files with 6547 additions and 1301 deletions

223
scripts/run_cache_benchmarks.py Executable file
View File

@@ -0,0 +1,223 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Benchmark cache query baseline vs indexed selection.
This benchmark is used as a repeatable governance harness for P1 cache/query
efficiency work. It focuses on deterministic synthetic workloads so operators
can compare relative latency and memory amplification over time.
"""
from __future__ import annotations
import argparse
import json
import math
import random
import statistics
import time
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
ROOT = Path(__file__).resolve().parents[1]
FIXTURE_PATH = ROOT / "tests" / "fixtures" / "cache_benchmark_fixture.json"
def load_fixture(path: Path = FIXTURE_PATH) -> dict[str, Any]:
payload = json.loads(path.read_text())
if "rows" not in payload:
raise ValueError("fixture requires rows")
return payload
def build_dataset(rows: int, seed: int) -> pd.DataFrame:
random.seed(seed)
np.random.seed(seed)
workcenters = [f"WC-{idx:02d}" for idx in range(1, 31)]
packages = ["QFN", "DFN", "SOT", "SOP", "BGA", "TSOP"]
types = ["TYPE-A", "TYPE-B", "TYPE-C", "TYPE-D"]
statuses = ["RUN", "QUEUE", "HOLD"]
hold_reasons = ["", "", "", "YieldLimit", "特殊需求管控", "PM Hold"]
frame = pd.DataFrame(
{
"WORKCENTER_GROUP": np.random.choice(workcenters, rows),
"PACKAGE_LEF": np.random.choice(packages, rows),
"PJ_TYPE": np.random.choice(types, rows),
"WIP_STATUS": np.random.choice(statuses, rows, p=[0.45, 0.35, 0.20]),
"HOLDREASONNAME": np.random.choice(hold_reasons, rows),
"QTY": np.random.randint(1, 500, rows),
"WORKORDER": [f"WO-{i:06d}" for i in range(rows)],
"LOTID": [f"LOT-{i:07d}" for i in range(rows)],
}
)
return frame
def _build_index(df: pd.DataFrame) -> dict[str, dict[str, set[int]]]:
def by_column(column: str) -> dict[str, set[int]]:
grouped = df.groupby(column, dropna=True, sort=False).indices
return {str(k): {int(i) for i in v} for k, v in grouped.items()}
return {
"workcenter": by_column("WORKCENTER_GROUP"),
"package": by_column("PACKAGE_LEF"),
"type": by_column("PJ_TYPE"),
"status": by_column("WIP_STATUS"),
}
def _baseline_query(df: pd.DataFrame, query: dict[str, str]) -> int:
subset = df
if query.get("workcenter"):
subset = subset[subset["WORKCENTER_GROUP"] == query["workcenter"]]
if query.get("package"):
subset = subset[subset["PACKAGE_LEF"] == query["package"]]
if query.get("type"):
subset = subset[subset["PJ_TYPE"] == query["type"]]
if query.get("status"):
subset = subset[subset["WIP_STATUS"] == query["status"]]
return int(len(subset))
def _indexed_query(_df: pd.DataFrame, indexes: dict[str, dict[str, set[int]]], query: dict[str, str]) -> int:
selected: set[int] | None = None
for key, bucket in (
("workcenter", "workcenter"),
("package", "package"),
("type", "type"),
("status", "status"),
):
current = indexes[bucket].get(query.get(key, ""))
if current is None:
return 0
if selected is None:
selected = set(current)
else:
selected.intersection_update(current)
if not selected:
return 0
return len(selected or ())
def _build_queries(df: pd.DataFrame, query_count: int, seed: int) -> list[dict[str, str]]:
random.seed(seed + 17)
workcenters = sorted(df["WORKCENTER_GROUP"].dropna().astype(str).unique().tolist())
packages = sorted(df["PACKAGE_LEF"].dropna().astype(str).unique().tolist())
types = sorted(df["PJ_TYPE"].dropna().astype(str).unique().tolist())
statuses = sorted(df["WIP_STATUS"].dropna().astype(str).unique().tolist())
queries: list[dict[str, str]] = []
for _ in range(query_count):
queries.append(
{
"workcenter": random.choice(workcenters),
"package": random.choice(packages),
"type": random.choice(types),
"status": random.choice(statuses),
}
)
return queries
def _p95(values: list[float]) -> float:
if not values:
return 0.0
sorted_values = sorted(values)
index = min(max(math.ceil(0.95 * len(sorted_values)) - 1, 0), len(sorted_values) - 1)
return sorted_values[index]
def run_benchmark(rows: int, query_count: int, seed: int) -> dict[str, Any]:
df = build_dataset(rows=rows, seed=seed)
queries = _build_queries(df, query_count=query_count, seed=seed)
indexes = _build_index(df)
baseline_latencies: list[float] = []
indexed_latencies: list[float] = []
baseline_rows: list[int] = []
indexed_rows: list[int] = []
for query in queries:
start = time.perf_counter()
baseline_rows.append(_baseline_query(df, query))
baseline_latencies.append((time.perf_counter() - start) * 1000)
start = time.perf_counter()
indexed_rows.append(_indexed_query(df, indexes, query))
indexed_latencies.append((time.perf_counter() - start) * 1000)
if baseline_rows != indexed_rows:
raise AssertionError("benchmark correctness drift: indexed result mismatch")
frame_bytes = int(df.memory_usage(index=True, deep=True).sum())
index_entries = sum(len(bucket) for buckets in indexes.values() for bucket in buckets.values())
index_bytes_estimate = int(index_entries * 16)
baseline_p95 = _p95(baseline_latencies)
indexed_p95 = _p95(indexed_latencies)
return {
"rows": rows,
"query_count": query_count,
"seed": seed,
"latency_ms": {
"baseline_avg": round(statistics.fmean(baseline_latencies), 4),
"baseline_p95": round(baseline_p95, 4),
"indexed_avg": round(statistics.fmean(indexed_latencies), 4),
"indexed_p95": round(indexed_p95, 4),
"p95_ratio_indexed_vs_baseline": round(
(indexed_p95 / baseline_p95) if baseline_p95 > 0 else 0.0,
4,
),
},
"memory_bytes": {
"frame": frame_bytes,
"index_estimate": index_bytes_estimate,
"amplification_ratio": round(
(frame_bytes + index_bytes_estimate) / max(frame_bytes, 1),
4,
),
},
}
def main() -> int:
fixture = load_fixture()
parser = argparse.ArgumentParser(description="Run cache baseline vs indexed benchmark")
parser.add_argument("--rows", type=int, default=int(fixture.get("rows", 30000)))
parser.add_argument("--queries", type=int, default=int(fixture.get("query_count", 400)))
parser.add_argument("--seed", type=int, default=int(fixture.get("seed", 42)))
parser.add_argument("--enforce", action="store_true")
args = parser.parse_args()
report = run_benchmark(rows=args.rows, query_count=args.queries, seed=args.seed)
print(json.dumps(report, ensure_ascii=False, indent=2))
if not args.enforce:
return 0
thresholds = fixture.get("thresholds") or {}
max_latency_ratio = float(thresholds.get("max_p95_ratio_indexed_vs_baseline", 1.25))
max_amplification = float(thresholds.get("max_memory_amplification_ratio", 1.8))
latency_ratio = float(report["latency_ms"]["p95_ratio_indexed_vs_baseline"])
amplification_ratio = float(report["memory_bytes"]["amplification_ratio"])
if latency_ratio > max_latency_ratio:
raise SystemExit(
f"Latency regression: {latency_ratio:.4f} > max allowed {max_latency_ratio:.4f}"
)
if amplification_ratio > max_amplification:
raise SystemExit(
f"Memory amplification regression: {amplification_ratio:.4f} > max allowed {max_amplification:.4f}"
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

40
scripts/start_server.sh Normal file → Executable file
View File

@@ -9,7 +9,7 @@ set -uo pipefail
# Configuration
# ============================================================
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
CONDA_ENV="mes-dashboard"
CONDA_ENV="${CONDA_ENV_NAME:-mes-dashboard}"
APP_NAME="mes-dashboard"
PID_FILE_DEFAULT="${ROOT}/tmp/gunicorn.pid"
PID_FILE="${WATCHDOG_PID_FILE:-${PID_FILE_DEFAULT}}"
@@ -56,7 +56,7 @@ timestamp() {
resolve_runtime_paths() {
WATCHDOG_RUNTIME_DIR="${WATCHDOG_RUNTIME_DIR:-${ROOT}/tmp}"
WATCHDOG_RESTART_FLAG="${WATCHDOG_RESTART_FLAG:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart.flag}"
WATCHDOG_PID_FILE="${WATCHDOG_PID_FILE:-${PID_FILE_DEFAULT}}"
WATCHDOG_PID_FILE="${WATCHDOG_PID_FILE:-${WATCHDOG_RUNTIME_DIR}/gunicorn.pid}"
WATCHDOG_STATE_FILE="${WATCHDOG_STATE_FILE:-${WATCHDOG_RUNTIME_DIR}/mes_dashboard_restart_state.json}"
PID_FILE="${WATCHDOG_PID_FILE}"
export WATCHDOG_RUNTIME_DIR WATCHDOG_RESTART_FLAG WATCHDOG_PID_FILE WATCHDOG_STATE_FILE
@@ -81,8 +81,14 @@ check_conda() {
return 1
fi
if [ -n "${CONDA_BIN:-}" ] && [ ! -x "${CONDA_BIN}" ]; then
log_error "CONDA_BIN is set but not executable: ${CONDA_BIN}"
return 1
fi
# Source conda
source "$(conda info --base)/etc/profile.d/conda.sh"
local conda_cmd="${CONDA_BIN:-$(command -v conda)}"
source "$(${conda_cmd} info --base)/etc/profile.d/conda.sh"
# Check if environment exists
if ! conda env list | grep -q "^${CONDA_ENV} "; then
@@ -95,6 +101,33 @@ check_conda() {
return 0
}
validate_runtime_contract() {
conda activate "$CONDA_ENV"
export PYTHONPATH="${ROOT}/src:${PYTHONPATH:-}"
if python - <<'PY'
import os
import sys
from mes_dashboard.core.runtime_contract import build_runtime_contract_diagnostics
strict = os.getenv("RUNTIME_CONTRACT_ENFORCE", "true").strip().lower() in {"1", "true", "yes", "on"}
diag = build_runtime_contract_diagnostics(strict=strict)
if not diag["valid"]:
for error in diag["errors"]:
print(f"RUNTIME_CONTRACT_ERROR: {error}")
raise SystemExit(1)
PY
then
log_success "Runtime contract validation passed"
return 0
fi
log_error "Runtime contract validation failed"
log_info "Fix env vars: WATCHDOG_RUNTIME_DIR / WATCHDOG_RESTART_FLAG / WATCHDOG_PID_FILE / WATCHDOG_STATE_FILE / CONDA_BIN"
return 1
}
check_dependencies() {
conda activate "$CONDA_ENV"
@@ -329,6 +362,7 @@ run_all_checks() {
check_env_file
load_env
resolve_runtime_paths
validate_runtime_contract || return 1
check_port || return 1
check_database
check_redis

177
scripts/worker_watchdog.py Normal file → Executable file
View File

@@ -31,6 +31,23 @@ import time
from datetime import datetime
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
sys.path.insert(0, str(SRC_ROOT))
from mes_dashboard.core.runtime_contract import ( # noqa: E402
build_runtime_contract_diagnostics,
load_runtime_contract,
)
from mes_dashboard.core.worker_recovery_policy import ( # noqa: E402
decide_restart_request,
evaluate_worker_recovery_state,
extract_last_requested_at,
extract_restart_history,
get_worker_recovery_policy_config,
)
# Configure logging
logging.basicConfig(
level=logging.INFO,
@@ -45,7 +62,10 @@ logger = logging.getLogger('mes_dashboard.watchdog')
# Configuration
# ============================================================
CHECK_INTERVAL = int(os.getenv('WATCHDOG_CHECK_INTERVAL', '5'))
_RUNTIME_CONTRACT = load_runtime_contract(project_root=PROJECT_ROOT)
CHECK_INTERVAL = int(
os.getenv('WATCHDOG_CHECK_INTERVAL', str(_RUNTIME_CONTRACT['watchdog_check_interval']))
)
def _env_int(name: str, default: int) -> int:
@@ -55,22 +75,11 @@ def _env_int(name: str, default: int) -> int:
return default
PROJECT_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_RUNTIME_DIR = Path(
os.getenv('WATCHDOG_RUNTIME_DIR', str(PROJECT_ROOT / 'tmp'))
)
RESTART_FLAG_PATH = os.getenv(
'WATCHDOG_RESTART_FLAG',
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart.flag')
)
GUNICORN_PID_FILE = os.getenv(
'WATCHDOG_PID_FILE',
str(DEFAULT_RUNTIME_DIR / 'gunicorn.pid')
)
RESTART_STATE_FILE = os.getenv(
'WATCHDOG_STATE_FILE',
str(DEFAULT_RUNTIME_DIR / 'mes_dashboard_restart_state.json')
)
DEFAULT_RUNTIME_DIR = Path(_RUNTIME_CONTRACT['watchdog_runtime_dir'])
RESTART_FLAG_PATH = _RUNTIME_CONTRACT['watchdog_restart_flag']
GUNICORN_PID_FILE = _RUNTIME_CONTRACT['watchdog_pid_file']
RESTART_STATE_FILE = _RUNTIME_CONTRACT['watchdog_state_file']
RUNTIME_CONTRACT_VERSION = _RUNTIME_CONTRACT['version']
RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
@@ -78,6 +87,32 @@ RESTART_HISTORY_MAX = _env_int('WATCHDOG_RESTART_HISTORY_MAX', 50)
# Watchdog Implementation
# ============================================================
def validate_runtime_contract_or_raise() -> None:
"""Fail fast if runtime contract is inconsistent."""
strict = os.getenv("RUNTIME_CONTRACT_ENFORCE", "true").strip().lower() in {
"1",
"true",
"yes",
"on",
}
diagnostics = build_runtime_contract_diagnostics(strict=strict)
if diagnostics["valid"]:
return
details = "; ".join(diagnostics["errors"])
raise RuntimeError(f"Runtime contract validation failed: {details}")
def log_restart_audit(event: str, payload: dict) -> None:
entry = {
"event": event,
"timestamp": datetime.utcnow().isoformat(),
"runtime_contract_version": RUNTIME_CONTRACT_VERSION,
**payload,
}
logger.info("worker_watchdog_audit %s", json.dumps(entry, ensure_ascii=False))
def get_gunicorn_pid() -> int | None:
"""Get Gunicorn master PID from PID file.
@@ -155,7 +190,12 @@ def save_restart_state(
requested_at: str | None = None,
requested_ip: str | None = None,
completed_at: str | None = None,
success: bool = True
success: bool = True,
source: str = "manual",
decision: str = "allowed",
decision_reason: str | None = None,
manual_override: bool = False,
policy_state: dict | None = None,
) -> None:
"""Save restart state for status queries.
@@ -173,7 +213,12 @@ def save_restart_state(
"requested_at": requested_at,
"requested_ip": requested_ip,
"completed_at": completed_at,
"success": success
"success": success,
"source": source,
"decision": decision,
"decision_reason": decision_reason,
"manual_override": manual_override,
"policy_state": policy_state or {},
}
current_state = load_restart_state()
history = current_state.get("history", [])
@@ -229,6 +274,47 @@ def process_restart_request() -> bool:
return False
logger.info(f"Restart flag detected: {flag_data}")
source = str(flag_data.get("source") or "manual").strip().lower()
manual_override = bool(flag_data.get("manual_override"))
override_ack = bool(flag_data.get("override_acknowledged"))
restart_state = load_restart_state()
restart_history = extract_restart_history(restart_state)
policy_state = evaluate_worker_recovery_state(
restart_history,
last_requested_at=extract_last_requested_at(restart_state),
)
decision = decide_restart_request(
policy_state,
source=source,
manual_override=manual_override,
override_acknowledged=override_ack,
)
if not decision["allowed"]:
remove_restart_flag()
save_restart_state(
requested_by=flag_data.get("user"),
requested_at=flag_data.get("timestamp"),
requested_ip=flag_data.get("ip"),
completed_at=datetime.now().isoformat(),
success=False,
source=source,
decision=decision["decision"],
decision_reason=decision["reason"],
manual_override=manual_override,
policy_state=policy_state,
)
log_restart_audit(
"restart_blocked",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"decision": decision,
"policy_state": policy_state,
},
)
return True
# Get Gunicorn master PID
pid = get_gunicorn_pid()
@@ -242,7 +328,22 @@ def process_restart_request() -> bool:
requested_at=flag_data.get("timestamp"),
requested_ip=flag_data.get("ip"),
completed_at=datetime.now().isoformat(),
success=False
success=False,
source=source,
decision="failed",
decision_reason="gunicorn_pid_unavailable",
manual_override=manual_override,
policy_state=policy_state,
)
log_restart_audit(
"restart_failed",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"decision_reason": "gunicorn_pid_unavailable",
"policy_state": policy_state,
},
)
return True
@@ -258,7 +359,12 @@ def process_restart_request() -> bool:
requested_at=flag_data.get("timestamp"),
requested_ip=flag_data.get("ip"),
completed_at=datetime.now().isoformat(),
success=success
success=success,
source=source,
decision="executed" if success else "failed",
decision_reason="signal_sighup" if success else "signal_failed",
manual_override=manual_override,
policy_state=policy_state,
)
if success:
@@ -267,17 +373,44 @@ def process_restart_request() -> bool:
f"Requested by: {flag_data.get('user', 'unknown')}, "
f"IP: {flag_data.get('ip', 'unknown')}"
)
log_restart_audit(
"restart_executed",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"manual_override": manual_override,
"policy_state": policy_state,
},
)
else:
log_restart_audit(
"restart_failed",
{
"source": source,
"actor": flag_data.get("user"),
"ip": flag_data.get("ip"),
"decision_reason": "signal_failed",
"policy_state": policy_state,
},
)
return True
def run_watchdog() -> None:
"""Main watchdog loop."""
validate_runtime_contract_or_raise()
policy = get_worker_recovery_policy_config()
logger.info(
f"Worker watchdog started - "
f"Check interval: {CHECK_INTERVAL}s, "
f"Flag path: {RESTART_FLAG_PATH}, "
f"PID file: {GUNICORN_PID_FILE}"
f"PID file: {GUNICORN_PID_FILE}, "
f"Policy(cooldown={policy['cooldown_seconds']}s, "
f"retry_budget={policy['retry_budget']}, "
f"window={policy['window_seconds']}s, "
f"guarded={policy['guarded_mode_enabled']})"
)
while True: