Files
the-nexus/bin/nexus_watchdog.py
Alexander Whitestone ed4c5da3cb
Some checks failed
CI / test (pull_request) Failing after 8s
CI / validate (pull_request) Failing after 13s
Review Approval Gate / verify-review (pull_request) Failing after 3s
fix: closes #865
2026-04-12 19:28:17 -04:00

705 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Nexus Watchdog — The Eye That Never Sleeps
Monitors the health of the Nexus consciousness loop and WebSocket
gateway, raising Gitea issues when components go dark.
The nexus was dead for hours after a syntax error crippled
nexus_think.py. Nobody knew. The gateway kept running, but the
consciousness loop — the only part that matters — was silent.
This watchdog ensures that never happens again.
HOW IT WORKS
============
1. Probes the WebSocket gateway (ws://localhost:8765)
→ Can Timmy hear the world?
2. Checks for a running nexus_think.py process
→ Is Timmy's mind awake?
3. Reads the heartbeat file (~/.nexus/heartbeat.json)
→ When did Timmy last think?
4. If any check fails, opens a Gitea issue (or updates an existing one)
with the exact failure mode, timestamp, and diagnostic info.
5. If all checks pass after a previous failure, closes the issue
with a recovery note.
USAGE
=====
# One-shot check (good for cron)
python bin/nexus_watchdog.py
# Continuous monitoring (every 60s)
python bin/nexus_watchdog.py --watch --interval 60
# Dry-run (print diagnostics, don't touch Gitea)
python bin/nexus_watchdog.py --dry-run
# Crontab entry (every 5 minutes)
*/5 * * * * cd /path/to/the-nexus && python bin/nexus_watchdog.py
HEARTBEAT PROTOCOL
==================
The consciousness loop (nexus_think.py) writes a heartbeat file
after each think cycle:
~/.nexus/heartbeat.json
{
"pid": 12345,
"timestamp": 1711843200.0,
"cycle": 42,
"model": "timmy:v0.1-q4",
"status": "thinking"
}
If the heartbeat is older than --stale-threshold seconds, the
mind is considered dead even if the process is still running
(e.g., hung on a blocking call).
KIMI HEARTBEAT
==============
The Kimi triage pipeline writes a cron heartbeat file after each run:
/var/run/bezalel/heartbeats/kimi-heartbeat.last
(fallback: ~/.bezalel/heartbeats/kimi-heartbeat.last)
{
"job": "kimi-heartbeat",
"timestamp": 1711843200.0,
"interval_seconds": 900,
"pid": 12345,
"status": "ok"
}
If the heartbeat is stale (>2x declared interval), the watchdog reports
a Kimi Heartbeat failure alongside the other checks.
ZERO DEPENDENCIES
=================
Pure stdlib. No pip installs. Same machine as the nexus.
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import signal
import socket
import subprocess
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
# Poka-yoke: write a cron heartbeat so check_cron_heartbeats.py can detect
# if *this* watchdog stops running. Import lazily to stay zero-dep if the
# nexus package is unavailable (e.g. very minimal test environments).
try:
from nexus.cron_heartbeat import write_cron_heartbeat as _write_cron_heartbeat
_HAS_CRON_HEARTBEAT = True
except ImportError:
_HAS_CRON_HEARTBEAT = False
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-7s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("nexus.watchdog")
# ── Configuration ────────────────────────────────────────────────────
DEFAULT_WS_HOST = "localhost"
DEFAULT_WS_PORT = 8765
DEFAULT_HEARTBEAT_PATH = Path.home() / ".nexus" / "heartbeat.json"
DEFAULT_STALE_THRESHOLD = 300 # 5 minutes without a heartbeat = dead
DEFAULT_INTERVAL = 60 # seconds between checks in watch mode
# Kimi Heartbeat — cron job heartbeat file written by the triage pipeline
KIMI_HEARTBEAT_JOB = "kimi-heartbeat"
KIMI_HEARTBEAT_STALE_MULTIPLIER = 2.0 # stale at 2x declared interval
GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
WATCHDOG_LABEL = "watchdog"
WATCHDOG_TITLE_PREFIX = "[watchdog]"
# ── Health check results ─────────────────────────────────────────────
@dataclass
class CheckResult:
"""Result of a single health check."""
name: str
healthy: bool
message: str
details: Dict[str, Any] = field(default_factory=dict)
@dataclass
class HealthReport:
"""Aggregate health report from all checks."""
timestamp: float
checks: List[CheckResult]
overall_healthy: bool = True
def __post_init__(self):
self.overall_healthy = all(c.healthy for c in self.checks)
@property
def failed_checks(self) -> List[CheckResult]:
return [c for c in self.checks if not c.healthy]
def to_markdown(self) -> str:
"""Format as a Gitea issue body."""
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(self.timestamp))
status = "🟢 ALL SYSTEMS OPERATIONAL" if self.overall_healthy else "🔴 FAILURES DETECTED"
lines = [
f"## Nexus Health Report — {ts}",
f"**Status:** {status}",
"",
"| Check | Status | Details |",
"|:------|:------:|:--------|",
]
for c in self.checks:
icon = "" if c.healthy else ""
lines.append(f"| {c.name} | {icon} | {c.message} |")
if self.failed_checks:
lines.append("")
lines.append("### Failure Diagnostics")
for c in self.failed_checks:
lines.append(f"\n**{c.name}:**")
lines.append(f"```")
lines.append(c.message)
if c.details:
lines.append(json.dumps(c.details, indent=2))
lines.append(f"```")
lines.append("")
lines.append(f"*Generated by `nexus_watchdog.py` at {ts}*")
return "\n".join(lines)
# ── Health checks ────────────────────────────────────────────────────
def check_ws_gateway(host: str = DEFAULT_WS_HOST, port: int = DEFAULT_WS_PORT) -> CheckResult:
"""Check if the WebSocket gateway is accepting connections.
Uses a raw TCP socket probe (not a full WebSocket handshake) to avoid
depending on the websockets library. If TCP connects, the gateway
process is alive and listening.
"""
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((host, port))
sock.close()
if result == 0:
return CheckResult(
name="WebSocket Gateway",
healthy=True,
message=f"Listening on {host}:{port}",
)
else:
return CheckResult(
name="WebSocket Gateway",
healthy=False,
message=f"Connection refused on {host}:{port} (errno={result})",
details={"host": host, "port": port, "errno": result},
)
except Exception as e:
return CheckResult(
name="WebSocket Gateway",
healthy=False,
message=f"Probe failed: {e}",
details={"host": host, "port": port, "error": str(e)},
)
def check_mind_process() -> CheckResult:
"""Check if nexus_think.py is running as a process.
Uses `pgrep -f` to find processes matching the script name.
This catches both `python nexus_think.py` and `python -m nexus.nexus_think`.
"""
try:
result = subprocess.run(
["pgrep", "-f", "nexus_think"],
capture_output=True, text=True, timeout=5,
)
if result.returncode == 0:
pids = [p.strip() for p in result.stdout.strip().split("\n") if p.strip()]
# Filter out our own watchdog process
own_pid = str(os.getpid())
pids = [p for p in pids if p != own_pid]
if pids:
return CheckResult(
name="Consciousness Loop",
healthy=True,
message=f"Running (PID: {', '.join(pids)})",
details={"pids": pids},
)
return CheckResult(
name="Consciousness Loop",
healthy=False,
message="nexus_think.py is not running — Timmy's mind is dark",
details={"pgrep_returncode": result.returncode},
)
except FileNotFoundError:
# pgrep not available (unlikely on Linux/macOS but handle gracefully)
return CheckResult(
name="Consciousness Loop",
healthy=True, # Can't check — don't raise false alarms
message="pgrep not available, skipping process check",
)
except Exception as e:
return CheckResult(
name="Consciousness Loop",
healthy=False,
message=f"Process check failed: {e}",
details={"error": str(e)},
)
def check_heartbeat(
path: Path = DEFAULT_HEARTBEAT_PATH,
stale_threshold: int = DEFAULT_STALE_THRESHOLD,
) -> CheckResult:
"""Check if the heartbeat file exists and is recent.
The consciousness loop should write this file after each think
cycle. If it's missing or stale, the mind has stopped thinking
even if the process is technically alive.
"""
if not path.exists():
return CheckResult(
name="Heartbeat",
healthy=False,
message=f"No heartbeat file at {path} — mind has never reported",
details={"path": str(path)},
)
try:
data = json.loads(path.read_text())
except (json.JSONDecodeError, OSError) as e:
return CheckResult(
name="Heartbeat",
healthy=False,
message=f"Heartbeat file corrupt: {e}",
details={"path": str(path), "error": str(e)},
)
timestamp = data.get("timestamp", 0)
age = time.time() - timestamp
cycle = data.get("cycle", "?")
model = data.get("model", "unknown")
status = data.get("status", "unknown")
if age > stale_threshold:
return CheckResult(
name="Heartbeat",
healthy=False,
message=(
f"Stale heartbeat — last pulse {int(age)}s ago "
f"(threshold: {stale_threshold}s). "
f"Cycle #{cycle}, model={model}, status={status}"
),
details=data,
)
return CheckResult(
name="Heartbeat",
healthy=True,
message=f"Alive — cycle #{cycle}, {int(age)}s ago, model={model}",
details=data,
)
def check_syntax_health() -> CheckResult:
"""Verify nexus_think.py can be parsed by Python.
This catches the exact failure mode that killed the nexus: a syntax
error introduced by a bad commit. Python's compile() is a fast,
zero-import check that catches SyntaxErrors before they hit runtime.
"""
script_path = Path(__file__).parent.parent / "nexus" / "nexus_think.py"
if not script_path.exists():
return CheckResult(
name="Syntax Health",
healthy=True,
message="nexus_think.py not found at expected path, skipping",
)
try:
source = script_path.read_text()
compile(source, str(script_path), "exec")
return CheckResult(
name="Syntax Health",
healthy=True,
message=f"nexus_think.py compiles cleanly ({len(source)} bytes)",
)
except SyntaxError as e:
return CheckResult(
name="Syntax Health",
healthy=False,
message=f"SyntaxError at line {e.lineno}: {e.msg}",
details={
"file": str(script_path),
"line": e.lineno,
"offset": e.offset,
"text": (e.text or "").strip(),
},
)
def check_kimi_heartbeat(
job: str = KIMI_HEARTBEAT_JOB,
stale_multiplier: float = KIMI_HEARTBEAT_STALE_MULTIPLIER,
) -> CheckResult:
"""Check if the Kimi Heartbeat cron job is alive.
Reads the ``<job>.last`` file from the standard Bezalel heartbeat
directory (``/var/run/bezalel/heartbeats/`` or fallback
``~/.bezalel/heartbeats/``). The file is written atomically by the
cron_heartbeat module after each successful triage pipeline run.
A job is stale when:
``time.time() - timestamp > stale_multiplier * interval_seconds``
(same rule used by ``check_cron_heartbeats.py``).
"""
# Resolve heartbeat directory — same logic as cron_heartbeat._resolve
primary = Path("/var/run/bezalel/heartbeats")
fallback = Path.home() / ".bezalel" / "heartbeats"
env_dir = os.environ.get("BEZALEL_HEARTBEAT_DIR")
if env_dir:
hb_dir = Path(env_dir)
elif primary.exists():
hb_dir = primary
elif fallback.exists():
hb_dir = fallback
else:
return CheckResult(
name="Kimi Heartbeat",
healthy=False,
message="Heartbeat directory not found — no triage pipeline deployed yet",
details={"searched": [str(primary), str(fallback)]},
)
hb_file = hb_dir / f"{job}.last"
if not hb_file.exists():
return CheckResult(
name="Kimi Heartbeat",
healthy=False,
message=f"No heartbeat file at {hb_file} — Kimi triage pipeline has never reported",
details={"path": str(hb_file)},
)
try:
data = json.loads(hb_file.read_text())
except (json.JSONDecodeError, OSError) as e:
return CheckResult(
name="Kimi Heartbeat",
healthy=False,
message=f"Heartbeat file corrupt: {e}",
details={"path": str(hb_file), "error": str(e)},
)
timestamp = float(data.get("timestamp", 0))
interval = int(data.get("interval_seconds", 0))
raw_status = data.get("status", "unknown")
age = time.time() - timestamp
if interval <= 0:
# No declared interval — use raw timestamp age (30 min default)
interval = 1800
threshold = stale_multiplier * interval
is_stale = age > threshold
age_str = f"{int(age)}s" if age < 3600 else f"{int(age // 3600)}h {int((age % 3600) // 60)}m"
interval_str = f"{int(interval)}s" if interval < 3600 else f"{int(interval // 3600)}h {int((interval % 3600) // 60)}m"
if is_stale:
return CheckResult(
name="Kimi Heartbeat",
healthy=False,
message=(
f"Silent for {age_str} "
f"(threshold: {stale_multiplier}x {interval_str} = {int(threshold)}s). "
f"Status: {raw_status}"
),
details=data,
)
return CheckResult(
name="Kimi Heartbeat",
healthy=True,
message=f"Alive — last beat {age_str} ago (interval {interval_str}, status={raw_status})",
details=data,
)
# ── Gitea alerting ───────────────────────────────────────────────────
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
"""Make a Gitea API request. Returns parsed JSON or empty dict."""
import urllib.request
import urllib.error
url = f"{GITEA_URL.rstrip('/')}/api/v1{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method)
if GITEA_TOKEN:
req.add_header("Authorization", f"token {GITEA_TOKEN}")
req.add_header("Content-Type", "application/json")
req.add_header("Accept", "application/json")
try:
with urllib.request.urlopen(req, timeout=15) as resp:
raw = resp.read().decode()
return json.loads(raw) if raw.strip() else {}
except urllib.error.HTTPError as e:
logger.warning("Gitea %d: %s", e.code, e.read().decode()[:200])
return None
except Exception as e:
logger.warning("Gitea request failed: %s", e)
return None
def find_open_watchdog_issue() -> Optional[dict]:
"""Find an existing open watchdog issue, if any."""
issues = _gitea_request(
"GET",
f"/repos/{GITEA_REPO}/issues?state=open&type=issues&limit=20",
)
if not issues or not isinstance(issues, list):
return None
for issue in issues:
title = issue.get("title", "")
if title.startswith(WATCHDOG_TITLE_PREFIX):
return issue
return None
def create_alert_issue(report: HealthReport) -> Optional[dict]:
"""Create a Gitea issue for a health failure."""
failed = report.failed_checks
components = ", ".join(c.name for c in failed)
title = f"{WATCHDOG_TITLE_PREFIX} Nexus health failure: {components}"
return _gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues",
data={
"title": title,
"body": report.to_markdown(),
"assignees": ["Timmy"],
},
)
def update_alert_issue(issue_number: int, report: HealthReport) -> Optional[dict]:
"""Add a comment to an existing watchdog issue with new findings."""
return _gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues/{issue_number}/comments",
data={"body": report.to_markdown()},
)
def close_alert_issue(issue_number: int, report: HealthReport) -> None:
"""Close a watchdog issue when health is restored."""
_gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues/{issue_number}/comments",
data={"body": (
"## 🟢 Recovery Confirmed\n\n"
+ report.to_markdown()
+ "\n\n*Closing — all systems operational.*"
)},
)
_gitea_request(
"PATCH",
f"/repos/{GITEA_REPO}/issues/{issue_number}",
data={"state": "closed"},
)
# ── Orchestration ────────────────────────────────────────────────────
def run_health_checks(
ws_host: str = DEFAULT_WS_HOST,
ws_port: int = DEFAULT_WS_PORT,
heartbeat_path: Path = DEFAULT_HEARTBEAT_PATH,
stale_threshold: int = DEFAULT_STALE_THRESHOLD,
) -> HealthReport:
"""Run all health checks and return the aggregate report."""
checks = [
check_ws_gateway(ws_host, ws_port),
check_mind_process(),
check_heartbeat(heartbeat_path, stale_threshold),
check_syntax_health(),
check_kimi_heartbeat(),
]
return HealthReport(timestamp=time.time(), checks=checks)
def alert_on_failure(report: HealthReport, dry_run: bool = False) -> None:
"""Create, update, or close Gitea issues based on health status."""
if dry_run:
logger.info("DRY RUN — would %s Gitea issue",
"close" if report.overall_healthy else "create/update")
return
if not GITEA_TOKEN:
logger.warning("GITEA_TOKEN not set — cannot create issues")
return
existing = find_open_watchdog_issue()
if report.overall_healthy:
if existing:
logger.info("Health restored — closing issue #%d", existing["number"])
close_alert_issue(existing["number"], report)
else:
if existing:
logger.info("Still unhealthy — updating issue #%d", existing["number"])
update_alert_issue(existing["number"], report)
else:
result = create_alert_issue(report)
if result and result.get("number"):
logger.info("Created alert issue #%d", result["number"])
def run_once(args: argparse.Namespace) -> tuple:
"""Run one health check cycle. Returns (healthy, report)."""
report = run_health_checks(
ws_host=args.ws_host,
ws_port=args.ws_port,
heartbeat_path=Path(args.heartbeat_path),
stale_threshold=args.stale_threshold,
)
# Log results
for check in report.checks:
level = logging.INFO if check.healthy else logging.ERROR
icon = "" if check.healthy else ""
logger.log(level, "%s %s: %s", icon, check.name, check.message)
if not report.overall_healthy:
alert_on_failure(report, dry_run=args.dry_run)
elif not args.dry_run:
alert_on_failure(report, dry_run=args.dry_run)
# Poka-yoke: stamp our own heartbeat so the meta-checker can detect
# if this watchdog cron job itself goes silent. Runs every 5 minutes
# by convention (*/5 * * * *).
if _HAS_CRON_HEARTBEAT:
try:
_write_cron_heartbeat("nexus_watchdog", interval_seconds=300)
except Exception:
pass # never crash the watchdog over its own heartbeat
return report.overall_healthy, report
def main():
parser = argparse.ArgumentParser(
description="Nexus Watchdog — monitors consciousness loop health",
)
parser.add_argument(
"--ws-host", default=DEFAULT_WS_HOST,
help="WebSocket gateway host (default: localhost)",
)
parser.add_argument(
"--ws-port", type=int, default=DEFAULT_WS_PORT,
help="WebSocket gateway port (default: 8765)",
)
parser.add_argument(
"--heartbeat-path", default=str(DEFAULT_HEARTBEAT_PATH),
help="Path to heartbeat file",
)
parser.add_argument(
"--stale-threshold", type=int, default=DEFAULT_STALE_THRESHOLD,
help="Seconds before heartbeat is considered stale (default: 300)",
)
parser.add_argument(
"--watch", action="store_true",
help="Run continuously instead of one-shot",
)
parser.add_argument(
"--interval", type=int, default=DEFAULT_INTERVAL,
help="Seconds between checks in watch mode (default: 60)",
)
parser.add_argument(
"--dry-run", action="store_true",
help="Print diagnostics without creating Gitea issues",
)
parser.add_argument(
"--json", action="store_true", dest="output_json",
help="Output results as JSON (for integration with other tools)",
)
parser.add_argument(
"--kimi-job", default=KIMI_HEARTBEAT_JOB,
help=f"Kimi heartbeat job name (default: {KIMI_HEARTBEAT_JOB})",
)
parser.add_argument(
"--kimi-stale-multiplier", type=float, default=KIMI_HEARTBEAT_STALE_MULTIPLIER,
help=f"Kimi heartbeat staleness multiplier (default: {KIMI_HEARTBEAT_STALE_MULTIPLIER})",
)
args = parser.parse_args()
if args.watch:
logger.info("Watchdog starting in continuous mode (interval: %ds)", args.interval)
_running = True
def _handle_sigterm(signum, frame):
nonlocal _running
_running = False
logger.info("Received signal %d, shutting down", signum)
signal.signal(signal.SIGTERM, _handle_sigterm)
signal.signal(signal.SIGINT, _handle_sigterm)
while _running:
run_once(args) # (healthy, report) — not needed in watch mode
for _ in range(args.interval):
if not _running:
break
time.sleep(1)
else:
healthy, report = run_once(args)
if args.output_json:
print(json.dumps({
"healthy": report.overall_healthy,
"timestamp": report.timestamp,
"checks": [
{"name": c.name, "healthy": c.healthy,
"message": c.message, "details": c.details}
for c in report.checks
],
}, indent=2))
sys.exit(0 if healthy else 1)
if __name__ == "__main__":
main()