Files
the-nexus/bin/nexus_watchdog.py
Google AI Agent d32baa696b
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
[watchdog] The Eye That Never Sleeps — Nexus Health Monitor (#794)
Co-authored-by: Google AI Agent <gemini@hermes.local>
Co-committed-by: Google AI Agent <gemini@hermes.local>
2026-04-04 01:36:56 +00:00

576 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Nexus Watchdog — The Eye That Never Sleeps
Monitors the health of the Nexus consciousness loop and WebSocket
gateway, raising Gitea issues when components go dark.
The nexus was dead for hours after a syntax error crippled
nexus_think.py. Nobody knew. The gateway kept running, but the
consciousness loop — the only part that matters — was silent.
This watchdog ensures that never happens again.
HOW IT WORKS
============
1. Probes the WebSocket gateway (ws://localhost:8765)
→ Can Timmy hear the world?
2. Checks for a running nexus_think.py process
→ Is Timmy's mind awake?
3. Reads the heartbeat file (~/.nexus/heartbeat.json)
→ When did Timmy last think?
4. If any check fails, opens a Gitea issue (or updates an existing one)
with the exact failure mode, timestamp, and diagnostic info.
5. If all checks pass after a previous failure, closes the issue
with a recovery note.
USAGE
=====
# One-shot check (good for cron)
python bin/nexus_watchdog.py
# Continuous monitoring (every 60s)
python bin/nexus_watchdog.py --watch --interval 60
# Dry-run (print diagnostics, don't touch Gitea)
python bin/nexus_watchdog.py --dry-run
# Crontab entry (every 5 minutes)
*/5 * * * * cd /path/to/the-nexus && python bin/nexus_watchdog.py
HEARTBEAT PROTOCOL
==================
The consciousness loop (nexus_think.py) writes a heartbeat file
after each think cycle:
~/.nexus/heartbeat.json
{
"pid": 12345,
"timestamp": 1711843200.0,
"cycle": 42,
"model": "timmy:v0.1-q4",
"status": "thinking"
}
If the heartbeat is older than --stale-threshold seconds, the
mind is considered dead even if the process is still running
(e.g., hung on a blocking call).
ZERO DEPENDENCIES
=================
Pure stdlib. No pip installs. Same machine as the nexus.
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import signal
import socket
import subprocess
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-7s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("nexus.watchdog")
# ── Configuration ────────────────────────────────────────────────────
DEFAULT_WS_HOST = "localhost"
DEFAULT_WS_PORT = 8765
DEFAULT_HEARTBEAT_PATH = Path.home() / ".nexus" / "heartbeat.json"
DEFAULT_STALE_THRESHOLD = 300 # 5 minutes without a heartbeat = dead
DEFAULT_INTERVAL = 60 # seconds between checks in watch mode
GITEA_URL = os.environ.get("GITEA_URL", "http://143.198.27.163:3000")
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
WATCHDOG_LABEL = "watchdog"
WATCHDOG_TITLE_PREFIX = "[watchdog]"
# ── Health check results ─────────────────────────────────────────────
@dataclass
class CheckResult:
"""Result of a single health check."""
name: str
healthy: bool
message: str
details: Dict[str, Any] = field(default_factory=dict)
@dataclass
class HealthReport:
"""Aggregate health report from all checks."""
timestamp: float
checks: List[CheckResult]
overall_healthy: bool = True
def __post_init__(self):
self.overall_healthy = all(c.healthy for c in self.checks)
@property
def failed_checks(self) -> List[CheckResult]:
return [c for c in self.checks if not c.healthy]
def to_markdown(self) -> str:
"""Format as a Gitea issue body."""
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(self.timestamp))
status = "🟢 ALL SYSTEMS OPERATIONAL" if self.overall_healthy else "🔴 FAILURES DETECTED"
lines = [
f"## Nexus Health Report — {ts}",
f"**Status:** {status}",
"",
"| Check | Status | Details |",
"|:------|:------:|:--------|",
]
for c in self.checks:
icon = "" if c.healthy else ""
lines.append(f"| {c.name} | {icon} | {c.message} |")
if self.failed_checks:
lines.append("")
lines.append("### Failure Diagnostics")
for c in self.failed_checks:
lines.append(f"\n**{c.name}:**")
lines.append(f"```")
lines.append(c.message)
if c.details:
lines.append(json.dumps(c.details, indent=2))
lines.append(f"```")
lines.append("")
lines.append(f"*Generated by `nexus_watchdog.py` at {ts}*")
return "\n".join(lines)
# ── Health checks ────────────────────────────────────────────────────
def check_ws_gateway(host: str = DEFAULT_WS_HOST, port: int = DEFAULT_WS_PORT) -> CheckResult:
"""Check if the WebSocket gateway is accepting connections.
Uses a raw TCP socket probe (not a full WebSocket handshake) to avoid
depending on the websockets library. If TCP connects, the gateway
process is alive and listening.
"""
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((host, port))
sock.close()
if result == 0:
return CheckResult(
name="WebSocket Gateway",
healthy=True,
message=f"Listening on {host}:{port}",
)
else:
return CheckResult(
name="WebSocket Gateway",
healthy=False,
message=f"Connection refused on {host}:{port} (errno={result})",
details={"host": host, "port": port, "errno": result},
)
except Exception as e:
return CheckResult(
name="WebSocket Gateway",
healthy=False,
message=f"Probe failed: {e}",
details={"host": host, "port": port, "error": str(e)},
)
def check_mind_process() -> CheckResult:
"""Check if nexus_think.py is running as a process.
Uses `pgrep -f` to find processes matching the script name.
This catches both `python nexus_think.py` and `python -m nexus.nexus_think`.
"""
try:
result = subprocess.run(
["pgrep", "-f", "nexus_think"],
capture_output=True, text=True, timeout=5,
)
if result.returncode == 0:
pids = [p.strip() for p in result.stdout.strip().split("\n") if p.strip()]
# Filter out our own watchdog process
own_pid = str(os.getpid())
pids = [p for p in pids if p != own_pid]
if pids:
return CheckResult(
name="Consciousness Loop",
healthy=True,
message=f"Running (PID: {', '.join(pids)})",
details={"pids": pids},
)
return CheckResult(
name="Consciousness Loop",
healthy=False,
message="nexus_think.py is not running — Timmy's mind is dark",
details={"pgrep_returncode": result.returncode},
)
except FileNotFoundError:
# pgrep not available (unlikely on Linux/macOS but handle gracefully)
return CheckResult(
name="Consciousness Loop",
healthy=True, # Can't check — don't raise false alarms
message="pgrep not available, skipping process check",
)
except Exception as e:
return CheckResult(
name="Consciousness Loop",
healthy=False,
message=f"Process check failed: {e}",
details={"error": str(e)},
)
def check_heartbeat(
path: Path = DEFAULT_HEARTBEAT_PATH,
stale_threshold: int = DEFAULT_STALE_THRESHOLD,
) -> CheckResult:
"""Check if the heartbeat file exists and is recent.
The consciousness loop should write this file after each think
cycle. If it's missing or stale, the mind has stopped thinking
even if the process is technically alive.
"""
if not path.exists():
return CheckResult(
name="Heartbeat",
healthy=False,
message=f"No heartbeat file at {path} — mind has never reported",
details={"path": str(path)},
)
try:
data = json.loads(path.read_text())
except (json.JSONDecodeError, OSError) as e:
return CheckResult(
name="Heartbeat",
healthy=False,
message=f"Heartbeat file corrupt: {e}",
details={"path": str(path), "error": str(e)},
)
timestamp = data.get("timestamp", 0)
age = time.time() - timestamp
cycle = data.get("cycle", "?")
model = data.get("model", "unknown")
status = data.get("status", "unknown")
if age > stale_threshold:
return CheckResult(
name="Heartbeat",
healthy=False,
message=(
f"Stale heartbeat — last pulse {int(age)}s ago "
f"(threshold: {stale_threshold}s). "
f"Cycle #{cycle}, model={model}, status={status}"
),
details=data,
)
return CheckResult(
name="Heartbeat",
healthy=True,
message=f"Alive — cycle #{cycle}, {int(age)}s ago, model={model}",
details=data,
)
def check_syntax_health() -> CheckResult:
"""Verify nexus_think.py can be parsed by Python.
This catches the exact failure mode that killed the nexus: a syntax
error introduced by a bad commit. Python's compile() is a fast,
zero-import check that catches SyntaxErrors before they hit runtime.
"""
script_path = Path(__file__).parent.parent / "nexus" / "nexus_think.py"
if not script_path.exists():
return CheckResult(
name="Syntax Health",
healthy=True,
message="nexus_think.py not found at expected path, skipping",
)
try:
source = script_path.read_text()
compile(source, str(script_path), "exec")
return CheckResult(
name="Syntax Health",
healthy=True,
message=f"nexus_think.py compiles cleanly ({len(source)} bytes)",
)
except SyntaxError as e:
return CheckResult(
name="Syntax Health",
healthy=False,
message=f"SyntaxError at line {e.lineno}: {e.msg}",
details={
"file": str(script_path),
"line": e.lineno,
"offset": e.offset,
"text": (e.text or "").strip(),
},
)
# ── Gitea alerting ───────────────────────────────────────────────────
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
"""Make a Gitea API request. Returns parsed JSON or empty dict."""
import urllib.request
import urllib.error
url = f"{GITEA_URL.rstrip('/')}/api/v1{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method)
if GITEA_TOKEN:
req.add_header("Authorization", f"token {GITEA_TOKEN}")
req.add_header("Content-Type", "application/json")
req.add_header("Accept", "application/json")
try:
with urllib.request.urlopen(req, timeout=15) as resp:
raw = resp.read().decode()
return json.loads(raw) if raw.strip() else {}
except urllib.error.HTTPError as e:
logger.warning("Gitea %d: %s", e.code, e.read().decode()[:200])
return None
except Exception as e:
logger.warning("Gitea request failed: %s", e)
return None
def find_open_watchdog_issue() -> Optional[dict]:
"""Find an existing open watchdog issue, if any."""
issues = _gitea_request(
"GET",
f"/repos/{GITEA_REPO}/issues?state=open&type=issues&limit=20",
)
if not issues or not isinstance(issues, list):
return None
for issue in issues:
title = issue.get("title", "")
if title.startswith(WATCHDOG_TITLE_PREFIX):
return issue
return None
def create_alert_issue(report: HealthReport) -> Optional[dict]:
"""Create a Gitea issue for a health failure."""
failed = report.failed_checks
components = ", ".join(c.name for c in failed)
title = f"{WATCHDOG_TITLE_PREFIX} Nexus health failure: {components}"
return _gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues",
data={
"title": title,
"body": report.to_markdown(),
"assignees": ["Timmy"],
},
)
def update_alert_issue(issue_number: int, report: HealthReport) -> Optional[dict]:
"""Add a comment to an existing watchdog issue with new findings."""
return _gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues/{issue_number}/comments",
data={"body": report.to_markdown()},
)
def close_alert_issue(issue_number: int, report: HealthReport) -> None:
"""Close a watchdog issue when health is restored."""
_gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues/{issue_number}/comments",
data={"body": (
"## 🟢 Recovery Confirmed\n\n"
+ report.to_markdown()
+ "\n\n*Closing — all systems operational.*"
)},
)
_gitea_request(
"PATCH",
f"/repos/{GITEA_REPO}/issues/{issue_number}",
data={"state": "closed"},
)
# ── Orchestration ────────────────────────────────────────────────────
def run_health_checks(
ws_host: str = DEFAULT_WS_HOST,
ws_port: int = DEFAULT_WS_PORT,
heartbeat_path: Path = DEFAULT_HEARTBEAT_PATH,
stale_threshold: int = DEFAULT_STALE_THRESHOLD,
) -> HealthReport:
"""Run all health checks and return the aggregate report."""
checks = [
check_ws_gateway(ws_host, ws_port),
check_mind_process(),
check_heartbeat(heartbeat_path, stale_threshold),
check_syntax_health(),
]
return HealthReport(timestamp=time.time(), checks=checks)
def alert_on_failure(report: HealthReport, dry_run: bool = False) -> None:
"""Create, update, or close Gitea issues based on health status."""
if dry_run:
logger.info("DRY RUN — would %s Gitea issue",
"close" if report.overall_healthy else "create/update")
return
if not GITEA_TOKEN:
logger.warning("GITEA_TOKEN not set — cannot create issues")
return
existing = find_open_watchdog_issue()
if report.overall_healthy:
if existing:
logger.info("Health restored — closing issue #%d", existing["number"])
close_alert_issue(existing["number"], report)
else:
if existing:
logger.info("Still unhealthy — updating issue #%d", existing["number"])
update_alert_issue(existing["number"], report)
else:
result = create_alert_issue(report)
if result and result.get("number"):
logger.info("Created alert issue #%d", result["number"])
def run_once(args: argparse.Namespace) -> bool:
"""Run one health check cycle. Returns True if healthy."""
report = run_health_checks(
ws_host=args.ws_host,
ws_port=args.ws_port,
heartbeat_path=Path(args.heartbeat_path),
stale_threshold=args.stale_threshold,
)
# Log results
for check in report.checks:
level = logging.INFO if check.healthy else logging.ERROR
icon = "" if check.healthy else ""
logger.log(level, "%s %s: %s", icon, check.name, check.message)
if not report.overall_healthy:
alert_on_failure(report, dry_run=args.dry_run)
elif not args.dry_run:
alert_on_failure(report, dry_run=args.dry_run)
return report.overall_healthy
def main():
parser = argparse.ArgumentParser(
description="Nexus Watchdog — monitors consciousness loop health",
)
parser.add_argument(
"--ws-host", default=DEFAULT_WS_HOST,
help="WebSocket gateway host (default: localhost)",
)
parser.add_argument(
"--ws-port", type=int, default=DEFAULT_WS_PORT,
help="WebSocket gateway port (default: 8765)",
)
parser.add_argument(
"--heartbeat-path", default=str(DEFAULT_HEARTBEAT_PATH),
help="Path to heartbeat file",
)
parser.add_argument(
"--stale-threshold", type=int, default=DEFAULT_STALE_THRESHOLD,
help="Seconds before heartbeat is considered stale (default: 300)",
)
parser.add_argument(
"--watch", action="store_true",
help="Run continuously instead of one-shot",
)
parser.add_argument(
"--interval", type=int, default=DEFAULT_INTERVAL,
help="Seconds between checks in watch mode (default: 60)",
)
parser.add_argument(
"--dry-run", action="store_true",
help="Print diagnostics without creating Gitea issues",
)
parser.add_argument(
"--json", action="store_true", dest="output_json",
help="Output results as JSON (for integration with other tools)",
)
args = parser.parse_args()
if args.watch:
logger.info("Watchdog starting in continuous mode (interval: %ds)", args.interval)
_running = True
def _handle_sigterm(signum, frame):
nonlocal _running
_running = False
logger.info("Received signal %d, shutting down", signum)
signal.signal(signal.SIGTERM, _handle_sigterm)
signal.signal(signal.SIGINT, _handle_sigterm)
while _running:
run_once(args)
for _ in range(args.interval):
if not _running:
break
time.sleep(1)
else:
healthy = run_once(args)
if args.output_json:
report = run_health_checks(
ws_host=args.ws_host,
ws_port=args.ws_port,
heartbeat_path=Path(args.heartbeat_path),
stale_threshold=args.stale_threshold,
)
print(json.dumps({
"healthy": report.overall_healthy,
"timestamp": report.timestamp,
"checks": [
{"name": c.name, "healthy": c.healthy,
"message": c.message, "details": c.details}
for c in report.checks
],
}, indent=2))
sys.exit(0 if healthy else 1)
if __name__ == "__main__":
main()