diff --git a/bin/__pycache__/nexus_watchdog.cpython-312.pyc b/bin/__pycache__/nexus_watchdog.cpython-312.pyc index 20fd6b1b..e2870006 100644 Binary files a/bin/__pycache__/nexus_watchdog.cpython-312.pyc and b/bin/__pycache__/nexus_watchdog.cpython-312.pyc differ diff --git a/bin/nexus_watchdog.py b/bin/nexus_watchdog.py index 28df7a83..f2f5cfa3 100644 --- a/bin/nexus_watchdog.py +++ b/bin/nexus_watchdog.py @@ -60,6 +60,23 @@ If the heartbeat is older than --stale-threshold seconds, the mind is considered dead even if the process is still running (e.g., hung on a blocking call). +KIMI HEARTBEAT +============== +The Kimi triage pipeline writes a cron heartbeat file after each run: + + /var/run/bezalel/heartbeats/kimi-heartbeat.last + (fallback: ~/.bezalel/heartbeats/kimi-heartbeat.last) + { + "job": "kimi-heartbeat", + "timestamp": 1711843200.0, + "interval_seconds": 900, + "pid": 12345, + "status": "ok" + } + +If the heartbeat is stale (>2x declared interval), the watchdog reports +a Kimi Heartbeat failure alongside the other checks. + ZERO DEPENDENCIES ================= Pure stdlib. No pip installs. Same machine as the nexus. @@ -104,6 +121,10 @@ DEFAULT_HEARTBEAT_PATH = Path.home() / ".nexus" / "heartbeat.json" DEFAULT_STALE_THRESHOLD = 300 # 5 minutes without a heartbeat = dead DEFAULT_INTERVAL = 60 # seconds between checks in watch mode +# Kimi Heartbeat — cron job heartbeat file written by the triage pipeline +KIMI_HEARTBEAT_JOB = "kimi-heartbeat" +KIMI_HEARTBEAT_STALE_MULTIPLIER = 2.0 # stale at 2x declared interval + GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com") GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "") GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus") @@ -345,6 +366,93 @@ def check_syntax_health() -> CheckResult: ) +def check_kimi_heartbeat( + job: str = KIMI_HEARTBEAT_JOB, + stale_multiplier: float = KIMI_HEARTBEAT_STALE_MULTIPLIER, +) -> CheckResult: + """Check if the Kimi Heartbeat cron job is alive. + + Reads the ``.last`` file from the standard Bezalel heartbeat + directory (``/var/run/bezalel/heartbeats/`` or fallback + ``~/.bezalel/heartbeats/``). The file is written atomically by the + cron_heartbeat module after each successful triage pipeline run. + + A job is stale when: + ``time.time() - timestamp > stale_multiplier * interval_seconds`` + (same rule used by ``check_cron_heartbeats.py``). + """ + # Resolve heartbeat directory — same logic as cron_heartbeat._resolve + primary = Path("/var/run/bezalel/heartbeats") + fallback = Path.home() / ".bezalel" / "heartbeats" + env_dir = os.environ.get("BEZALEL_HEARTBEAT_DIR") + if env_dir: + hb_dir = Path(env_dir) + elif primary.exists(): + hb_dir = primary + elif fallback.exists(): + hb_dir = fallback + else: + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message="Heartbeat directory not found — no triage pipeline deployed yet", + details={"searched": [str(primary), str(fallback)]}, + ) + + hb_file = hb_dir / f"{job}.last" + if not hb_file.exists(): + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message=f"No heartbeat file at {hb_file} — Kimi triage pipeline has never reported", + details={"path": str(hb_file)}, + ) + + try: + data = json.loads(hb_file.read_text()) + except (json.JSONDecodeError, OSError) as e: + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message=f"Heartbeat file corrupt: {e}", + details={"path": str(hb_file), "error": str(e)}, + ) + + timestamp = float(data.get("timestamp", 0)) + interval = int(data.get("interval_seconds", 0)) + raw_status = data.get("status", "unknown") + age = time.time() - timestamp + + if interval <= 0: + # No declared interval — use raw timestamp age (30 min default) + interval = 1800 + + threshold = stale_multiplier * interval + is_stale = age > threshold + + age_str = f"{int(age)}s" if age < 3600 else f"{int(age // 3600)}h {int((age % 3600) // 60)}m" + interval_str = f"{int(interval)}s" if interval < 3600 else f"{int(interval // 3600)}h {int((interval % 3600) // 60)}m" + + if is_stale: + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message=( + f"Silent for {age_str} " + f"(threshold: {stale_multiplier}x {interval_str} = {int(threshold)}s). " + f"Status: {raw_status}" + ), + details=data, + ) + + return CheckResult( + name="Kimi Heartbeat", + healthy=True, + message=f"Alive — last beat {age_str} ago (interval {interval_str}, status={raw_status})", + details=data, + ) + + # ── Gitea alerting ─────────────────────────────────────────────────── def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any: @@ -446,6 +554,7 @@ def run_health_checks( check_mind_process(), check_heartbeat(heartbeat_path, stale_threshold), check_syntax_health(), + check_kimi_heartbeat(), ] return HealthReport(timestamp=time.time(), checks=checks) @@ -545,6 +654,14 @@ def main(): "--json", action="store_true", dest="output_json", help="Output results as JSON (for integration with other tools)", ) + parser.add_argument( + "--kimi-job", default=KIMI_HEARTBEAT_JOB, + help=f"Kimi heartbeat job name (default: {KIMI_HEARTBEAT_JOB})", + ) + parser.add_argument( + "--kimi-stale-multiplier", type=float, default=KIMI_HEARTBEAT_STALE_MULTIPLIER, + help=f"Kimi heartbeat staleness multiplier (default: {KIMI_HEARTBEAT_STALE_MULTIPLIER})", + ) args = parser.parse_args()