Compare commits
1 Commits
feat/mnemo
...
mimo/creat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c7dfb8a5e6 |
Binary file not shown.
@@ -60,6 +60,23 @@ If the heartbeat is older than --stale-threshold seconds, the
|
||||
mind is considered dead even if the process is still running
|
||||
(e.g., hung on a blocking call).
|
||||
|
||||
KIMI HEARTBEAT
|
||||
==============
|
||||
The Kimi triage pipeline writes a cron heartbeat file after each run:
|
||||
|
||||
/var/run/bezalel/heartbeats/kimi-heartbeat.last
|
||||
(fallback: ~/.bezalel/heartbeats/kimi-heartbeat.last)
|
||||
{
|
||||
"job": "kimi-heartbeat",
|
||||
"timestamp": 1711843200.0,
|
||||
"interval_seconds": 900,
|
||||
"pid": 12345,
|
||||
"status": "ok"
|
||||
}
|
||||
|
||||
If the heartbeat is stale (>2x declared interval), the watchdog reports
|
||||
a Kimi Heartbeat failure alongside the other checks.
|
||||
|
||||
ZERO DEPENDENCIES
|
||||
=================
|
||||
Pure stdlib. No pip installs. Same machine as the nexus.
|
||||
@@ -104,6 +121,10 @@ DEFAULT_HEARTBEAT_PATH = Path.home() / ".nexus" / "heartbeat.json"
|
||||
DEFAULT_STALE_THRESHOLD = 300 # 5 minutes without a heartbeat = dead
|
||||
DEFAULT_INTERVAL = 60 # seconds between checks in watch mode
|
||||
|
||||
# Kimi Heartbeat — cron job heartbeat file written by the triage pipeline
|
||||
KIMI_HEARTBEAT_JOB = "kimi-heartbeat"
|
||||
KIMI_HEARTBEAT_STALE_MULTIPLIER = 2.0 # stale at 2x declared interval
|
||||
|
||||
GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
|
||||
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
||||
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
|
||||
@@ -345,6 +366,93 @@ def check_syntax_health() -> CheckResult:
|
||||
)
|
||||
|
||||
|
||||
def check_kimi_heartbeat(
|
||||
job: str = KIMI_HEARTBEAT_JOB,
|
||||
stale_multiplier: float = KIMI_HEARTBEAT_STALE_MULTIPLIER,
|
||||
) -> CheckResult:
|
||||
"""Check if the Kimi Heartbeat cron job is alive.
|
||||
|
||||
Reads the ``<job>.last`` file from the standard Bezalel heartbeat
|
||||
directory (``/var/run/bezalel/heartbeats/`` or fallback
|
||||
``~/.bezalel/heartbeats/``). The file is written atomically by the
|
||||
cron_heartbeat module after each successful triage pipeline run.
|
||||
|
||||
A job is stale when:
|
||||
``time.time() - timestamp > stale_multiplier * interval_seconds``
|
||||
(same rule used by ``check_cron_heartbeats.py``).
|
||||
"""
|
||||
# Resolve heartbeat directory — same logic as cron_heartbeat._resolve
|
||||
primary = Path("/var/run/bezalel/heartbeats")
|
||||
fallback = Path.home() / ".bezalel" / "heartbeats"
|
||||
env_dir = os.environ.get("BEZALEL_HEARTBEAT_DIR")
|
||||
if env_dir:
|
||||
hb_dir = Path(env_dir)
|
||||
elif primary.exists():
|
||||
hb_dir = primary
|
||||
elif fallback.exists():
|
||||
hb_dir = fallback
|
||||
else:
|
||||
return CheckResult(
|
||||
name="Kimi Heartbeat",
|
||||
healthy=False,
|
||||
message="Heartbeat directory not found — no triage pipeline deployed yet",
|
||||
details={"searched": [str(primary), str(fallback)]},
|
||||
)
|
||||
|
||||
hb_file = hb_dir / f"{job}.last"
|
||||
if not hb_file.exists():
|
||||
return CheckResult(
|
||||
name="Kimi Heartbeat",
|
||||
healthy=False,
|
||||
message=f"No heartbeat file at {hb_file} — Kimi triage pipeline has never reported",
|
||||
details={"path": str(hb_file)},
|
||||
)
|
||||
|
||||
try:
|
||||
data = json.loads(hb_file.read_text())
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
return CheckResult(
|
||||
name="Kimi Heartbeat",
|
||||
healthy=False,
|
||||
message=f"Heartbeat file corrupt: {e}",
|
||||
details={"path": str(hb_file), "error": str(e)},
|
||||
)
|
||||
|
||||
timestamp = float(data.get("timestamp", 0))
|
||||
interval = int(data.get("interval_seconds", 0))
|
||||
raw_status = data.get("status", "unknown")
|
||||
age = time.time() - timestamp
|
||||
|
||||
if interval <= 0:
|
||||
# No declared interval — use raw timestamp age (30 min default)
|
||||
interval = 1800
|
||||
|
||||
threshold = stale_multiplier * interval
|
||||
is_stale = age > threshold
|
||||
|
||||
age_str = f"{int(age)}s" if age < 3600 else f"{int(age // 3600)}h {int((age % 3600) // 60)}m"
|
||||
interval_str = f"{int(interval)}s" if interval < 3600 else f"{int(interval // 3600)}h {int((interval % 3600) // 60)}m"
|
||||
|
||||
if is_stale:
|
||||
return CheckResult(
|
||||
name="Kimi Heartbeat",
|
||||
healthy=False,
|
||||
message=(
|
||||
f"Silent for {age_str} "
|
||||
f"(threshold: {stale_multiplier}x {interval_str} = {int(threshold)}s). "
|
||||
f"Status: {raw_status}"
|
||||
),
|
||||
details=data,
|
||||
)
|
||||
|
||||
return CheckResult(
|
||||
name="Kimi Heartbeat",
|
||||
healthy=True,
|
||||
message=f"Alive — last beat {age_str} ago (interval {interval_str}, status={raw_status})",
|
||||
details=data,
|
||||
)
|
||||
|
||||
|
||||
# ── Gitea alerting ───────────────────────────────────────────────────
|
||||
|
||||
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
|
||||
@@ -446,6 +554,7 @@ def run_health_checks(
|
||||
check_mind_process(),
|
||||
check_heartbeat(heartbeat_path, stale_threshold),
|
||||
check_syntax_health(),
|
||||
check_kimi_heartbeat(),
|
||||
]
|
||||
return HealthReport(timestamp=time.time(), checks=checks)
|
||||
|
||||
@@ -545,6 +654,14 @@ def main():
|
||||
"--json", action="store_true", dest="output_json",
|
||||
help="Output results as JSON (for integration with other tools)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--kimi-job", default=KIMI_HEARTBEAT_JOB,
|
||||
help=f"Kimi heartbeat job name (default: {KIMI_HEARTBEAT_JOB})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--kimi-stale-multiplier", type=float, default=KIMI_HEARTBEAT_STALE_MULTIPLIER,
|
||||
help=f"Kimi heartbeat staleness multiplier (default: {KIMI_HEARTBEAT_STALE_MULTIPLIER})",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user