Compare commits
1 Commits
mimo/code/
...
mimo/creat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c7dfb8a5e6 |
Binary file not shown.
@@ -60,6 +60,23 @@ If the heartbeat is older than --stale-threshold seconds, the
|
|||||||
mind is considered dead even if the process is still running
|
mind is considered dead even if the process is still running
|
||||||
(e.g., hung on a blocking call).
|
(e.g., hung on a blocking call).
|
||||||
|
|
||||||
|
KIMI HEARTBEAT
|
||||||
|
==============
|
||||||
|
The Kimi triage pipeline writes a cron heartbeat file after each run:
|
||||||
|
|
||||||
|
/var/run/bezalel/heartbeats/kimi-heartbeat.last
|
||||||
|
(fallback: ~/.bezalel/heartbeats/kimi-heartbeat.last)
|
||||||
|
{
|
||||||
|
"job": "kimi-heartbeat",
|
||||||
|
"timestamp": 1711843200.0,
|
||||||
|
"interval_seconds": 900,
|
||||||
|
"pid": 12345,
|
||||||
|
"status": "ok"
|
||||||
|
}
|
||||||
|
|
||||||
|
If the heartbeat is stale (>2x declared interval), the watchdog reports
|
||||||
|
a Kimi Heartbeat failure alongside the other checks.
|
||||||
|
|
||||||
ZERO DEPENDENCIES
|
ZERO DEPENDENCIES
|
||||||
=================
|
=================
|
||||||
Pure stdlib. No pip installs. Same machine as the nexus.
|
Pure stdlib. No pip installs. Same machine as the nexus.
|
||||||
@@ -104,6 +121,10 @@ DEFAULT_HEARTBEAT_PATH = Path.home() / ".nexus" / "heartbeat.json"
|
|||||||
DEFAULT_STALE_THRESHOLD = 300 # 5 minutes without a heartbeat = dead
|
DEFAULT_STALE_THRESHOLD = 300 # 5 minutes without a heartbeat = dead
|
||||||
DEFAULT_INTERVAL = 60 # seconds between checks in watch mode
|
DEFAULT_INTERVAL = 60 # seconds between checks in watch mode
|
||||||
|
|
||||||
|
# Kimi Heartbeat — cron job heartbeat file written by the triage pipeline
|
||||||
|
KIMI_HEARTBEAT_JOB = "kimi-heartbeat"
|
||||||
|
KIMI_HEARTBEAT_STALE_MULTIPLIER = 2.0 # stale at 2x declared interval
|
||||||
|
|
||||||
GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
|
GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
|
||||||
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
||||||
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
|
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
|
||||||
@@ -345,6 +366,93 @@ def check_syntax_health() -> CheckResult:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def check_kimi_heartbeat(
|
||||||
|
job: str = KIMI_HEARTBEAT_JOB,
|
||||||
|
stale_multiplier: float = KIMI_HEARTBEAT_STALE_MULTIPLIER,
|
||||||
|
) -> CheckResult:
|
||||||
|
"""Check if the Kimi Heartbeat cron job is alive.
|
||||||
|
|
||||||
|
Reads the ``<job>.last`` file from the standard Bezalel heartbeat
|
||||||
|
directory (``/var/run/bezalel/heartbeats/`` or fallback
|
||||||
|
``~/.bezalel/heartbeats/``). The file is written atomically by the
|
||||||
|
cron_heartbeat module after each successful triage pipeline run.
|
||||||
|
|
||||||
|
A job is stale when:
|
||||||
|
``time.time() - timestamp > stale_multiplier * interval_seconds``
|
||||||
|
(same rule used by ``check_cron_heartbeats.py``).
|
||||||
|
"""
|
||||||
|
# Resolve heartbeat directory — same logic as cron_heartbeat._resolve
|
||||||
|
primary = Path("/var/run/bezalel/heartbeats")
|
||||||
|
fallback = Path.home() / ".bezalel" / "heartbeats"
|
||||||
|
env_dir = os.environ.get("BEZALEL_HEARTBEAT_DIR")
|
||||||
|
if env_dir:
|
||||||
|
hb_dir = Path(env_dir)
|
||||||
|
elif primary.exists():
|
||||||
|
hb_dir = primary
|
||||||
|
elif fallback.exists():
|
||||||
|
hb_dir = fallback
|
||||||
|
else:
|
||||||
|
return CheckResult(
|
||||||
|
name="Kimi Heartbeat",
|
||||||
|
healthy=False,
|
||||||
|
message="Heartbeat directory not found — no triage pipeline deployed yet",
|
||||||
|
details={"searched": [str(primary), str(fallback)]},
|
||||||
|
)
|
||||||
|
|
||||||
|
hb_file = hb_dir / f"{job}.last"
|
||||||
|
if not hb_file.exists():
|
||||||
|
return CheckResult(
|
||||||
|
name="Kimi Heartbeat",
|
||||||
|
healthy=False,
|
||||||
|
message=f"No heartbeat file at {hb_file} — Kimi triage pipeline has never reported",
|
||||||
|
details={"path": str(hb_file)},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(hb_file.read_text())
|
||||||
|
except (json.JSONDecodeError, OSError) as e:
|
||||||
|
return CheckResult(
|
||||||
|
name="Kimi Heartbeat",
|
||||||
|
healthy=False,
|
||||||
|
message=f"Heartbeat file corrupt: {e}",
|
||||||
|
details={"path": str(hb_file), "error": str(e)},
|
||||||
|
)
|
||||||
|
|
||||||
|
timestamp = float(data.get("timestamp", 0))
|
||||||
|
interval = int(data.get("interval_seconds", 0))
|
||||||
|
raw_status = data.get("status", "unknown")
|
||||||
|
age = time.time() - timestamp
|
||||||
|
|
||||||
|
if interval <= 0:
|
||||||
|
# No declared interval — use raw timestamp age (30 min default)
|
||||||
|
interval = 1800
|
||||||
|
|
||||||
|
threshold = stale_multiplier * interval
|
||||||
|
is_stale = age > threshold
|
||||||
|
|
||||||
|
age_str = f"{int(age)}s" if age < 3600 else f"{int(age // 3600)}h {int((age % 3600) // 60)}m"
|
||||||
|
interval_str = f"{int(interval)}s" if interval < 3600 else f"{int(interval // 3600)}h {int((interval % 3600) // 60)}m"
|
||||||
|
|
||||||
|
if is_stale:
|
||||||
|
return CheckResult(
|
||||||
|
name="Kimi Heartbeat",
|
||||||
|
healthy=False,
|
||||||
|
message=(
|
||||||
|
f"Silent for {age_str} "
|
||||||
|
f"(threshold: {stale_multiplier}x {interval_str} = {int(threshold)}s). "
|
||||||
|
f"Status: {raw_status}"
|
||||||
|
),
|
||||||
|
details=data,
|
||||||
|
)
|
||||||
|
|
||||||
|
return CheckResult(
|
||||||
|
name="Kimi Heartbeat",
|
||||||
|
healthy=True,
|
||||||
|
message=f"Alive — last beat {age_str} ago (interval {interval_str}, status={raw_status})",
|
||||||
|
details=data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ── Gitea alerting ───────────────────────────────────────────────────
|
# ── Gitea alerting ───────────────────────────────────────────────────
|
||||||
|
|
||||||
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
|
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
|
||||||
@@ -446,6 +554,7 @@ def run_health_checks(
|
|||||||
check_mind_process(),
|
check_mind_process(),
|
||||||
check_heartbeat(heartbeat_path, stale_threshold),
|
check_heartbeat(heartbeat_path, stale_threshold),
|
||||||
check_syntax_health(),
|
check_syntax_health(),
|
||||||
|
check_kimi_heartbeat(),
|
||||||
]
|
]
|
||||||
return HealthReport(timestamp=time.time(), checks=checks)
|
return HealthReport(timestamp=time.time(), checks=checks)
|
||||||
|
|
||||||
@@ -545,6 +654,14 @@ def main():
|
|||||||
"--json", action="store_true", dest="output_json",
|
"--json", action="store_true", dest="output_json",
|
||||||
help="Output results as JSON (for integration with other tools)",
|
help="Output results as JSON (for integration with other tools)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--kimi-job", default=KIMI_HEARTBEAT_JOB,
|
||||||
|
help=f"Kimi heartbeat job name (default: {KIMI_HEARTBEAT_JOB})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--kimi-stale-multiplier", type=float, default=KIMI_HEARTBEAT_STALE_MULTIPLIER,
|
||||||
|
help=f"Kimi heartbeat staleness multiplier (default: {KIMI_HEARTBEAT_STALE_MULTIPLIER})",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user