450 lines
16 KiB
Python
450 lines
16 KiB
Python
#!/usr/bin/env python3
|
||
"""Meta-heartbeat checker — makes silent cron failures impossible.
|
||
|
||
Reads every ``*.last`` file in the heartbeat directory and verifies that no
|
||
job has been silent for longer than **2× its declared interval**. If any job
|
||
is stale, a Gitea alert issue is created (or an existing one is updated).
|
||
When all jobs recover, the issue is closed automatically.
|
||
|
||
This script itself should be run as a cron job every 15 minutes so the
|
||
meta-level is also covered:
|
||
|
||
*/15 * * * * cd /path/to/the-nexus && \\
|
||
python bin/check_cron_heartbeats.py >> /var/log/bezalel/heartbeat-check.log 2>&1
|
||
|
||
USAGE
|
||
-----
|
||
# Check all jobs; create/update Gitea alert if any stale:
|
||
python bin/check_cron_heartbeats.py
|
||
|
||
# Dry-run (no Gitea writes):
|
||
python bin/check_cron_heartbeats.py --dry-run
|
||
|
||
# Output Night Watch heartbeat panel markdown:
|
||
python bin/check_cron_heartbeats.py --panel
|
||
|
||
# Output JSON (for integration with other tools):
|
||
python bin/check_cron_heartbeats.py --json
|
||
|
||
# Use a custom heartbeat directory:
|
||
python bin/check_cron_heartbeats.py --dir /tmp/test-heartbeats
|
||
|
||
HEARTBEAT DIRECTORY
|
||
-------------------
|
||
Primary: /var/run/bezalel/heartbeats/ (set by ops, writable by cron user)
|
||
Fallback: ~/.bezalel/heartbeats/ (dev machines)
|
||
Override: BEZALEL_HEARTBEAT_DIR env var
|
||
|
||
ZERO DEPENDENCIES
|
||
-----------------
|
||
Pure stdlib. No pip installs required.
|
||
|
||
Refs: #1096
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import os
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s %(levelname)-7s %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S",
|
||
)
|
||
logger = logging.getLogger("bezalel.heartbeat_checker")
|
||
|
||
# ── Configuration ─────────────────────────────────────────────────────
|
||
|
||
PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats")
|
||
FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats"
|
||
|
||
GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
|
||
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
||
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
|
||
ALERT_TITLE_PREFIX = "[heartbeat-checker]"
|
||
|
||
# A job is stale when its age exceeds this multiple of its declared interval
|
||
STALE_RATIO = 2.0
|
||
# Never flag a job as stale if it completed less than this many seconds ago
|
||
# (prevents noise immediately after deployment)
|
||
MIN_STALE_AGE = 60
|
||
|
||
|
||
def _resolve_heartbeat_dir() -> Path:
|
||
"""Return the active heartbeat directory."""
|
||
env = os.environ.get("BEZALEL_HEARTBEAT_DIR")
|
||
if env:
|
||
return Path(env)
|
||
if PRIMARY_HEARTBEAT_DIR.exists():
|
||
return PRIMARY_HEARTBEAT_DIR
|
||
# Try to create it; fall back to home dir if not permitted
|
||
try:
|
||
PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
|
||
probe = PRIMARY_HEARTBEAT_DIR / ".write_probe"
|
||
probe.touch()
|
||
probe.unlink()
|
||
return PRIMARY_HEARTBEAT_DIR
|
||
except (PermissionError, OSError):
|
||
return FALLBACK_HEARTBEAT_DIR
|
||
|
||
|
||
# ── Data model ────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class JobStatus:
|
||
"""Health status for a single cron job's heartbeat."""
|
||
job: str
|
||
path: Path
|
||
healthy: bool
|
||
age_seconds: float # -1 if unknown (missing/corrupt)
|
||
interval_seconds: int # 0 if unknown
|
||
staleness_ratio: float # age / interval; -1 if unknown; >STALE_RATIO = stale
|
||
last_timestamp: Optional[float]
|
||
pid: Optional[int]
|
||
raw_status: str # value from the .last file: "ok" / "warn" / "error"
|
||
message: str
|
||
|
||
|
||
@dataclass
|
||
class HeartbeatReport:
|
||
"""Aggregate report for all cron job heartbeats in a directory."""
|
||
timestamp: float
|
||
heartbeat_dir: Path
|
||
jobs: List[JobStatus] = field(default_factory=list)
|
||
|
||
@property
|
||
def stale_jobs(self) -> List[JobStatus]:
|
||
return [j for j in self.jobs if not j.healthy]
|
||
|
||
@property
|
||
def overall_healthy(self) -> bool:
|
||
return len(self.stale_jobs) == 0
|
||
|
||
# ── Rendering ─────────────────────────────────────────────────────
|
||
|
||
def to_panel_markdown(self) -> str:
|
||
"""Night Watch heartbeat panel — a table of all jobs with their status."""
|
||
ts = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime(self.timestamp))
|
||
overall = "OK" if self.overall_healthy else "ALERT"
|
||
|
||
lines = [
|
||
f"## Heartbeat Panel — {ts}",
|
||
"",
|
||
f"**Overall:** {overall}",
|
||
"",
|
||
"| Job | Status | Age | Interval | Ratio |",
|
||
"|-----|--------|-----|----------|-------|",
|
||
]
|
||
|
||
if not self.jobs:
|
||
lines.append("| *(no heartbeat files found)* | — | — | — | — |")
|
||
else:
|
||
for j in self.jobs:
|
||
icon = "OK" if j.healthy else "STALE"
|
||
age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
|
||
interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
|
||
ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
|
||
lines.append(
|
||
f"| `{j.job}` | {icon} | {age_str} | {interval_str} | {ratio_str} |"
|
||
)
|
||
|
||
if self.stale_jobs:
|
||
lines += ["", "**Stale jobs:**"]
|
||
for j in self.stale_jobs:
|
||
lines.append(f"- `{j.job}`: {j.message}")
|
||
|
||
lines += [
|
||
"",
|
||
f"*Heartbeat dir: `{self.heartbeat_dir}`*",
|
||
]
|
||
return "\n".join(lines)
|
||
|
||
def to_alert_body(self) -> str:
|
||
"""Gitea issue body when stale jobs are detected."""
|
||
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(self.timestamp))
|
||
stale = self.stale_jobs
|
||
|
||
lines = [
|
||
f"## Cron Heartbeat Alert — {ts}",
|
||
"",
|
||
f"**{len(stale)} job(s) have gone silent** (stale > {STALE_RATIO}x interval).",
|
||
"",
|
||
"| Job | Age | Interval | Ratio | Detail |",
|
||
"|-----|-----|----------|-------|--------|",
|
||
]
|
||
|
||
for j in stale:
|
||
age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
|
||
interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
|
||
ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
|
||
lines.append(
|
||
f"| `{j.job}` | {age_str} | {interval_str} | {ratio_str} | {j.message} |"
|
||
)
|
||
|
||
lines += [
|
||
"",
|
||
"### What to do",
|
||
"1. `crontab -l` — confirm the job is still scheduled",
|
||
"2. Check the job's log for errors",
|
||
"3. Restart the job if needed",
|
||
"4. Close this issue once fresh heartbeats appear",
|
||
"",
|
||
f"*Generated by `check_cron_heartbeats.py` — dir: `{self.heartbeat_dir}`*",
|
||
]
|
||
return "\n".join(lines)
|
||
|
||
def to_json(self) -> Dict[str, Any]:
|
||
return {
|
||
"healthy": self.overall_healthy,
|
||
"timestamp": self.timestamp,
|
||
"heartbeat_dir": str(self.heartbeat_dir),
|
||
"jobs": [
|
||
{
|
||
"job": j.job,
|
||
"healthy": j.healthy,
|
||
"age_seconds": j.age_seconds,
|
||
"interval_seconds": j.interval_seconds,
|
||
"staleness_ratio": j.staleness_ratio,
|
||
"raw_status": j.raw_status,
|
||
"message": j.message,
|
||
}
|
||
for j in self.jobs
|
||
],
|
||
}
|
||
|
||
|
||
def _fmt_duration(seconds: float) -> str:
|
||
"""Format a duration in seconds as a human-readable string."""
|
||
s = int(seconds)
|
||
if s < 60:
|
||
return f"{s}s"
|
||
if s < 3600:
|
||
return f"{s // 60}m {s % 60}s"
|
||
return f"{s // 3600}h {(s % 3600) // 60}m"
|
||
|
||
|
||
# ── Job scanning ──────────────────────────────────────────────────────
|
||
|
||
def scan_heartbeats(directory: Path) -> List[JobStatus]:
|
||
"""Read every ``*.last`` file in *directory* and return their statuses."""
|
||
if not directory.exists():
|
||
return []
|
||
return [_read_job_status(p.stem, p) for p in sorted(directory.glob("*.last"))]
|
||
|
||
|
||
def _read_job_status(job: str, path: Path) -> JobStatus:
|
||
"""Parse one ``.last`` file and produce a ``JobStatus``."""
|
||
now = time.time()
|
||
|
||
if not path.exists():
|
||
return JobStatus(
|
||
job=job, path=path,
|
||
healthy=False,
|
||
age_seconds=-1,
|
||
interval_seconds=0,
|
||
staleness_ratio=-1,
|
||
last_timestamp=None,
|
||
pid=None,
|
||
raw_status="missing",
|
||
message=f"Heartbeat file missing: {path}",
|
||
)
|
||
|
||
try:
|
||
data = json.loads(path.read_text())
|
||
except (json.JSONDecodeError, OSError) as exc:
|
||
return JobStatus(
|
||
job=job, path=path,
|
||
healthy=False,
|
||
age_seconds=-1,
|
||
interval_seconds=0,
|
||
staleness_ratio=-1,
|
||
last_timestamp=None,
|
||
pid=None,
|
||
raw_status="corrupt",
|
||
message=f"Corrupt heartbeat: {exc}",
|
||
)
|
||
|
||
timestamp = float(data.get("timestamp", 0))
|
||
interval = int(data.get("interval_seconds", 0))
|
||
pid = data.get("pid")
|
||
raw_status = data.get("status", "ok")
|
||
|
||
age = now - timestamp
|
||
ratio = age / interval if interval > 0 else float("inf")
|
||
stale = ratio > STALE_RATIO and age > MIN_STALE_AGE
|
||
|
||
if stale:
|
||
message = (
|
||
f"Silent for {_fmt_duration(age)} "
|
||
f"({ratio:.1f}x interval of {_fmt_duration(interval)})"
|
||
)
|
||
else:
|
||
message = f"Last beat {_fmt_duration(age)} ago (ratio {ratio:.1f}x)"
|
||
|
||
return JobStatus(
|
||
job=job, path=path,
|
||
healthy=not stale,
|
||
age_seconds=age,
|
||
interval_seconds=interval,
|
||
staleness_ratio=ratio,
|
||
last_timestamp=timestamp,
|
||
pid=pid,
|
||
raw_status=raw_status if not stale else "stale",
|
||
message=message,
|
||
)
|
||
|
||
|
||
# ── Gitea alerting ────────────────────────────────────────────────────
|
||
|
||
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
|
||
"""Make a Gitea API request; return parsed JSON or None on error."""
|
||
import urllib.request
|
||
import urllib.error
|
||
|
||
url = f"{GITEA_URL.rstrip('/')}/api/v1{path}"
|
||
body = json.dumps(data).encode() if data else None
|
||
req = urllib.request.Request(url, data=body, method=method)
|
||
if GITEA_TOKEN:
|
||
req.add_header("Authorization", f"token {GITEA_TOKEN}")
|
||
req.add_header("Content-Type", "application/json")
|
||
req.add_header("Accept", "application/json")
|
||
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||
raw = resp.read().decode()
|
||
return json.loads(raw) if raw.strip() else {}
|
||
except urllib.error.HTTPError as exc:
|
||
logger.warning("Gitea %d: %s", exc.code, exc.read().decode()[:200])
|
||
return None
|
||
except Exception as exc:
|
||
logger.warning("Gitea request failed: %s", exc)
|
||
return None
|
||
|
||
|
||
def _find_open_alert_issue() -> Optional[dict]:
|
||
issues = _gitea_request(
|
||
"GET",
|
||
f"/repos/{GITEA_REPO}/issues?state=open&type=issues&limit=20",
|
||
)
|
||
if not isinstance(issues, list):
|
||
return None
|
||
for issue in issues:
|
||
if issue.get("title", "").startswith(ALERT_TITLE_PREFIX):
|
||
return issue
|
||
return None
|
||
|
||
|
||
def alert_on_stale(report: HeartbeatReport, dry_run: bool = False) -> None:
|
||
"""Create, update, or close a Gitea alert issue based on report health."""
|
||
if dry_run:
|
||
action = "close" if report.overall_healthy else "create/update"
|
||
logger.info("DRY RUN — would %s Gitea issue", action)
|
||
return
|
||
|
||
if not GITEA_TOKEN:
|
||
logger.warning("GITEA_TOKEN not set — skipping Gitea alert")
|
||
return
|
||
|
||
existing = _find_open_alert_issue()
|
||
|
||
if report.overall_healthy:
|
||
if existing:
|
||
logger.info("All heartbeats healthy — closing issue #%d", existing["number"])
|
||
_gitea_request(
|
||
"POST",
|
||
f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
|
||
data={"body": "All cron heartbeats are now fresh. Closing."},
|
||
)
|
||
_gitea_request(
|
||
"PATCH",
|
||
f"/repos/{GITEA_REPO}/issues/{existing['number']}",
|
||
data={"state": "closed"},
|
||
)
|
||
return
|
||
|
||
stale_names = ", ".join(j.job for j in report.stale_jobs)
|
||
title = f"{ALERT_TITLE_PREFIX} Stale cron heartbeats: {stale_names}"
|
||
body = report.to_alert_body()
|
||
|
||
if existing:
|
||
logger.info("Still stale — updating issue #%d", existing["number"])
|
||
_gitea_request(
|
||
"POST",
|
||
f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
|
||
data={"body": body},
|
||
)
|
||
else:
|
||
result = _gitea_request(
|
||
"POST",
|
||
f"/repos/{GITEA_REPO}/issues",
|
||
data={"title": title, "body": body, "assignees": ["Timmy"]},
|
||
)
|
||
if result and result.get("number"):
|
||
logger.info("Created alert issue #%d", result["number"])
|
||
|
||
|
||
# ── Entry point ───────────────────────────────────────────────────────
|
||
|
||
def build_report(directory: Optional[Path] = None) -> HeartbeatReport:
|
||
"""Scan heartbeats and return a report. Exposed for Night Watch import."""
|
||
hb_dir = directory if directory is not None else _resolve_heartbeat_dir()
|
||
jobs = scan_heartbeats(hb_dir)
|
||
return HeartbeatReport(timestamp=time.time(), heartbeat_dir=hb_dir, jobs=jobs)
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="Meta-heartbeat checker — detects silent cron failures",
|
||
)
|
||
parser.add_argument(
|
||
"--dir", default=None,
|
||
help="Heartbeat directory (default: auto-detect)",
|
||
)
|
||
parser.add_argument(
|
||
"--panel", action="store_true",
|
||
help="Output Night Watch heartbeat panel markdown and exit",
|
||
)
|
||
parser.add_argument(
|
||
"--json", action="store_true", dest="output_json",
|
||
help="Output results as JSON and exit",
|
||
)
|
||
parser.add_argument(
|
||
"--dry-run", action="store_true",
|
||
help="Log results without writing Gitea issues",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
report = build_report(Path(args.dir) if args.dir else None)
|
||
|
||
if args.panel:
|
||
print(report.to_panel_markdown())
|
||
return
|
||
|
||
if args.output_json:
|
||
print(json.dumps(report.to_json(), indent=2))
|
||
sys.exit(0 if report.overall_healthy else 1)
|
||
|
||
# Default: log + alert
|
||
if not report.jobs:
|
||
logger.info("No heartbeat files found in %s", report.heartbeat_dir)
|
||
else:
|
||
for j in report.jobs:
|
||
level = logging.INFO if j.healthy else logging.ERROR
|
||
icon = "OK " if j.healthy else "STALE"
|
||
logger.log(level, "[%s] %s: %s", icon, j.job, j.message)
|
||
|
||
alert_on_stale(report, dry_run=args.dry_run)
|
||
sys.exit(0 if report.overall_healthy else 1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|