450 lines
16 KiB
Python
450 lines
16 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""Meta-heartbeat checker — makes silent cron failures impossible.
|
|||
|
|
|
|||
|
|
Reads every ``*.last`` file in the heartbeat directory and verifies that no
|
|||
|
|
job has been silent for longer than **2× its declared interval**. If any job
|
|||
|
|
is stale, a Gitea alert issue is created (or an existing one is updated).
|
|||
|
|
When all jobs recover, the issue is closed automatically.
|
|||
|
|
|
|||
|
|
This script itself should be run as a cron job every 15 minutes so the
|
|||
|
|
meta-level is also covered:
|
|||
|
|
|
|||
|
|
*/15 * * * * cd /path/to/the-nexus && \\
|
|||
|
|
python bin/check_cron_heartbeats.py >> /var/log/bezalel/heartbeat-check.log 2>&1
|
|||
|
|
|
|||
|
|
USAGE
|
|||
|
|
-----
|
|||
|
|
# Check all jobs; create/update Gitea alert if any stale:
|
|||
|
|
python bin/check_cron_heartbeats.py
|
|||
|
|
|
|||
|
|
# Dry-run (no Gitea writes):
|
|||
|
|
python bin/check_cron_heartbeats.py --dry-run
|
|||
|
|
|
|||
|
|
# Output Night Watch heartbeat panel markdown:
|
|||
|
|
python bin/check_cron_heartbeats.py --panel
|
|||
|
|
|
|||
|
|
# Output JSON (for integration with other tools):
|
|||
|
|
python bin/check_cron_heartbeats.py --json
|
|||
|
|
|
|||
|
|
# Use a custom heartbeat directory:
|
|||
|
|
python bin/check_cron_heartbeats.py --dir /tmp/test-heartbeats
|
|||
|
|
|
|||
|
|
HEARTBEAT DIRECTORY
|
|||
|
|
-------------------
|
|||
|
|
Primary: /var/run/bezalel/heartbeats/ (set by ops, writable by cron user)
|
|||
|
|
Fallback: ~/.bezalel/heartbeats/ (dev machines)
|
|||
|
|
Override: BEZALEL_HEARTBEAT_DIR env var
|
|||
|
|
|
|||
|
|
ZERO DEPENDENCIES
|
|||
|
|
-----------------
|
|||
|
|
Pure stdlib. No pip installs required.
|
|||
|
|
|
|||
|
|
Refs: #1096
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
from dataclasses import dataclass, field
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Any, Dict, List, Optional
|
|||
|
|
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format="%(asctime)s %(levelname)-7s %(message)s",
|
|||
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger("bezalel.heartbeat_checker")
|
|||
|
|
|
|||
|
|
# ── Configuration ─────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats")
|
|||
|
|
FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats"
|
|||
|
|
|
|||
|
|
GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
|
|||
|
|
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
|||
|
|
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
|
|||
|
|
ALERT_TITLE_PREFIX = "[heartbeat-checker]"
|
|||
|
|
|
|||
|
|
# A job is stale when its age exceeds this multiple of its declared interval
|
|||
|
|
STALE_RATIO = 2.0
|
|||
|
|
# Never flag a job as stale if it completed less than this many seconds ago
|
|||
|
|
# (prevents noise immediately after deployment)
|
|||
|
|
MIN_STALE_AGE = 60
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _resolve_heartbeat_dir() -> Path:
|
|||
|
|
"""Return the active heartbeat directory."""
|
|||
|
|
env = os.environ.get("BEZALEL_HEARTBEAT_DIR")
|
|||
|
|
if env:
|
|||
|
|
return Path(env)
|
|||
|
|
if PRIMARY_HEARTBEAT_DIR.exists():
|
|||
|
|
return PRIMARY_HEARTBEAT_DIR
|
|||
|
|
# Try to create it; fall back to home dir if not permitted
|
|||
|
|
try:
|
|||
|
|
PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
|
|||
|
|
probe = PRIMARY_HEARTBEAT_DIR / ".write_probe"
|
|||
|
|
probe.touch()
|
|||
|
|
probe.unlink()
|
|||
|
|
return PRIMARY_HEARTBEAT_DIR
|
|||
|
|
except (PermissionError, OSError):
|
|||
|
|
return FALLBACK_HEARTBEAT_DIR
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Data model ────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class JobStatus:
|
|||
|
|
"""Health status for a single cron job's heartbeat."""
|
|||
|
|
job: str
|
|||
|
|
path: Path
|
|||
|
|
healthy: bool
|
|||
|
|
age_seconds: float # -1 if unknown (missing/corrupt)
|
|||
|
|
interval_seconds: int # 0 if unknown
|
|||
|
|
staleness_ratio: float # age / interval; -1 if unknown; >STALE_RATIO = stale
|
|||
|
|
last_timestamp: Optional[float]
|
|||
|
|
pid: Optional[int]
|
|||
|
|
raw_status: str # value from the .last file: "ok" / "warn" / "error"
|
|||
|
|
message: str
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class HeartbeatReport:
|
|||
|
|
"""Aggregate report for all cron job heartbeats in a directory."""
|
|||
|
|
timestamp: float
|
|||
|
|
heartbeat_dir: Path
|
|||
|
|
jobs: List[JobStatus] = field(default_factory=list)
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def stale_jobs(self) -> List[JobStatus]:
|
|||
|
|
return [j for j in self.jobs if not j.healthy]
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def overall_healthy(self) -> bool:
|
|||
|
|
return len(self.stale_jobs) == 0
|
|||
|
|
|
|||
|
|
# ── Rendering ─────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def to_panel_markdown(self) -> str:
|
|||
|
|
"""Night Watch heartbeat panel — a table of all jobs with their status."""
|
|||
|
|
ts = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime(self.timestamp))
|
|||
|
|
overall = "OK" if self.overall_healthy else "ALERT"
|
|||
|
|
|
|||
|
|
lines = [
|
|||
|
|
f"## Heartbeat Panel — {ts}",
|
|||
|
|
"",
|
|||
|
|
f"**Overall:** {overall}",
|
|||
|
|
"",
|
|||
|
|
"| Job | Status | Age | Interval | Ratio |",
|
|||
|
|
"|-----|--------|-----|----------|-------|",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
if not self.jobs:
|
|||
|
|
lines.append("| *(no heartbeat files found)* | — | — | — | — |")
|
|||
|
|
else:
|
|||
|
|
for j in self.jobs:
|
|||
|
|
icon = "OK" if j.healthy else "STALE"
|
|||
|
|
age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
|
|||
|
|
interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
|
|||
|
|
ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
|
|||
|
|
lines.append(
|
|||
|
|
f"| `{j.job}` | {icon} | {age_str} | {interval_str} | {ratio_str} |"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if self.stale_jobs:
|
|||
|
|
lines += ["", "**Stale jobs:**"]
|
|||
|
|
for j in self.stale_jobs:
|
|||
|
|
lines.append(f"- `{j.job}`: {j.message}")
|
|||
|
|
|
|||
|
|
lines += [
|
|||
|
|
"",
|
|||
|
|
f"*Heartbeat dir: `{self.heartbeat_dir}`*",
|
|||
|
|
]
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
def to_alert_body(self) -> str:
|
|||
|
|
"""Gitea issue body when stale jobs are detected."""
|
|||
|
|
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(self.timestamp))
|
|||
|
|
stale = self.stale_jobs
|
|||
|
|
|
|||
|
|
lines = [
|
|||
|
|
f"## Cron Heartbeat Alert — {ts}",
|
|||
|
|
"",
|
|||
|
|
f"**{len(stale)} job(s) have gone silent** (stale > {STALE_RATIO}x interval).",
|
|||
|
|
"",
|
|||
|
|
"| Job | Age | Interval | Ratio | Detail |",
|
|||
|
|
"|-----|-----|----------|-------|--------|",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for j in stale:
|
|||
|
|
age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
|
|||
|
|
interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
|
|||
|
|
ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
|
|||
|
|
lines.append(
|
|||
|
|
f"| `{j.job}` | {age_str} | {interval_str} | {ratio_str} | {j.message} |"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
lines += [
|
|||
|
|
"",
|
|||
|
|
"### What to do",
|
|||
|
|
"1. `crontab -l` — confirm the job is still scheduled",
|
|||
|
|
"2. Check the job's log for errors",
|
|||
|
|
"3. Restart the job if needed",
|
|||
|
|
"4. Close this issue once fresh heartbeats appear",
|
|||
|
|
"",
|
|||
|
|
f"*Generated by `check_cron_heartbeats.py` — dir: `{self.heartbeat_dir}`*",
|
|||
|
|
]
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
def to_json(self) -> Dict[str, Any]:
|
|||
|
|
return {
|
|||
|
|
"healthy": self.overall_healthy,
|
|||
|
|
"timestamp": self.timestamp,
|
|||
|
|
"heartbeat_dir": str(self.heartbeat_dir),
|
|||
|
|
"jobs": [
|
|||
|
|
{
|
|||
|
|
"job": j.job,
|
|||
|
|
"healthy": j.healthy,
|
|||
|
|
"age_seconds": j.age_seconds,
|
|||
|
|
"interval_seconds": j.interval_seconds,
|
|||
|
|
"staleness_ratio": j.staleness_ratio,
|
|||
|
|
"raw_status": j.raw_status,
|
|||
|
|
"message": j.message,
|
|||
|
|
}
|
|||
|
|
for j in self.jobs
|
|||
|
|
],
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _fmt_duration(seconds: float) -> str:
|
|||
|
|
"""Format a duration in seconds as a human-readable string."""
|
|||
|
|
s = int(seconds)
|
|||
|
|
if s < 60:
|
|||
|
|
return f"{s}s"
|
|||
|
|
if s < 3600:
|
|||
|
|
return f"{s // 60}m {s % 60}s"
|
|||
|
|
return f"{s // 3600}h {(s % 3600) // 60}m"
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Job scanning ──────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def scan_heartbeats(directory: Path) -> List[JobStatus]:
|
|||
|
|
"""Read every ``*.last`` file in *directory* and return their statuses."""
|
|||
|
|
if not directory.exists():
|
|||
|
|
return []
|
|||
|
|
return [_read_job_status(p.stem, p) for p in sorted(directory.glob("*.last"))]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _read_job_status(job: str, path: Path) -> JobStatus:
|
|||
|
|
"""Parse one ``.last`` file and produce a ``JobStatus``."""
|
|||
|
|
now = time.time()
|
|||
|
|
|
|||
|
|
if not path.exists():
|
|||
|
|
return JobStatus(
|
|||
|
|
job=job, path=path,
|
|||
|
|
healthy=False,
|
|||
|
|
age_seconds=-1,
|
|||
|
|
interval_seconds=0,
|
|||
|
|
staleness_ratio=-1,
|
|||
|
|
last_timestamp=None,
|
|||
|
|
pid=None,
|
|||
|
|
raw_status="missing",
|
|||
|
|
message=f"Heartbeat file missing: {path}",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
data = json.loads(path.read_text())
|
|||
|
|
except (json.JSONDecodeError, OSError) as exc:
|
|||
|
|
return JobStatus(
|
|||
|
|
job=job, path=path,
|
|||
|
|
healthy=False,
|
|||
|
|
age_seconds=-1,
|
|||
|
|
interval_seconds=0,
|
|||
|
|
staleness_ratio=-1,
|
|||
|
|
last_timestamp=None,
|
|||
|
|
pid=None,
|
|||
|
|
raw_status="corrupt",
|
|||
|
|
message=f"Corrupt heartbeat: {exc}",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
timestamp = float(data.get("timestamp", 0))
|
|||
|
|
interval = int(data.get("interval_seconds", 0))
|
|||
|
|
pid = data.get("pid")
|
|||
|
|
raw_status = data.get("status", "ok")
|
|||
|
|
|
|||
|
|
age = now - timestamp
|
|||
|
|
ratio = age / interval if interval > 0 else float("inf")
|
|||
|
|
stale = ratio > STALE_RATIO and age > MIN_STALE_AGE
|
|||
|
|
|
|||
|
|
if stale:
|
|||
|
|
message = (
|
|||
|
|
f"Silent for {_fmt_duration(age)} "
|
|||
|
|
f"({ratio:.1f}x interval of {_fmt_duration(interval)})"
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
message = f"Last beat {_fmt_duration(age)} ago (ratio {ratio:.1f}x)"
|
|||
|
|
|
|||
|
|
return JobStatus(
|
|||
|
|
job=job, path=path,
|
|||
|
|
healthy=not stale,
|
|||
|
|
age_seconds=age,
|
|||
|
|
interval_seconds=interval,
|
|||
|
|
staleness_ratio=ratio,
|
|||
|
|
last_timestamp=timestamp,
|
|||
|
|
pid=pid,
|
|||
|
|
raw_status=raw_status if not stale else "stale",
|
|||
|
|
message=message,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Gitea alerting ────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
|
|||
|
|
"""Make a Gitea API request; return parsed JSON or None on error."""
|
|||
|
|
import urllib.request
|
|||
|
|
import urllib.error
|
|||
|
|
|
|||
|
|
url = f"{GITEA_URL.rstrip('/')}/api/v1{path}"
|
|||
|
|
body = json.dumps(data).encode() if data else None
|
|||
|
|
req = urllib.request.Request(url, data=body, method=method)
|
|||
|
|
if GITEA_TOKEN:
|
|||
|
|
req.add_header("Authorization", f"token {GITEA_TOKEN}")
|
|||
|
|
req.add_header("Content-Type", "application/json")
|
|||
|
|
req.add_header("Accept", "application/json")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|||
|
|
raw = resp.read().decode()
|
|||
|
|
return json.loads(raw) if raw.strip() else {}
|
|||
|
|
except urllib.error.HTTPError as exc:
|
|||
|
|
logger.warning("Gitea %d: %s", exc.code, exc.read().decode()[:200])
|
|||
|
|
return None
|
|||
|
|
except Exception as exc:
|
|||
|
|
logger.warning("Gitea request failed: %s", exc)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _find_open_alert_issue() -> Optional[dict]:
|
|||
|
|
issues = _gitea_request(
|
|||
|
|
"GET",
|
|||
|
|
f"/repos/{GITEA_REPO}/issues?state=open&type=issues&limit=20",
|
|||
|
|
)
|
|||
|
|
if not isinstance(issues, list):
|
|||
|
|
return None
|
|||
|
|
for issue in issues:
|
|||
|
|
if issue.get("title", "").startswith(ALERT_TITLE_PREFIX):
|
|||
|
|
return issue
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def alert_on_stale(report: HeartbeatReport, dry_run: bool = False) -> None:
|
|||
|
|
"""Create, update, or close a Gitea alert issue based on report health."""
|
|||
|
|
if dry_run:
|
|||
|
|
action = "close" if report.overall_healthy else "create/update"
|
|||
|
|
logger.info("DRY RUN — would %s Gitea issue", action)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
if not GITEA_TOKEN:
|
|||
|
|
logger.warning("GITEA_TOKEN not set — skipping Gitea alert")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
existing = _find_open_alert_issue()
|
|||
|
|
|
|||
|
|
if report.overall_healthy:
|
|||
|
|
if existing:
|
|||
|
|
logger.info("All heartbeats healthy — closing issue #%d", existing["number"])
|
|||
|
|
_gitea_request(
|
|||
|
|
"POST",
|
|||
|
|
f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
|
|||
|
|
data={"body": "All cron heartbeats are now fresh. Closing."},
|
|||
|
|
)
|
|||
|
|
_gitea_request(
|
|||
|
|
"PATCH",
|
|||
|
|
f"/repos/{GITEA_REPO}/issues/{existing['number']}",
|
|||
|
|
data={"state": "closed"},
|
|||
|
|
)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
stale_names = ", ".join(j.job for j in report.stale_jobs)
|
|||
|
|
title = f"{ALERT_TITLE_PREFIX} Stale cron heartbeats: {stale_names}"
|
|||
|
|
body = report.to_alert_body()
|
|||
|
|
|
|||
|
|
if existing:
|
|||
|
|
logger.info("Still stale — updating issue #%d", existing["number"])
|
|||
|
|
_gitea_request(
|
|||
|
|
"POST",
|
|||
|
|
f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
|
|||
|
|
data={"body": body},
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
result = _gitea_request(
|
|||
|
|
"POST",
|
|||
|
|
f"/repos/{GITEA_REPO}/issues",
|
|||
|
|
data={"title": title, "body": body, "assignees": ["Timmy"]},
|
|||
|
|
)
|
|||
|
|
if result and result.get("number"):
|
|||
|
|
logger.info("Created alert issue #%d", result["number"])
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Entry point ───────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def build_report(directory: Optional[Path] = None) -> HeartbeatReport:
|
|||
|
|
"""Scan heartbeats and return a report. Exposed for Night Watch import."""
|
|||
|
|
hb_dir = directory if directory is not None else _resolve_heartbeat_dir()
|
|||
|
|
jobs = scan_heartbeats(hb_dir)
|
|||
|
|
return HeartbeatReport(timestamp=time.time(), heartbeat_dir=hb_dir, jobs=jobs)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> None:
|
|||
|
|
parser = argparse.ArgumentParser(
|
|||
|
|
description="Meta-heartbeat checker — detects silent cron failures",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--dir", default=None,
|
|||
|
|
help="Heartbeat directory (default: auto-detect)",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--panel", action="store_true",
|
|||
|
|
help="Output Night Watch heartbeat panel markdown and exit",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--json", action="store_true", dest="output_json",
|
|||
|
|
help="Output results as JSON and exit",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--dry-run", action="store_true",
|
|||
|
|
help="Log results without writing Gitea issues",
|
|||
|
|
)
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
report = build_report(Path(args.dir) if args.dir else None)
|
|||
|
|
|
|||
|
|
if args.panel:
|
|||
|
|
print(report.to_panel_markdown())
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
if args.output_json:
|
|||
|
|
print(json.dumps(report.to_json(), indent=2))
|
|||
|
|
sys.exit(0 if report.overall_healthy else 1)
|
|||
|
|
|
|||
|
|
# Default: log + alert
|
|||
|
|
if not report.jobs:
|
|||
|
|
logger.info("No heartbeat files found in %s", report.heartbeat_dir)
|
|||
|
|
else:
|
|||
|
|
for j in report.jobs:
|
|||
|
|
level = logging.INFO if j.healthy else logging.ERROR
|
|||
|
|
icon = "OK " if j.healthy else "STALE"
|
|||
|
|
logger.log(level, "[%s] %s: %s", icon, j.job, j.message)
|
|||
|
|
|
|||
|
|
alert_on_stale(report, dry_run=args.dry_run)
|
|||
|
|
sys.exit(0 if report.overall_healthy else 1)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|