[claude] Poka-yoke cron heartbeats — make silent failures impossible (#1096 ) (#1102 )

[claude] Poka-yoke: make test skips/flakes impossible to ignore (#1094 ) (#1104 )
[claude] Poka-yoke runner health: provision + health probe scripts (#1097 ) (#1101 )
2026-04-07 14:38:55 +00:00 · 2026-04-07 14:38:49 +00:00 · 2026-04-07 14:33:35 +00:00
15 changed files with 2199 additions and 0 deletions
--- a/bin/check_cron_heartbeats.py
+++ b/bin/check_cron_heartbeats.py
@@ -0,0 +1,449 @@
+#!/usr/bin/env python3
+"""Meta-heartbeat checker — makes silent cron failures impossible.
+
+Reads every ``*.last`` file in the heartbeat directory and verifies that no
+job has been silent for longer than **2× its declared interval**.  If any job
+is stale, a Gitea alert issue is created (or an existing one is updated).
+When all jobs recover, the issue is closed automatically.
+
+This script itself should be run as a cron job every 15 minutes so the
+meta-level is also covered:
+
+    */15 * * * * cd /path/to/the-nexus && \\
+        python bin/check_cron_heartbeats.py >> /var/log/bezalel/heartbeat-check.log 2>&1
+
+USAGE
+-----
+    # Check all jobs; create/update Gitea alert if any stale:
+    python bin/check_cron_heartbeats.py
+
+    # Dry-run (no Gitea writes):
+    python bin/check_cron_heartbeats.py --dry-run
+
+    # Output Night Watch heartbeat panel markdown:
+    python bin/check_cron_heartbeats.py --panel
+
+    # Output JSON (for integration with other tools):
+    python bin/check_cron_heartbeats.py --json
+
+    # Use a custom heartbeat directory:
+    python bin/check_cron_heartbeats.py --dir /tmp/test-heartbeats
+
+HEARTBEAT DIRECTORY
+-------------------
+    Primary:  /var/run/bezalel/heartbeats/      (set by ops, writable by cron user)
+    Fallback: ~/.bezalel/heartbeats/             (dev machines)
+    Override: BEZALEL_HEARTBEAT_DIR env var
+
+ZERO DEPENDENCIES
+-----------------
+Pure stdlib.  No pip installs required.
+
+Refs: #1096
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(levelname)-7s  %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger("bezalel.heartbeat_checker")
+
+# ── Configuration ─────────────────────────────────────────────────────
+
+PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats")
+FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats"
+
+GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
+GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
+GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
+ALERT_TITLE_PREFIX = "[heartbeat-checker]"
+
+# A job is stale when its age exceeds this multiple of its declared interval
+STALE_RATIO = 2.0
+# Never flag a job as stale if it completed less than this many seconds ago
+# (prevents noise immediately after deployment)
+MIN_STALE_AGE = 60
+
+
+def _resolve_heartbeat_dir() -> Path:
+    """Return the active heartbeat directory."""
+    env = os.environ.get("BEZALEL_HEARTBEAT_DIR")
+    if env:
+        return Path(env)
+    if PRIMARY_HEARTBEAT_DIR.exists():
+        return PRIMARY_HEARTBEAT_DIR
+    # Try to create it; fall back to home dir if not permitted
+    try:
+        PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
+        probe = PRIMARY_HEARTBEAT_DIR / ".write_probe"
+        probe.touch()
+        probe.unlink()
+        return PRIMARY_HEARTBEAT_DIR
+    except (PermissionError, OSError):
+        return FALLBACK_HEARTBEAT_DIR
+
+
+# ── Data model ────────────────────────────────────────────────────────
+
+@dataclass
+class JobStatus:
+    """Health status for a single cron job's heartbeat."""
+    job: str
+    path: Path
+    healthy: bool
+    age_seconds: float       # -1 if unknown (missing/corrupt)
+    interval_seconds: int    # 0 if unknown
+    staleness_ratio: float   # age / interval; -1 if unknown; >STALE_RATIO = stale
+    last_timestamp: Optional[float]
+    pid: Optional[int]
+    raw_status: str          # value from the .last file: "ok" / "warn" / "error"
+    message: str
+
+
+@dataclass
+class HeartbeatReport:
+    """Aggregate report for all cron job heartbeats in a directory."""
+    timestamp: float
+    heartbeat_dir: Path
+    jobs: List[JobStatus] = field(default_factory=list)
+
+    @property
+    def stale_jobs(self) -> List[JobStatus]:
+        return [j for j in self.jobs if not j.healthy]
+
+    @property
+    def overall_healthy(self) -> bool:
+        return len(self.stale_jobs) == 0
+
+    # ── Rendering ─────────────────────────────────────────────────────
+
+    def to_panel_markdown(self) -> str:
+        """Night Watch heartbeat panel — a table of all jobs with their status."""
+        ts = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime(self.timestamp))
+        overall = "OK" if self.overall_healthy else "ALERT"
+
+        lines = [
+            f"## Heartbeat Panel — {ts}",
+            "",
+            f"**Overall:** {overall}",
+            "",
+            "| Job | Status | Age | Interval | Ratio |",
+            "|-----|--------|-----|----------|-------|",
+        ]
+
+        if not self.jobs:
+            lines.append("| *(no heartbeat files found)* | — | — | — | — |")
+        else:
+            for j in self.jobs:
+                icon = "OK" if j.healthy else "STALE"
+                age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
+                interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
+                ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
+                lines.append(
+                    f"| `{j.job}` | {icon} | {age_str} | {interval_str} | {ratio_str} |"
+                )
+
+        if self.stale_jobs:
+            lines += ["", "**Stale jobs:**"]
+            for j in self.stale_jobs:
+                lines.append(f"- `{j.job}`: {j.message}")
+
+        lines += [
+            "",
+            f"*Heartbeat dir: `{self.heartbeat_dir}`*",
+        ]
+        return "\n".join(lines)
+
+    def to_alert_body(self) -> str:
+        """Gitea issue body when stale jobs are detected."""
+        ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(self.timestamp))
+        stale = self.stale_jobs
+
+        lines = [
+            f"## Cron Heartbeat Alert — {ts}",
+            "",
+            f"**{len(stale)} job(s) have gone silent** (stale > {STALE_RATIO}x interval).",
+            "",
+            "| Job | Age | Interval | Ratio | Detail |",
+            "|-----|-----|----------|-------|--------|",
+        ]
+
+        for j in stale:
+            age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
+            interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
+            ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
+            lines.append(
+                f"| `{j.job}` | {age_str} | {interval_str} | {ratio_str} | {j.message} |"
+            )
+
+        lines += [
+            "",
+            "### What to do",
+            "1. `crontab -l` — confirm the job is still scheduled",
+            "2. Check the job's log for errors",
+            "3. Restart the job if needed",
+            "4. Close this issue once fresh heartbeats appear",
+            "",
+            f"*Generated by `check_cron_heartbeats.py` — dir: `{self.heartbeat_dir}`*",
+        ]
+        return "\n".join(lines)
+
+    def to_json(self) -> Dict[str, Any]:
+        return {
+            "healthy": self.overall_healthy,
+            "timestamp": self.timestamp,
+            "heartbeat_dir": str(self.heartbeat_dir),
+            "jobs": [
+                {
+                    "job": j.job,
+                    "healthy": j.healthy,
+                    "age_seconds": j.age_seconds,
+                    "interval_seconds": j.interval_seconds,
+                    "staleness_ratio": j.staleness_ratio,
+                    "raw_status": j.raw_status,
+                    "message": j.message,
+                }
+                for j in self.jobs
+            ],
+        }
+
+
+def _fmt_duration(seconds: float) -> str:
+    """Format a duration in seconds as a human-readable string."""
+    s = int(seconds)
+    if s < 60:
+        return f"{s}s"
+    if s < 3600:
+        return f"{s // 60}m {s % 60}s"
+    return f"{s // 3600}h {(s % 3600) // 60}m"
+
+
+# ── Job scanning ──────────────────────────────────────────────────────
+
+def scan_heartbeats(directory: Path) -> List[JobStatus]:
+    """Read every ``*.last`` file in *directory* and return their statuses."""
+    if not directory.exists():
+        return []
+    return [_read_job_status(p.stem, p) for p in sorted(directory.glob("*.last"))]
+
+
+def _read_job_status(job: str, path: Path) -> JobStatus:
+    """Parse one ``.last`` file and produce a ``JobStatus``."""
+    now = time.time()
+
+    if not path.exists():
+        return JobStatus(
+            job=job, path=path,
+            healthy=False,
+            age_seconds=-1,
+            interval_seconds=0,
+            staleness_ratio=-1,
+            last_timestamp=None,
+            pid=None,
+            raw_status="missing",
+            message=f"Heartbeat file missing: {path}",
+        )
+
+    try:
+        data = json.loads(path.read_text())
+    except (json.JSONDecodeError, OSError) as exc:
+        return JobStatus(
+            job=job, path=path,
+            healthy=False,
+            age_seconds=-1,
+            interval_seconds=0,
+            staleness_ratio=-1,
+            last_timestamp=None,
+            pid=None,
+            raw_status="corrupt",
+            message=f"Corrupt heartbeat: {exc}",
+        )
+
+    timestamp = float(data.get("timestamp", 0))
+    interval = int(data.get("interval_seconds", 0))
+    pid = data.get("pid")
+    raw_status = data.get("status", "ok")
+
+    age = now - timestamp
+    ratio = age / interval if interval > 0 else float("inf")
+    stale = ratio > STALE_RATIO and age > MIN_STALE_AGE
+
+    if stale:
+        message = (
+            f"Silent for {_fmt_duration(age)} "
+            f"({ratio:.1f}x interval of {_fmt_duration(interval)})"
+        )
+    else:
+        message = f"Last beat {_fmt_duration(age)} ago (ratio {ratio:.1f}x)"
+
+    return JobStatus(
+        job=job, path=path,
+        healthy=not stale,
+        age_seconds=age,
+        interval_seconds=interval,
+        staleness_ratio=ratio,
+        last_timestamp=timestamp,
+        pid=pid,
+        raw_status=raw_status if not stale else "stale",
+        message=message,
+    )
+
+
+# ── Gitea alerting ────────────────────────────────────────────────────
+
+def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
+    """Make a Gitea API request; return parsed JSON or None on error."""
+    import urllib.request
+    import urllib.error
+
+    url = f"{GITEA_URL.rstrip('/')}/api/v1{path}"
+    body = json.dumps(data).encode() if data else None
+    req = urllib.request.Request(url, data=body, method=method)
+    if GITEA_TOKEN:
+        req.add_header("Authorization", f"token {GITEA_TOKEN}")
+    req.add_header("Content-Type", "application/json")
+    req.add_header("Accept", "application/json")
+
+    try:
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            raw = resp.read().decode()
+            return json.loads(raw) if raw.strip() else {}
+    except urllib.error.HTTPError as exc:
+        logger.warning("Gitea %d: %s", exc.code, exc.read().decode()[:200])
+        return None
+    except Exception as exc:
+        logger.warning("Gitea request failed: %s", exc)
+        return None
+
+
+def _find_open_alert_issue() -> Optional[dict]:
+    issues = _gitea_request(
+        "GET",
+        f"/repos/{GITEA_REPO}/issues?state=open&type=issues&limit=20",
+    )
+    if not isinstance(issues, list):
+        return None
+    for issue in issues:
+        if issue.get("title", "").startswith(ALERT_TITLE_PREFIX):
+            return issue
+    return None
+
+
+def alert_on_stale(report: HeartbeatReport, dry_run: bool = False) -> None:
+    """Create, update, or close a Gitea alert issue based on report health."""
+    if dry_run:
+        action = "close" if report.overall_healthy else "create/update"
+        logger.info("DRY RUN — would %s Gitea issue", action)
+        return
+
+    if not GITEA_TOKEN:
+        logger.warning("GITEA_TOKEN not set — skipping Gitea alert")
+        return
+
+    existing = _find_open_alert_issue()
+
+    if report.overall_healthy:
+        if existing:
+            logger.info("All heartbeats healthy — closing issue #%d", existing["number"])
+            _gitea_request(
+                "POST",
+                f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
+                data={"body": "All cron heartbeats are now fresh. Closing."},
+            )
+            _gitea_request(
+                "PATCH",
+                f"/repos/{GITEA_REPO}/issues/{existing['number']}",
+                data={"state": "closed"},
+            )
+        return
+
+    stale_names = ", ".join(j.job for j in report.stale_jobs)
+    title = f"{ALERT_TITLE_PREFIX} Stale cron heartbeats: {stale_names}"
+    body = report.to_alert_body()
+
+    if existing:
+        logger.info("Still stale — updating issue #%d", existing["number"])
+        _gitea_request(
+            "POST",
+            f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
+            data={"body": body},
+        )
+    else:
+        result = _gitea_request(
+            "POST",
+            f"/repos/{GITEA_REPO}/issues",
+            data={"title": title, "body": body, "assignees": ["Timmy"]},
+        )
+        if result and result.get("number"):
+            logger.info("Created alert issue #%d", result["number"])
+
+
+# ── Entry point ───────────────────────────────────────────────────────
+
+def build_report(directory: Optional[Path] = None) -> HeartbeatReport:
+    """Scan heartbeats and return a report.  Exposed for Night Watch import."""
+    hb_dir = directory if directory is not None else _resolve_heartbeat_dir()
+    jobs = scan_heartbeats(hb_dir)
+    return HeartbeatReport(timestamp=time.time(), heartbeat_dir=hb_dir, jobs=jobs)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Meta-heartbeat checker — detects silent cron failures",
+    )
+    parser.add_argument(
+        "--dir", default=None,
+        help="Heartbeat directory (default: auto-detect)",
+    )
+    parser.add_argument(
+        "--panel", action="store_true",
+        help="Output Night Watch heartbeat panel markdown and exit",
+    )
+    parser.add_argument(
+        "--json", action="store_true", dest="output_json",
+        help="Output results as JSON and exit",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Log results without writing Gitea issues",
+    )
+    args = parser.parse_args()
+
+    report = build_report(Path(args.dir) if args.dir else None)
+
+    if args.panel:
+        print(report.to_panel_markdown())
+        return
+
+    if args.output_json:
+        print(json.dumps(report.to_json(), indent=2))
+        sys.exit(0 if report.overall_healthy else 1)
+
+    # Default: log + alert
+    if not report.jobs:
+        logger.info("No heartbeat files found in %s", report.heartbeat_dir)
+    else:
+        for j in report.jobs:
+            level = logging.INFO if j.healthy else logging.ERROR
+            icon = "OK   " if j.healthy else "STALE"
+            logger.log(level, "[%s] %s: %s", icon, j.job, j.message)
+
+    alert_on_stale(report, dry_run=args.dry_run)
+    sys.exit(0 if report.overall_healthy else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/bin/nexus_watchdog.py
+++ b/bin/nexus_watchdog.py
@@ -80,6 +80,15 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Optional

+# Poka-yoke: write a cron heartbeat so check_cron_heartbeats.py can detect
+# if *this* watchdog stops running.  Import lazily to stay zero-dep if the
+# nexus package is unavailable (e.g. very minimal test environments).
+try:
+    from nexus.cron_heartbeat import write_cron_heartbeat as _write_cron_heartbeat
+    _HAS_CRON_HEARTBEAT = True
+except ImportError:
+    _HAS_CRON_HEARTBEAT = False
+
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-7s  %(message)s",
@@ -488,6 +497,15 @@ def run_once(args: argparse.Namespace) -> bool:
    elif not args.dry_run:
        alert_on_failure(report, dry_run=args.dry_run)

+    # Poka-yoke: stamp our own heartbeat so the meta-checker can detect
+    # if this watchdog cron job itself goes silent.  Runs every 5 minutes
+    # by convention (*/5 * * * *).
+    if _HAS_CRON_HEARTBEAT:
+        try:
+            _write_cron_heartbeat("nexus_watchdog", interval_seconds=300)
+        except Exception:
+            pass  # never crash the watchdog over its own heartbeat
+
    return report.overall_healthy


--- a/bin/night_watch.py
+++ b/bin/night_watch.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""Night Watch — Bezalel nightly report generator.
+
+Runs once per night (typically at 03:00 local time via cron) and writes a
+markdown report to ``reports/bezalel/nightly/<YYYY-MM-DD>.md``.
+
+The report always includes a **Heartbeat Panel** (acceptance criterion #3 of
+issue #1096) so silent cron failures are visible in the morning brief.
+
+USAGE
+-----
+    python bin/night_watch.py              # write today's report
+    python bin/night_watch.py --dry-run    # print to stdout, don't write file
+    python bin/night_watch.py --date 2026-04-08   # specific date
+
+CRONTAB
+-------
+    0 3 * * * cd /path/to/the-nexus && python bin/night_watch.py \\
+        >> /var/log/bezalel/night-watch.log 2>&1
+
+ZERO DEPENDENCIES
+-----------------
+Pure stdlib, plus ``check_cron_heartbeats`` from this repo (also stdlib).
+
+Refs: #1096
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import json
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(levelname)-7s  %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger("bezalel.night_watch")
+
+PROJECT_ROOT = Path(__file__).parent.parent
+REPORTS_DIR = PROJECT_ROOT / "reports" / "bezalel" / "nightly"
+
+# ── Load check_cron_heartbeats without relying on sys.path hacks ──────
+
+def _load_checker():
+    """Import bin/check_cron_heartbeats.py as a module."""
+    spec = importlib.util.spec_from_file_location(
+        "_check_cron_heartbeats",
+        PROJECT_ROOT / "bin" / "check_cron_heartbeats.py",
+    )
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+# ── System checks ─────────────────────────────────────────────────────
+
+def _check_service(service_name: str) -> tuple[str, str]:
+    """Return (status, detail) for a systemd service."""
+    try:
+        result = subprocess.run(
+            ["systemctl", "is-active", service_name],
+            capture_output=True, text=True, timeout=5,
+        )
+        active = result.stdout.strip()
+        if active == "active":
+            return "OK", f"{service_name} is active"
+        return "WARN", f"{service_name} is {active}"
+    except FileNotFoundError:
+        return "OK", f"{service_name} status unknown (systemctl not available)"
+    except Exception as exc:
+        return "WARN", f"systemctl error: {exc}"
+
+
+def _check_disk(threshold_pct: int = 90) -> tuple[str, str]:
+    """Return (status, detail) for disk usage on /."""
+    try:
+        usage = shutil.disk_usage("/")
+        pct = int(usage.used / usage.total * 100)
+        status = "OK" if pct < threshold_pct else "WARN"
+        return status, f"disk usage {pct}%"
+    except Exception as exc:
+        return "WARN", f"disk check failed: {exc}"
+
+
+def _check_memory(threshold_pct: int = 90) -> tuple[str, str]:
+    """Return (status, detail) for memory usage."""
+    try:
+        meminfo = Path("/proc/meminfo").read_text()
+        data = {}
+        for line in meminfo.splitlines():
+            parts = line.split()
+            if len(parts) >= 2:
+                data[parts[0].rstrip(":")] = int(parts[1])
+        total = data.get("MemTotal", 0)
+        available = data.get("MemAvailable", 0)
+        if total == 0:
+            return "OK", "memory info unavailable"
+        pct = int((total - available) / total * 100)
+        status = "OK" if pct < threshold_pct else "WARN"
+        return status, f"memory usage {pct}%"
+    except FileNotFoundError:
+        # Not Linux (e.g. macOS dev machine)
+        return "OK", "memory check skipped (not Linux)"
+    except Exception as exc:
+        return "WARN", f"memory check failed: {exc}"
+
+
+def _check_gitea_reachability(gitea_url: str = "https://forge.alexanderwhitestone.com") -> tuple[str, str]:
+    """Return (status, detail) for Gitea HTTPS reachability."""
+    import urllib.request
+    import urllib.error
+    try:
+        with urllib.request.urlopen(gitea_url, timeout=10) as resp:
+            code = resp.status
+            if code == 200:
+                return "OK", f"Alpha SSH not configured from Beta, but Gitea HTTPS is responding ({code})"
+            return "WARN", f"Gitea returned HTTP {code}"
+    except Exception as exc:
+        return "WARN", f"Gitea unreachable: {exc}"
+
+
+def _check_world_readable_secrets() -> tuple[str, str]:
+    """Return (status, detail) for world-readable sensitive files."""
+    sensitive_patterns = ["*.key", "*.pem", "*.secret", ".env", "*.token"]
+    found = []
+    try:
+        for pattern in sensitive_patterns:
+            for path in PROJECT_ROOT.rglob(pattern):
+                try:
+                    mode = path.stat().st_mode
+                    if mode & 0o004:  # world-readable
+                        found.append(str(path.relative_to(PROJECT_ROOT)))
+                except OSError:
+                    pass
+        if found:
+            return "WARN", f"world-readable sensitive files: {', '.join(found[:3])}"
+        return "OK", "no sensitive recently-modified world-readable files found"
+    except Exception as exc:
+        return "WARN", f"security check failed: {exc}"
+
+
+# ── Report generation ─────────────────────────────────────────────────
+
+def generate_report(date_str: str, checker_mod) -> str:
+    """Build the full nightly report markdown string."""
+    now_utc = datetime.now(timezone.utc)
+    ts = now_utc.strftime("%Y-%m-%d %02H:%M UTC")
+
+    rows: list[tuple[str, str, str]] = []
+
+    service_status, service_detail = _check_service("hermes-bezalel")
+    rows.append(("Service", service_status, service_detail))
+
+    disk_status, disk_detail = _check_disk()
+    rows.append(("Disk", disk_status, disk_detail))
+
+    mem_status, mem_detail = _check_memory()
+    rows.append(("Memory", mem_status, mem_detail))
+
+    gitea_status, gitea_detail = _check_gitea_reachability()
+    rows.append(("Alpha VPS", gitea_status, gitea_detail))
+
+    sec_status, sec_detail = _check_world_readable_secrets()
+    rows.append(("Security", sec_status, sec_detail))
+
+    overall = "OK" if all(r[1] == "OK" for r in rows) else "WARN"
+
+    lines = [
+        f"# Bezalel Night Watch — {ts}",
+        "",
+        f"**Overall:** {overall}",
+        "",
+        "| Check | Status | Detail |",
+        "|-------|--------|--------|",
+    ]
+    for check, status, detail in rows:
+        lines.append(f"| {check} | {status} | {detail} |")
+
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+
+    # ── Heartbeat Panel (acceptance criterion #1096) ──────────────────
+    try:
+        hb_report = checker_mod.build_report()
+        lines.append(hb_report.to_panel_markdown())
+    except Exception as exc:
+        lines += [
+            "## Heartbeat Panel",
+            "",
+            f"*(heartbeat check failed: {exc})*",
+        ]
+
+    lines += [
+        "",
+        "---",
+        "",
+        "*Automated by Bezalel Night Watch*",
+        "",
+    ]
+
+    return "\n".join(lines)
+
+
+# ── Entry point ───────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Bezalel Night Watch — nightly report generator",
+    )
+    parser.add_argument(
+        "--date", default=None,
+        help="Report date as YYYY-MM-DD (default: today UTC)",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Print report to stdout instead of writing to disk",
+    )
+    args = parser.parse_args()
+
+    date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    checker = _load_checker()
+    report_text = generate_report(date_str, checker)
+
+    if args.dry_run:
+        print(report_text)
+        return
+
+    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
+    report_path = REPORTS_DIR / f"{date_str}.md"
+    report_path.write_text(report_text)
+    logger.info("Night Watch report written to %s", report_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/QUARANTINE_PROCESS.md
+++ b/docs/QUARANTINE_PROCESS.md
@@ -0,0 +1,168 @@
+# Quarantine Process
+
+**Poka-yoke principle:** a flaky or broken test must never silently rot in
+place.  Quarantine is the correction step in the
+Prevention → Detection → Correction triad described in issue #1094.
+
+---
+
+## When to quarantine
+
+Quarantine a test when **any** of the following are true:
+
+| Signal | Source |
+|--------|--------|
+| `flake_detector.py` flags the test at < 95 % consistency | Automated |
+| The test fails intermittently in CI over two consecutive runs | Manual observation |
+| The test depends on infrastructure that is temporarily unavailable | Manual observation |
+| You are fixing a bug and need to defer a related test | Developer judgement |
+
+Do **not** use quarantine as a way to ignore tests indefinitely.  The
+quarantine directory is a **30-day time-box** — see the escalation rule below.
+
+---
+
+## Step-by-step workflow
+
+### 1  File an issue
+
+Open a Gitea issue with the title prefix `[FLAKY]` or `[BROKEN]`:
+
+```
+[FLAKY] test_foo_bar non-deterministically fails with assertion error
+```
+
+Note the issue number — you will need it in the next step.
+
+### 2  Move the test file
+
+Move (or copy) the test from `tests/` into `tests/quarantine/`.
+
+```bash
+git mv tests/test_my_thing.py tests/quarantine/test_my_thing.py
+```
+
+If only individual test functions are flaky, extract them into a new file in
+`tests/quarantine/` rather than moving the whole module.
+
+### 3  Annotate the test
+
+Add the `@pytest.mark.quarantine` marker with the issue reference:
+
+```python
+import pytest
+
+@pytest.mark.quarantine(reason="Flaky until #NNN is resolved")
+def test_my_thing():
+    ...
+```
+
+This satisfies the poka-yoke skip-enforcement rule: the test is allowed to
+skip/be excluded because it is explicitly linked to a tracking issue.
+
+### 4  Verify CI still passes
+
+```bash
+pytest          # default run — quarantine tests are excluded
+pytest --run-quarantine   # optional: run quarantined tests explicitly
+```
+
+The main CI run must be green before merging.
+
+### 5  Add to `.test-history.json` exclusions (optional)
+
+If the flake detector is tracking the test, add it to the `quarantine_list` in
+`.test-history.json` so it is excluded from the consistency report:
+
+```json
+{
+  "quarantine_list": [
+    "tests/quarantine/test_my_thing.py::test_my_thing"
+  ]
+}
+```
+
+---
+
+## Escalation rule
+
+If a quarantined test's tracking issue has had **no activity for 30 days**,
+the next developer to touch that file must:
+
+1. Attempt to fix and un-quarantine the test, **or**
+2. Delete the test and close the issue with a comment explaining why, **or**
+3. Leave a comment on the issue explaining the blocker and reset the 30-day
+   clock explicitly.
+
+**A test may not stay in quarantine indefinitely without active attention.**
+
+---
+
+## Un-quarantining a test
+
+When the underlying issue is resolved:
+
+1. Remove `@pytest.mark.quarantine` from the test.
+2. Move the file back from `tests/quarantine/` to `tests/`.
+3. Run the full suite to confirm it passes consistently (at least 3 local runs).
+4. Close the tracking issue.
+5. Remove any entries from `.test-history.json`'s `quarantine_list`.
+
+---
+
+## Flake detector integration
+
+The flake detector (`scripts/flake_detector.py`) is run after every CI test
+execution.  It reads `.test-report.json` (produced by `pytest --json-report`)
+and updates `.test-history.json`.
+
+**CI integration example (shell script or CI step):**
+
+```bash
+pytest --json-report --json-report-file=.test-report.json
+python scripts/flake_detector.py
+```
+
+If the flake detector exits non-zero, the CI step fails and the output lists
+the offending tests with their consistency percentages.
+
+**Local usage:**
+
+```bash
+# After running tests with JSON report:
+python scripts/flake_detector.py
+
+# Just view current statistics without ingesting a new report:
+python scripts/flake_detector.py --no-update
+
+# Lower threshold for local dev:
+python scripts/flake_detector.py --threshold 0.90
+```
+
+---
+
+## Summary
+
+```
+Test fails intermittently
+        │
+        ▼
+  File [FLAKY] issue
+        │
+        ▼
+  git mv test → tests/quarantine/
+        │
+        ▼
+  Add @pytest.mark.quarantine(reason="#NNN")
+        │
+        ▼
+  Main CI green ✓
+        │
+        ▼
+  Fix the root cause (within 30 days)
+        │
+        ▼
+  git mv back → tests/
+  Remove quarantine marker
+  Close issue ✓
+```
--- a/nexus/cron_heartbeat.py
+++ b/nexus/cron_heartbeat.py
@@ -0,0 +1,136 @@
+"""Poka-yoke heartbeat writer for cron jobs.
+
+Every scheduled job calls write_cron_heartbeat() on successful completion so
+the meta-heartbeat checker (bin/check_cron_heartbeats.py) can verify that all
+jobs are still alive.  Absence of a fresh heartbeat = silent failure.
+
+Path convention
+---------------
+Primary:  /var/run/bezalel/heartbeats/<job>.last
+Fallback: ~/.bezalel/heartbeats/<job>.last
+          (used when /var/run/bezalel is not writable, e.g. dev machines)
+Override: BEZALEL_HEARTBEAT_DIR environment variable
+
+Heartbeat file format (JSON)
+----------------------------
+{
+  "job": "nexus_watchdog",
+  "timestamp": 1744000000.0,
+  "interval_seconds": 300,
+  "pid": 12345,
+  "status": "ok"
+}
+
+Usage in a cron job
+-------------------
+    from nexus.cron_heartbeat import write_cron_heartbeat
+
+    def main():
+        # ... do the work ...
+        write_cron_heartbeat("my_job_name", interval_seconds=300)
+
+Zero-dependency shell one-liner (for scripts that can't import Python)
+-----------------------------------------------------------------------
+    python -c "
+    from nexus.cron_heartbeat import write_cron_heartbeat
+    write_cron_heartbeat('my_job', interval_seconds=300)
+    "
+
+Refs: #1096
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import tempfile
+import time
+from pathlib import Path
+
+PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats")
+FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats"
+
+
+def _resolve_heartbeat_dir() -> Path:
+    """Return the heartbeat directory, trying primary then fallback.
+
+    If BEZALEL_HEARTBEAT_DIR is set in the environment that wins outright
+    (useful for tests and non-standard deployments).
+    """
+    env = os.environ.get("BEZALEL_HEARTBEAT_DIR")
+    if env:
+        return Path(env)
+
+    # Try to create and write-test the primary path
+    try:
+        PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
+        probe = PRIMARY_HEARTBEAT_DIR / ".write_probe"
+        probe.touch()
+        probe.unlink()
+        return PRIMARY_HEARTBEAT_DIR
+    except (PermissionError, OSError):
+        pass
+
+    FALLBACK_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
+    return FALLBACK_HEARTBEAT_DIR
+
+
+def heartbeat_path(job: str, directory: Path | None = None) -> Path:
+    """Return the Path where *job*'s heartbeat file lives.
+
+    Useful for readers (e.g. the Night Watch report) that just need the
+    location without writing anything.
+    """
+    d = directory if directory is not None else _resolve_heartbeat_dir()
+    return d / f"{job}.last"
+
+
+def write_cron_heartbeat(
+    job: str,
+    interval_seconds: int,
+    status: str = "ok",
+    directory: Path | None = None,
+) -> Path:
+    """Write a poka-yoke heartbeat file for a cron job.
+
+    Call this at the end of your job's main function.  The file is written
+    atomically (write-to-temp + rename) so the checker never reads a partial
+    file.
+
+    Args:
+        job:              Unique job name, e.g. ``"nexus_watchdog"``.
+        interval_seconds: Expected run cadence, e.g. ``300`` for every 5 min.
+        status:           Completion status: ``"ok"``, ``"warn"``, or
+                          ``"error"``.  Only ``"ok"`` resets the stale clock.
+        directory:        Override the heartbeat directory (mainly for tests).
+
+    Returns:
+        Path to the written heartbeat file.
+    """
+    d = directory if directory is not None else _resolve_heartbeat_dir()
+    d.mkdir(parents=True, exist_ok=True)
+    path = d / f"{job}.last"
+
+    data = {
+        "job": job,
+        "timestamp": time.time(),
+        "interval_seconds": interval_seconds,
+        "pid": os.getpid(),
+        "status": status,
+    }
+
+    # Atomic write: temp file in same directory + rename.
+    # Guarantees the checker never sees a half-written file.
+    fd, tmp = tempfile.mkstemp(dir=str(d), prefix=f".{job}-", suffix=".tmp")
+    try:
+        with os.fdopen(fd, "w") as f:
+            json.dump(data, f)
+        os.replace(tmp, str(path))
+    except Exception:
+        # Best-effort — never crash the job over a heartbeat failure
+        try:
+            os.unlink(tmp)
+        except OSError:
+            pass
+
+    return path
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,14 @@
+[pytest]
+testpaths = tests
+asyncio_mode = auto
+
+# Show full diffs and verbose skip/fail reasons
+addopts =
+    -v
+    --tb=short
+    --strict-markers
+
+# Markers registered here (also registered in conftest.py for programmatic use)
+markers =
+    integration: mark test as integration test (requires MCP servers)
+    quarantine: mark test as quarantined (flaky/broken, tracked by issue)
--- a/scripts/flake_detector.py
+++ b/scripts/flake_detector.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+"""Flake detector for the Nexus test suite.
+
+Reads pytest JSON reports (produced by pytest-json-report) and maintains a
+rolling history file at .test-history.json.  After each update it prints a
+report of any test whose pass rate has dropped below the 95 % consistency
+threshold and exits non-zero if any flaky tests are found.
+
+Usage
+-----
+Install pytest-json-report once::
+
+    pip install pytest-json-report
+
+Then run tests with JSON output::
+
+    pytest --json-report --json-report-file=.test-report.json
+
+Then call this script::
+
+    python scripts/flake_detector.py            # uses .test-report.json + .test-history.json
+    python scripts/flake_detector.py --report path/to/report.json
+    python scripts/flake_detector.py --history path/to/history.json
+    python scripts/flake_detector.py --threshold 0.90   # lower threshold for local dev
+
+The script is also safe to call with no report file — it will just print the
+current history statistics without updating anything.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import TypedDict
+
+
+# ---------------------------------------------------------------------------
+# Types
+# ---------------------------------------------------------------------------
+
+class TestRecord(TypedDict):
+    """Per-test rolling history."""
+    runs: int
+    passes: int
+    failures: int
+    skips: int
+    last_outcome: str  # "passed" | "failed" | "skipped" | "error"
+
+
+class HistoryFile(TypedDict):
+    total_runs: int
+    tests: dict[str, TestRecord]
+
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+DEFAULT_REPORT = Path(".test-report.json")
+DEFAULT_HISTORY = Path(".test-history.json")
+DEFAULT_THRESHOLD = 0.95  # 95 % consistency required
+
+
+# ---------------------------------------------------------------------------
+# Core helpers
+# ---------------------------------------------------------------------------
+
+def load_history(history_path: Path) -> HistoryFile:
+    if history_path.exists():
+        with history_path.open() as fh:
+            return json.load(fh)
+    return {"total_runs": 0, "tests": {}}
+
+
+def save_history(history: HistoryFile, history_path: Path) -> None:
+    with history_path.open("w") as fh:
+        json.dump(history, fh, indent=2, sort_keys=True)
+    print(f"[flake-detector] History saved → {history_path}", file=sys.stderr)
+
+
+def ingest_report(report_path: Path, history: HistoryFile) -> int:
+    """Merge a pytest JSON report into *history*.  Returns the number of tests updated."""
+    with report_path.open() as fh:
+        report = json.load(fh)
+
+    history["total_runs"] = history.get("total_runs", 0) + 1
+    tests_section = report.get("tests", [])
+
+    for test in tests_section:
+        node_id: str = test.get("nodeid", "unknown")
+        outcome: str = test.get("outcome", "unknown")
+
+        rec: TestRecord = history["tests"].setdefault(
+            node_id,
+            {"runs": 0, "passes": 0, "failures": 0, "skips": 0, "last_outcome": ""},
+        )
+        rec["runs"] += 1
+        rec["last_outcome"] = outcome
+        if outcome == "passed":
+            rec["passes"] += 1
+        elif outcome in ("failed", "error"):
+            rec["failures"] += 1
+        elif outcome == "skipped":
+            rec["skips"] += 1
+
+    return len(tests_section)
+
+
+def consistency(rec: TestRecord) -> float:
+    """Return fraction of runs that produced a deterministic (pass or fail) outcome.
+
+    A test that always passes → 1.0 (stable green).
+    A test that always fails  → 0.0 (stable red — broken, not flaky).
+    A test that passes 9 out of 10 times → 0.9 (flaky).
+
+    We define *consistency* as the rate at which the test's outcome matches
+    its dominant outcome (pass or fail).  A test with fewer than
+    MIN_RUNS runs is not judged.
+    """
+    runs = rec["runs"]
+    if runs == 0:
+        return 1.0
+    passes = rec["passes"]
+    failures = rec["failures"]
+    dominant = max(passes, failures)
+    return dominant / runs
+
+
+MIN_RUNS = 5  # need at least this many runs before flagging
+
+
+def find_flaky_tests(
+    history: HistoryFile,
+    threshold: float = DEFAULT_THRESHOLD,
+) -> list[tuple[str, TestRecord, float]]:
+    """Return (node_id, record, consistency_rate) for all tests below threshold."""
+    flaky: list[tuple[str, TestRecord, float]] = []
+    for node_id, rec in history["tests"].items():
+        if rec["runs"] < MIN_RUNS:
+            continue
+        rate = consistency(rec)
+        if rate < threshold:
+            flaky.append((node_id, rec, rate))
+    flaky.sort(key=lambda x: x[2])  # worst first
+    return flaky
+
+
+# ---------------------------------------------------------------------------
+# Reporting
+# ---------------------------------------------------------------------------
+
+def print_report(
+    flaky: list[tuple[str, TestRecord, float]],
+    history: HistoryFile,
+    threshold: float,
+) -> None:
+    total_tests = len(history["tests"])
+    total_runs = history.get("total_runs", 0)
+
+    print(f"\n{'=' * 70}")
+    print("  FLAKE DETECTOR REPORT")
+    print(f"{'=' * 70}")
+    print(f"  Total suite runs tracked : {total_runs}")
+    print(f"  Total distinct tests     : {total_tests}")
+    print(f"  Consistency threshold    : {threshold:.0%}")
+    print(f"  Min runs before judging  : {MIN_RUNS}")
+    print(f"{'=' * 70}")
+
+    if not flaky:
+        print("  ✓ No flaky tests detected — all tests above consistency threshold.")
+        print(f"{'=' * 70}\n")
+        return
+
+    print(f"  ✗ {len(flaky)} FLAKY TEST(S) DETECTED:\n")
+    for node_id, rec, rate in flaky:
+        print(f"  [{rate:.0%}] {node_id}")
+        print(
+            f"       runs={rec['runs']}  passes={rec['passes']}  "
+            f"failures={rec['failures']}  skips={rec['skips']}  "
+            f"last={rec['last_outcome']}"
+        )
+        print()
+
+    print("  ACTION REQUIRED:")
+    print("  1. Move each flaky test to tests/quarantine/")
+    print("  2. File a tracking issue with [FLAKY] in the title")
+    print("  3. Add @pytest.mark.quarantine(reason='#NNN') to the test")
+    print("  See docs/QUARANTINE_PROCESS.md for full instructions.")
+    print(f"{'=' * 70}\n")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Detect flaky tests by analysing pytest JSON report history."
+    )
+    parser.add_argument(
+        "--report",
+        type=Path,
+        default=DEFAULT_REPORT,
+        help=f"Path to pytest JSON report file (default: {DEFAULT_REPORT})",
+    )
+    parser.add_argument(
+        "--history",
+        type=Path,
+        default=DEFAULT_HISTORY,
+        help=f"Path to rolling history JSON file (default: {DEFAULT_HISTORY})",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=DEFAULT_THRESHOLD,
+        help=f"Consistency threshold 0–1 (default: {DEFAULT_THRESHOLD})",
+    )
+    parser.add_argument(
+        "--no-update",
+        action="store_true",
+        default=False,
+        help="Print current statistics without ingesting a new report",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    history = load_history(args.history)
+
+    if not args.no_update:
+        if not args.report.exists():
+            print(
+                f"[flake-detector] No report file at {args.report} — "
+                "run pytest with --json-report first.",
+                file=sys.stderr,
+            )
+            # Not a fatal error; just print current state.
+        else:
+            n = ingest_report(args.report, history)
+            print(
+                f"[flake-detector] Ingested {n} test results from {args.report}",
+                file=sys.stderr,
+            )
+            save_history(history, args.history)
+
+    flaky = find_flaky_tests(history, threshold=args.threshold)
+    print_report(flaky, history, threshold=args.threshold)
+
+    return 1 if flaky else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/provision-runner.sh
+++ b/scripts/provision-runner.sh
@@ -0,0 +1,229 @@
+#!/usr/bin/env bash
+# provision-runner.sh — VPS provisioning script for Gitea act_runner
+# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
+#
+# Usage (on Bezalel VPS as root):
+#   bash provision-runner.sh --gitea-url <url> --token <runner-registration-token>
+#
+# This script:
+#   1. Downloads and installs act_runner binary
+#   2. Registers the runner with the Gitea instance
+#   3. Creates and enables systemd service for act_runner
+#   4. Installs the runner-health-probe timer (poka-yoke detection layer)
+#
+# POKA-YOKE principles applied:
+#   Prevention: runner registration is mandatory — script exits non-zero if registration fails
+#   Detection:  runner-health-probe.sh installed and enabled as part of this script
+#   Correction: health probe auto-restarts act_runner on zero-runner detection
+
+set -euo pipefail
+
+# ── Configuration defaults (override via env or flags) ───────────────────────
+GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
+RUNNER_TOKEN="${RUNNER_TOKEN:-}"
+RUNNER_NAME="${RUNNER_NAME:-$(hostname)-runner}"
+RUNNER_LABELS="${RUNNER_LABELS:-ubuntu-latest,linux,x86_64}"
+ACT_RUNNER_VERSION="${ACT_RUNNER_VERSION:-0.2.10}"
+INSTALL_DIR="${INSTALL_DIR:-/usr/local/bin}"
+CONFIG_DIR="${CONFIG_DIR:-/etc/act_runner}"
+DATA_DIR="${DATA_DIR:-/var/lib/act_runner}"
+NEXUS_DIR="${NEXUS_DIR:-/root/wizards/the-nexus}"
+PROBE_SCRIPT="${NEXUS_DIR}/scripts/runner-health-probe.sh"
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+log()  { echo "[$(date '+%Y-%m-%d %H:%M:%S')] PROVISION: $*"; }
+fail() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] PROVISION ERROR: $*" >&2; exit 1; }
+
+usage() {
+  cat <<EOF
+Usage: provision-runner.sh [options]
+
+Options:
+  --gitea-url <url>     Gitea base URL (default: $GITEA_URL)
+  --token <token>       Runner registration token (required)
+  --name <name>         Runner name (default: hostname-runner)
+  --labels <labels>     Comma-separated labels (default: $RUNNER_LABELS)
+  --version <ver>       act_runner version to install (default: $ACT_RUNNER_VERSION)
+  --nexus-dir <path>    Path to the-nexus checkout (default: $NEXUS_DIR)
+  --help                Show this help
+
+Environment variables: GITEA_URL, RUNNER_TOKEN, RUNNER_NAME, RUNNER_LABELS,
+                        ACT_RUNNER_VERSION, NEXUS_DIR
+
+POKA-YOKE CHECKLIST (enforced by this script):
+  [1] act_runner binary installed and executable
+  [2] Runner registered with Gitea (non-zero runner count verified)
+  [3] act_runner systemd service enabled and running
+  [4] runner-health-probe timer installed and enabled
+EOF
+}
+
+# ── Argument parsing ──────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gitea-url) GITEA_URL="$2"; shift 2 ;;
+    --token)     RUNNER_TOKEN="$2"; shift 2 ;;
+    --name)      RUNNER_NAME="$2"; shift 2 ;;
+    --labels)    RUNNER_LABELS="$2"; shift 2 ;;
+    --version)   ACT_RUNNER_VERSION="$2"; shift 2 ;;
+    --nexus-dir) NEXUS_DIR="$2"; PROBE_SCRIPT="${NEXUS_DIR}/scripts/runner-health-probe.sh"; shift 2 ;;
+    --help)      usage; exit 0 ;;
+    *) fail "Unknown argument: $1. Use --help for usage." ;;
+  esac
+done
+
+[[ -z "$RUNNER_TOKEN" ]] && fail "Runner registration token required. Pass --token or set RUNNER_TOKEN env var."
+
+# ── Step 1: Install act_runner binary ─────────────────────────────────────────
+log "Step 1/4: Installing act_runner v${ACT_RUNNER_VERSION}..."
+
+ARCH=$(uname -m)
+case "$ARCH" in
+  x86_64)  ARCH_SUFFIX="amd64" ;;
+  aarch64) ARCH_SUFFIX="arm64" ;;
+  *) fail "Unsupported architecture: $ARCH" ;;
+esac
+
+BINARY_URL="https://gitea.com/gitea/act_runner/releases/download/v${ACT_RUNNER_VERSION}/act_runner-${ACT_RUNNER_VERSION}-linux-${ARCH_SUFFIX}"
+BINARY_PATH="${INSTALL_DIR}/act_runner"
+
+if [[ -f "$BINARY_PATH" ]]; then
+  CURRENT_VER=$("$BINARY_PATH" --version 2>/dev/null | grep -oP '\d+\.\d+\.\d+' || echo "unknown")
+  if [[ "$CURRENT_VER" == "$ACT_RUNNER_VERSION" ]]; then
+    log "act_runner v${ACT_RUNNER_VERSION} already installed — skipping download."
+  else
+    log "Upgrading act_runner from v${CURRENT_VER} to v${ACT_RUNNER_VERSION}..."
+    curl -fsSL "$BINARY_URL" -o "$BINARY_PATH"
+    chmod +x "$BINARY_PATH"
+  fi
+else
+  curl -fsSL "$BINARY_URL" -o "$BINARY_PATH"
+  chmod +x "$BINARY_PATH"
+fi
+
+"$BINARY_PATH" --version >/dev/null 2>&1 || fail "act_runner binary not functional after install."
+log "act_runner binary OK: $($BINARY_PATH --version 2>/dev/null || echo 'installed')"
+
+# ── Step 2: Register runner with Gitea ────────────────────────────────────────
+log "Step 2/4: Registering runner with Gitea at ${GITEA_URL}..."
+
+mkdir -p "$CONFIG_DIR" "$DATA_DIR"
+
+CONFIG_FILE="${CONFIG_DIR}/config.yaml"
+
+# Generate config and register
+"$BINARY_PATH" register \
+  --no-interactive \
+  --instance "$GITEA_URL" \
+  --token "$RUNNER_TOKEN" \
+  --name "$RUNNER_NAME" \
+  --labels "$RUNNER_LABELS" \
+  --config "$CONFIG_FILE" \
+  2>&1 | tee /tmp/act_runner_register.log
+
+if ! grep -q "Runner registered" /tmp/act_runner_register.log 2>/dev/null && \
+   ! grep -q "registered" /tmp/act_runner_register.log 2>/dev/null; then
+  # Registration output varies — check if config was written as a fallback signal
+  if [[ ! -f "$CONFIG_FILE" ]]; then
+    fail "Runner registration failed. Check token and Gitea URL. Log: /tmp/act_runner_register.log"
+  fi
+fi
+
+log "Runner registered. Config written to ${CONFIG_FILE}"
+
+# ── Step 3: Create and enable systemd service ─────────────────────────────────
+log "Step 3/4: Installing act_runner systemd service..."
+
+cat > /etc/systemd/system/act_runner.service <<EOF
+[Unit]
+Description=Gitea Actions Runner (act_runner)
+Documentation=https://gitea.com/gitea/act_runner
+After=network.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+User=root
+WorkingDirectory=${DATA_DIR}
+ExecStart=${INSTALL_DIR}/act_runner daemon --config ${CONFIG_FILE}
+Restart=always
+RestartSec=10
+StandardOutput=journal
+StandardError=journal
+Environment=HOME=/root
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+systemctl daemon-reload
+systemctl enable act_runner
+systemctl restart act_runner
+sleep 3
+
+if ! systemctl is-active --quiet act_runner; then
+  fail "act_runner service failed to start. Check: journalctl -u act_runner -n 50"
+fi
+log "act_runner service running."
+
+# ── Step 4: Install runner health probe ───────────────────────────────────────
+log "Step 4/4: Installing runner-health-probe systemd timer..."
+
+if [[ ! -f "$PROBE_SCRIPT" ]]; then
+  log "WARNING: probe script not found at ${PROBE_SCRIPT}. Skipping timer install."
+  log "         Re-run after the-nexus is checked out to: ${NEXUS_DIR}"
+  log "         Then manually: systemctl enable --now runner-health-probe.timer"
+else
+  chmod +x "$PROBE_SCRIPT"
+
+  # Install service unit
+  cat > /etc/systemd/system/runner-health-probe.service <<EOF
+[Unit]
+Description=Gitea Runner Health Probe (poka-yoke zero-runner detection)
+Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
+After=network.target act_runner.service
+
+[Service]
+Type=oneshot
+ExecStart=${PROBE_SCRIPT}
+StandardOutput=journal
+StandardError=journal
+Environment=HOME=/root
+EOF
+
+  # Install timer unit (every 5 minutes)
+  cat > /etc/systemd/system/runner-health-probe.timer <<EOF
+[Unit]
+Description=Gitea Runner Health Probe — every 5 minutes (poka-yoke #1097)
+Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
+
+[Timer]
+OnBootSec=2min
+OnUnitActiveSec=5min
+Persistent=true
+
+[Install]
+WantedBy=timers.target
+EOF
+
+  systemctl daemon-reload
+  systemctl enable --now runner-health-probe.timer
+  log "runner-health-probe.timer enabled (fires every 5 minutes)."
+fi
+
+# ── Poka-yoke checklist summary ───────────────────────────────────────────────
+echo ""
+echo "══════════════════════════════════════════════════════════"
+echo "  POKA-YOKE PROVISIONING CHECKLIST — $(hostname)"
+echo "══════════════════════════════════════════════════════════"
+printf "  [1] act_runner binary     : "
+"$BINARY_PATH" --version >/dev/null 2>&1 && echo "OK" || echo "FAIL"
+printf "  [2] act_runner registered : "
+[[ -f "$CONFIG_FILE" ]] && echo "OK (config exists)" || echo "FAIL (no config)"
+printf "  [3] act_runner service    : "
+systemctl is-active --quiet act_runner && echo "RUNNING" || echo "FAIL"
+printf "  [4] health-probe timer    : "
+systemctl is-active --quiet runner-health-probe.timer 2>/dev/null && echo "ACTIVE" || echo "NOT INSTALLED (re-run after nexus checkout)"
+echo "══════════════════════════════════════════════════════════"
+echo ""
+log "Provisioning complete. Runner '${RUNNER_NAME}' registered at ${GITEA_URL}"
--- a/scripts/runner-health-probe.sh
+++ b/scripts/runner-health-probe.sh
@@ -0,0 +1,190 @@
+#!/usr/bin/env bash
+# runner-health-probe.sh — Gitea Runner Health Probe (poka-yoke detection layer)
+# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
+#
+# Called every 5 minutes by runner-health-probe.timer (systemd).
+# Can also be run manually for immediate status.
+#
+# POKA-YOKE detection + correction:
+#   1. Queries Gitea API for active runner count
+#   2. Reports count to Timmy Time via journal/log every run
+#   3. On ZERO active runners:
+#      a. Logs P1 alert to journal
+#      b. Creates alert marker file for external watchers
+#      c. Attempts to restart act_runner service (auto-correction)
+#      d. Re-queries after restart to verify recovery
+#
+# Exit codes:
+#   0 — runners healthy (≥1 online runner)
+#   1 — zero runners detected (P1 alert fired)
+#   2 — Gitea API unreachable (network/config error)
+
+set -uo pipefail
+
+# ── Configuration ─────────────────────────────────────────────────────────────
+GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
+GITEA_TOKEN="${GITEA_TOKEN:-}"
+GITEA_TOKEN_FILE="${GITEA_TOKEN_FILE:-/etc/act_runner/gitea-probe-token}"
+ALERT_DIR="${ALERT_DIR:-/var/lib/act_runner/alerts}"
+RUNNER_SERVICE="${RUNNER_SERVICE:-act_runner}"
+# Restart cooldown: don't restart more than once per 10 minutes
+COOLDOWN_FILE="${ALERT_DIR}/.last_restart"
+COOLDOWN_SECS=600
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+log()  { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE: $*"; }
+warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE WARNING: $*" >&2; }
+alert(){ echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE P1-ALERT: $*" >&2; }
+
+# Load token from file if not set via env
+if [[ -z "$GITEA_TOKEN" && -f "$GITEA_TOKEN_FILE" ]]; then
+  GITEA_TOKEN=$(cat "$GITEA_TOKEN_FILE")
+fi
+
+if [[ -z "$GITEA_TOKEN" ]]; then
+  warn "No Gitea API token configured. Set GITEA_TOKEN env var or write to ${GITEA_TOKEN_FILE}"
+  warn "Cannot query runner health without API token. Exiting."
+  exit 2
+fi
+
+mkdir -p "$ALERT_DIR"
+
+# ── Query Gitea runner count ───────────────────────────────────────────────────
+query_active_runners() {
+  local response http_code runner_count
+
+  # Fetch runners list — Gitea admin endpoint
+  response=$(curl -sf \
+    --max-time 15 \
+    -H "Authorization: token ${GITEA_TOKEN}" \
+    -H "Content-Type: application/json" \
+    -w "\n__HTTP_STATUS__%{http_code}" \
+    "${GITEA_URL}/api/v1/admin/runners?limit=50" 2>/dev/null) || {
+    warn "Gitea API request failed (curl error). URL: ${GITEA_URL}/api/v1/admin/runners"
+    return 2
+  }
+
+  http_code=$(echo "$response" | grep -oP '(?<=__HTTP_STATUS__)\d+')
+  response=$(echo "$response" | sed '/^__HTTP_STATUS__/d')
+
+  if [[ "$http_code" != "200" ]]; then
+    warn "Gitea API returned HTTP ${http_code}. Check token permissions (requires admin)."
+    return 2
+  fi
+
+  # Count runners that are "online" or "active"
+  # Gitea runner status field: "online", "offline", "idle", "active"
+  runner_count=$(echo "$response" | \
+    python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+runners = data if isinstance(data, list) else data.get('runners', data.get('data', []))
+online = [r for r in runners if r.get('status') in ('online', 'idle', 'active')]
+print(len(online))
+" 2>/dev/null) || {
+    # Fallback: count all runners if status parse fails
+    runner_count=$(echo "$response" | \
+      python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d) if isinstance(d,list) else len(d.get('runners',d.get('data',[]))))" 2>/dev/null || echo "0")
+    warn "Could not parse runner status — counting all runners: ${runner_count}"
+  }
+
+  echo "${runner_count:-0}"
+  return 0
+}
+
+# ── Cooldown check ────────────────────────────────────────────────────────────
+in_cooldown() {
+  if [[ -f "$COOLDOWN_FILE" ]]; then
+    local last_restart now age
+    last_restart=$(cat "$COOLDOWN_FILE" 2>/dev/null || echo 0)
+    now=$(date +%s)
+    age=$(( now - last_restart ))
+    if (( age < COOLDOWN_SECS )); then
+      log "Restart cooldown active (${age}s < ${COOLDOWN_SECS}s). Skipping restart attempt."
+      return 0
+    fi
+  fi
+  return 1
+}
+
+record_restart() {
+  date +%s > "$COOLDOWN_FILE"
+}
+
+# ── Main probe logic ───────────────────────────────────────────────────────────
+log "Querying Gitea runner health at ${GITEA_URL}..."
+
+RUNNER_COUNT=$(query_active_runners)
+QUERY_EXIT=$?
+
+if [[ $QUERY_EXIT -eq 2 ]]; then
+  warn "API unreachable — cannot assess runner health. Check network and token."
+  # Write an "unknown" alert marker so monitoring can see the probe itself is broken
+  echo "$(date -Iseconds) PROBE_ERROR: API unreachable" >> "${ALERT_DIR}/probe-errors.log"
+  exit 2
+fi
+
+log "Active runner count: ${RUNNER_COUNT}"
+
+# ── Healthy path ──────────────────────────────────────────────────────────────
+if (( RUNNER_COUNT > 0 )); then
+  log "Runners OK. ${RUNNER_COUNT} active runner(s) online."
+  # Clear any stale P1 alert marker
+  rm -f "${ALERT_DIR}/p1-zero-runners.alert"
+  exit 0
+fi
+
+# ── Zero-runner P1 alert path ─────────────────────────────────────────────────
+alert "ZERO active runners detected on ${GITEA_URL}!"
+alert "All CI jobs will queue silently. Attempting auto-correction."
+
+# Write P1 alert marker (watched by external monitoring, logs, etc.)
+ALERT_FILE="${ALERT_DIR}/p1-zero-runners.alert"
+cat > "$ALERT_FILE" <<ALERT_EOF
+P1 ALERT — ZERO GITEA RUNNERS
+Detected : $(date -Iseconds)
+Host     : $(hostname)
+Gitea    : ${GITEA_URL}
+Impact   : ALL CI jobs queuing silently — no runners available
+Action   : Auto-restart of ${RUNNER_SERVICE} attempted (see below)
+ALERT_EOF
+
+log "P1 alert written to ${ALERT_FILE}"
+
+# ── Auto-correction: restart act_runner ───────────────────────────────────────
+if in_cooldown; then
+  alert "Cannot attempt restart — cooldown active. Manual intervention may be required."
+  alert "Check: systemctl status ${RUNNER_SERVICE}"
+  alert "See alert file: ${ALERT_FILE}"
+  exit 1
+fi
+
+log "Attempting to restart ${RUNNER_SERVICE} service..."
+if systemctl restart "$RUNNER_SERVICE" 2>&1; then
+  record_restart
+  log "Service restart issued. Waiting 15s for runner to register..."
+  sleep 15
+
+  # Re-query to verify recovery
+  RUNNER_COUNT_AFTER=$(query_active_runners 2>/dev/null || echo "0")
+  if (( RUNNER_COUNT_AFTER > 0 )); then
+    log "Recovery SUCCESS: ${RUNNER_COUNT_AFTER} runner(s) online after restart."
+    # Append recovery note to alert file (leave file as audit trail)
+    echo "Recovered  : $(date -Iseconds) — ${RUNNER_COUNT_AFTER} runner(s) online after restart" >> "$ALERT_FILE"
+    exit 0
+  else
+    alert "Recovery FAILED: still zero runners after restart."
+    alert "Manual intervention required."
+    alert "Next steps:"
+    alert "  1. ssh root@$(hostname) 'journalctl -u ${RUNNER_SERVICE} -n 100'"
+    alert "  2. Verify registration token: ${GITEA_URL}/user/settings/applications"
+    alert "  3. Re-run: /root/wizards/the-nexus/scripts/provision-runner.sh --token <new-token>"
+    echo "AutoRestart: FAILED at $(date -Iseconds)" >> "$ALERT_FILE"
+    exit 1
+  fi
+else
+  alert "systemctl restart ${RUNNER_SERVICE} failed — service may not exist on this host."
+  alert "Verify act_runner is installed via provision-runner.sh."
+  echo "AutoRestart: systemctl failed at $(date -Iseconds)" >> "$ALERT_FILE"
+  exit 1
+fi
--- a/scripts/systemd/runner-health-probe.service
+++ b/scripts/systemd/runner-health-probe.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=Gitea Runner Health Probe (poka-yoke zero-runner detection)
+Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
+After=network.target act_runner.service
+
+[Service]
+Type=oneshot
+ExecStart=/root/wizards/the-nexus/scripts/runner-health-probe.sh
+StandardOutput=journal
+StandardError=journal
+Environment=HOME=/root
+# Token can be set here or via /etc/act_runner/gitea-probe-token file
+# EnvironmentFile=/etc/act_runner/probe.env
+
+[Install]
+WantedBy=multi-user.target
--- a/scripts/systemd/runner-health-probe.timer
+++ b/scripts/systemd/runner-health-probe.timer
@@ -0,0 +1,14 @@
+[Unit]
+Description=Gitea Runner Health Probe — fires every 5 minutes (poka-yoke #1097)
+Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
+
+[Timer]
+# Start 2 minutes after boot (let network and act_runner settle)
+OnBootSec=2min
+# Then fire every 5 minutes
+OnUnitActiveSec=5min
+# Re-fire missed runs after downtime
+Persistent=true
+
+[Install]
+WantedBy=timers.target
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,15 +1,43 @@
 """Pytest configuration for the test suite."""
+import re
 import pytest

 # Configure pytest-asyncio mode
 pytest_plugins = ["pytest_asyncio"]

+# Pattern that constitutes a valid issue link in a skip reason.
+# Accepts: #NNN, https?://..., or JIRA-NNN style keys.
+_ISSUE_LINK_RE = re.compile(
+    r"(#\d+|https?://\S+|[A-Z]+-\d+)",
+    re.IGNORECASE,
+)
+
+
+def _has_issue_link(reason: str) -> bool:
+    """Return True if *reason* contains a recognisable issue reference."""
+    return bool(_ISSUE_LINK_RE.search(reason or ""))
+
+
+def _skip_reason(report) -> str:
+    """Extract the human-readable skip reason from a pytest report."""
+    longrepr = getattr(report, "longrepr", None)
+    if longrepr is None:
+        return ""
+    if isinstance(longrepr, tuple) and len(longrepr) >= 3:
+        # (filename, lineno, "Skipped: <reason>")
+        return str(longrepr[2])
+    return str(longrepr)
+

 def pytest_configure(config):
    """Configure pytest."""
    config.addinivalue_line(
        "markers", "integration: mark test as integration test (requires MCP servers)"
    )
+    config.addinivalue_line(
+        "markers",
+        "quarantine: mark test as quarantined (flaky/broken, tracked by issue)",
+    )


 def pytest_addoption(parser):
@@ -20,6 +48,12 @@ def pytest_addoption(parser):
        default=False,
        help="Run integration tests that require MCP servers",
    )
+    parser.addoption(
+        "--no-skip-enforcement",
+        action="store_true",
+        default=False,
+        help="Disable poka-yoke enforcement of issue-linked skip reasons (CI escape hatch)",
+    )


 def pytest_collection_modifyitems(config, items):
@@ -31,3 +65,60 @@ def pytest_collection_modifyitems(config, items):
        for item in items:
            if "integration" in item.keywords:
                item.add_marker(skip_integration)
+
+
+# ---------------------------------------------------------------------------
+# POKA-YOKE: Treat skipped tests as failures unless they carry an issue link.
+# ---------------------------------------------------------------------------
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    """Intercept skipped reports and fail them if they lack an issue link.
+
+    Exceptions:
+    * Tests in tests/quarantine/ — explicitly quarantined, issue link required
+      on the quarantine marker, not the skip marker.
+    * Tests using environment-variable-based ``skipif`` conditions — these are
+      legitimate CI gates (RUN_INTEGRATION_TESTS, RUN_LIVE_TESTS, etc.) where
+      the *condition* is the gate, not a developer opt-out.  We allow these
+      only when the skip reason mentions a recognised env-var pattern.
+    * --no-skip-enforcement flag set (emergency escape hatch).
+    """
+    outcome = yield
+    report = outcome.get_result()
+
+    if not report.skipped:
+        return
+
+    # Escape hatch for emergency use.
+    if item.config.getoption("--no-skip-enforcement", default=False):
+        return
+
+    reason = _skip_reason(report)
+
+    # Allow quarantined tests — they are tracked by their quarantine marker.
+    if "quarantine" in item.keywords:
+        return
+
+    # Allow env-var-gated skipif conditions.  These come from the
+    # pytest_collection_modifyitems integration gate above, or from
+    # explicit @pytest.mark.skipif(..., reason="... requires ENV=1 ...")
+    _ENV_GATE_RE = re.compile(r"(require|needs|set)\s+\w+=[^\s]+", re.IGNORECASE)
+    if _ENV_GATE_RE.search(reason):
+        return
+
+    # Allow skips added by the integration gate in this very conftest.
+    if "require --run-integration" in reason:
+        return
+
+    # Anything else needs an issue link.
+    if not _has_issue_link(reason):
+        report.outcome = "failed"
+        report.longrepr = (
+            "[POKA-YOKE] Skip without issue link is not allowed.\n"
+            f"  Reason given: {reason!r}\n"
+            "  Fix: add an issue reference to the skip reason, e.g.:\n"
+            "    @pytest.mark.skip(reason='Broken until #NNN is resolved')\n"
+            "  Or quarantine the test: move it to tests/quarantine/ and\n"
+            "  file an issue — see docs/QUARANTINE_PROCESS.md"
+        )
--- a/tests/quarantine/README.md
+++ b/tests/quarantine/README.md
@@ -0,0 +1,28 @@
+# tests/quarantine/
+
+This directory holds tests that have been **quarantined** because they are
+flaky or temporarily broken.  Quarantine keeps the main test suite green while
+ensuring no test is silently deleted or forgotten.
+
+## Rules
+
+1. Every file here must correspond to an open issue.
+2. Every test here must carry `@pytest.mark.quarantine(reason="#NNN")`.
+3. Quarantined tests are **excluded from the default CI run** but are included
+   when you pass `--run-quarantine`.
+4. The quarantine is a **temporary holding area**, not a graveyard.  If a
+   quarantined test's issue has been open for more than 30 days with no
+   progress, escalate it.
+
+## Adding a test
+
+```python
+# tests/quarantine/test_my_flaky_thing.py
+import pytest
+
+@pytest.mark.quarantine(reason="Flaky: #1234")
+def test_my_flaky_thing():
+    ...
+```
+
+See `docs/QUARANTINE_PROCESS.md` for the complete workflow.
--- a/tests/quarantine/init.py
+++ b/tests/quarantine/init.py
@@ -0,0 +1,2 @@
+# Quarantined tests live here.
+# See docs/QUARANTINE_PROCESS.md for the full quarantine workflow.
--- a/tests/test_cron_heartbeats.py
+++ b/tests/test_cron_heartbeats.py
@@ -0,0 +1,341 @@
+"""Tests for the poka-yoke cron heartbeat system.
+
+Covers:
+  - nexus/cron_heartbeat.py  (write utility)
+  - bin/check_cron_heartbeats.py  (meta-heartbeat checker)
+
+Refs: #1096
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# ── Load modules under test ───────────────────────────────────────────
+
+PROJECT_ROOT = Path(__file__).parent.parent
+
+_hb_spec = importlib.util.spec_from_file_location(
+    "_cron_heartbeat",
+    PROJECT_ROOT / "nexus" / "cron_heartbeat.py",
+)
+_hb = importlib.util.module_from_spec(_hb_spec)
+sys.modules["_cron_heartbeat"] = _hb
+_hb_spec.loader.exec_module(_hb)
+
+_chk_spec = importlib.util.spec_from_file_location(
+    "_check_cron_heartbeats",
+    PROJECT_ROOT / "bin" / "check_cron_heartbeats.py",
+)
+_chk = importlib.util.module_from_spec(_chk_spec)
+sys.modules["_check_cron_heartbeats"] = _chk
+_chk_spec.loader.exec_module(_chk)
+
+write_cron_heartbeat = _hb.write_cron_heartbeat
+heartbeat_path = _hb.heartbeat_path
+
+scan_heartbeats = _chk.scan_heartbeats
+build_report = _chk.build_report
+HeartbeatReport = _chk.HeartbeatReport
+JobStatus = _chk.JobStatus
+_read_job_status = _chk._read_job_status
+_fmt_duration = _chk._fmt_duration
+
+
+# ── nexus/cron_heartbeat.py ───────────────────────────────────────────
+
+class TestWriteCronHeartbeat:
+    def test_creates_file(self, tmp_path):
+        """write_cron_heartbeat creates <job>.last in the given directory."""
+        path = write_cron_heartbeat("my_job", interval_seconds=300, directory=tmp_path)
+        assert path == tmp_path / "my_job.last"
+        assert path.exists()
+
+    def test_file_content(self, tmp_path):
+        """Written file has all required fields."""
+        write_cron_heartbeat("my_job", interval_seconds=600, status="ok", directory=tmp_path)
+        data = json.loads((tmp_path / "my_job.last").read_text())
+        assert data["job"] == "my_job"
+        assert data["interval_seconds"] == 600
+        assert data["status"] == "ok"
+        assert data["pid"] == os.getpid()
+        assert abs(data["timestamp"] - time.time()) < 2
+
+    def test_atomic_write_no_temp_files(self, tmp_path):
+        """No temporary files remain after a successful write."""
+        write_cron_heartbeat("my_job", interval_seconds=300, directory=tmp_path)
+        files = list(tmp_path.iterdir())
+        assert len(files) == 1
+        assert files[0].name == "my_job.last"
+
+    def test_overwrites_cleanly(self, tmp_path):
+        """Successive writes update, not append."""
+        write_cron_heartbeat("j", interval_seconds=60, status="ok", directory=tmp_path)
+        write_cron_heartbeat("j", interval_seconds=60, status="warn", directory=tmp_path)
+        data = json.loads((tmp_path / "j.last").read_text())
+        assert data["status"] == "warn"
+
+    def test_creates_parent_dirs(self, tmp_path):
+        """Parent directories are created as needed."""
+        deep_dir = tmp_path / "a" / "b" / "c"
+        write_cron_heartbeat("j", interval_seconds=60, directory=deep_dir)
+        assert (deep_dir / "j.last").exists()
+
+    def test_heartbeat_path_helper(self, tmp_path):
+        """heartbeat_path() returns the correct path without writing."""
+        p = heartbeat_path("myjob", directory=tmp_path)
+        assert p == tmp_path / "myjob.last"
+        assert not p.exists()
+
+    def test_env_var_override(self, tmp_path, monkeypatch):
+        """BEZALEL_HEARTBEAT_DIR env var overrides the directory."""
+        monkeypatch.setenv("BEZALEL_HEARTBEAT_DIR", str(tmp_path))
+        # Call without directory= so it uses the env var
+        path = write_cron_heartbeat("env_job", interval_seconds=120)
+        assert path.parent == tmp_path
+
+
+# ── bin/check_cron_heartbeats.py ─────────────────────────────────────
+
+class TestScanHeartbeats:
+    def test_empty_dir(self, tmp_path):
+        """No .last files → empty list."""
+        assert scan_heartbeats(tmp_path) == []
+
+    def test_nonexistent_dir(self, tmp_path):
+        """Missing directory → empty list (no exception)."""
+        assert scan_heartbeats(tmp_path / "nope") == []
+
+    def test_healthy_job(self, tmp_path):
+        """Fresh heartbeat → healthy."""
+        (tmp_path / "myjob.last").write_text(json.dumps({
+            "job": "myjob",
+            "timestamp": time.time(),
+            "interval_seconds": 300,
+            "pid": 1,
+            "status": "ok",
+        }))
+        jobs = scan_heartbeats(tmp_path)
+        assert len(jobs) == 1
+        assert jobs[0].healthy is True
+        assert jobs[0].job == "myjob"
+
+    def test_stale_job(self, tmp_path):
+        """Heartbeat older than 2× interval → stale."""
+        (tmp_path / "slow.last").write_text(json.dumps({
+            "job": "slow",
+            "timestamp": time.time() - 700,  # 11.7 minutes
+            "interval_seconds": 300,          # 5 min interval → ratio 2.33
+            "pid": 1,
+            "status": "ok",
+        }))
+        jobs = scan_heartbeats(tmp_path)
+        assert jobs[0].healthy is False
+        assert jobs[0].staleness_ratio > 2.0
+
+    def test_missing_heartbeat_file(self, tmp_path):
+        """_read_job_status handles a file that disappears mid-scan."""
+        ghost_path = tmp_path / "ghost.last"
+        status = _read_job_status("ghost", ghost_path)
+        assert status.healthy is False
+        assert "missing" in status.raw_status
+
+    def test_corrupt_heartbeat(self, tmp_path):
+        """Corrupt JSON → unhealthy with 'corrupt' status."""
+        p = tmp_path / "bad.last"
+        p.write_text("{not valid json")
+        status = _read_job_status("bad", p)
+        assert status.healthy is False
+        assert "corrupt" in status.raw_status
+
+    def test_multiple_jobs(self, tmp_path):
+        """Multiple .last files are all reported."""
+        for i, name in enumerate(["alpha", "beta", "gamma"]):
+            (tmp_path / f"{name}.last").write_text(json.dumps({
+                "job": name,
+                "timestamp": time.time() - i * 10,
+                "interval_seconds": 300,
+                "pid": i + 1,
+                "status": "ok",
+            }))
+        jobs = scan_heartbeats(tmp_path)
+        assert len(jobs) == 3
+        job_names = {j.job for j in jobs}
+        assert job_names == {"alpha", "beta", "gamma"}
+
+    def test_non_last_files_ignored(self, tmp_path):
+        """.json and other extensions are ignored."""
+        (tmp_path / "other.json").write_text("{}")
+        (tmp_path / "notes.txt").write_text("hello")
+        assert scan_heartbeats(tmp_path) == []
+
+
+class TestHeartbeatReport:
+    def _fresh_job(self, name="j"):
+        return JobStatus(
+            job=name, path=Path(f"/tmp/{name}.last"),
+            healthy=True, age_seconds=30, interval_seconds=300,
+            staleness_ratio=0.1, last_timestamp=time.time() - 30,
+            pid=1, raw_status="ok",
+            message="Last beat 30s ago (ratio 0.1x)",
+        )
+
+    def _stale_job(self, name="s"):
+        return JobStatus(
+            job=name, path=Path(f"/tmp/{name}.last"),
+            healthy=False, age_seconds=700, interval_seconds=300,
+            staleness_ratio=2.33, last_timestamp=time.time() - 700,
+            pid=1, raw_status="stale",
+            message="Silent for 11m 40s (2.3x interval of 5m 0s)",
+        )
+
+    def test_overall_healthy(self):
+        report = HeartbeatReport(
+            timestamp=time.time(),
+            heartbeat_dir=Path("/tmp"),
+            jobs=[self._fresh_job()],
+        )
+        assert report.overall_healthy is True
+
+    def test_overall_unhealthy(self):
+        report = HeartbeatReport(
+            timestamp=time.time(),
+            heartbeat_dir=Path("/tmp"),
+            jobs=[self._fresh_job(), self._stale_job()],
+        )
+        assert report.overall_healthy is False
+        assert len(report.stale_jobs) == 1
+
+    def test_panel_markdown_contains_table(self):
+        report = HeartbeatReport(
+            timestamp=time.time(),
+            heartbeat_dir=Path("/tmp"),
+            jobs=[self._fresh_job("alpha"), self._stale_job("beta")],
+        )
+        panel = report.to_panel_markdown()
+        assert "## Heartbeat Panel" in panel
+        assert "| `alpha` |" in panel
+        assert "| `beta` |" in panel
+        assert "STALE" in panel
+        assert "OK" in panel
+        assert "**Overall:** ALERT" in panel
+
+    def test_panel_markdown_no_jobs(self):
+        report = HeartbeatReport(
+            timestamp=time.time(),
+            heartbeat_dir=Path("/tmp"),
+            jobs=[],
+        )
+        panel = report.to_panel_markdown()
+        assert "no heartbeat files found" in panel
+
+    def test_panel_overall_ok(self):
+        report = HeartbeatReport(
+            timestamp=time.time(),
+            heartbeat_dir=Path("/tmp"),
+            jobs=[self._fresh_job()],
+        )
+        panel = report.to_panel_markdown()
+        assert "**Overall:** OK" in panel
+
+    def test_alert_body_lists_stale_jobs(self):
+        report = HeartbeatReport(
+            timestamp=time.time(),
+            heartbeat_dir=Path("/tmp"),
+            jobs=[self._stale_job("slow")],
+        )
+        body = report.to_alert_body()
+        assert "slow" in body
+        assert "STALE" in body.upper() or "stale" in body.lower() or "silent" in body.lower()
+        assert "crontab" in body.lower()
+
+    def test_to_json(self):
+        report = HeartbeatReport(
+            timestamp=time.time(),
+            heartbeat_dir=Path("/tmp"),
+            jobs=[self._fresh_job()],
+        )
+        data = report.to_json()
+        assert data["healthy"] is True
+        assert len(data["jobs"]) == 1
+        assert data["jobs"][0]["job"] == "j"
+
+
+class TestFmtDuration:
+    def test_seconds(self):
+        assert _fmt_duration(45) == "45s"
+
+    def test_minutes(self):
+        assert _fmt_duration(90) == "1m 30s"
+
+    def test_hours(self):
+        assert _fmt_duration(3661) == "1h 1m"
+
+
+class TestBuildReport:
+    def test_build_report_with_dir(self, tmp_path):
+        """build_report() uses the given directory."""
+        (tmp_path / "myjob.last").write_text(json.dumps({
+            "job": "myjob",
+            "timestamp": time.time(),
+            "interval_seconds": 300,
+            "pid": 1,
+            "status": "ok",
+        }))
+        report = build_report(directory=tmp_path)
+        assert len(report.jobs) == 1
+        assert report.overall_healthy is True
+
+    def test_build_report_empty_dir(self, tmp_path):
+        report = build_report(directory=tmp_path)
+        assert report.jobs == []
+        assert report.overall_healthy is True  # nothing stale = healthy
+
+
+# ── Integration: nexus_watchdog writes its own heartbeat ─────────────
+
+class TestWatchdogHeartbeatIntegration:
+    """Verify nexus_watchdog.py writes a cron heartbeat after run_once()."""
+
+    def test_watchdog_writes_cron_heartbeat(self, tmp_path, monkeypatch):
+        """After run_once, nexus_watchdog writes nexus_watchdog.last."""
+        monkeypatch.setenv("BEZALEL_HEARTBEAT_DIR", str(tmp_path))
+
+        # Load watchdog module
+        spec = importlib.util.spec_from_file_location(
+            "_watchdog_hb_test",
+            PROJECT_ROOT / "bin" / "nexus_watchdog.py",
+        )
+        wd = importlib.util.module_from_spec(spec)
+        sys.modules["_watchdog_hb_test"] = wd
+
+        # Patch out network calls
+        with patch("socket.socket") as mock_sock, \
+             patch("subprocess.run") as mock_run:
+            mock_sock.return_value.connect_ex.return_value = 111  # port closed
+            mock_run.return_value = MagicMock(returncode=1, stdout="")
+
+            spec.loader.exec_module(wd)
+
+            args = MagicMock()
+            args.ws_host = "localhost"
+            args.ws_port = 8765
+            args.heartbeat_path = str(tmp_path / "nexus_heartbeat.json")
+            args.stale_threshold = 300
+            args.dry_run = True  # don't touch Gitea
+
+            wd.run_once(args)
+
+        hb_file = tmp_path / "nexus_watchdog.last"
+        assert hb_file.exists(), "nexus_watchdog.last was not written"
+        data = json.loads(hb_file.read_text())
+        assert data["job"] == "nexus_watchdog"
+        assert data["interval_seconds"] == 300
Author	SHA1	Message	Date
Claude (Opus 4.6)	ea3cc6b393	[claude] Poka-yoke cron heartbeats — make silent failures impossible (#1096 ) (#1102 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details	2026-04-07 14:38:55 +00:00
Claude (Opus 4.6)	caa7823cdd	[claude] Poka-yoke: make test skips/flakes impossible to ignore (#1094 ) (#1104 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details	2026-04-07 14:38:49 +00:00
Claude (Opus 4.6)	d0d655b42a	[claude] Poka-yoke runner health: provision + health probe scripts (#1097 ) (#1101 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details	2026-04-07 14:33:35 +00:00