diff --git a/cron/audit-report.json b/cron/audit-report.json new file mode 100644 index 00000000..71314191 --- /dev/null +++ b/cron/audit-report.json @@ -0,0 +1,104 @@ +{ + "audit_time": "2026-04-15T01:13:31.126215+00:00", + "total_jobs": 7, + "summary": { + "healthy": 7, + "transient_errors": 0, + "systemic_failures": 0 + }, + "systemic_jobs": [], + "transient_jobs": [], + "all_jobs": [ + { + "id": "9e0624269ba7", + "name": "Triage Heartbeat", + "schedule": "every 15m", + "state": "paused", + "enabled": false, + "last_status": "ok", + "last_error": null, + "last_run_at": "2026-03-24T15:33:57.749458-04:00", + "category": "healthy", + "reason": "Dashboard repo frozen - loops redirected to the-nexus", + "action": "none \u2014 paused intentionally" + }, + { + "id": "e29eda4a8548", + "name": "PR Review Sweep", + "schedule": "every 30m", + "state": "paused", + "enabled": false, + "last_status": "ok", + "last_error": null, + "last_run_at": "2026-03-24T15:21:42.995715-04:00", + "category": "healthy", + "reason": "Dashboard repo frozen - loops redirected to the-nexus", + "action": "none \u2014 paused intentionally" + }, + { + "id": "a77a87392582", + "name": "Health Monitor", + "schedule": "every 5m", + "state": "scheduled", + "enabled": true, + "last_status": "ok", + "last_error": null, + "last_run_at": "2026-03-24T15:34:39.045945-04:00", + "category": "healthy", + "reason": "Last run succeeded", + "action": "" + }, + { + "id": "36fb2f630a17", + "name": "Hermes Philosophy Loop", + "schedule": "every 1440m", + "state": "unknown", + "enabled": false, + "last_status": null, + "last_error": null, + "last_run_at": null, + "category": "healthy", + "reason": "Never run, no errors", + "action": "" + }, + { + "id": "muda-audit-weekly", + "name": "Muda Audit", + "schedule": "0 21 * * 0", + "state": "scheduled", + "enabled": true, + "last_status": null, + "last_error": null, + "last_run_at": null, + "category": "healthy", + "reason": "Never run, no errors", + "action": "" + }, + { + "id": "kaizen-retro-349", + "name": "Kaizen Retro", + "schedule": "daily at 07:30", + "state": "scheduled", + "enabled": true, + "last_status": null, + "last_error": null, + "last_run_at": null, + "category": "healthy", + "reason": "Never run, no errors", + "action": "" + }, + { + "id": "overnight-rd-nightly", + "name": "Overnight R&D Loop", + "schedule": "Nightly at 10 PM EDT", + "state": "scheduled", + "enabled": true, + "last_status": null, + "last_error": null, + "last_run_at": null, + "category": "healthy", + "reason": "Never run, no errors", + "action": "" + } + ] +} \ No newline at end of file diff --git a/scripts/cron-audit-662.py b/scripts/cron-audit-662.py new file mode 100644 index 00000000..9582b71f --- /dev/null +++ b/scripts/cron-audit-662.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +""" +Cron Fleet Audit Script — #662 + +Reads hermes cron job state, categorizes all jobs into: + - healthy: last_status=ok or never-run-and-enabled + - transient: recent errors (likely network/timeout) + - systemic: repeated errors over 48+ hours + +Outputs a JSON report and optionally: + --disable Disable systemic jobs erroring 48+ hours + --issues File Gitea issues for systemic failures +""" + +import json +import sys +import os +import argparse +from datetime import datetime, timezone, timedelta +from pathlib import Path +from typing import List, Dict, Any + +# --- Config --- +ERROR_THRESHOLD_HOURS = 48 +CRON_STATE_PATHS = [ + Path.home() / ".hermes" / "cron" / "jobs.json", + Path.home() / ".hermes" / "cron" / "state.json", + Path("/root/.hermes/cron/jobs.json"), + Path("/root/.hermes/cron/state.json"), +] + +def load_cron_state() -> List[Dict[str, Any]]: + """Load cron job state from known locations.""" + for path in CRON_STATE_PATHS: + if path.exists(): + try: + with open(path) as f: + data = json.load(f) + if isinstance(data, dict) and "jobs" in data: + return data["jobs"] + if isinstance(data, list): + return data + except (json.JSONDecodeError, IOError): + continue + + # Fallback: try hermes cron list CLI + try: + import subprocess + result = subprocess.run( + ["hermes", "cron", "list", "--json"], + capture_output=True, text=True, timeout=30 + ) + if result.returncode == 0: + data = json.loads(result.stdout) + if isinstance(data, dict) and "jobs" in data: + return data["jobs"] + if isinstance(data, list): + return data + except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError): + pass + + return [] + + +def parse_timestamp(ts: str) -> datetime: + """Parse ISO timestamp, handle various formats.""" + if not ts: + return None + # Normalize timezone + ts = ts.replace("+00:00", "+00:00") + try: + dt = datetime.fromisoformat(ts) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except ValueError: + return None + + +def categorize_job(job: Dict[str, Any], now: datetime) -> Dict[str, Any]: + """Categorize a single job.""" + status = job.get("last_status", "") + last_error = job.get("last_error", "") + last_run = parse_timestamp(job.get("last_run_at")) + enabled = job.get("enabled", False) + state = job.get("state", "unknown") + name = job.get("name", job.get("id", "unknown")) + + entry = { + "id": job.get("id", ""), + "name": name, + "schedule": job.get("schedule_display", str(job.get("schedule", ""))), + "state": state, + "enabled": enabled, + "last_status": status, + "last_error": last_error, + "last_run_at": job.get("last_run_at"), + "category": "healthy", + "reason": "", + "action": "", + } + + # Never run / no error + if status is None and not last_error: + entry["category"] = "healthy" + entry["reason"] = "Never run, no errors" + return entry + + # Explicitly paused with reason + if state == "paused": + entry["category"] = "healthy" + entry["reason"] = job.get("paused_reason", "Manually paused") + entry["action"] = "none — paused intentionally" + return entry + + # Completed jobs + if state == "completed": + entry["category"] = "healthy" + entry["reason"] = "Completed (one-shot)" + return entry + + # Error status + if status == "error" and last_error: + age_hours = None + if last_run: + age_hours = (now - last_run).total_seconds() / 3600 + + if age_hours is not None and age_hours >= ERROR_THRESHOLD_HOURS: + entry["category"] = "systemic" + entry["reason"] = f"Erroring for {age_hours:.1f}h (>{ERROR_THRESHOLD_HOURS}h threshold)" + entry["action"] = "disable" + else: + entry["category"] = "transient" + age_str = f"{age_hours:.1f}h ago" if age_hours is not None else "unknown age" + entry["reason"] = f"Recent error ({age_str}), may be transient" + entry["action"] = "monitor" + return entry + + # OK status + if status == "ok": + entry["category"] = "healthy" + entry["reason"] = "Last run succeeded" + return entry + + # Scheduled but never errored + if state == "scheduled" and enabled: + entry["category"] = "healthy" + entry["reason"] = "Scheduled and running" + return entry + + # Unknown state + entry["category"] = "transient" + entry["reason"] = f"Unknown state: {state}, status: {status}" + entry["action"] = "investigate" + return entry + + +def audit_jobs(jobs: List[Dict[str, Any]]) -> Dict[str, Any]: + """Run full audit on job list.""" + now = datetime.now(timezone.utc) + categorized = [categorize_job(j, now) for j in jobs] + + healthy = [c for c in categorized if c["category"] == "healthy"] + transient = [c for c in categorized if c["category"] == "transient"] + systemic = [c for c in categorized if c["category"] == "systemic"] + + report = { + "audit_time": now.isoformat(), + "total_jobs": len(jobs), + "summary": { + "healthy": len(healthy), + "transient_errors": len(transient), + "systemic_failures": len(systemic), + }, + "systemic_jobs": [ + { + "id": j["id"], + "name": j["name"], + "reason": j["reason"], + "last_error": j["last_error"], + } + for j in systemic + ], + "transient_jobs": [ + { + "id": j["id"], + "name": j["name"], + "reason": j["reason"], + } + for j in transient + ], + "all_jobs": categorized, + } + + return report + + +def generate_issue_body(job: Dict[str, Any]) -> str: + """Generate a Gitea issue body for a systemic cron failure.""" + return f"""## Systemic Cron Failure — Auto-Filed by Audit #662 + +**Job:** {job['name']} (`{job['id']}`) +**Schedule:** {job['schedule']} +**State:** {job['state']} +**Last Error:** +``` +{job['last_error'] or 'No error details available'} +``` + +**Audit Finding:** {job['reason']} + +### Action Required +- [ ] Diagnose root cause of repeated failure +- [ ] Fix configuration or remove broken job +- [ ] Verify job resumes healthy after fix + +*Auto-generated by cron-audit-662.py* +""" + + +def main(): + parser = argparse.ArgumentParser(description="Cron fleet audit (#662)") + parser.add_argument("--jobs-file", help="Path to jobs.json override") + parser.add_argument("--disable", action="store_true", + help="Disable systemic jobs (requires hermes CLI)") + parser.add_argument("--issues", action="store_true", + help="File Gitea issues for systemic failures") + parser.add_argument("--output", help="Write report to file") + parser.add_argument("--json", action="store_true", help="JSON output only") + args = parser.parse_args() + + # Load jobs + jobs = [] + if args.jobs_file: + with open(args.jobs_file) as f: + data = json.load(f) + jobs = data.get("jobs", data) if isinstance(data, dict) else data + else: + jobs = load_cron_state() + + if not jobs: + print("ERROR: No cron jobs found. Check ~/.hermes/cron/ or run 'hermes cron list'.") + sys.exit(1) + + # Run audit + report = audit_jobs(jobs) + + # Output + if args.json: + print(json.dumps(report, indent=2)) + else: + print(f"\n{'='*60}") + print(f" CRON FLEET AUDIT — {report['total_jobs']} jobs") + print(f"{'='*60}") + print(f" Healthy: {report['summary']['healthy']}") + print(f" Transient errors: {report['summary']['transient_errors']}") + print(f" Systemic failures: {report['summary']['systemic_failures']}") + print(f"{'='*60}") + + if report["systemic_jobs"]: + print(f"\n SYSTEMIC FAILURES (>{ERROR_THRESHOLD_HOURS}h):") + for j in report["systemic_jobs"]: + print(f" - {j['name']} ({j['id']}): {j['reason']}") + if j["last_error"]: + print(f" Error: {j['last_error'][:100]}") + + if report["transient_jobs"]: + print(f"\n TRANSIENT ERRORS:") + for j in report["transient_jobs"]: + print(f" - {j['name']} ({j['id']}): {j['reason']}") + + print() + + # Write report file + if args.output: + with open(args.output, "w") as f: + json.dump(report, f, indent=2) + print(f"Report written to {args.output}") + + # Disable systemic jobs + if args.disable and report["systemic_jobs"]: + import subprocess + for j in report["systemic_jobs"]: + print(f"Disabling: {j['name']} ({j['id']})") + try: + subprocess.run( + ["hermes", "cron", "pause", j["id"]], + capture_output=True, text=True, timeout=10 + ) + print(f" → Disabled") + except Exception as e: + print(f" → Failed: {e}") + + # File issues for systemic failures + if args.issues and report["systemic_jobs"]: + gitea_token = os.environ.get("GITEA_TOKEN") or "" + if not gitea_token: + token_path = Path.home() / ".config" / "gitea" / "token" + if token_path.exists(): + gitea_token = token_path.read_text().strip() + + if not gitea_token: + print("ERROR: No Gitea token found. Set GITEA_TOKEN or ~/.config/gitea/token") + sys.exit(1) + + import urllib.request + base = "https://forge.alexanderwhitestone.com/api/v1" + headers = { + "Authorization": f"token {gitea_token}", + "Content-Type": "application/json", + } + + for j in report["systemic_jobs"]: + title = f"CRON FAIL: {j['name']} — systemic error ({j['id']})" + body = generate_issue_body(j) + data = json.dumps({"title": title, "body": body}).encode() + req = urllib.request.Request( + f"{base}/repos/Timmy_Foundation/timmy-config/issues", + data=data, headers=headers, method="POST" + ) + try: + resp = urllib.request.urlopen(req) + result = json.loads(resp.read()) + print(f"Issued #{result['number']}: {title}") + except Exception as e: + print(f"Failed to file issue for {j['name']}: {e}") + + # Exit code: non-zero if systemic failures found + sys.exit(1 if report["systemic_jobs"] else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/cron_audit_662.py b/scripts/cron_audit_662.py new file mode 120000 index 00000000..f26ef96c --- /dev/null +++ b/scripts/cron_audit_662.py @@ -0,0 +1 @@ +cron-audit-662.py \ No newline at end of file diff --git a/tests/test_cron_audit.py b/tests/test_cron_audit.py new file mode 100644 index 00000000..a820129a --- /dev/null +++ b/tests/test_cron_audit.py @@ -0,0 +1,109 @@ +""" +Tests for scripts/cron-audit-662.py — cron fleet audit. +""" + +import json +import sys +import unittest +from datetime import datetime, timezone, timedelta +from pathlib import Path + +# Add scripts to path +sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) +from cron_audit_662 import categorize_job, audit_jobs + + +class TestCategorizeJob(unittest.TestCase): + def setUp(self): + self.now = datetime(2026, 4, 14, 20, 0, 0, tzinfo=timezone.utc) + + def test_healthy_ok(self): + job = {"id": "a1", "name": "Test", "last_status": "ok", "enabled": True, "state": "scheduled"} + result = categorize_job(job, self.now) + self.assertEqual(result["category"], "healthy") + + def test_healthy_never_run(self): + job = {"id": "a2", "name": "Never", "last_status": None, "last_error": None} + result = categorize_job(job, self.now) + self.assertEqual(result["category"], "healthy") + + def test_healthy_paused(self): + job = {"id": "a3", "name": "Paused", "state": "paused", "paused_reason": "intentional"} + result = categorize_job(job, self.now) + self.assertEqual(result["category"], "healthy") + + def test_healthy_completed(self): + job = {"id": "a4", "name": "Done", "state": "completed"} + result = categorize_job(job, self.now) + self.assertEqual(result["category"], "healthy") + + def test_transient_recent_error(self): + recent = (self.now - timedelta(hours=2)).isoformat() + job = { + "id": "t1", "name": "RecentErr", + "last_status": "error", + "last_error": "Connection timeout", + "last_run_at": recent, + "enabled": True, + "state": "scheduled", + } + result = categorize_job(job, self.now) + self.assertEqual(result["category"], "transient") + self.assertIn("transient", result["reason"].lower()) + + def test_systemic_old_error(self): + old = (self.now - timedelta(hours=72)).isoformat() + job = { + "id": "s1", "name": "OldErr", + "last_status": "error", + "last_error": "ConfigError: bad config", + "last_run_at": old, + "enabled": True, + "state": "scheduled", + } + result = categorize_job(job, self.now) + self.assertEqual(result["category"], "systemic") + self.assertEqual(result["action"], "disable") + + def test_systemic_boundary(self): + """48.1 hours should be systemic.""" + boundary = (self.now - timedelta(hours=48, minutes=6)).isoformat() + job = { + "id": "s2", "name": "Boundary", + "last_status": "error", + "last_error": "fail", + "last_run_at": boundary, + "enabled": True, + "state": "scheduled", + } + result = categorize_job(job, self.now) + self.assertEqual(result["category"], "systemic") + + +class TestAuditJobs(unittest.TestCase): + def test_empty(self): + report = audit_jobs([]) + self.assertEqual(report["total_jobs"], 0) + self.assertEqual(report["summary"]["healthy"], 0) + + def test_mixed_report(self): + now = datetime(2026, 4, 14, 20, 0, 0, tzinfo=timezone.utc) + old = (now - timedelta(hours=72)).isoformat() + recent = (now - timedelta(hours=1)).isoformat() + + jobs = [ + {"id": "h1", "name": "Healthy", "last_status": "ok", "enabled": True, "state": "scheduled"}, + {"id": "t1", "name": "Transient", "last_status": "error", "last_error": "timeout", "last_run_at": recent, "enabled": True, "state": "scheduled"}, + {"id": "s1", "name": "Systemic", "last_status": "error", "last_error": "config bad", "last_run_at": old, "enabled": True, "state": "scheduled"}, + {"id": "p1", "name": "Paused", "state": "paused", "paused_reason": "frozen"}, + ] + report = audit_jobs(jobs) + self.assertEqual(report["summary"]["healthy"], 2) + self.assertEqual(report["summary"]["transient_errors"], 1) + self.assertEqual(report["summary"]["systemic_failures"], 1) + self.assertEqual(len(report["systemic_jobs"]), 1) + self.assertEqual(report["systemic_jobs"][0]["name"], "Systemic") + + +if __name__ == "__main__": + unittest.main()