#!/usr/bin/env python3 """ Cron Job Audit — Identify erroring jobs, categorize health, recommend actions. Usage: python scripts/cron_audit.py # Full audit python scripts/cron_audit.py --disable-stale 48 # Disable jobs erroring 48+ hours python scripts/cron_audit.py --json # JSON output """ import json import os import subprocess import sys from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Any def get_cron_jobs() -> list[dict]: """Get all cron jobs from hermes.""" try: result = subprocess.run( ["hermes", "cron", "list", "--all"], capture_output=True, text=True, timeout=30, ) # Parse hermes cron list output jobs = [] # hermes cron list outputs JSON via the tool try: data = json.loads(result.stdout) if isinstance(data, dict) and "jobs" in data: jobs = data["jobs"] elif isinstance(data, list): jobs = data except json.JSONDecodeError: # Fall back to parsing the jobs file directly jobs_file = Path.home() / ".hermes" / "cron" / "jobs.json" if jobs_file.exists(): data = json.loads(jobs_file.read_text()) jobs = data.get("jobs", []) return jobs except Exception as e: print(f"Error fetching jobs: {e}", file=sys.stderr) return [] def categorize_job(job: dict) -> dict: """Categorize a job as healthy, transient error, or systemic error.""" last_status = job.get("last_status", "unknown") last_run = job.get("last_run_at", "") last_error = job.get("last_error", "") enabled = job.get("enabled", True) name = job.get("name", job.get("id", "?")) if not enabled: return {"name": name, "id": job.get("id"), "health": "disabled", "action": "none"} if last_status == "ok" or last_status is None: return {"name": name, "id": job.get("id"), "health": "healthy", "action": "none"} if last_status == "error": # Check if transient (network) or systemic (bad config) error_lower = (last_error or "").lower() transient_signals = ["timeout", "connection", "network", "temporary", "rate limit", "429", "503"] systemic_signals = ["not found", "import", "module", "attribute", "syntax", "permission", "404", "401"] is_transient = any(s in error_lower for s in transient_signals) is_systemic = any(s in error_lower for s in systemic_signals) # Check staleness staleness_hours = 0 if last_run: try: last_dt = datetime.fromisoformat(last_run.replace("Z", "+00:00")) staleness_hours = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600 except Exception: pass if is_systemic or staleness_hours > 48: return { "name": name, "id": job.get("id"), "health": "systemic", "action": "disable", "staleness_hours": round(staleness_hours, 1), "error": last_error[:200] if last_error else "unknown", } elif is_transient: return { "name": name, "id": job.get("id"), "health": "transient", "action": "monitor", "staleness_hours": round(staleness_hours, 1), "error": last_error[:200] if last_error else "unknown", } else: return { "name": name, "id": job.get("id"), "health": "unknown_error", "action": "investigate", "staleness_hours": round(staleness_hours, 1), "error": last_error[:200] if last_error else "unknown", } return {"name": name, "id": job.get("id"), "health": "unknown", "action": "investigate"} def run_audit() -> dict: """Run full cron audit.""" jobs = get_cron_jobs() if not jobs: return {"total": 0, "categories": {}, "jobs": [], "recommendations": ["No jobs found or hermes not available"]} categorized = [categorize_job(j) for j in jobs] categories = {} for c in categorized: health = c["health"] categories.setdefault(health, []).append(c) recommendations = [] if categories.get("systemic"): recommendations.append(f"DISABLE {len(categories['systemic'])} systemic error jobs (erroring 48+ hours)") if categories.get("transient"): recommendations.append(f"MONITOR {len(categories['transient'])} transient error jobs (network/timeout)") if categories.get("unknown_error"): recommendations.append(f"INVESTIGATE {len(categories['unknown_error'])} jobs with unclassified errors") return { "generated_at": datetime.now(timezone.utc).isoformat(), "total": len(jobs), "healthy": len(categories.get("healthy", [])), "transient_errors": len(categories.get("transient", [])), "systemic_errors": len(categories.get("systemic", [])), "disabled": len(categories.get("disabled", [])), "unknown": len(categories.get("unknown_error", [])) + len(categories.get("unknown", [])), "categories": {k: len(v) for k, v in categories.items()}, "jobs": categorized, "recommendations": recommendations, } def to_markdown(audit: dict) -> str: lines = [ "# Cron Job Audit Report", "", f"Generated: {audit['generated_at'][:16]}", "", "## Summary", "", f"| Health | Count |", f"|--------|-------|", f"| Healthy | {audit['healthy']} |", f"| Transient errors | {audit['transient_errors']} |", f"| Systemic errors | {audit['systemic_errors']} |", f"| Disabled | {audit['disabled']} |", f"| Unknown | {audit['unknown']} |", f"| **Total** | **{audit['total']}** |", "", ] if audit["recommendations"]: lines.extend(["## Recommendations", ""]) for r in audit["recommendations"]: lines.append(f"- {r}") lines.append("") if audit.get("systemic_errors", 0) > 0: lines.extend(["## Systemic Errors (Recommend Disable)", ""]) for j in audit["jobs"]: if j["health"] == "systemic": lines.append(f"- `{j['id']}`: {j['name']} (stale {j.get('staleness_hours', '?')}h)") lines.append(f" Error: {j.get('error', 'unknown')}") if audit.get("transient_errors", 0) > 0: lines.extend(["", "## Transient Errors (Monitor)", ""]) for j in audit["jobs"]: if j["health"] == "transient": lines.append(f"- `{j['id']}`: {j['name']} — {j.get('error', 'unknown')[:100]}") return " ".join(lines) def main(): import argparse parser = argparse.ArgumentParser(description="Cron job audit") parser.add_argument("--json", action="store_true") parser.add_argument("--disable-stale", type=int, default=0, help="Disable jobs stale N+ hours") args = parser.parse_args() audit = run_audit() if args.json: print(json.dumps(audit, indent=2)) else: print(to_markdown(audit)) if __name__ == "__main__": main()