scripts/cron_audit.py

#!/usr/bin/env python3
"""
Cron Job Audit — Identify erroring jobs, categorize health, recommend actions.

Usage:
    python scripts/cron_audit.py                    # Full audit
    python scripts/cron_audit.py --disable-stale 48 # Disable jobs erroring 48+ hours
    python scripts/cron_audit.py --json             # JSON output
"""

import json
import os
import subprocess
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any


def get_cron_jobs() -> list[dict]:
    """Get all cron jobs from hermes."""
    try:
        result = subprocess.run(
            ["hermes", "cron", "list", "--all"],
            capture_output=True, text=True, timeout=30,
        )
        # Parse hermes cron list output
        jobs = []
        # hermes cron list outputs JSON via the tool
        try:
            data = json.loads(result.stdout)
            if isinstance(data, dict) and "jobs" in data:
                jobs = data["jobs"]
            elif isinstance(data, list):
                jobs = data
        except json.JSONDecodeError:
            # Fall back to parsing the jobs file directly
            jobs_file = Path.home() / ".hermes" / "cron" / "jobs.json"
            if jobs_file.exists():
                data = json.loads(jobs_file.read_text())
                jobs = data.get("jobs", [])
        return jobs
    except Exception as e:
        print(f"Error fetching jobs: {e}", file=sys.stderr)
        return []


def categorize_job(job: dict) -> dict:
    """Categorize a job as healthy, transient error, or systemic error."""
    last_status = job.get("last_status", "unknown")
    last_run = job.get("last_run_at", "")
    last_error = job.get("last_error", "")
    enabled = job.get("enabled", True)
    name = job.get("name", job.get("id", "?"))

    if not enabled:
        return {"name": name, "id": job.get("id"), "health": "disabled", "action": "none"}

    if last_status == "ok" or last_status is None:
        return {"name": name, "id": job.get("id"), "health": "healthy", "action": "none"}

    if last_status == "error":
        # Check if transient (network) or systemic (bad config)
        error_lower = (last_error or "").lower()
        transient_signals = ["timeout", "connection", "network", "temporary", "rate limit", "429", "503"]
        systemic_signals = ["not found", "import", "module", "attribute", "syntax", "permission", "404", "401"]

        is_transient = any(s in error_lower for s in transient_signals)
        is_systemic = any(s in error_lower for s in systemic_signals)

        # Check staleness
        staleness_hours = 0
        if last_run:
            try:
                last_dt = datetime.fromisoformat(last_run.replace("Z", "+00:00"))
                staleness_hours = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600
            except Exception:
                pass

        if is_systemic or staleness_hours > 48:
            return {
                "name": name, "id": job.get("id"), "health": "systemic",
                "action": "disable", "staleness_hours": round(staleness_hours, 1),
                "error": last_error[:200] if last_error else "unknown",
            }
        elif is_transient:
            return {
                "name": name, "id": job.get("id"), "health": "transient",
                "action": "monitor", "staleness_hours": round(staleness_hours, 1),
                "error": last_error[:200] if last_error else "unknown",
            }
        else:
            return {
                "name": name, "id": job.get("id"), "health": "unknown_error",
                "action": "investigate", "staleness_hours": round(staleness_hours, 1),
                "error": last_error[:200] if last_error else "unknown",
            }

    return {"name": name, "id": job.get("id"), "health": "unknown", "action": "investigate"}


def run_audit() -> dict:
    """Run full cron audit."""
    jobs = get_cron_jobs()

    if not jobs:
        return {"total": 0, "categories": {}, "jobs": [], "recommendations": ["No jobs found or hermes not available"]}

    categorized = [categorize_job(j) for j in jobs]

    categories = {}
    for c in categorized:
        health = c["health"]
        categories.setdefault(health, []).append(c)

    recommendations = []
    if categories.get("systemic"):
        recommendations.append(f"DISABLE {len(categories['systemic'])} systemic error jobs (erroring 48+ hours)")
    if categories.get("transient"):
        recommendations.append(f"MONITOR {len(categories['transient'])} transient error jobs (network/timeout)")
    if categories.get("unknown_error"):
        recommendations.append(f"INVESTIGATE {len(categories['unknown_error'])} jobs with unclassified errors")

    return {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "total": len(jobs),
        "healthy": len(categories.get("healthy", [])),
        "transient_errors": len(categories.get("transient", [])),
        "systemic_errors": len(categories.get("systemic", [])),
        "disabled": len(categories.get("disabled", [])),
        "unknown": len(categories.get("unknown_error", [])) + len(categories.get("unknown", [])),
        "categories": {k: len(v) for k, v in categories.items()},
        "jobs": categorized,
        "recommendations": recommendations,
    }


def to_markdown(audit: dict) -> str:
    lines = [
        "# Cron Job Audit Report",
        "",
        f"Generated: {audit['generated_at'][:16]}",
        "",
        "## Summary",
        "",
        f"| Health | Count |",
        f"|--------|-------|",
        f"| Healthy | {audit['healthy']} |",
        f"| Transient errors | {audit['transient_errors']} |",
        f"| Systemic errors | {audit['systemic_errors']} |",
        f"| Disabled | {audit['disabled']} |",
        f"| Unknown | {audit['unknown']} |",
        f"| **Total** | **{audit['total']}** |",
        "",
    ]

    if audit["recommendations"]:
        lines.extend(["## Recommendations", ""])
        for r in audit["recommendations"]:
            lines.append(f"- {r}")
        lines.append("")

    if audit.get("systemic_errors", 0) > 0:
        lines.extend(["## Systemic Errors (Recommend Disable)", ""])
        for j in audit["jobs"]:
            if j["health"] == "systemic":
                lines.append(f"- `{j['id']}`: {j['name']} (stale {j.get('staleness_hours', '?')}h)")
                lines.append(f"  Error: {j.get('error', 'unknown')}")

    if audit.get("transient_errors", 0) > 0:
        lines.extend(["", "## Transient Errors (Monitor)", ""])
        for j in audit["jobs"]:
            if j["health"] == "transient":
                lines.append(f"- `{j['id']}`: {j['name']} — {j.get('error', 'unknown')[:100]}")

    return "
".join(lines)


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Cron job audit")
    parser.add_argument("--json", action="store_true")
    parser.add_argument("--disable-stale", type=int, default=0, help="Disable jobs stale N+ hours")
    args = parser.parse_args()

    audit = run_audit()

    if args.json:
        print(json.dumps(audit, indent=2))
    else:
        print(to_markdown(audit))


if __name__ == "__main__":
    main()
Merge PR #772: scripts/cron_audit.py (added) 2026-04-16 04:58:59 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Cron Job Audit — Identify erroring jobs, categorize health, recommend actions.`

			`Usage:`
			`python scripts/cron_audit.py # Full audit`
			`python scripts/cron_audit.py --disable-stale 48 # Disable jobs erroring 48+ hours`
			`python scripts/cron_audit.py --json # JSON output`
			`"""`

			`import json`
			`import os`
			`import subprocess`
			`import sys`
			`from datetime import datetime, timedelta, timezone`
			`from pathlib import Path`
			`from typing import Any`


			`def get_cron_jobs() -> list[dict]:`
			`"""Get all cron jobs from hermes."""`
			`try:`
			`result = subprocess.run(`
			`["hermes", "cron", "list", "--all"],`
			`capture_output=True, text=True, timeout=30,`
			`)`
			`# Parse hermes cron list output`
			`jobs = []`
			`# hermes cron list outputs JSON via the tool`
			`try:`
			`data = json.loads(result.stdout)`
			`if isinstance(data, dict) and "jobs" in data:`
			`jobs = data["jobs"]`
			`elif isinstance(data, list):`
			`jobs = data`
			`except json.JSONDecodeError:`
			`# Fall back to parsing the jobs file directly`
			`jobs_file = Path.home() / ".hermes" / "cron" / "jobs.json"`
			`if jobs_file.exists():`
			`data = json.loads(jobs_file.read_text())`
			`jobs = data.get("jobs", [])`
			`return jobs`
			`except Exception as e:`
			`print(f"Error fetching jobs: {e}", file=sys.stderr)`
			`return []`


			`def categorize_job(job: dict) -> dict:`
			`"""Categorize a job as healthy, transient error, or systemic error."""`
			`last_status = job.get("last_status", "unknown")`
			`last_run = job.get("last_run_at", "")`
			`last_error = job.get("last_error", "")`
			`enabled = job.get("enabled", True)`
			`name = job.get("name", job.get("id", "?"))`

			`if not enabled:`
			`return {"name": name, "id": job.get("id"), "health": "disabled", "action": "none"}`

			`if last_status == "ok" or last_status is None:`
			`return {"name": name, "id": job.get("id"), "health": "healthy", "action": "none"}`

			`if last_status == "error":`
			`# Check if transient (network) or systemic (bad config)`
			`error_lower = (last_error or "").lower()`
			`transient_signals = ["timeout", "connection", "network", "temporary", "rate limit", "429", "503"]`
			`systemic_signals = ["not found", "import", "module", "attribute", "syntax", "permission", "404", "401"]`

			`is_transient = any(s in error_lower for s in transient_signals)`
			`is_systemic = any(s in error_lower for s in systemic_signals)`

			`# Check staleness`
			`staleness_hours = 0`
			`if last_run:`
			`try:`
			`last_dt = datetime.fromisoformat(last_run.replace("Z", "+00:00"))`
			`staleness_hours = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600`
			`except Exception:`
			`pass`

			`if is_systemic or staleness_hours > 48:`
			`return {`
			`"name": name, "id": job.get("id"), "health": "systemic",`
			`"action": "disable", "staleness_hours": round(staleness_hours, 1),`
			`"error": last_error[:200] if last_error else "unknown",`
			`}`
			`elif is_transient:`
			`return {`
			`"name": name, "id": job.get("id"), "health": "transient",`
			`"action": "monitor", "staleness_hours": round(staleness_hours, 1),`
			`"error": last_error[:200] if last_error else "unknown",`
			`}`
			`else:`
			`return {`
			`"name": name, "id": job.get("id"), "health": "unknown_error",`
			`"action": "investigate", "staleness_hours": round(staleness_hours, 1),`
			`"error": last_error[:200] if last_error else "unknown",`
			`}`

			`return {"name": name, "id": job.get("id"), "health": "unknown", "action": "investigate"}`


			`def run_audit() -> dict:`
			`"""Run full cron audit."""`
			`jobs = get_cron_jobs()`

			`if not jobs:`
			`return {"total": 0, "categories": {}, "jobs": [], "recommendations": ["No jobs found or hermes not available"]}`

			`categorized = [categorize_job(j) for j in jobs]`

			`categories = {}`
			`for c in categorized:`
			`health = c["health"]`
			`categories.setdefault(health, []).append(c)`

			`recommendations = []`
			`if categories.get("systemic"):`
			`recommendations.append(f"DISABLE {len(categories['systemic'])} systemic error jobs (erroring 48+ hours)")`
			`if categories.get("transient"):`
			`recommendations.append(f"MONITOR {len(categories['transient'])} transient error jobs (network/timeout)")`
			`if categories.get("unknown_error"):`
			`recommendations.append(f"INVESTIGATE {len(categories['unknown_error'])} jobs with unclassified errors")`

			`return {`
			`"generated_at": datetime.now(timezone.utc).isoformat(),`
			`"total": len(jobs),`
			`"healthy": len(categories.get("healthy", [])),`
			`"transient_errors": len(categories.get("transient", [])),`
			`"systemic_errors": len(categories.get("systemic", [])),`
			`"disabled": len(categories.get("disabled", [])),`
			`"unknown": len(categories.get("unknown_error", [])) + len(categories.get("unknown", [])),`
			`"categories": {k: len(v) for k, v in categories.items()},`
			`"jobs": categorized,`
			`"recommendations": recommendations,`
			`}`


			`def to_markdown(audit: dict) -> str:`
			`lines = [`
			`"# Cron Job Audit Report",`
			`"",`
			`f"Generated: {audit['generated_at'][:16]}",`
			`"",`
			`"## Summary",`
			`"",`
			`f"\| Health \| Count \|",`
			`f"\|--------\|-------\|",`
			`f"\| Healthy \| {audit['healthy']} \|",`
			`f"\| Transient errors \| {audit['transient_errors']} \|",`
			`f"\| Systemic errors \| {audit['systemic_errors']} \|",`
			`f"\| Disabled \| {audit['disabled']} \|",`
			`f"\| Unknown \| {audit['unknown']} \|",`
			`f"\| Total \| {audit['total']} \|",`
			`"",`
			`]`

			`if audit["recommendations"]:`
			`lines.extend(["## Recommendations", ""])`
			`for r in audit["recommendations"]:`
			`lines.append(f"- {r}")`
			`lines.append("")`

			`if audit.get("systemic_errors", 0) > 0:`
			`lines.extend(["## Systemic Errors (Recommend Disable)", ""])`
			`for j in audit["jobs"]:`
			`if j["health"] == "systemic":`
			lines.append(f"- `{j['id']}`: {j['name']} (stale {j.get('staleness_hours', '?')}h)")
			`lines.append(f" Error: {j.get('error', 'unknown')}")`

			`if audit.get("transient_errors", 0) > 0:`
			`lines.extend(["", "## Transient Errors (Monitor)", ""])`
			`for j in audit["jobs"]:`
			`if j["health"] == "transient":`
			lines.append(f"- `{j['id']}`: {j['name']} — {j.get('error', 'unknown')[:100]}")

			`return "`
			`".join(lines)`


			`def main():`
			`import argparse`
			`parser = argparse.ArgumentParser(description="Cron job audit")`
			`parser.add_argument("--json", action="store_true")`
			`parser.add_argument("--disable-stale", type=int, default=0, help="Disable jobs stale N+ hours")`
			`args = parser.parse_args()`

			`audit = run_audit()`

			`if args.json:`
			`print(json.dumps(audit, indent=2))`
			`else:`
			`print(to_markdown(audit))`


			`if __name__ == "__main__":`
			`main()`