fix: harden cron audit workflow for #662

Split the audit into an importable cron_audit_662 module plus a CLI wrapper, classify recent systemic failures by error signature instead of age alone, and include enough metadata for issue filing and delivery-failure reporting. Add regression tests for import-path loading, systemic vs transient classification, and issue body generation.
2026-04-22 14:48:42 -04:00
4 changed files with 691 additions and 421 deletions
--- a/scripts/ci-cron-validate.py
+++ b/scripts/ci-cron-validate.py
--- a/scripts/cron-audit-662.py
+++ b/scripts/cron-audit-662.py
@@ -1,428 +1,11 @@
 #!/usr/bin/env python3
-"""
+"""CLI wrapper for the importable cron_audit_662 module."""
 Cron Fleet Audit Script — #662
 Reads hermes cron job state, categorizes all jobs into:
  - healthy: last_status=ok or never-run-and-enabled
  - transient: recent errors (likely network/timeout)
  - systemic: repeated errors over 48+ hours
 Outputs a JSON report and optionally:
  --disable  Disable systemic jobs erroring 48+ hours
  --issues   File Gitea issues for systemic failures
 """
 import json
 import sys
 import os
 import argparse
 from datetime import datetime, timezone, timedelta
 from pathlib import Path
-from typing import List, Dict, Any
+import sys
-# --- Config ---
+sys.path.insert(0, str(Path(__file__).resolve().parent))
-ERROR_THRESHOLD_HOURS = 48
+from cron_audit_662 import main
 CRON_STATE_PATHS = [
    Path.home() / ".hermes" / "cron" / "jobs.json",
    Path.home() / ".hermes" / "cron" / "state.json",
    Path("/root/.hermes/cron/jobs.json"),
    Path("/root/.hermes/cron/state.json"),
 ]
 def load_cron_state() -> List[Dict[str, Any]]:
    """Load cron job state from known locations."""
    for path in CRON_STATE_PATHS:
        if path.exists():
            try:
                with open(path) as f:
                    data = json.load(f)
                if isinstance(data, dict) and "jobs" in data:
                    return data["jobs"]
                if isinstance(data, list):
                    return data
            except (json.JSONDecodeError, IOError):
                continue
    # Fallback: try hermes cron list CLI
    try:
        import subprocess
        result = subprocess.run(
            ["hermes", "cron", "list", "--json"],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode == 0:
            data = json.loads(result.stdout)
            if isinstance(data, dict) and "jobs" in data:
                return data["jobs"]
            if isinstance(data, list):
                return data
    except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError):
        pass
    return []
 def parse_timestamp(ts: str) -> datetime:
    """Parse ISO timestamp, handle various formats."""
    if not ts:
        return None
    # Normalize timezone
    ts = ts.replace("+00:00", "+00:00")
    try:
        dt = datetime.fromisoformat(ts)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt
    except ValueError:
        return None
 def categorize_job(job: Dict[str, Any], now: datetime) -> Dict[str, Any]:
    """Categorize a single job."""
    status = job.get("last_status", "")
    last_error = job.get("last_error", "")
    last_run = parse_timestamp(job.get("last_run_at"))
    enabled = job.get("enabled", False)
    state = job.get("state", "unknown")
    name = job.get("name", job.get("id", "unknown"))
    entry = {
        "id": job.get("id", ""),
        "name": name,
        "schedule": job.get("schedule_display", str(job.get("schedule", ""))),
        "state": state,
        "enabled": enabled,
        "last_status": status,
        "last_error": last_error,
        "last_run_at": job.get("last_run_at"),
        "category": "healthy",
        "reason": "",
        "action": "",
    }
    # Never run / no error
    if status is None and not last_error:
        entry["category"] = "healthy"
        entry["reason"] = "Never run, no errors"
        return entry
    # Explicitly paused with reason
    if state == "paused":
        entry["category"] = "healthy"
        entry["reason"] = job.get("paused_reason", "Manually paused")
        entry["action"] = "none — paused intentionally"
        return entry
    # Completed jobs
    if state == "completed":
        entry["category"] = "healthy"
        entry["reason"] = "Completed (one-shot)"
        return entry
    # Error status
    if status == "error" and last_error:
        age_hours = None
        if last_run:
            age_hours = (now - last_run).total_seconds() / 3600
        if age_hours is not None and age_hours >= ERROR_THRESHOLD_HOURS:
            entry["category"] = "systemic"
            entry["reason"] = f"Erroring for {age_hours:.1f}h (>{ERROR_THRESHOLD_HOURS}h threshold)"
            entry["action"] = "disable"
        else:
            entry["category"] = "transient"
            age_str = f"{age_hours:.1f}h ago" if age_hours is not None else "unknown age"
            entry["reason"] = f"Recent error ({age_str}), may be transient"
            entry["action"] = "monitor"
        return entry
    # OK status
    if status == "ok":
        entry["category"] = "healthy"
        entry["reason"] = "Last run succeeded"
        return entry
    # Scheduled but never errored
    if state == "scheduled" and enabled:
        entry["category"] = "healthy"
        entry["reason"] = "Scheduled and running"
        return entry
    # Unknown state
    entry["category"] = "transient"
    entry["reason"] = f"Unknown state: {state}, status: {status}"
    entry["action"] = "investigate"
    return entry
 def audit_jobs(jobs: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Run full audit on job list."""
    now = datetime.now(timezone.utc)
    categorized = [categorize_job(j, now) for j in jobs]
    healthy = [c for c in categorized if c["category"] == "healthy"]
    transient = [c for c in categorized if c["category"] == "transient"]
    systemic = [c for c in categorized if c["category"] == "systemic"]
    report = {
        "audit_time": now.isoformat(),
        "total_jobs": len(jobs),
        "summary": {
            "healthy": len(healthy),
            "transient_errors": len(transient),
            "systemic_failures": len(systemic),
        },
        "systemic_jobs": [
            {
                "id": j["id"],
                "name": j["name"],
                "reason": j["reason"],
                "last_error": j["last_error"],
            }
            for j in systemic
        ],
        "transient_jobs": [
            {
                "id": j["id"],
                "name": j["name"],
                "reason": j["reason"],
            }
            for j in transient
        ],
        "all_jobs": categorized,
    }
    return report
 def generate_issue_body(job: Dict[str, Any]) -> str:
    """Generate a Gitea issue body for a systemic cron failure."""
    return f"""## Systemic Cron Failure — Auto-Filed by Audit #662
 **Job:** {job['name']} (`{job['id']}`)
 **Schedule:** {job['schedule']}
 **State:** {job['state']}
 **Last Error:**
 ```
 {job['last_error'] or 'No error details available'}
 ```
 **Audit Finding:** {job['reason']}
 ### Action Required
 - [ ] Diagnose root cause of repeated failure
 - [ ] Fix configuration or remove broken job
 - [ ] Verify job resumes healthy after fix
 *Auto-generated by cron-audit-662.py*
 """
 # --- Crontab Parsing ---
 def parse_crontab(text: str, source: str = "unknown") -> list:
    """Parse a crontab file into job-like dicts for audit."""
    import re
    jobs = []
    cron_pattern = re.compile(
        r'^(?:@\w+|[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+)\s+(.*)'
    )
    schedule_pattern = re.compile(
        r'^(?:@\w+|[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+)'
    )
    for line in text.split("\n"):
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        sm = schedule_pattern.match(line)
        cm = cron_pattern.match(line)
        if not sm or not cm:
            continue
        schedule_raw = sm.group(0).strip()
        command = cm.group(1).strip()
        name_part = command.split("#")[-1].strip() if "#" in command else ""
        if not name_part:
            cmd_base = command.split(">>")[0].strip().split()
            name_part = os.path.basename(cmd_base[-1]) if cmd_base else "unnamed"
        clean_cmd = command.split(">>")[0].split("#")[0].strip()[:200]
        jobs.append({
            "id": f"crontab-{source}-{hash(command) % 10000:04x}",
            "name": name_part,
            "schedule_display": schedule_raw,
            "schedule": schedule_raw,
            "state": "scheduled",
            "enabled": True,
            "last_status": None,
            "last_error": None,
            "last_run_at": None,
            "_source": f"crontab:{source}",
            "_command": clean_cmd,
        })
    return jobs
 def load_crontab_backups(backup_dir) -> list:
    """Load cron jobs from VPS crontab backup files."""
    from pathlib import Path
    backup_dir = Path(backup_dir)
    all_jobs = []
    if not backup_dir.exists():
        return all_jobs
    for fpath in sorted(backup_dir.glob("*-crontab-backup.txt")):
        source = fpath.name.replace("-crontab-backup.txt", "")
        text = fpath.read_text()
        all_jobs.extend(parse_crontab(text, source=source))
    return all_jobs
 def audit_fleet(hermes_jobs: list, crontab_jobs: list) -> dict:
    """Run full fleet audit combining hermes cron + VPS crontabs."""
    now = datetime.now(timezone.utc)
    hermes_categorized = [categorize_job(j, now) for j in hermes_jobs]
    crontab_categorized = []
    for j in crontab_jobs:
        crontab_categorized.append({
            "id": j["id"], "name": j["name"],
            "schedule": j.get("schedule_display", ""),
            "state": "scheduled", "enabled": True,
            "last_status": None, "last_error": None, "last_run_at": None,
            "category": "healthy",
            "reason": f"Crontab ({j.get('_source', '?')}) — verify logs manually",
            "action": "verify-logs",
        })
    all_cat = hermes_categorized + crontab_categorized
    healthy = [c for c in all_cat if c["category"] == "healthy"]
    transient = [c for c in all_cat if c["category"] == "transient"]
    systemic = [c for c in all_cat if c["category"] == "systemic"]
    return {
        "audit_time": now.isoformat(),
        "total_jobs": len(all_cat),
        "hermes_jobs": len(hermes_categorized),
        "crontab_jobs": len(crontab_categorized),
        "summary": {"healthy": len(healthy), "transient_errors": len(transient), "systemic_failures": len(systemic)},
        "systemic_jobs": [{"id": j["id"], "name": j["name"], "reason": j["reason"], "last_error": j.get("last_error", "")} for j in systemic],
        "transient_jobs": [{"id": j["id"], "name": j["name"], "reason": j["reason"]} for j in transient],
        "all_jobs": all_cat,
    }
 def main():
    parser = argparse.ArgumentParser(description="Cron fleet audit (#662)")
    parser.add_argument("--jobs-file", help="Path to jobs.json override")
    parser.add_argument("--disable", action="store_true",
                        help="Disable systemic jobs (requires hermes CLI)")
    parser.add_argument("--issues", action="store_true",
                        help="File Gitea issues for systemic failures")
    parser.add_argument("--output", help="Write report to file")
    parser.add_argument("--json", action="store_true", help="JSON output only")
    args = parser.parse_args()
    # Load jobs
    jobs = []
    if args.jobs_file:
        with open(args.jobs_file) as f:
            data = json.load(f)
        jobs = data.get("jobs", data) if isinstance(data, dict) else data
    else:
        jobs = load_cron_state()
    # Also load VPS crontab backups
    crontab_dir = Path(__file__).parent.parent / "cron" / "vps"
    crontab_jobs = load_crontab_backups(crontab_dir)
    if not jobs:
        print("ERROR: No cron jobs found. Check ~/.hermes/cron/ or run 'hermes cron list'.")
        sys.exit(1)
    # Run audit
    if crontab_jobs:
        report = audit_fleet(jobs, crontab_jobs)
    else:
        report = audit_jobs(jobs)
    # Output
    if args.json:
        print(json.dumps(report, indent=2))
    else:
        print(f"\n{'='*60}")
        print(f"  CRON FLEET AUDIT — {report['total_jobs']} jobs")
        print(f"{'='*60}")
        print(f"  Healthy:           {report['summary']['healthy']}")
        print(f"  Transient errors:  {report['summary']['transient_errors']}")
        print(f"  Systemic failures: {report['summary']['systemic_failures']}")
        print(f"{'='*60}")
        if report["systemic_jobs"]:
            print(f"\n  SYSTEMIC FAILURES (>{ERROR_THRESHOLD_HOURS}h):")
            for j in report["systemic_jobs"]:
                print(f"    - {j['name']} ({j['id']}): {j['reason']}")
                if j["last_error"]:
                    print(f"      Error: {j['last_error'][:100]}")
        if report["transient_jobs"]:
            print(f"\n  TRANSIENT ERRORS:")
            for j in report["transient_jobs"]:
                print(f"    - {j['name']} ({j['id']}): {j['reason']}")
        print()
    # Write report file
    if args.output:
        with open(args.output, "w") as f:
            json.dump(report, f, indent=2)
        print(f"Report written to {args.output}")
    # Disable systemic jobs
    if args.disable and report["systemic_jobs"]:
        import subprocess
        for j in report["systemic_jobs"]:
            print(f"Disabling: {j['name']} ({j['id']})")
            try:
                subprocess.run(
                    ["hermes", "cron", "pause", j["id"]],
                    capture_output=True, text=True, timeout=10
                )
                print(f"  → Disabled")
            except Exception as e:
                print(f"  → Failed: {e}")
    # File issues for systemic failures
    if args.issues and report["systemic_jobs"]:
        gitea_token = os.environ.get("GITEA_TOKEN") or ""
        if not gitea_token:
            token_path = Path.home() / ".config" / "gitea" / "token"
            if token_path.exists():
                gitea_token = token_path.read_text().strip()
        if not gitea_token:
            print("ERROR: No Gitea token found. Set GITEA_TOKEN or ~/.config/gitea/token")
            sys.exit(1)
        import urllib.request
        base = "https://forge.alexanderwhitestone.com/api/v1"
        headers = {
            "Authorization": f"token {gitea_token}",
            "Content-Type": "application/json",
        }
        for j in report["systemic_jobs"]:
            title = f"CRON FAIL: {j['name']} — systemic error ({j['id']})"
            body = generate_issue_body(j)
            data = json.dumps({"title": title, "body": body}).encode()
            req = urllib.request.Request(
                f"{base}/repos/Timmy_Foundation/timmy-config/issues",
                data=data, headers=headers, method="POST"
            )
            try:
                resp = urllib.request.urlopen(req)
                result = json.loads(resp.read())
                print(f"Issued #{result['number']}: {title}")
            except Exception as e:
                print(f"Failed to file issue for {j['name']}: {e}")
    # Exit code: non-zero if systemic failures found
    sys.exit(1 if report["systemic_jobs"] else 0)
 if __name__ == "__main__":
--- a/scripts/cron_audit_662.py
+++ b/scripts/cron_audit_662.py
@@ -0,0 +1,630 @@
 #!/usr/bin/env python3
 """
 Cron Fleet Audit Script — #662
 Reads hermes cron job state, categorizes all jobs into:
  - healthy: last_status=ok or never-run-and-enabled
  - transient: recent errors (likely network/timeout)
  - systemic: repeated errors over 48+ hours
 Outputs a JSON report and optionally:
  --disable  Disable systemic jobs erroring 48+ hours
  --issues   File Gitea issues for systemic failures
 """
 import argparse
 import json
 import os
 import re
 import subprocess
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 # --- Config ---
 ERROR_THRESHOLD_HOURS = 48
 TRANSIENT_ERROR_MARKERS = (
    "timeout",
    "timed out",
    "rate limit",
    "429",
    "503",
    "temporary",
    "temporarily",
    "auth store lock",
    "connection reset",
    "connection refused",
    "connection aborted",
    "delivery error",
    "telegram send failed",
    "matrix send failed",
    "nodename nor servname provided",
    "name or service not known",
    "empty response",
 )
 SYSTEMIC_ERROR_MARKERS = (
    "cannot import",
    "modulenotfounderror",
    "importerror",
    "attributeerror",
    "syntaxerror",
    "permission denied",
    "no such file",
    "file not found",
    "dict' object has no attribute",
    'dict" object has no attribute',
 )
 CRON_STATE_PATHS = [
    Path.home() / ".hermes" / "cron" / "jobs.json",
    Path.home() / ".hermes" / "cron" / "state.json",
    Path("/root/.hermes/cron/jobs.json"),
    Path("/root/.hermes/cron/state.json"),
 ]
 def load_jobs_file(path: Path) -> List[Dict[str, Any]]:
    """Load a cron jobs/state JSON file."""
    with path.open() as f:
        data = json.load(f)
    if isinstance(data, dict) and "jobs" in data:
        return data["jobs"]
    if isinstance(data, list):
        return data
    return []
 def parse_cron_list_output(text: str) -> List[Dict[str, Any]]:
    """Parse `hermes cron list --all` output as a last-resort fallback."""
    jobs: List[Dict[str, Any]] = []
    current: Optional[Dict[str, Any]] = None
    job_header = re.compile(r"^\s{2}(?P<id>\S+) \[(?P<state>[^\]]+)\]\s*$")
    for raw_line in text.splitlines():
        line = raw_line.rstrip()
        match = job_header.match(line)
        if match:
            if current:
                jobs.append(current)
            state = match.group("state").strip().lower()
            current = {
                "id": match.group("id"),
                "name": match.group("id"),
                "schedule_display": "",
                "schedule": "",
                "state": "paused" if state == "paused" else "scheduled",
                "enabled": state == "active",
                "last_status": None,
                "last_error": None,
                "last_delivery_error": None,
                "last_run_at": None,
            }
            continue
        if not current:
            continue
        stripped = line.strip()
        if stripped.startswith("Name:"):
            current["name"] = stripped.split("Name:", 1)[1].strip()
        elif stripped.startswith("Schedule:"):
            schedule = stripped.split("Schedule:", 1)[1].strip()
            current["schedule_display"] = schedule
            current["schedule"] = schedule
        elif stripped.startswith("Last run:"):
            payload = stripped.split("Last run:", 1)[1].strip()
            if payload in {"-", ""}:
                continue
            if "  error: " in payload:
                ts, error = payload.split("  error: ", 1)
                current["last_run_at"] = ts.strip()
                current["last_status"] = "error"
                current["last_error"] = error.strip()
            elif payload.endswith("  ok"):
                current["last_run_at"] = payload[:-4].strip()
                current["last_status"] = "ok"
            else:
                current["last_run_at"] = payload
    if current:
        jobs.append(current)
    return jobs
 def load_cron_state() -> List[Dict[str, Any]]:
    """Load cron job state from known locations."""
    for path in CRON_STATE_PATHS:
        if not path.exists():
            continue
        try:
            jobs = load_jobs_file(path)
            if jobs:
                return jobs
        except (json.JSONDecodeError, IOError):
            continue
    try:
        result = subprocess.run(
            ["hermes", "cron", "list", "--all"],
            capture_output=True,
            text=True,
            timeout=30,
        )
        jobs = parse_cron_list_output(result.stdout)
        if jobs:
            return jobs
    except (subprocess.TimeoutExpired, FileNotFoundError):
        pass
    return []
 def parse_timestamp(ts: Optional[str]) -> Optional[datetime]:
    """Parse ISO timestamp, handle various formats."""
    if not ts:
        return None
    ts = str(ts).strip().replace("Z", "+00:00")
    try:
        dt = datetime.fromisoformat(ts)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt
    except ValueError:
        return None
 def format_schedule(job: Dict[str, Any]) -> str:
    """Normalize mixed schedule formats into a readable string."""
    display = job.get("schedule_display")
    if display:
        return str(display)
    schedule = job.get("schedule")
    if isinstance(schedule, dict):
        return (
            schedule.get("display")
            or schedule.get("expr")
            or schedule.get("value")
            or json.dumps(schedule, sort_keys=True)
        )
    return str(schedule or "")
 def find_error_marker(text: str, markers: tuple[str, ...]) -> Optional[str]:
    for marker in markers:
        if marker in text:
            return marker
    return None
 def classify_error(last_error: str, last_delivery_error: str, age_hours: Optional[float]) -> tuple[str, str, str]:
    """Classify an error as transient vs systemic using both age and error text."""
    combined = "\n".join(part for part in [last_error, last_delivery_error] if part).lower()
    age_text = f"{age_hours:.1f}h ago" if age_hours is not None else "unknown age"
    if age_hours is not None and age_hours >= ERROR_THRESHOLD_HOURS:
        return (
            "systemic",
            f"Error persisted for {age_hours:.1f}h (>= {ERROR_THRESHOLD_HOURS}h threshold)",
            "disable",
        )
    systemic_marker = find_error_marker(combined, SYSTEMIC_ERROR_MARKERS)
    if systemic_marker:
        return (
            "systemic",
            f"Systemic error signature: {systemic_marker} ({age_text})",
            "disable",
        )
    transient_marker = find_error_marker(combined, TRANSIENT_ERROR_MARKERS)
    if transient_marker:
        return (
            "transient",
            f"Transient error signature: {transient_marker} ({age_text})",
            "monitor",
        )
    return (
        "transient",
        f"Unclassified recent error ({age_text})",
        "investigate",
    )
 def categorize_job(job: Dict[str, Any], now: datetime) -> Dict[str, Any]:
    """Categorize a single job."""
    status = job.get("last_status", "")
    last_error = job.get("last_error", "")
    last_delivery_error = job.get("last_delivery_error", "")
    last_run = parse_timestamp(job.get("last_run_at"))
    enabled = job.get("enabled", False)
    state = job.get("state", "unknown")
    name = job.get("name", job.get("id", "unknown"))
    entry = {
        "id": job.get("id", ""),
        "name": name,
        "schedule": format_schedule(job),
        "state": state,
        "enabled": enabled,
        "last_status": status,
        "last_error": last_error,
        "last_delivery_error": last_delivery_error,
        "last_run_at": job.get("last_run_at"),
        "category": "healthy",
        "reason": "",
        "action": "",
    }
    # Never run / no error
    if status is None and not last_error:
        entry["category"] = "healthy"
        entry["reason"] = "Never run, no errors"
        return entry
    # Explicitly paused with reason
    if state == "paused":
        entry["category"] = "healthy"
        entry["reason"] = job.get("paused_reason", "Manually paused")
        entry["action"] = "none — paused intentionally"
        return entry
    # Completed jobs
    if state == "completed":
        entry["category"] = "healthy"
        entry["reason"] = "Completed (one-shot)"
        return entry
    # Error status
    if status == "error":
        age_hours = None
        if last_run:
            age_hours = (now - last_run).total_seconds() / 3600
        entry["category"], entry["reason"], entry["action"] = classify_error(
            str(last_error or ""),
            str(last_delivery_error or ""),
            age_hours,
        )
        return entry
    if status == "ok" and last_delivery_error:
        entry["category"] = "transient"
        entry["reason"] = f"Job completed but delivery failed: {last_delivery_error}"
        entry["action"] = "monitor"
        return entry
    # OK status
    if status == "ok":
        entry["category"] = "healthy"
        entry["reason"] = "Last run succeeded"
        return entry
    # Scheduled but never errored
    if state == "scheduled" and enabled:
        entry["category"] = "healthy"
        entry["reason"] = "Scheduled and running"
        return entry
    # Unknown state
    entry["category"] = "transient"
    entry["reason"] = f"Unknown state: {state}, status: {status}"
    entry["action"] = "investigate"
    return entry
 def audit_jobs(jobs: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Run full audit on job list."""
    now = datetime.now(timezone.utc)
    categorized = [categorize_job(j, now) for j in jobs]
    healthy = [c for c in categorized if c["category"] == "healthy"]
    transient = [c for c in categorized if c["category"] == "transient"]
    systemic = [c for c in categorized if c["category"] == "systemic"]
    report = {
        "audit_time": now.isoformat(),
        "total_jobs": len(jobs),
        "summary": {
            "healthy": len(healthy),
            "transient_errors": len(transient),
            "systemic_failures": len(systemic),
        },
        "systemic_jobs": [
            {
                "id": j["id"],
                "name": j["name"],
                "schedule": j["schedule"],
                "state": j["state"],
                "reason": j["reason"],
                "last_error": j["last_error"],
                "last_delivery_error": j["last_delivery_error"],
                "last_run_at": j["last_run_at"],
                "action": j["action"],
            }
            for j in systemic
        ],
        "transient_jobs": [
            {
                "id": j["id"],
                "name": j["name"],
                "schedule": j["schedule"],
                "state": j["state"],
                "reason": j["reason"],
                "last_error": j["last_error"],
                "last_delivery_error": j["last_delivery_error"],
                "last_run_at": j["last_run_at"],
                "action": j["action"],
            }
            for j in transient
        ],
        "all_jobs": categorized,
    }
    return report
 def generate_issue_body(job: Dict[str, Any]) -> str:
    """Generate a Gitea issue body for a systemic cron failure."""
    delivery_error = job.get("last_delivery_error") or ""
    delivery_block = ""
    if delivery_error:
        delivery_block = f"\n**Last Delivery Error:**\n```\n{delivery_error}\n```\n"
    return f"""## Systemic Cron Failure — Auto-Filed by Audit #662
 **Job:** {job.get('name', 'unknown')} (`{job.get('id', 'unknown')}`)
 **Schedule:** {job.get('schedule', 'unknown')}
 **State:** {job.get('state', 'unknown')}
 **Last Error:**
 ```
 {job.get('last_error') or 'No error details available'}
 ```
 {delivery_block}
 **Audit Finding:** {job.get('reason', 'No audit reason captured')}
 ### Action Required
 - [ ] Diagnose root cause of repeated failure
 - [ ] Fix configuration or remove broken job
 - [ ] Verify job resumes healthy after fix
 *Auto-generated by cron-audit-662.py*
 """
 # --- Crontab Parsing ---
 def parse_crontab(text: str, source: str = "unknown") -> list:
    """Parse a crontab file into job-like dicts for audit."""
    import re
    jobs = []
    cron_pattern = re.compile(
        r'^(?:@\w+|[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+)\s+(.*)'
    )
    schedule_pattern = re.compile(
        r'^(?:@\w+|[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+\s+[\d*,/\-]+)'
    )
    for line in text.split("\n"):
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        sm = schedule_pattern.match(line)
        cm = cron_pattern.match(line)
        if not sm or not cm:
            continue
        schedule_raw = sm.group(0).strip()
        command = cm.group(1).strip()
        name_part = command.split("#")[-1].strip() if "#" in command else ""
        if not name_part:
            cmd_base = command.split(">>")[0].strip().split()
            name_part = os.path.basename(cmd_base[-1]) if cmd_base else "unnamed"
        clean_cmd = command.split(">>")[0].split("#")[0].strip()[:200]
        jobs.append({
            "id": f"crontab-{source}-{hash(command) % 10000:04x}",
            "name": name_part,
            "schedule_display": schedule_raw,
            "schedule": schedule_raw,
            "state": "scheduled",
            "enabled": True,
            "last_status": None,
            "last_error": None,
            "last_run_at": None,
            "_source": f"crontab:{source}",
            "_command": clean_cmd,
        })
    return jobs
 def load_crontab_backups(backup_dir) -> list:
    """Load cron jobs from VPS crontab backup files."""
    from pathlib import Path
    backup_dir = Path(backup_dir)
    all_jobs = []
    if not backup_dir.exists():
        return all_jobs
    for fpath in sorted(backup_dir.glob("*-crontab-backup.txt")):
        source = fpath.name.replace("-crontab-backup.txt", "")
        text = fpath.read_text()
        all_jobs.extend(parse_crontab(text, source=source))
    return all_jobs
 def audit_fleet(hermes_jobs: list, crontab_jobs: list) -> dict:
    """Run full fleet audit combining hermes cron + VPS crontabs."""
    now = datetime.now(timezone.utc)
    hermes_categorized = [categorize_job(j, now) for j in hermes_jobs]
    crontab_categorized = []
    for j in crontab_jobs:
        crontab_categorized.append({
            "id": j["id"], "name": j["name"],
            "schedule": j.get("schedule_display", ""),
            "state": "scheduled", "enabled": True,
            "last_status": None, "last_error": None, "last_delivery_error": None, "last_run_at": None,
            "category": "healthy",
            "reason": f"Crontab ({j.get('_source', '?')}) — verify logs manually",
            "action": "verify-logs",
        })
    all_cat = hermes_categorized + crontab_categorized
    healthy = [c for c in all_cat if c["category"] == "healthy"]
    transient = [c for c in all_cat if c["category"] == "transient"]
    systemic = [c for c in all_cat if c["category"] == "systemic"]
    return {
        "audit_time": now.isoformat(),
        "total_jobs": len(all_cat),
        "hermes_jobs": len(hermes_categorized),
        "crontab_jobs": len(crontab_categorized),
        "summary": {"healthy": len(healthy), "transient_errors": len(transient), "systemic_failures": len(systemic)},
        "systemic_jobs": [
            {
                "id": j["id"],
                "name": j["name"],
                "schedule": j["schedule"],
                "state": j["state"],
                "reason": j["reason"],
                "last_error": j.get("last_error", ""),
                "last_delivery_error": j.get("last_delivery_error", ""),
                "last_run_at": j.get("last_run_at"),
                "action": j.get("action", ""),
            }
            for j in systemic
        ],
        "transient_jobs": [
            {
                "id": j["id"],
                "name": j["name"],
                "schedule": j["schedule"],
                "state": j["state"],
                "reason": j["reason"],
                "last_error": j.get("last_error", ""),
                "last_delivery_error": j.get("last_delivery_error", ""),
                "last_run_at": j.get("last_run_at"),
                "action": j.get("action", ""),
            }
            for j in transient
        ],
        "all_jobs": all_cat,
    }
 def main():
    parser = argparse.ArgumentParser(description="Cron fleet audit (#662)")
    parser.add_argument("--jobs-file", help="Path to jobs.json override")
    parser.add_argument("--disable", action="store_true",
                        help="Disable systemic jobs (requires hermes CLI)")
    parser.add_argument("--issues", action="store_true",
                        help="File Gitea issues for systemic failures")
    parser.add_argument("--output", help="Write report to file")
    parser.add_argument("--json", action="store_true", help="JSON output only")
    args = parser.parse_args()
    # Load jobs
    jobs = []
    if args.jobs_file:
        with open(args.jobs_file) as f:
            data = json.load(f)
        jobs = data.get("jobs", data) if isinstance(data, dict) else data
    else:
        jobs = load_cron_state()
    # Also load VPS crontab backups
    crontab_dir = Path(__file__).parent.parent / "cron" / "vps"
    crontab_jobs = load_crontab_backups(crontab_dir)
    if not jobs:
        print("ERROR: No cron jobs found. Check ~/.hermes/cron/ or run 'hermes cron list'.")
        sys.exit(1)
    # Run audit
    if crontab_jobs:
        report = audit_fleet(jobs, crontab_jobs)
    else:
        report = audit_jobs(jobs)
    # Output
    if args.json:
        print(json.dumps(report, indent=2))
    else:
        print(f"\n{'='*60}")
        print(f"  CRON FLEET AUDIT — {report['total_jobs']} jobs")
        print(f"{'='*60}")
        print(f"  Healthy:           {report['summary']['healthy']}")
        print(f"  Transient errors:  {report['summary']['transient_errors']}")
        print(f"  Systemic failures: {report['summary']['systemic_failures']}")
        print(f"{'='*60}")
        if report["systemic_jobs"]:
            print(f"\n  SYSTEMIC FAILURES (>{ERROR_THRESHOLD_HOURS}h):")
            for j in report["systemic_jobs"]:
                print(f"    - {j['name']} ({j['id']}): {j['reason']}")
                if j["last_error"]:
                    print(f"      Error: {j['last_error'][:100]}")
        if report["transient_jobs"]:
            print(f"\n  TRANSIENT ERRORS:")
            for j in report["transient_jobs"]:
                print(f"    - {j['name']} ({j['id']}): {j['reason']}")
        print()
    # Write report file
    if args.output:
        with open(args.output, "w") as f:
            json.dump(report, f, indent=2)
        print(f"Report written to {args.output}")
    # Disable systemic jobs
    if args.disable and report["systemic_jobs"]:
        import subprocess
        for j in report["systemic_jobs"]:
            print(f"Disabling: {j['name']} ({j['id']})")
            try:
                subprocess.run(
                    ["hermes", "cron", "pause", j["id"]],
                    capture_output=True, text=True, timeout=10
                )
                print(f"  → Disabled")
            except Exception as e:
                print(f"  → Failed: {e}")
    # File issues for systemic failures
    if args.issues and report["systemic_jobs"]:
        gitea_token = os.environ.get("GITEA_TOKEN") or ""
        if not gitea_token:
            token_path = Path.home() / ".config" / "gitea" / "token"
            if token_path.exists():
                gitea_token = token_path.read_text().strip()
        if not gitea_token:
            print("ERROR: No Gitea token found. Set GITEA_TOKEN or ~/.config/gitea/token")
            sys.exit(1)
        import urllib.request
        base = "https://forge.alexanderwhitestone.com/api/v1"
        headers = {
            "Authorization": f"token {gitea_token}",
            "Content-Type": "application/json",
        }
        for j in report["systemic_jobs"]:
            title = f"CRON FAIL: {j['name']} — systemic error ({j['id']})"
            body = generate_issue_body(j)
            data = json.dumps({"title": title, "body": body}).encode()
            req = urllib.request.Request(
                f"{base}/repos/Timmy_Foundation/timmy-config/issues",
                data=data, headers=headers, method="POST"
            )
            try:
                resp = urllib.request.urlopen(req)
                result = json.loads(resp.read())
                print(f"Issued #{result['number']}: {title}")
            except Exception as e:
                print(f"Failed to file issue for {j['name']}: {e}")
    # Exit code: non-zero if systemic failures found
    sys.exit(1 if report["systemic_jobs"] else 0)
 if __name__ == "__main__":
    main()
--- a/tests/test_cron_audit.py
+++ b/tests/test_cron_audit.py
@@ -71,6 +71,43 @@ class TestCategorizeJob:
        r = categorize_job({"name": "t", "state": "paused", "enabled": False}, datetime.now(timezone.utc))
        assert r["category"] == "healthy"
    def test_import_error_is_systemic_even_when_recent(self):
        from cron_audit_662 import categorize_job
        now = datetime.now(timezone.utc)
        r = categorize_job({
            "name": "t",
            "last_status": "error",
            "last_error": "cannot import name 'AIAgent' from 'run_agent'",
            "last_run_at": (now - timedelta(hours=1)).isoformat(),
        }, now)
        assert r["category"] == "systemic"
        assert r["action"] == "disable"
    def test_empty_response_stays_transient(self):
        from cron_audit_662 import categorize_job
        now = datetime.now(timezone.utc)
        r = categorize_job({
            "name": "t",
            "last_status": "error",
            "last_error": "Agent completed but produced empty response (model error, timeout, or misconfiguration)",
            "last_run_at": (now - timedelta(hours=1)).isoformat(),
        }, now)
        assert r["category"] == "transient"
    def test_delivery_failure_after_success_is_transient(self):
        from cron_audit_662 import categorize_job
        now = datetime.now(timezone.utc)
        r = categorize_job({
            "name": "t",
            "last_status": "ok",
            "last_delivery_error": "delivery error: Telegram send failed: Timed out",
            "last_run_at": now.isoformat(),
            "enabled": True,
            "state": "scheduled",
        }, now)
        assert r["category"] == "transient"
        assert "delivery failed" in r["reason"]
 class TestAuditFleet:
    def test_empty(self):
@@ -116,3 +153,23 @@ class TestTimestampParsing:
        from cron_audit_662 import parse_timestamp
        assert parse_timestamp("") is None
        assert parse_timestamp(None) is None
 class TestIssueBody:
    def test_includes_schedule_state_and_delivery_error(self):
        from cron_audit_662 import generate_issue_body
        body = generate_issue_body({
            "id": "job-1",
            "name": "Health Monitor",
            "schedule": "every 5m",
            "state": "scheduled",
            "last_error": "cannot import name 'tool' from 'tools.registry'",
            "last_delivery_error": "delivery error: Telegram send failed: Timed out",
            "reason": "Systemic error signature: cannot import (1.0h ago)",
        })
        assert "Health Monitor" in body
        assert "every 5m" in body
        assert "scheduled" in body
        assert "Last Delivery Error" in body