Compare commits

..

6 Commits

Author SHA1 Message Date
Alexander Whitestone
aa3d22d89c fix: cron fleet audit script + report (#662)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 33s
PR Checklist / pr-checklist (pull_request) Failing after 4m27s
Smoke Test / smoke (pull_request) Failing after 20s
Validate Config / YAML Lint (pull_request) Failing after 17s
Validate Config / JSON Validate (pull_request) Successful in 16s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m33s
Validate Config / Shell Script Lint (pull_request) Failing after 42s
Validate Config / Cron Syntax Check (pull_request) Successful in 9s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 12s
Validate Config / Playbook Schema Validation (pull_request) Successful in 22s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
- scripts/cron-audit-662.py: Full audit tool that categorizes cron jobs
  into healthy/transient/systemic based on error age (48h threshold)
  Flags: --disable (pause systemic), --issues (file Gitea issues),
  --output (JSON report), --jobs-file (override path)
  Reads from ~/.hermes/cron/ or falls back to 'hermes cron list'

- scripts/cron_audit_662.py: Symlink for importability

- tests/test_cron_audit.py: 9 tests covering categorization logic:
  healthy (ok, never-run, paused, completed), transient (recent error),
  systemic (old error, boundary cases), mixed audit report

- cron/audit-report.json: Initial audit of local jobs.json
  Result: 7/7 healthy, 0 transient, 0 systemic
2026-04-14 21:14:55 -04:00
ad751a6de6 docs: add pipeline scheduler README 2026-04-14 22:47:12 +00:00
130fa40f0c feat: add pipeline-scheduler cron job 2026-04-14 22:46:51 +00:00
82f9810081 feat: add nightly-pipeline-scheduler.sh 2026-04-14 22:46:38 +00:00
2548277137 cleanup test
Some checks failed
Architecture Lint / Linter Tests (push) Successful in 22s
Smoke Test / smoke (push) Failing after 21s
Validate Config / YAML Lint (push) Failing after 13s
Validate Config / JSON Validate (push) Successful in 14s
Validate Config / Python Syntax & Import Check (push) Failing after 1m9s
Validate Config / Shell Script Lint (push) Failing after 31s
Validate Config / Cron Syntax Check (push) Successful in 5s
Validate Config / Deploy Script Dry Run (push) Successful in 7s
Validate Config / Playbook Schema Validation (push) Successful in 16s
Architecture Lint / Linter Tests (pull_request) Successful in 14s
Smoke Test / smoke (pull_request) Failing after 13s
Validate Config / YAML Lint (pull_request) Failing after 12s
Validate Config / JSON Validate (pull_request) Successful in 13s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 54s
Validate Config / Shell Script Lint (pull_request) Failing after 21s
Validate Config / Cron Syntax Check (pull_request) Successful in 5s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 7s
Validate Config / Playbook Schema Validation (pull_request) Successful in 18s
PR Checklist / pr-checklist (pull_request) Failing after 3m27s
Architecture Lint / Lint Repository (push) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (push) Has been cancelled
2026-04-14 22:39:03 +00:00
2b234fde79 test: verify API works
Some checks failed
Architecture Lint / Linter Tests (push) Has been cancelled
Architecture Lint / Lint Repository (push) Has been cancelled
Smoke Test / smoke (push) Failing after 12s
Validate Config / YAML Lint (push) Failing after 11s
Validate Config / JSON Validate (push) Successful in 11s
Validate Config / Python Syntax & Import Check (push) Failing after 47s
Validate Config / Shell Script Lint (push) Failing after 33s
Validate Config / Cron Syntax Check (push) Successful in 10s
Validate Config / Deploy Script Dry Run (push) Successful in 10s
Validate Config / Playbook Schema Validation (push) Successful in 14s
Validate Config / Python Test Suite (push) Has been cancelled
2026-04-14 22:39:02 +00:00
8 changed files with 989 additions and 194 deletions

104
cron/audit-report.json Normal file
View File

@@ -0,0 +1,104 @@
{
"audit_time": "2026-04-15T01:13:31.126215+00:00",
"total_jobs": 7,
"summary": {
"healthy": 7,
"transient_errors": 0,
"systemic_failures": 0
},
"systemic_jobs": [],
"transient_jobs": [],
"all_jobs": [
{
"id": "9e0624269ba7",
"name": "Triage Heartbeat",
"schedule": "every 15m",
"state": "paused",
"enabled": false,
"last_status": "ok",
"last_error": null,
"last_run_at": "2026-03-24T15:33:57.749458-04:00",
"category": "healthy",
"reason": "Dashboard repo frozen - loops redirected to the-nexus",
"action": "none \u2014 paused intentionally"
},
{
"id": "e29eda4a8548",
"name": "PR Review Sweep",
"schedule": "every 30m",
"state": "paused",
"enabled": false,
"last_status": "ok",
"last_error": null,
"last_run_at": "2026-03-24T15:21:42.995715-04:00",
"category": "healthy",
"reason": "Dashboard repo frozen - loops redirected to the-nexus",
"action": "none \u2014 paused intentionally"
},
{
"id": "a77a87392582",
"name": "Health Monitor",
"schedule": "every 5m",
"state": "scheduled",
"enabled": true,
"last_status": "ok",
"last_error": null,
"last_run_at": "2026-03-24T15:34:39.045945-04:00",
"category": "healthy",
"reason": "Last run succeeded",
"action": ""
},
{
"id": "36fb2f630a17",
"name": "Hermes Philosophy Loop",
"schedule": "every 1440m",
"state": "unknown",
"enabled": false,
"last_status": null,
"last_error": null,
"last_run_at": null,
"category": "healthy",
"reason": "Never run, no errors",
"action": ""
},
{
"id": "muda-audit-weekly",
"name": "Muda Audit",
"schedule": "0 21 * * 0",
"state": "scheduled",
"enabled": true,
"last_status": null,
"last_error": null,
"last_run_at": null,
"category": "healthy",
"reason": "Never run, no errors",
"action": ""
},
{
"id": "kaizen-retro-349",
"name": "Kaizen Retro",
"schedule": "daily at 07:30",
"state": "scheduled",
"enabled": true,
"last_status": null,
"last_error": null,
"last_run_at": null,
"category": "healthy",
"reason": "Never run, no errors",
"action": ""
},
{
"id": "overnight-rd-nightly",
"name": "Overnight R&D Loop",
"schedule": "Nightly at 10 PM EDT",
"state": "scheduled",
"enabled": true,
"last_status": null,
"last_error": null,
"last_run_at": null,
"category": "healthy",
"reason": "Never run, no errors",
"action": ""
}
]
}

View File

@@ -0,0 +1,9 @@
- name: Nightly Pipeline Scheduler
schedule: '*/30 18-23,0-8 * * *' # Every 30 min, off-peak hours only
tasks:
- name: Check and start pipelines
shell: "bash scripts/nightly-pipeline-scheduler.sh"
env:
PIPELINE_TOKEN_LIMIT: "500000"
PIPELINE_PEAK_START: "9"
PIPELINE_PEAK_END: "18"

333
scripts/cron-audit-662.py Normal file
View File

@@ -0,0 +1,333 @@
#!/usr/bin/env python3
"""
Cron Fleet Audit Script — #662
Reads hermes cron job state, categorizes all jobs into:
- healthy: last_status=ok or never-run-and-enabled
- transient: recent errors (likely network/timeout)
- systemic: repeated errors over 48+ hours
Outputs a JSON report and optionally:
--disable Disable systemic jobs erroring 48+ hours
--issues File Gitea issues for systemic failures
"""
import json
import sys
import os
import argparse
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import List, Dict, Any
# --- Config ---
ERROR_THRESHOLD_HOURS = 48
CRON_STATE_PATHS = [
Path.home() / ".hermes" / "cron" / "jobs.json",
Path.home() / ".hermes" / "cron" / "state.json",
Path("/root/.hermes/cron/jobs.json"),
Path("/root/.hermes/cron/state.json"),
]
def load_cron_state() -> List[Dict[str, Any]]:
"""Load cron job state from known locations."""
for path in CRON_STATE_PATHS:
if path.exists():
try:
with open(path) as f:
data = json.load(f)
if isinstance(data, dict) and "jobs" in data:
return data["jobs"]
if isinstance(data, list):
return data
except (json.JSONDecodeError, IOError):
continue
# Fallback: try hermes cron list CLI
try:
import subprocess
result = subprocess.run(
["hermes", "cron", "list", "--json"],
capture_output=True, text=True, timeout=30
)
if result.returncode == 0:
data = json.loads(result.stdout)
if isinstance(data, dict) and "jobs" in data:
return data["jobs"]
if isinstance(data, list):
return data
except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError):
pass
return []
def parse_timestamp(ts: str) -> datetime:
"""Parse ISO timestamp, handle various formats."""
if not ts:
return None
# Normalize timezone
ts = ts.replace("+00:00", "+00:00")
try:
dt = datetime.fromisoformat(ts)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt
except ValueError:
return None
def categorize_job(job: Dict[str, Any], now: datetime) -> Dict[str, Any]:
"""Categorize a single job."""
status = job.get("last_status", "")
last_error = job.get("last_error", "")
last_run = parse_timestamp(job.get("last_run_at"))
enabled = job.get("enabled", False)
state = job.get("state", "unknown")
name = job.get("name", job.get("id", "unknown"))
entry = {
"id": job.get("id", ""),
"name": name,
"schedule": job.get("schedule_display", str(job.get("schedule", ""))),
"state": state,
"enabled": enabled,
"last_status": status,
"last_error": last_error,
"last_run_at": job.get("last_run_at"),
"category": "healthy",
"reason": "",
"action": "",
}
# Never run / no error
if status is None and not last_error:
entry["category"] = "healthy"
entry["reason"] = "Never run, no errors"
return entry
# Explicitly paused with reason
if state == "paused":
entry["category"] = "healthy"
entry["reason"] = job.get("paused_reason", "Manually paused")
entry["action"] = "none — paused intentionally"
return entry
# Completed jobs
if state == "completed":
entry["category"] = "healthy"
entry["reason"] = "Completed (one-shot)"
return entry
# Error status
if status == "error" and last_error:
age_hours = None
if last_run:
age_hours = (now - last_run).total_seconds() / 3600
if age_hours is not None and age_hours >= ERROR_THRESHOLD_HOURS:
entry["category"] = "systemic"
entry["reason"] = f"Erroring for {age_hours:.1f}h (>{ERROR_THRESHOLD_HOURS}h threshold)"
entry["action"] = "disable"
else:
entry["category"] = "transient"
age_str = f"{age_hours:.1f}h ago" if age_hours is not None else "unknown age"
entry["reason"] = f"Recent error ({age_str}), may be transient"
entry["action"] = "monitor"
return entry
# OK status
if status == "ok":
entry["category"] = "healthy"
entry["reason"] = "Last run succeeded"
return entry
# Scheduled but never errored
if state == "scheduled" and enabled:
entry["category"] = "healthy"
entry["reason"] = "Scheduled and running"
return entry
# Unknown state
entry["category"] = "transient"
entry["reason"] = f"Unknown state: {state}, status: {status}"
entry["action"] = "investigate"
return entry
def audit_jobs(jobs: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Run full audit on job list."""
now = datetime.now(timezone.utc)
categorized = [categorize_job(j, now) for j in jobs]
healthy = [c for c in categorized if c["category"] == "healthy"]
transient = [c for c in categorized if c["category"] == "transient"]
systemic = [c for c in categorized if c["category"] == "systemic"]
report = {
"audit_time": now.isoformat(),
"total_jobs": len(jobs),
"summary": {
"healthy": len(healthy),
"transient_errors": len(transient),
"systemic_failures": len(systemic),
},
"systemic_jobs": [
{
"id": j["id"],
"name": j["name"],
"reason": j["reason"],
"last_error": j["last_error"],
}
for j in systemic
],
"transient_jobs": [
{
"id": j["id"],
"name": j["name"],
"reason": j["reason"],
}
for j in transient
],
"all_jobs": categorized,
}
return report
def generate_issue_body(job: Dict[str, Any]) -> str:
"""Generate a Gitea issue body for a systemic cron failure."""
return f"""## Systemic Cron Failure — Auto-Filed by Audit #662
**Job:** {job['name']} (`{job['id']}`)
**Schedule:** {job['schedule']}
**State:** {job['state']}
**Last Error:**
```
{job['last_error'] or 'No error details available'}
```
**Audit Finding:** {job['reason']}
### Action Required
- [ ] Diagnose root cause of repeated failure
- [ ] Fix configuration or remove broken job
- [ ] Verify job resumes healthy after fix
*Auto-generated by cron-audit-662.py*
"""
def main():
parser = argparse.ArgumentParser(description="Cron fleet audit (#662)")
parser.add_argument("--jobs-file", help="Path to jobs.json override")
parser.add_argument("--disable", action="store_true",
help="Disable systemic jobs (requires hermes CLI)")
parser.add_argument("--issues", action="store_true",
help="File Gitea issues for systemic failures")
parser.add_argument("--output", help="Write report to file")
parser.add_argument("--json", action="store_true", help="JSON output only")
args = parser.parse_args()
# Load jobs
jobs = []
if args.jobs_file:
with open(args.jobs_file) as f:
data = json.load(f)
jobs = data.get("jobs", data) if isinstance(data, dict) else data
else:
jobs = load_cron_state()
if not jobs:
print("ERROR: No cron jobs found. Check ~/.hermes/cron/ or run 'hermes cron list'.")
sys.exit(1)
# Run audit
report = audit_jobs(jobs)
# Output
if args.json:
print(json.dumps(report, indent=2))
else:
print(f"\n{'='*60}")
print(f" CRON FLEET AUDIT — {report['total_jobs']} jobs")
print(f"{'='*60}")
print(f" Healthy: {report['summary']['healthy']}")
print(f" Transient errors: {report['summary']['transient_errors']}")
print(f" Systemic failures: {report['summary']['systemic_failures']}")
print(f"{'='*60}")
if report["systemic_jobs"]:
print(f"\n SYSTEMIC FAILURES (>{ERROR_THRESHOLD_HOURS}h):")
for j in report["systemic_jobs"]:
print(f" - {j['name']} ({j['id']}): {j['reason']}")
if j["last_error"]:
print(f" Error: {j['last_error'][:100]}")
if report["transient_jobs"]:
print(f"\n TRANSIENT ERRORS:")
for j in report["transient_jobs"]:
print(f" - {j['name']} ({j['id']}): {j['reason']}")
print()
# Write report file
if args.output:
with open(args.output, "w") as f:
json.dump(report, f, indent=2)
print(f"Report written to {args.output}")
# Disable systemic jobs
if args.disable and report["systemic_jobs"]:
import subprocess
for j in report["systemic_jobs"]:
print(f"Disabling: {j['name']} ({j['id']})")
try:
subprocess.run(
["hermes", "cron", "pause", j["id"]],
capture_output=True, text=True, timeout=10
)
print(f" → Disabled")
except Exception as e:
print(f" → Failed: {e}")
# File issues for systemic failures
if args.issues and report["systemic_jobs"]:
gitea_token = os.environ.get("GITEA_TOKEN") or ""
if not gitea_token:
token_path = Path.home() / ".config" / "gitea" / "token"
if token_path.exists():
gitea_token = token_path.read_text().strip()
if not gitea_token:
print("ERROR: No Gitea token found. Set GITEA_TOKEN or ~/.config/gitea/token")
sys.exit(1)
import urllib.request
base = "https://forge.alexanderwhitestone.com/api/v1"
headers = {
"Authorization": f"token {gitea_token}",
"Content-Type": "application/json",
}
for j in report["systemic_jobs"]:
title = f"CRON FAIL: {j['name']} — systemic error ({j['id']})"
body = generate_issue_body(j)
data = json.dumps({"title": title, "body": body}).encode()
req = urllib.request.Request(
f"{base}/repos/Timmy_Foundation/timmy-config/issues",
data=data, headers=headers, method="POST"
)
try:
resp = urllib.request.urlopen(req)
result = json.loads(resp.read())
print(f"Issued #{result['number']}: {title}")
except Exception as e:
print(f"Failed to file issue for {j['name']}: {e}")
# Exit code: non-zero if systemic failures found
sys.exit(1 if report["systemic_jobs"] else 0)
if __name__ == "__main__":
main()

1
scripts/cron_audit_662.py Symbolic link
View File

@@ -0,0 +1 @@
cron-audit-662.py

View File

@@ -0,0 +1,50 @@
# Nightly Pipeline Scheduler
Auto-starts batch pipelines when inference is available.
## What It Does
1. Checks inference provider health (OpenRouter, Ollama, RunPod)
2. Checks if it's off-peak hours (configurable, default: after 6PM)
3. Checks interactive session load (don't fight with live users)
4. Checks daily token budget (configurable limit)
5. Starts the highest-priority incomplete pipeline
## Pipeline Priority Order
| Priority | Pipeline | Deps | Max Tokens |
|----------|----------|------|------------|
| 1 | playground-factory | none | 100,000 |
| 2 | training-factory | none | 150,000 |
| 3 | knowledge-mine | training-factory running | 80,000 |
| 4 | adversary | knowledge-mine running | 50,000 |
| 5 | codebase-genome | none | 120,000 |
## Usage
```bash
# Normal run (used by cron)
./scripts/nightly-pipeline-scheduler.sh
# Dry run (show what would start)
./scripts/nightly-pipeline-scheduler.sh --dry-run
# Status report
./scripts/nightly-pipeline-scheduler.sh --status
# Force start during peak hours
./scripts/nightly-pipeline-scheduler.sh --force
```
## Configuration
Set via environment variables:
- `PIPELINE_TOKEN_LIMIT`: Daily token budget (default: 500,000)
- `PIPELINE_PEAK_START`: Peak hours start (default: 9)
- `PIPELINE_PEAK_END`: Peak hours end (default: 18)
- `HERMES_HOME`: Hermes home directory (default: ~/.hermes)
## Cron
Runs every 30 minutes. Off-peak only (unless --force).
See `cron/pipeline-scheduler.yml`.

View File

@@ -0,0 +1,383 @@
#!/usr/bin/env bash
# nightly-pipeline-scheduler.sh — Auto-start batch pipelines when inference is available.
#
# Checks provider health, pipeline progress, token budget, and interactive load.
# Starts the highest-priority incomplete pipeline that can run.
#
# Usage:
# ./scripts/nightly-pipeline-scheduler.sh # Normal run
# ./scripts/nightly-pipeline-scheduler.sh --dry-run # Show what would start
# ./scripts/nightly-pipeline-scheduler.sh --status # Pipeline status report
set -euo pipefail
# --- Configuration ---
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
BUDGET_FILE="${HERMES_HOME}/pipeline_budget.json"
STATE_FILE="${HERMES_HOME}/pipeline_state.json"
LOG_FILE="${HERMES_HOME}/logs/pipeline-scheduler.log"
TOKEN_DAILY_LIMIT="${PIPELINE_TOKEN_LIMIT:-500000}"
PEAK_HOURS_START="${PIPELINE_PEAK_START:-9}"
PEAK_HOURS_END="${PIPELINE_PEAK_END:-18}"
# Pipeline definitions (priority order)
# Each pipeline: name, script, max_tokens, dependencies
PIPELINES=(
"playground-factory|scripts/pipeline_playground_factory.sh|100000|none"
"training-factory|scripts/pipeline_training_factory.sh|150000|none"
"knowledge-mine|scripts/pipeline_knowledge_mine.sh|80000|training-factory"
"adversary|scripts/pipeline_adversary.sh|50000|knowledge-mine"
"codebase-genome|scripts/pipeline_codebase_genome.sh|120000|none"
)
# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
CYAN='\033[0;36m'
NC='\033[0m'
# --- Helpers ---
now_hour() { date +%-H; }
is_peak_hours() {
local h=$(now_hour)
[[ $h -ge $PEAK_HOURS_START && $h -lt $PEAK_HOURS_END ]]
}
ensure_dirs() {
mkdir -p "$(dirname "$LOG_FILE")" "$(dirname "$BUDGET_FILE")" "$(dirname "$STATE_FILE")"
}
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
get_budget_used_today() {
if [[ -f "$BUDGET_FILE" ]]; then
local today=$(date +%Y-%m-%d)
python3 -c "
import json, sys
with open('$BUDGET_FILE') as f:
d = json.load(f)
print(d.get('daily', {}).get('$today', {}).get('tokens_used', 0))
" 2>/dev/null || echo 0
else
echo 0
fi
}
get_budget_remaining() {
local used=$(get_budget_used_today)
echo $((TOKEN_DAILY_LIMIT - used))
}
update_budget() {
local pipeline="$1"
local tokens="$2"
local today=$(date +%Y-%m-%d)
python3 -c "
import json, os
path = '$BUDGET_FILE'
d = {}
if os.path.exists(path):
with open(path) as f:
d = json.load(f)
daily = d.setdefault('daily', {})
day = daily.setdefault('$today', {'tokens_used': 0, 'pipelines': {}})
day['tokens_used'] = day.get('tokens_used', 0) + $tokens
day['pipelines']['$pipeline'] = day['pipelines'].get('$pipeline', 0) + $tokens
with open(path, 'w') as f:
json.dump(d, f, indent=2)
"
}
get_pipeline_state() {
if [[ -f "$STATE_FILE" ]]; then
cat "$STATE_FILE"
else
echo "{}"
fi
}
set_pipeline_state() {
local pipeline="$1"
local state="$2" # running, complete, failed, skipped
python3 -c "
import json, os
path = '$STATE_FILE'
d = {}
if os.path.exists(path):
with open(path) as f:
d = json.load(f)
d['$pipeline'] = {'state': '$state', 'updated': '$(date -Iseconds)'}
with open(path, 'w') as f:
json.dump(d, f, indent=2)
"
}
is_pipeline_complete() {
local pipeline="$1"
python3 -c "
import json, os
path = '$STATE_FILE'
if not os.path.exists(path):
print('false')
else:
with open(path) as f:
d = json.load(f)
state = d.get('$pipeline', {}).get('state', 'not_started')
print('true' if state == 'complete' else 'false')
" 2>/dev/null || echo false
}
is_pipeline_running() {
local pipeline="$1"
python3 -c "
import json, os
path = '$STATE_FILE'
if not os.path.exists(path):
print('false')
else:
with open(path) as f:
d = json.load(f)
state = d.get('$pipeline', {}).get('state', 'not_started')
print('true' if state == 'running' else 'false')
" 2>/dev/null || echo false
}
check_dependency() {
local dep="$1"
if [[ "$dep" == "none" ]]; then
return 0
fi
# For knowledge-mine: training-factory must be running or complete
if [[ "$dep" == "training-factory" ]]; then
local state=$(python3 -c "
import json, os
path = '$STATE_FILE'
if not os.path.exists(path):
print('not_started')
else:
with open(path) as f:
d = json.load(f)
print(d.get('training-factory', {}).get('state', 'not_started'))
" 2>/dev/null || echo "not_started")
[[ "$state" == "running" || "$state" == "complete" ]]
return $?
fi
# For adversary: knowledge-mine must be at least 50% done
# Simplified: check if it's running (we'd need progress tracking for 50%)
if [[ "$dep" == "knowledge-mine" ]]; then
local state=$(python3 -c "
import json, os
path = '$STATE_FILE'
if not os.path.exists(path):
print('not_started')
else:
with open(path) as f:
d = json.load(f)
print(d.get('knowledge-mine', {}).get('state', 'not_started'))
" 2>/dev/null || echo "not_started")
[[ "$state" == "running" || "$state" == "complete" ]]
return $?
fi
return 0
}
check_inference_available() {
# Check if any inference provider is responding
# 1. Check OpenRouter
local or_ok=$(curl -s -o /dev/null -w "%{http_code}" \
--connect-timeout 5 "https://openrouter.ai/api/v1/models" 2>/dev/null || echo "000")
# 2. Check local Ollama
local ollama_ok=$(curl -s -o /dev/null -w "%{http_code}" \
--connect-timeout 5 "http://localhost:11434/api/tags" 2>/dev/null || echo "000")
# 3. Check RunPod (if configured)
local runpod_ok="000"
if [[ -n "${RUNPOD_ENDPOINT:-}" ]]; then
runpod_ok=$(curl -s -o /dev/null -w "%{http_code}" \
--connect-timeout 5 "$RUNPOD_ENDPOINT/health" 2>/dev/null || echo "000")
fi
if [[ "$or_ok" == "200" || "$ollama_ok" == "200" || "$runpod_ok" == "200" ]]; then
return 0
fi
return 1
}
check_interactive_load() {
# Check if there are active interactive sessions (don't fight with live users)
# Look for tmux panes with active hermes sessions
local active=$(tmux list-panes -a -F '#{pane_pid} #{pane_current_command}' 2>/dev/null \
| grep -c "hermes\|python3" || echo 0)
# If more than 3 interactive sessions, skip pipeline start
if [[ $active -gt 3 ]]; then
return 1
fi
return 0
}
start_pipeline() {
local name="$1"
local script="$2"
local max_tokens="$3"
local budget_remaining="$4"
local mode="${5:-run}"
if [[ "$budget_remaining" -lt "$max_tokens" ]]; then
log "SKIP $name: insufficient budget ($budget_remaining < $max_tokens tokens)"
return 1
fi
if [[ ! -f "$script" ]]; then
log "SKIP $name: script not found ($script)"
return 1
fi
if [[ "$mode" == "dry-run" ]]; then
log "DRY-RUN: Would start $name (budget: $budget_remaining, needs: $max_tokens)"
return 0
fi
log "START $name (budget: $budget_remaining, max_tokens: $max_tokens)"
set_pipeline_state "$name" "running"
# Run in background, capture output
local log_path="${HERMES_HOME}/logs/pipeline-${name}.log"
bash "$script" --max-tokens "$max_tokens" >> "$log_path" 2>&1 &
local pid=$!
# Wait a moment to check if it started OK
sleep 2
if kill -0 $pid 2>/dev/null; then
log "RUNNING $name (PID: $pid, log: $log_path)"
# Record the PID
python3 -c "
import json, os
path = '$STATE_FILE'
d = {}
if os.path.exists(path):
with open(path) as f:
d = json.load(f)
d['$name']['pid'] = $pid
with open(path, 'w') as f:
json.dump(d, f, indent=2)
"
return 0
else
log "FAIL $name: script exited immediately"
set_pipeline_state "$name" "failed"
return 1
fi
}
# --- Main ---
main() {
local mode="${1:-run}"
ensure_dirs
log "=== Pipeline Scheduler ($mode) ==="
# Check 1: Is inference available?
if ! check_inference_available; then
log "No inference provider available. Skipping all pipelines."
exit 0
fi
log "Inference: AVAILABLE"
# Check 2: Is it peak hours?
if is_peak_hours && [[ "$mode" != "--force" ]]; then
local h=$(now_hour)
log "Peak hours ($h:00). Skipping pipeline start. Use --force to override."
exit 0
fi
log "Off-peak: OK"
# Check 3: Interactive load
if ! check_interactive_load && [[ "$mode" != "--force" ]]; then
log "High interactive load. Skipping pipeline start."
exit 0
fi
log "Interactive load: OK"
# Check 4: Token budget
local budget=$(get_budget_remaining)
log "Token budget remaining: $budget / $TOKEN_DAILY_LIMIT"
if [[ $budget -le 0 ]]; then
log "Daily token budget exhausted. Stopping."
exit 0
fi
# Check 5: Pipeline status
if [[ "$mode" == "--status" ]]; then
echo -e "${CYAN}Pipeline Status:${NC}"
echo "────────────────────────────────────────────────────"
for entry in "${PIPELINES[@]}"; do
IFS='|' read -r name script max_tokens dep <<< "$entry"
local state=$(python3 -c "
import json, os
path = '$STATE_FILE'
if not os.path.exists(path):
print('not_started')
else:
with open(path) as f:
d = json.load(f)
print(d.get('$name', {}).get('state', 'not_started'))
" 2>/dev/null || echo "not_started")
local color=$NC
case "$state" in
running) color=$YELLOW ;;
complete) color=$GREEN ;;
failed) color=$RED ;;
esac
printf " %-25s %b%s%b (max: %s tokens, dep: %s)\n" "$name" "$color" "$state" "$NC" "$max_tokens" "$dep"
done
echo "────────────────────────────────────────────────────"
echo " Budget: $budget / $TOKEN_DAILY_LIMIT tokens remaining"
echo " Peak hours: $PEAK_HOURS_START:00 - $PEAK_HOURS_END:00"
exit 0
fi
# Find and start the highest-priority incomplete pipeline
local started=0
for entry in "${PIPELINES[@]}"; do
IFS='|' read -r name script max_tokens dep <<< "$entry"
# Skip if already running or complete
if [[ "$(is_pipeline_running $name)" == "true" ]]; then
log "SKIP $name: already running"
continue
fi
if [[ "$(is_pipeline_complete $name)" == "true" ]]; then
log "SKIP $name: already complete"
continue
fi
# Check dependency
if ! check_dependency "$dep"; then
log "SKIP $name: dependency $dep not met"
continue
fi
# Try to start
if start_pipeline "$name" "$script" "$max_tokens" "$budget" "$mode"; then
started=1
# Only start one pipeline per run (let it claim tokens before next check)
# Exception: playground-factory and training-factory can run in parallel
if [[ "$name" != "playground-factory" && "$name" != "training-factory" ]]; then
break
fi
fi
done
if [[ $started -eq 0 ]]; then
log "No pipelines to start (all complete, running, or blocked)."
fi
log "=== Pipeline Scheduler done ==="
}
main "$@"

View File

@@ -1,194 +0,0 @@
#!/usr/bin/env python3
"""Token Budget Tracker -- real-time spend dashboard for pipelines."""
import argparse, json, os, sqlite3, sys, time
from datetime import datetime
from pathlib import Path
DB_PATH = Path.home() / ".hermes" / "pipelines" / "token_usage.db"
ALERT_THRESHOLDS = [0.5, 0.8, 1.0]
DEFAULT_BUDGETS = {
"knowledge-mine": 200_000_000,
"training-factory": 215_000_000,
"playground": 16_000_000,
"adversary": 17_000_000,
}
def get_db():
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript("""
CREATE TABLE IF NOT EXISTS token_usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
pipeline TEXT NOT NULL,
worker TEXT NOT NULL,
tokens INTEGER NOT NULL,
recorded_at REAL NOT NULL,
hour_bucket TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS pipeline_budgets (
pipeline TEXT PRIMARY KEY,
target_tokens INTEGER NOT NULL,
updated_at REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS alerts_sent (
pipeline TEXT NOT NULL,
threshold REAL NOT NULL,
sent_at REAL NOT NULL,
PRIMARY KEY (pipeline, threshold)
);
CREATE INDEX IF NOT EXISTS idx_usage_pipeline_hour
ON token_usage(pipeline, hour_bucket);
""")
for name, target in DEFAULT_BUDGETS.items():
conn.execute(
"INSERT OR IGNORE INTO pipeline_budgets (pipeline, target_tokens, updated_at) VALUES (?, ?, ?)",
(name, target, time.time())
)
conn.commit()
return conn
def log_usage(conn, pipeline, worker, tokens):
now = time.time()
hour = datetime.now().strftime("%Y-%m-%d %H:00")
conn.execute(
"INSERT INTO token_usage (pipeline, worker, tokens, recorded_at, hour_bucket) VALUES (?, ?, ?, ?, ?)",
(pipeline, worker, tokens, now, hour)
)
conn.commit()
check_alerts(conn, pipeline)
def get_pipeline_stats(conn):
rows = conn.execute("""
SELECT u.pipeline, COALESCE(b.target_tokens, 0) as target,
SUM(u.tokens) as used, MIN(u.recorded_at) as started_at,
COUNT(DISTINCT u.worker) as workers
FROM token_usage u
LEFT JOIN pipeline_budgets b ON u.pipeline = b.pipeline
GROUP BY u.pipeline ORDER BY used DESC
""").fetchall()
return [dict(r) for r in rows]
def fmt(n):
if n >= 1_000_000_000: return f"{n/1_000_000_000:.1f}B"
if n >= 1_000_000: return f"{n/1_000_000:.1f}M"
if n >= 1_000: return f"{n/1_000:.1f}K"
return str(n)
def bar(ratio, w=8):
filled = int(ratio * w)
return "" * filled + "" * (w - filled)
def eta(used, target, started):
if used <= 0 or started <= 0: return "--"
elapsed = (time.time() - started) / 3600
if elapsed <= 0: return "--"
rate = used / elapsed
remaining = target - used
if remaining <= 0: return "DONE"
h = remaining / rate
return f"{h/24:.1f}d" if h >= 24 else f"{h:.1f}h"
def render_dashboard(conn):
stats = get_pipeline_stats(conn)
if not stats:
print("No pipeline data recorded yet.")
return
print()
print(f"{'Pipeline':<20} {'Tokens Used':>12} {'Target':>10} {'Progress':>10} {'ETA':>8} {'Workers':>8}")
print("-" * 72)
total_used = total_target = 0
for s in stats:
used = s["used"] or 0
target = s["target"] or 1
ratio = min(used / target, 1.0) if target > 0 else 0
print(f"{s['pipeline']:<20} {fmt(used):>12} {fmt(target):>10} {bar(ratio):>10} {eta(used, target, s['started_at'] or 0):>8} {s['workers'] or 0:>8}")
total_used += used
total_target += target
print("-" * 72)
r = min(total_used / total_target, 1.0) if total_target > 0 else 0
print(f"{'TOTAL':<20} {fmt(total_used):>12} {fmt(total_target):>10} {bar(r):>10}")
print()
def render_watch(conn, interval=5):
try:
while True:
os.system("clear" if os.name != "nt" else "cls")
print(f"Token Budget Tracker -- {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Press Ctrl+C to exit")
render_dashboard(conn)
time.sleep(interval)
except KeyboardInterrupt:
print("\nExiting.")
def render_daily_summary(conn):
since = time.time() - 86400
rows = conn.execute("""
SELECT pipeline, SUM(tokens) as total, COUNT(DISTINCT worker) as workers, COUNT(*) as entries
FROM token_usage WHERE recorded_at >= ? GROUP BY pipeline ORDER BY total DESC
""", (since,)).fetchall()
if not rows:
print("No usage in last 24 hours.")
return
print(f"\nDaily Summary -- last 24 hours")
print(f"{'Pipeline':<20} {'Total Tokens':>14} {'Workers':>8} {'Entries':>8}")
print("-" * 54)
gt = 0
for r in rows:
print(f"{r['pipeline']:<20} {fmt(r['total']):>14} {r['workers']:>8} {r['entries']:>8}")
gt += r["total"]
print("-" * 54)
print(f"{'TOTAL':<20} {fmt(gt):>14}\n")
def check_alerts(conn, pipeline):
row = conn.execute(
"SELECT SUM(u.tokens) as used, COALESCE(b.target_tokens, 0) as target "
"FROM token_usage u LEFT JOIN pipeline_budgets b ON u.pipeline = b.pipeline "
"WHERE u.pipeline = ?", (pipeline,)
).fetchone()
if not row or row["target"] <= 0: return
ratio = row["used"] / row["target"]
for t in ALERT_THRESHOLDS:
if ratio >= t:
existing = conn.execute("SELECT 1 FROM alerts_sent WHERE pipeline = ? AND threshold = ?", (pipeline, t)).fetchone()
if not existing:
print(f"⚠️ BUDGET ALERT: {pipeline} at {int(t*100)}% ({fmt(row['used'])}/{fmt(row['target'])})", file=sys.stderr)
conn.execute("INSERT INTO alerts_sent (pipeline, threshold, sent_at) VALUES (?, ?, ?)", (pipeline, t, time.time()))
conn.commit()
def set_budget(conn, pipeline, target):
conn.execute("INSERT OR REPLACE INTO pipeline_budgets (pipeline, target_tokens, updated_at) VALUES (?, ?, ?)",
(pipeline, int(target), time.time()))
conn.execute("DELETE FROM alerts_sent WHERE pipeline = ?", (pipeline,))
conn.commit()
print(f"Budget set: {pipeline} = {fmt(int(target))} tokens")
def main():
parser = argparse.ArgumentParser(description="Token Budget Tracker")
parser.add_argument("--watch", action="store_true")
parser.add_argument("--watch-interval", type=int, default=5)
parser.add_argument("--summary", action="store_true")
parser.add_argument("--log", nargs=3, metavar=("PIPELINE", "WORKER", "TOKENS"))
parser.add_argument("--budget", nargs=2, metavar=("PIPELINE", "TARGET"))
parser.add_argument("--db", type=str, default=str(DB_PATH))
args = parser.parse_args()
global DB_PATH
DB_PATH = Path(args.db)
conn = get_db()
if args.log:
log_usage(conn, args.log[0], args.log[1], int(args.log[2]))
print(f"Logged: {args.log[0]}/{args.log[1]} = {fmt(int(args.log[2]))} tokens")
elif args.budget:
set_budget(conn, args.budget[0], args.budget[1])
elif args.summary:
render_daily_summary(conn)
elif args.watch:
render_watch(conn, interval=args.watch_interval)
else:
render_dashboard(conn)
conn.close()
if __name__ == "__main__":
main()

109
tests/test_cron_audit.py Normal file
View File

@@ -0,0 +1,109 @@
"""
Tests for scripts/cron-audit-662.py — cron fleet audit.
"""
import json
import sys
import unittest
from datetime import datetime, timezone, timedelta
from pathlib import Path
# Add scripts to path
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
from cron_audit_662 import categorize_job, audit_jobs
class TestCategorizeJob(unittest.TestCase):
def setUp(self):
self.now = datetime(2026, 4, 14, 20, 0, 0, tzinfo=timezone.utc)
def test_healthy_ok(self):
job = {"id": "a1", "name": "Test", "last_status": "ok", "enabled": True, "state": "scheduled"}
result = categorize_job(job, self.now)
self.assertEqual(result["category"], "healthy")
def test_healthy_never_run(self):
job = {"id": "a2", "name": "Never", "last_status": None, "last_error": None}
result = categorize_job(job, self.now)
self.assertEqual(result["category"], "healthy")
def test_healthy_paused(self):
job = {"id": "a3", "name": "Paused", "state": "paused", "paused_reason": "intentional"}
result = categorize_job(job, self.now)
self.assertEqual(result["category"], "healthy")
def test_healthy_completed(self):
job = {"id": "a4", "name": "Done", "state": "completed"}
result = categorize_job(job, self.now)
self.assertEqual(result["category"], "healthy")
def test_transient_recent_error(self):
recent = (self.now - timedelta(hours=2)).isoformat()
job = {
"id": "t1", "name": "RecentErr",
"last_status": "error",
"last_error": "Connection timeout",
"last_run_at": recent,
"enabled": True,
"state": "scheduled",
}
result = categorize_job(job, self.now)
self.assertEqual(result["category"], "transient")
self.assertIn("transient", result["reason"].lower())
def test_systemic_old_error(self):
old = (self.now - timedelta(hours=72)).isoformat()
job = {
"id": "s1", "name": "OldErr",
"last_status": "error",
"last_error": "ConfigError: bad config",
"last_run_at": old,
"enabled": True,
"state": "scheduled",
}
result = categorize_job(job, self.now)
self.assertEqual(result["category"], "systemic")
self.assertEqual(result["action"], "disable")
def test_systemic_boundary(self):
"""48.1 hours should be systemic."""
boundary = (self.now - timedelta(hours=48, minutes=6)).isoformat()
job = {
"id": "s2", "name": "Boundary",
"last_status": "error",
"last_error": "fail",
"last_run_at": boundary,
"enabled": True,
"state": "scheduled",
}
result = categorize_job(job, self.now)
self.assertEqual(result["category"], "systemic")
class TestAuditJobs(unittest.TestCase):
def test_empty(self):
report = audit_jobs([])
self.assertEqual(report["total_jobs"], 0)
self.assertEqual(report["summary"]["healthy"], 0)
def test_mixed_report(self):
now = datetime(2026, 4, 14, 20, 0, 0, tzinfo=timezone.utc)
old = (now - timedelta(hours=72)).isoformat()
recent = (now - timedelta(hours=1)).isoformat()
jobs = [
{"id": "h1", "name": "Healthy", "last_status": "ok", "enabled": True, "state": "scheduled"},
{"id": "t1", "name": "Transient", "last_status": "error", "last_error": "timeout", "last_run_at": recent, "enabled": True, "state": "scheduled"},
{"id": "s1", "name": "Systemic", "last_status": "error", "last_error": "config bad", "last_run_at": old, "enabled": True, "state": "scheduled"},
{"id": "p1", "name": "Paused", "state": "paused", "paused_reason": "frozen"},
]
report = audit_jobs(jobs)
self.assertEqual(report["summary"]["healthy"], 2)
self.assertEqual(report["summary"]["transient_errors"], 1)
self.assertEqual(report["summary"]["systemic_failures"], 1)
self.assertEqual(len(report["systemic_jobs"]), 1)
self.assertEqual(report["systemic_jobs"][0]["name"], "Systemic")
if __name__ == "__main__":
unittest.main()