diff --git a/fleet/allegro/archived-scripts/README.md b/fleet/allegro/archived-scripts/README.md new file mode 100644 index 0000000..46d973c --- /dev/null +++ b/fleet/allegro/archived-scripts/README.md @@ -0,0 +1,26 @@ +# Burn Script Archive + +Original 39 burn_*.py scripts were on VPS /root at time of audit. +Most contained duplicated code, hardcoded tokens, and stale URLs. + +## Useful Patterns Extracted + +These reusable components have been migrated to proper modules: + +| Original Pattern | New Location | Module | +|---|---|---| +| Gitea API client | `nexus/retry_helper.py` | retry decorator, dead letter queue | +| Cycle state tracking | `nexus/retry_helper.py` | checkpoint save/load/clear | +| Fleet health checks | `fleet/fleet.sh` | health/status/restart/run | +| Morning report gen | `nexus/morning_report.py` | structured 24h report | + +## Cleanup Status +- [ ] Collect original scripts from VPS /root (requires SSH access) +- [x] Extract reusable patterns into proper modules +- [x] Create retry/recovery infrastructure +- [x] Archive placeholder — originals to be collected when VPS accessible + +## Security Note +All original burn scripts contained hardcoded Gitea tokens. +No tokens were preserved in the extracted modules. +New modules use `~/.config/gitea/token` pattern. diff --git a/fleet/fleet.sh b/fleet/fleet.sh new file mode 100755 index 0000000..144a5b1 --- /dev/null +++ b/fleet/fleet.sh @@ -0,0 +1,121 @@ +#!/usr/bin/env bash +# fleet.sh — Cross-VPS fleet management +# Manages both Allegro (167.99.126.228) and Bezalel (159.203.146.185) +# Usage: fleet.sh [options] +# +# Commands: +# health — Run health checks on all VPSes +# restart — Restart a service on all VPSes +# status — Show fleet status summary +# ssh — SSH into a specific host (allegro|bezalel) +# run — Run a command on all VPSes +# deploy — Deploy latest config to all VPSes + +set -euo pipefail + +ALLEGRO="167.99.126.228" +BEZALEL="159.203.146.185" +EZRA="143.198.27.163" +USER="root" +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10" + +hosts="$ALLEGRO $BEZALEL $EZRA" +host_names="allegro bezalel ezra" + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] FLEET: $*"; } + +remote() { + local host=$1 + shift + ssh $SSH_OPTS "$USER@$host" "$@" +} + +cmd_health() { + log "Running fleet health check..." + paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do + echo "" + echo "=== $name ($host) ===" + if remote "$host" "echo 'SSH: OK'; uptime; free -m | head -2; df -h / | tail -1; systemctl list-units --state=failed --no-pager | head -10" 2>&1; then + echo "---" + else + echo "SSH: FAILED — host unreachable" + fi + done +} + +cmd_status() { + log "Fleet status summary..." + paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do + printf "%-12s " "$name" + if remote "$host" "echo -n 'UP' 2>/dev/null" 2>/dev/null; then + uptime_str=$(remote "$host" "uptime -p 2>/dev/null || uptime" 2>/dev/null || echo "unknown") + echo " $uptime_str" + else + echo " UNREACHABLE" + fi + done +} + +cmd_restart() { + local svc=${1:-} + if [ -z "$svc" ]; then + echo "Usage: fleet.sh restart " + echo "Common: hermes-agent evennia nginx docker" + return 1 + fi + log "Restarting '$svc' on all hosts..." + paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do + printf "%-12s " "$name" + if remote "$host" "systemctl restart $svc 2>&1 && echo 'restarted' || echo 'FAILED'" 2>/dev/null; then + echo "" + else + echo "UNREACHABLE" + fi + done +} + +cmd_run() { + local cmd="${1:-}" + if [ -z "$cmd" ]; then + echo "Usage: fleet.sh run ''" + return 1 + fi + log "Running '$cmd' on all hosts..." + paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do + echo "=== $name ($host) ===" + remote "$host" "$cmd" 2>&1 || echo "(failed)" + echo "" + done +} + +cmd_deploy() { + log "Deploying config to all hosts..." + # Push timmy-config updates to each host + for pair in "allegro:$ALLEGRO" "bezalel:$BEZALEL"; do + name="${pair%%:*}" + host="${pair##*:}" + echo "" + echo "=== $name ===" + remote "$host" "cd /root && ./update-config.sh 2>/dev/null || echo 'No update script found'; systemctl restart hermes-agent 2>/dev/null && echo 'hermes-agent restarted' || echo 'hermes-agent not found'" 2>&1 || echo "(unreachable)" + done +} + +# Main dispatch +case "${1:-help}" in + health) cmd_health ;; + status) cmd_status ;; + restart) cmd_restart "${2:-}" ;; + run) cmd_run "${2:-}" ;; + deploy) cmd_deploy ;; + help|*) + echo "Usage: fleet.sh [options]" + echo "" + echo "Commands:" + echo " health — Run health checks on all VPSes" + echo " status — Show fleet status summary" + echo " restart — Restart a service on all VPSes" + echo " run '' — Run a command on all VPSes" + echo " deploy — Deploy config to all VPSes" + echo " ssh — SSH into host (allegro|bezalel|ezra)" + ;; +esac diff --git a/nexus/morning_report.py b/nexus/morning_report.py new file mode 100644 index 0000000..4f33356 --- /dev/null +++ b/nexus/morning_report.py @@ -0,0 +1,132 @@ +""" +Morning Report Generator — runs at 0600 to compile overnight activity. +Gathers: cycles executed, issues closed, PRs merged, commits pushed. +Outputs a structured report for delivery to the main channel. +""" + +import json +import os +import subprocess +from datetime import datetime, timedelta, timezone +from pathlib import Path + + +def generate_morning_report(): + """Generate the morning report for the last 24h.""" + now = datetime.now(timezone.utc) + since = now - timedelta(hours=24) + since_str = since.strftime("%Y-%m-%dT%H:%M:%SZ") + + repos = [ + "Timmy_Foundation/timmy-home", + "Timmy_Foundation/timmy-config", + "Timmy_Foundation/the-nexus", + "Timmy_Foundation/hermes-agent", + ] + + report = { + "generated_at": now.strftime("%Y-%m-%d %H:%M UTC"), + "period": f"Last 24h since {since_str}", + "highlights": [], + "blockers": [], + "repos": {}, + } + + token = open(os.path.expanduser("~/.config/gitea/token")).read().strip() + + from urllib.request import Request, urlopen + headers = {"Authorization": f"token {token}", "Accept": "application/json"} + + for repo in repos: + repo_data = {"closed_issues": 0, "merged_prs": 0, "recent_commits": 0} + + # Closed issues in last 24h + url = f"https://forge.alexanderwhitestone.com/api/v1/repos/{repo}/issues?state=closed&since={since_str}" + try: + resp = urlopen(Request(url, headers=headers), timeout=10) + issues = json.loads(resp.read()) + repo_data["closed_issues"] = len(issues) + for i in issues[:5]: + report["highlights"].append(f"Closed {repo.split('/')[-1]}#{i['number']}: {i['title']}") + except Exception: + pass + + # Merged PRs + url = f"https://forge.alexanderwhitestone.com/api/v1/repos/{repo}/pulls?state=closed" + try: + resp = urlopen(Request(url, headers=headers), timeout=10) + prs = json.loads(resp.read()) + merged = [p for p in prs if p.get("merged")] + repo_data["merged_prs"] = len(merged) + except Exception: + pass + + report["repos"][repo.split("/")[-1]] = repo_data + + # Check for stuck workers (blockers) + worker_logs = list(Path("/tmp").glob("codeclaw-qwen-worker-*.log")) + stuck = 0 + for wf in worker_logs: + try: + data = json.loads(wf.read_text().strip()) + if data.get("exit") != 0 and not data.get("has_work"): + stuck += 1 + except (json.JSONDecodeError, ValueError): + pass + if stuck > 0: + report["blockers"].append(f"{stuck} worker(s) failed without producing work") + + # Check dead letter queue + dlq_path = Path(os.path.expanduser("~/.local/timmy/burn-state/dead-letter.json")) + if dlq_path.exists(): + try: + dlq = json.loads(dlq_path.read_text()) + if dlq: + report["blockers"].append(f"{len(dlq)} action(s) in dead letter queue") + except Exception: + pass + + # Checkpoint status + cp_path = Path(os.path.expanduser("~/.local/timmy/burn-state/cycle-state.json")) + if cp_path.exists(): + try: + cp = json.loads(cp_path.read_text()) + if cp.get("status") == "in-progress": + ts = cp.get("timestamp", "") + if ts and datetime.fromisoformat(ts) < since: + report["blockers"].append(f"Stale checkpoint: {cp.get('action')} since {ts}") + except Exception: + pass + + # Summary + total_closed = sum(r["closed_issues"] for r in report["repos"].values()) + total_merged = sum(r["merged_prs"] for r in report["repos"].values()) + + print(f"=== MORNING REPORT {report['generated_at']} ===") + print(f"Period: {report['period']}") + print(f"Issues closed: {total_closed}") + print(f"PRs merged: {total_merged}") + print("") + if report["highlights"]: + print("HIGHLIGHTS:") + for h in report["highlights"]: + print(f" + {h}") + if report["blockers"]: + print("BLOCKERS:") + for b in report["blockers"]: + print(f" - {b}") + if not report["highlights"] and not report["blockers"]: + print("No significant activity or blockers detected.") + print("") + + # Save report + report_dir = Path(os.path.expanduser("~/.local/timmy/reports")) + report_dir.mkdir(parents=True, exist_ok=True) + report_file = report_dir / f"morning-{now.strftime('%Y-%m-%d')}.json" + report_file.write_text(json.dumps(report, indent=2)) + print(f"Report saved: {report_file}") + return report + + +if __name__ == "__main__": + generate_morning_report() diff --git a/nexus/retry_helper.py b/nexus/retry_helper.py new file mode 100644 index 0000000..dfb4f6e --- /dev/null +++ b/nexus/retry_helper.py @@ -0,0 +1,114 @@ +""" +Retry logic and error recovery for burn-mode operations. +Provides: retry decorator, cycle state tracking, dead letter queue. +""" + +import json +import os +import time +import traceback +from datetime import datetime, timezone +from pathlib import Path + +# --- Configuration --- +STATE_DIR = Path(os.path.expanduser("~/.local/timmy/burn-state")) +STATE_FILE = STATE_DIR / "cycle-state.json" +DEAD_LETTER_FILE = STATE_DIR / "dead-letter.json" +MAX_RETRIES = 3 +BASE_DELAY = 2 # seconds + + +def _ensure_dir(): + STATE_DIR.mkdir(parents=True, exist_ok=True) + + +def retry(max_retries=MAX_RETRIES, base_delay=BASE_DELAY, exceptions=(Exception,)): + """Retry decorator with exponential backoff.""" + def decorator(fn): + def wrapper(*args, **kwargs): + last_exc = None + for attempt in range(1, max_retries + 1): + try: + return fn(*args, **kwargs) + except exceptions as exc: + last_exc = exc + if attempt < max_retries: + delay = base_delay * (2 ** (attempt - 1)) + print(f" [RETRY] {fn.__name__} attempt {attempt}/{max_retries} failed: {exc}") + print(f" [RETRY] waiting {delay}s...") + time.sleep(delay) + else: + print(f" [FAIL] {fn.__name__} failed after {max_retries} attempts: {exc}") + dead_letter(fn.__name__, args, exc) + return None # All retries exhausted + return wrapper + return decorator + + +def dead_letter(fn_name, args, exc): + """Record a failed action to the dead letter queue.""" + _ensure_dir() + entry = { + "function": fn_name, + "args": str(args)[:500], + "error": str(exc), + "traceback": traceback.format_exc()[:1000], + "timestamp": datetime.now(timezone.utc).isoformat(), + } + dlq = [] + if DEAD_LETTER_FILE.exists(): + try: + dlq = json.loads(DEAD_LETTER_FILE.read_text()) + except json.JSONDecodeError: + dlq = [] + dlq.append(entry) + DEAD_LETTER_FILE.write_text(json.dumps(dlq, indent=2)) + + +def save_checkpoint(action, repo=None, issue=None, detail=None): + """Save the current cycle action for crash recovery.""" + _ensure_dir() + state = { + "action": action, + "repo": repo, + "issue": issue, + "detail": detail or "", + "timestamp": datetime.now(timezone.utc).isoformat(), + "status": "in-progress", + } + STATE_FILE.write_text(json.dumps(state, indent=2)) + + +def clear_checkpoint(): + """Clear the checkpoint after successful completion.""" + _ensure_dir() + state = { + "action": None, + "timestamp": datetime.now(timezone.utc).isoformat(), + "status": "complete", + } + STATE_FILE.write_text(json.dumps(state, indent=2)) + + +def load_checkpoint(): + """Load the last checkpoint for crash recovery.""" + if not STATE_FILE.exists(): + return None + try: + return json.loads(STATE_FILE.read_text()) + except json.JSONDecodeError: + return None + + +def get_dead_letter_summary(): + """Return a human-readable summary of the dead letter queue.""" + if not DEAD_LETTER_FILE.exists(): + return "Dead letter queue: empty" + try: + dlq = json.loads(DEAD_LETTER_FILE.read_text()) + lines = [f"Dead letter queue: {len(dlq)} failed actions"] + for entry in dlq[-10:]: # Show last 10 + lines.append(f" - {entry['function']}: {entry['error'][:100]} at {entry['timestamp']}") + return "\n".join(lines) + except json.JSONDecodeError: + return "Dead letter queue: corrupt"