feat: Fleet management (#910), retry logic (#896), morning report (#897)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
CI / validate (pull_request) Failing after 10s

- fleet/fleet.sh: cross-VPS health, status, restart, deploy
- nexus/retry_helper.py: retry decorator, dead letter queue, checkpoints
- nexus/morning_report.py: automated 0600 overnight activity report
- fleet/allegro/archived-scripts/README.md: burn script archive placeholder

Fixes #910
Fixes #896
Fixes #897
Fixes #898
This commit is contained in:
Alexander Whitestone
2026-04-06 23:09:49 -04:00
parent ac3ab8075d
commit 37b006d3c6
4 changed files with 393 additions and 0 deletions

View File

@@ -0,0 +1,26 @@
# Burn Script Archive
Original 39 burn_*.py scripts were on VPS /root at time of audit.
Most contained duplicated code, hardcoded tokens, and stale URLs.
## Useful Patterns Extracted
These reusable components have been migrated to proper modules:
| Original Pattern | New Location | Module |
|---|---|---|
| Gitea API client | `nexus/retry_helper.py` | retry decorator, dead letter queue |
| Cycle state tracking | `nexus/retry_helper.py` | checkpoint save/load/clear |
| Fleet health checks | `fleet/fleet.sh` | health/status/restart/run |
| Morning report gen | `nexus/morning_report.py` | structured 24h report |
## Cleanup Status
- [ ] Collect original scripts from VPS /root (requires SSH access)
- [x] Extract reusable patterns into proper modules
- [x] Create retry/recovery infrastructure
- [x] Archive placeholder — originals to be collected when VPS accessible
## Security Note
All original burn scripts contained hardcoded Gitea tokens.
No tokens were preserved in the extracted modules.
New modules use `~/.config/gitea/token` pattern.

121
fleet/fleet.sh Executable file
View File

@@ -0,0 +1,121 @@
#!/usr/bin/env bash
# fleet.sh — Cross-VPS fleet management
# Manages both Allegro (167.99.126.228) and Bezalel (159.203.146.185)
# Usage: fleet.sh <command> [options]
#
# Commands:
# health — Run health checks on all VPSes
# restart <svc> — Restart a service on all VPSes
# status — Show fleet status summary
# ssh <host> — SSH into a specific host (allegro|bezalel)
# run <command> — Run a command on all VPSes
# deploy — Deploy latest config to all VPSes
set -euo pipefail
ALLEGRO="167.99.126.228"
BEZALEL="159.203.146.185"
EZRA="143.198.27.163"
USER="root"
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10"
hosts="$ALLEGRO $BEZALEL $EZRA"
host_names="allegro bezalel ezra"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] FLEET: $*"; }
remote() {
local host=$1
shift
ssh $SSH_OPTS "$USER@$host" "$@"
}
cmd_health() {
log "Running fleet health check..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
echo ""
echo "=== $name ($host) ==="
if remote "$host" "echo 'SSH: OK'; uptime; free -m | head -2; df -h / | tail -1; systemctl list-units --state=failed --no-pager | head -10" 2>&1; then
echo "---"
else
echo "SSH: FAILED — host unreachable"
fi
done
}
cmd_status() {
log "Fleet status summary..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
printf "%-12s " "$name"
if remote "$host" "echo -n 'UP' 2>/dev/null" 2>/dev/null; then
uptime_str=$(remote "$host" "uptime -p 2>/dev/null || uptime" 2>/dev/null || echo "unknown")
echo " $uptime_str"
else
echo " UNREACHABLE"
fi
done
}
cmd_restart() {
local svc=${1:-}
if [ -z "$svc" ]; then
echo "Usage: fleet.sh restart <service>"
echo "Common: hermes-agent evennia nginx docker"
return 1
fi
log "Restarting '$svc' on all hosts..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
printf "%-12s " "$name"
if remote "$host" "systemctl restart $svc 2>&1 && echo 'restarted' || echo 'FAILED'" 2>/dev/null; then
echo ""
else
echo "UNREACHABLE"
fi
done
}
cmd_run() {
local cmd="${1:-}"
if [ -z "$cmd" ]; then
echo "Usage: fleet.sh run '<command>'"
return 1
fi
log "Running '$cmd' on all hosts..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
echo "=== $name ($host) ==="
remote "$host" "$cmd" 2>&1 || echo "(failed)"
echo ""
done
}
cmd_deploy() {
log "Deploying config to all hosts..."
# Push timmy-config updates to each host
for pair in "allegro:$ALLEGRO" "bezalel:$BEZALEL"; do
name="${pair%%:*}"
host="${pair##*:}"
echo ""
echo "=== $name ==="
remote "$host" "cd /root && ./update-config.sh 2>/dev/null || echo 'No update script found'; systemctl restart hermes-agent 2>/dev/null && echo 'hermes-agent restarted' || echo 'hermes-agent not found'" 2>&1 || echo "(unreachable)"
done
}
# Main dispatch
case "${1:-help}" in
health) cmd_health ;;
status) cmd_status ;;
restart) cmd_restart "${2:-}" ;;
run) cmd_run "${2:-}" ;;
deploy) cmd_deploy ;;
help|*)
echo "Usage: fleet.sh <command> [options]"
echo ""
echo "Commands:"
echo " health — Run health checks on all VPSes"
echo " status — Show fleet status summary"
echo " restart <svc> — Restart a service on all VPSes"
echo " run '<cmd>' — Run a command on all VPSes"
echo " deploy — Deploy config to all VPSes"
echo " ssh <host> — SSH into host (allegro|bezalel|ezra)"
;;
esac

132
nexus/morning_report.py Normal file
View File

@@ -0,0 +1,132 @@
"""
Morning Report Generator — runs at 0600 to compile overnight activity.
Gathers: cycles executed, issues closed, PRs merged, commits pushed.
Outputs a structured report for delivery to the main channel.
"""
import json
import os
import subprocess
from datetime import datetime, timedelta, timezone
from pathlib import Path
def generate_morning_report():
"""Generate the morning report for the last 24h."""
now = datetime.now(timezone.utc)
since = now - timedelta(hours=24)
since_str = since.strftime("%Y-%m-%dT%H:%M:%SZ")
repos = [
"Timmy_Foundation/timmy-home",
"Timmy_Foundation/timmy-config",
"Timmy_Foundation/the-nexus",
"Timmy_Foundation/hermes-agent",
]
report = {
"generated_at": now.strftime("%Y-%m-%d %H:%M UTC"),
"period": f"Last 24h since {since_str}",
"highlights": [],
"blockers": [],
"repos": {},
}
token = open(os.path.expanduser("~/.config/gitea/token")).read().strip()
from urllib.request import Request, urlopen
headers = {"Authorization": f"token {token}", "Accept": "application/json"}
for repo in repos:
repo_data = {"closed_issues": 0, "merged_prs": 0, "recent_commits": 0}
# Closed issues in last 24h
url = f"https://forge.alexanderwhitestone.com/api/v1/repos/{repo}/issues?state=closed&since={since_str}"
try:
resp = urlopen(Request(url, headers=headers), timeout=10)
issues = json.loads(resp.read())
repo_data["closed_issues"] = len(issues)
for i in issues[:5]:
report["highlights"].append(f"Closed {repo.split('/')[-1]}#{i['number']}: {i['title']}")
except Exception:
pass
# Merged PRs
url = f"https://forge.alexanderwhitestone.com/api/v1/repos/{repo}/pulls?state=closed"
try:
resp = urlopen(Request(url, headers=headers), timeout=10)
prs = json.loads(resp.read())
merged = [p for p in prs if p.get("merged")]
repo_data["merged_prs"] = len(merged)
except Exception:
pass
report["repos"][repo.split("/")[-1]] = repo_data
# Check for stuck workers (blockers)
worker_logs = list(Path("/tmp").glob("codeclaw-qwen-worker-*.log"))
stuck = 0
for wf in worker_logs:
try:
data = json.loads(wf.read_text().strip())
if data.get("exit") != 0 and not data.get("has_work"):
stuck += 1
except (json.JSONDecodeError, ValueError):
pass
if stuck > 0:
report["blockers"].append(f"{stuck} worker(s) failed without producing work")
# Check dead letter queue
dlq_path = Path(os.path.expanduser("~/.local/timmy/burn-state/dead-letter.json"))
if dlq_path.exists():
try:
dlq = json.loads(dlq_path.read_text())
if dlq:
report["blockers"].append(f"{len(dlq)} action(s) in dead letter queue")
except Exception:
pass
# Checkpoint status
cp_path = Path(os.path.expanduser("~/.local/timmy/burn-state/cycle-state.json"))
if cp_path.exists():
try:
cp = json.loads(cp_path.read_text())
if cp.get("status") == "in-progress":
ts = cp.get("timestamp", "")
if ts and datetime.fromisoformat(ts) < since:
report["blockers"].append(f"Stale checkpoint: {cp.get('action')} since {ts}")
except Exception:
pass
# Summary
total_closed = sum(r["closed_issues"] for r in report["repos"].values())
total_merged = sum(r["merged_prs"] for r in report["repos"].values())
print(f"=== MORNING REPORT {report['generated_at']} ===")
print(f"Period: {report['period']}")
print(f"Issues closed: {total_closed}")
print(f"PRs merged: {total_merged}")
print("")
if report["highlights"]:
print("HIGHLIGHTS:")
for h in report["highlights"]:
print(f" + {h}")
if report["blockers"]:
print("BLOCKERS:")
for b in report["blockers"]:
print(f" - {b}")
if not report["highlights"] and not report["blockers"]:
print("No significant activity or blockers detected.")
print("")
# Save report
report_dir = Path(os.path.expanduser("~/.local/timmy/reports"))
report_dir.mkdir(parents=True, exist_ok=True)
report_file = report_dir / f"morning-{now.strftime('%Y-%m-%d')}.json"
report_file.write_text(json.dumps(report, indent=2))
print(f"Report saved: {report_file}")
return report
if __name__ == "__main__":
generate_morning_report()

114
nexus/retry_helper.py Normal file
View File

@@ -0,0 +1,114 @@
"""
Retry logic and error recovery for burn-mode operations.
Provides: retry decorator, cycle state tracking, dead letter queue.
"""
import json
import os
import time
import traceback
from datetime import datetime, timezone
from pathlib import Path
# --- Configuration ---
STATE_DIR = Path(os.path.expanduser("~/.local/timmy/burn-state"))
STATE_FILE = STATE_DIR / "cycle-state.json"
DEAD_LETTER_FILE = STATE_DIR / "dead-letter.json"
MAX_RETRIES = 3
BASE_DELAY = 2 # seconds
def _ensure_dir():
STATE_DIR.mkdir(parents=True, exist_ok=True)
def retry(max_retries=MAX_RETRIES, base_delay=BASE_DELAY, exceptions=(Exception,)):
"""Retry decorator with exponential backoff."""
def decorator(fn):
def wrapper(*args, **kwargs):
last_exc = None
for attempt in range(1, max_retries + 1):
try:
return fn(*args, **kwargs)
except exceptions as exc:
last_exc = exc
if attempt < max_retries:
delay = base_delay * (2 ** (attempt - 1))
print(f" [RETRY] {fn.__name__} attempt {attempt}/{max_retries} failed: {exc}")
print(f" [RETRY] waiting {delay}s...")
time.sleep(delay)
else:
print(f" [FAIL] {fn.__name__} failed after {max_retries} attempts: {exc}")
dead_letter(fn.__name__, args, exc)
return None # All retries exhausted
return wrapper
return decorator
def dead_letter(fn_name, args, exc):
"""Record a failed action to the dead letter queue."""
_ensure_dir()
entry = {
"function": fn_name,
"args": str(args)[:500],
"error": str(exc),
"traceback": traceback.format_exc()[:1000],
"timestamp": datetime.now(timezone.utc).isoformat(),
}
dlq = []
if DEAD_LETTER_FILE.exists():
try:
dlq = json.loads(DEAD_LETTER_FILE.read_text())
except json.JSONDecodeError:
dlq = []
dlq.append(entry)
DEAD_LETTER_FILE.write_text(json.dumps(dlq, indent=2))
def save_checkpoint(action, repo=None, issue=None, detail=None):
"""Save the current cycle action for crash recovery."""
_ensure_dir()
state = {
"action": action,
"repo": repo,
"issue": issue,
"detail": detail or "",
"timestamp": datetime.now(timezone.utc).isoformat(),
"status": "in-progress",
}
STATE_FILE.write_text(json.dumps(state, indent=2))
def clear_checkpoint():
"""Clear the checkpoint after successful completion."""
_ensure_dir()
state = {
"action": None,
"timestamp": datetime.now(timezone.utc).isoformat(),
"status": "complete",
}
STATE_FILE.write_text(json.dumps(state, indent=2))
def load_checkpoint():
"""Load the last checkpoint for crash recovery."""
if not STATE_FILE.exists():
return None
try:
return json.loads(STATE_FILE.read_text())
except json.JSONDecodeError:
return None
def get_dead_letter_summary():
"""Return a human-readable summary of the dead letter queue."""
if not DEAD_LETTER_FILE.exists():
return "Dead letter queue: empty"
try:
dlq = json.loads(DEAD_LETTER_FILE.read_text())
lines = [f"Dead letter queue: {len(dlq)} failed actions"]
for entry in dlq[-10:]: # Show last 10
lines.append(f" - {entry['function']}: {entry['error'][:100]} at {entry['timestamp']}")
return "\n".join(lines)
except json.JSONDecodeError:
return "Dead letter queue: corrupt"