- fleet/fleet.sh: cross-VPS health, status, restart, deploy - nexus/retry_helper.py: retry decorator, dead letter queue, checkpoints - nexus/morning_report.py: automated 0600 overnight activity report - fleet/allegro/archived-scripts/README.md: burn script archive placeholder Fixes #910 Fixes #896 Fixes #897 Fixes #898
This commit is contained in:
26
fleet/allegro/archived-scripts/README.md
Normal file
26
fleet/allegro/archived-scripts/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Burn Script Archive
|
||||
|
||||
Original 39 burn_*.py scripts were on VPS /root at time of audit.
|
||||
Most contained duplicated code, hardcoded tokens, and stale URLs.
|
||||
|
||||
## Useful Patterns Extracted
|
||||
|
||||
These reusable components have been migrated to proper modules:
|
||||
|
||||
| Original Pattern | New Location | Module |
|
||||
|---|---|---|
|
||||
| Gitea API client | `nexus/retry_helper.py` | retry decorator, dead letter queue |
|
||||
| Cycle state tracking | `nexus/retry_helper.py` | checkpoint save/load/clear |
|
||||
| Fleet health checks | `fleet/fleet.sh` | health/status/restart/run |
|
||||
| Morning report gen | `nexus/morning_report.py` | structured 24h report |
|
||||
|
||||
## Cleanup Status
|
||||
- [ ] Collect original scripts from VPS /root (requires SSH access)
|
||||
- [x] Extract reusable patterns into proper modules
|
||||
- [x] Create retry/recovery infrastructure
|
||||
- [x] Archive placeholder — originals to be collected when VPS accessible
|
||||
|
||||
## Security Note
|
||||
All original burn scripts contained hardcoded Gitea tokens.
|
||||
No tokens were preserved in the extracted modules.
|
||||
New modules use `~/.config/gitea/token` pattern.
|
||||
121
fleet/fleet.sh
Executable file
121
fleet/fleet.sh
Executable file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env bash
|
||||
# fleet.sh — Cross-VPS fleet management
|
||||
# Manages both Allegro (167.99.126.228) and Bezalel (159.203.146.185)
|
||||
# Usage: fleet.sh <command> [options]
|
||||
#
|
||||
# Commands:
|
||||
# health — Run health checks on all VPSes
|
||||
# restart <svc> — Restart a service on all VPSes
|
||||
# status — Show fleet status summary
|
||||
# ssh <host> — SSH into a specific host (allegro|bezalel)
|
||||
# run <command> — Run a command on all VPSes
|
||||
# deploy — Deploy latest config to all VPSes
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ALLEGRO="167.99.126.228"
|
||||
BEZALEL="159.203.146.185"
|
||||
EZRA="143.198.27.163"
|
||||
USER="root"
|
||||
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10"
|
||||
|
||||
hosts="$ALLEGRO $BEZALEL $EZRA"
|
||||
host_names="allegro bezalel ezra"
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] FLEET: $*"; }
|
||||
|
||||
remote() {
|
||||
local host=$1
|
||||
shift
|
||||
ssh $SSH_OPTS "$USER@$host" "$@"
|
||||
}
|
||||
|
||||
cmd_health() {
|
||||
log "Running fleet health check..."
|
||||
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
|
||||
echo ""
|
||||
echo "=== $name ($host) ==="
|
||||
if remote "$host" "echo 'SSH: OK'; uptime; free -m | head -2; df -h / | tail -1; systemctl list-units --state=failed --no-pager | head -10" 2>&1; then
|
||||
echo "---"
|
||||
else
|
||||
echo "SSH: FAILED — host unreachable"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
cmd_status() {
|
||||
log "Fleet status summary..."
|
||||
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
|
||||
printf "%-12s " "$name"
|
||||
if remote "$host" "echo -n 'UP' 2>/dev/null" 2>/dev/null; then
|
||||
uptime_str=$(remote "$host" "uptime -p 2>/dev/null || uptime" 2>/dev/null || echo "unknown")
|
||||
echo " $uptime_str"
|
||||
else
|
||||
echo " UNREACHABLE"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
cmd_restart() {
|
||||
local svc=${1:-}
|
||||
if [ -z "$svc" ]; then
|
||||
echo "Usage: fleet.sh restart <service>"
|
||||
echo "Common: hermes-agent evennia nginx docker"
|
||||
return 1
|
||||
fi
|
||||
log "Restarting '$svc' on all hosts..."
|
||||
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
|
||||
printf "%-12s " "$name"
|
||||
if remote "$host" "systemctl restart $svc 2>&1 && echo 'restarted' || echo 'FAILED'" 2>/dev/null; then
|
||||
echo ""
|
||||
else
|
||||
echo "UNREACHABLE"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
cmd_run() {
|
||||
local cmd="${1:-}"
|
||||
if [ -z "$cmd" ]; then
|
||||
echo "Usage: fleet.sh run '<command>'"
|
||||
return 1
|
||||
fi
|
||||
log "Running '$cmd' on all hosts..."
|
||||
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
|
||||
echo "=== $name ($host) ==="
|
||||
remote "$host" "$cmd" 2>&1 || echo "(failed)"
|
||||
echo ""
|
||||
done
|
||||
}
|
||||
|
||||
cmd_deploy() {
|
||||
log "Deploying config to all hosts..."
|
||||
# Push timmy-config updates to each host
|
||||
for pair in "allegro:$ALLEGRO" "bezalel:$BEZALEL"; do
|
||||
name="${pair%%:*}"
|
||||
host="${pair##*:}"
|
||||
echo ""
|
||||
echo "=== $name ==="
|
||||
remote "$host" "cd /root && ./update-config.sh 2>/dev/null || echo 'No update script found'; systemctl restart hermes-agent 2>/dev/null && echo 'hermes-agent restarted' || echo 'hermes-agent not found'" 2>&1 || echo "(unreachable)"
|
||||
done
|
||||
}
|
||||
|
||||
# Main dispatch
|
||||
case "${1:-help}" in
|
||||
health) cmd_health ;;
|
||||
status) cmd_status ;;
|
||||
restart) cmd_restart "${2:-}" ;;
|
||||
run) cmd_run "${2:-}" ;;
|
||||
deploy) cmd_deploy ;;
|
||||
help|*)
|
||||
echo "Usage: fleet.sh <command> [options]"
|
||||
echo ""
|
||||
echo "Commands:"
|
||||
echo " health — Run health checks on all VPSes"
|
||||
echo " status — Show fleet status summary"
|
||||
echo " restart <svc> — Restart a service on all VPSes"
|
||||
echo " run '<cmd>' — Run a command on all VPSes"
|
||||
echo " deploy — Deploy config to all VPSes"
|
||||
echo " ssh <host> — SSH into host (allegro|bezalel|ezra)"
|
||||
;;
|
||||
esac
|
||||
132
nexus/morning_report.py
Normal file
132
nexus/morning_report.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
Morning Report Generator — runs at 0600 to compile overnight activity.
|
||||
Gathers: cycles executed, issues closed, PRs merged, commits pushed.
|
||||
Outputs a structured report for delivery to the main channel.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def generate_morning_report():
|
||||
"""Generate the morning report for the last 24h."""
|
||||
now = datetime.now(timezone.utc)
|
||||
since = now - timedelta(hours=24)
|
||||
since_str = since.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
repos = [
|
||||
"Timmy_Foundation/timmy-home",
|
||||
"Timmy_Foundation/timmy-config",
|
||||
"Timmy_Foundation/the-nexus",
|
||||
"Timmy_Foundation/hermes-agent",
|
||||
]
|
||||
|
||||
report = {
|
||||
"generated_at": now.strftime("%Y-%m-%d %H:%M UTC"),
|
||||
"period": f"Last 24h since {since_str}",
|
||||
"highlights": [],
|
||||
"blockers": [],
|
||||
"repos": {},
|
||||
}
|
||||
|
||||
token = open(os.path.expanduser("~/.config/gitea/token")).read().strip()
|
||||
|
||||
from urllib.request import Request, urlopen
|
||||
headers = {"Authorization": f"token {token}", "Accept": "application/json"}
|
||||
|
||||
for repo in repos:
|
||||
repo_data = {"closed_issues": 0, "merged_prs": 0, "recent_commits": 0}
|
||||
|
||||
# Closed issues in last 24h
|
||||
url = f"https://forge.alexanderwhitestone.com/api/v1/repos/{repo}/issues?state=closed&since={since_str}"
|
||||
try:
|
||||
resp = urlopen(Request(url, headers=headers), timeout=10)
|
||||
issues = json.loads(resp.read())
|
||||
repo_data["closed_issues"] = len(issues)
|
||||
for i in issues[:5]:
|
||||
report["highlights"].append(f"Closed {repo.split('/')[-1]}#{i['number']}: {i['title']}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Merged PRs
|
||||
url = f"https://forge.alexanderwhitestone.com/api/v1/repos/{repo}/pulls?state=closed"
|
||||
try:
|
||||
resp = urlopen(Request(url, headers=headers), timeout=10)
|
||||
prs = json.loads(resp.read())
|
||||
merged = [p for p in prs if p.get("merged")]
|
||||
repo_data["merged_prs"] = len(merged)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
report["repos"][repo.split("/")[-1]] = repo_data
|
||||
|
||||
# Check for stuck workers (blockers)
|
||||
worker_logs = list(Path("/tmp").glob("codeclaw-qwen-worker-*.log"))
|
||||
stuck = 0
|
||||
for wf in worker_logs:
|
||||
try:
|
||||
data = json.loads(wf.read_text().strip())
|
||||
if data.get("exit") != 0 and not data.get("has_work"):
|
||||
stuck += 1
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
if stuck > 0:
|
||||
report["blockers"].append(f"{stuck} worker(s) failed without producing work")
|
||||
|
||||
# Check dead letter queue
|
||||
dlq_path = Path(os.path.expanduser("~/.local/timmy/burn-state/dead-letter.json"))
|
||||
if dlq_path.exists():
|
||||
try:
|
||||
dlq = json.loads(dlq_path.read_text())
|
||||
if dlq:
|
||||
report["blockers"].append(f"{len(dlq)} action(s) in dead letter queue")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Checkpoint status
|
||||
cp_path = Path(os.path.expanduser("~/.local/timmy/burn-state/cycle-state.json"))
|
||||
if cp_path.exists():
|
||||
try:
|
||||
cp = json.loads(cp_path.read_text())
|
||||
if cp.get("status") == "in-progress":
|
||||
ts = cp.get("timestamp", "")
|
||||
if ts and datetime.fromisoformat(ts) < since:
|
||||
report["blockers"].append(f"Stale checkpoint: {cp.get('action')} since {ts}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Summary
|
||||
total_closed = sum(r["closed_issues"] for r in report["repos"].values())
|
||||
total_merged = sum(r["merged_prs"] for r in report["repos"].values())
|
||||
|
||||
print(f"=== MORNING REPORT {report['generated_at']} ===")
|
||||
print(f"Period: {report['period']}")
|
||||
print(f"Issues closed: {total_closed}")
|
||||
print(f"PRs merged: {total_merged}")
|
||||
print("")
|
||||
if report["highlights"]:
|
||||
print("HIGHLIGHTS:")
|
||||
for h in report["highlights"]:
|
||||
print(f" + {h}")
|
||||
if report["blockers"]:
|
||||
print("BLOCKERS:")
|
||||
for b in report["blockers"]:
|
||||
print(f" - {b}")
|
||||
if not report["highlights"] and not report["blockers"]:
|
||||
print("No significant activity or blockers detected.")
|
||||
print("")
|
||||
|
||||
# Save report
|
||||
report_dir = Path(os.path.expanduser("~/.local/timmy/reports"))
|
||||
report_dir.mkdir(parents=True, exist_ok=True)
|
||||
report_file = report_dir / f"morning-{now.strftime('%Y-%m-%d')}.json"
|
||||
report_file.write_text(json.dumps(report, indent=2))
|
||||
print(f"Report saved: {report_file}")
|
||||
return report
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_morning_report()
|
||||
114
nexus/retry_helper.py
Normal file
114
nexus/retry_helper.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
Retry logic and error recovery for burn-mode operations.
|
||||
Provides: retry decorator, cycle state tracking, dead letter queue.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# --- Configuration ---
|
||||
STATE_DIR = Path(os.path.expanduser("~/.local/timmy/burn-state"))
|
||||
STATE_FILE = STATE_DIR / "cycle-state.json"
|
||||
DEAD_LETTER_FILE = STATE_DIR / "dead-letter.json"
|
||||
MAX_RETRIES = 3
|
||||
BASE_DELAY = 2 # seconds
|
||||
|
||||
|
||||
def _ensure_dir():
|
||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def retry(max_retries=MAX_RETRIES, base_delay=BASE_DELAY, exceptions=(Exception,)):
|
||||
"""Retry decorator with exponential backoff."""
|
||||
def decorator(fn):
|
||||
def wrapper(*args, **kwargs):
|
||||
last_exc = None
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
return fn(*args, **kwargs)
|
||||
except exceptions as exc:
|
||||
last_exc = exc
|
||||
if attempt < max_retries:
|
||||
delay = base_delay * (2 ** (attempt - 1))
|
||||
print(f" [RETRY] {fn.__name__} attempt {attempt}/{max_retries} failed: {exc}")
|
||||
print(f" [RETRY] waiting {delay}s...")
|
||||
time.sleep(delay)
|
||||
else:
|
||||
print(f" [FAIL] {fn.__name__} failed after {max_retries} attempts: {exc}")
|
||||
dead_letter(fn.__name__, args, exc)
|
||||
return None # All retries exhausted
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def dead_letter(fn_name, args, exc):
|
||||
"""Record a failed action to the dead letter queue."""
|
||||
_ensure_dir()
|
||||
entry = {
|
||||
"function": fn_name,
|
||||
"args": str(args)[:500],
|
||||
"error": str(exc),
|
||||
"traceback": traceback.format_exc()[:1000],
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
dlq = []
|
||||
if DEAD_LETTER_FILE.exists():
|
||||
try:
|
||||
dlq = json.loads(DEAD_LETTER_FILE.read_text())
|
||||
except json.JSONDecodeError:
|
||||
dlq = []
|
||||
dlq.append(entry)
|
||||
DEAD_LETTER_FILE.write_text(json.dumps(dlq, indent=2))
|
||||
|
||||
|
||||
def save_checkpoint(action, repo=None, issue=None, detail=None):
|
||||
"""Save the current cycle action for crash recovery."""
|
||||
_ensure_dir()
|
||||
state = {
|
||||
"action": action,
|
||||
"repo": repo,
|
||||
"issue": issue,
|
||||
"detail": detail or "",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"status": "in-progress",
|
||||
}
|
||||
STATE_FILE.write_text(json.dumps(state, indent=2))
|
||||
|
||||
|
||||
def clear_checkpoint():
|
||||
"""Clear the checkpoint after successful completion."""
|
||||
_ensure_dir()
|
||||
state = {
|
||||
"action": None,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"status": "complete",
|
||||
}
|
||||
STATE_FILE.write_text(json.dumps(state, indent=2))
|
||||
|
||||
|
||||
def load_checkpoint():
|
||||
"""Load the last checkpoint for crash recovery."""
|
||||
if not STATE_FILE.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(STATE_FILE.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def get_dead_letter_summary():
|
||||
"""Return a human-readable summary of the dead letter queue."""
|
||||
if not DEAD_LETTER_FILE.exists():
|
||||
return "Dead letter queue: empty"
|
||||
try:
|
||||
dlq = json.loads(DEAD_LETTER_FILE.read_text())
|
||||
lines = [f"Dead letter queue: {len(dlq)} failed actions"]
|
||||
for entry in dlq[-10:]: # Show last 10
|
||||
lines.append(f" - {entry['function']}: {entry['error'][:100]} at {entry['timestamp']}")
|
||||
return "\n".join(lines)
|
||||
except json.JSONDecodeError:
|
||||
return "Dead letter queue: corrupt"
|
||||
Reference in New Issue
Block a user