Compare commits
4 Commits
ezra/issue
...
timmy/flee
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e58a7c225e | ||
|
|
277d21aef6 | ||
|
|
228e46a330 | ||
|
|
67c2927c1a |
156
fleet/agent_lifecycle.py
Executable file
156
fleet/agent_lifecycle.py
Executable file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
# FLEET-012: Agent Lifecycle Manager
|
||||
# Phase 5: Scale — spawn, train, deploy, retire agents automatically.
|
||||
#
|
||||
# Manages the full lifecycle of AI agents in the fleet:
|
||||
# 1. PROVISION: Clone template, install deps, configure, test
|
||||
# 2. TRAIN: Run initial tasks, measure quality, score
|
||||
# 3. DEPLOY: Add to active rotation, start accepting issues
|
||||
# 4. MONITOR: Track performance, quality, uptime
|
||||
# 5. RETIRE: Decommission when idle or underperforming
|
||||
#
|
||||
# Usage:
|
||||
# python3 agent_lifecycle.py provision <agent_name> <vps_ip> [--model <model>]
|
||||
# python3 agent_lifecycle.py status
|
||||
# python3 agent_lifecycle.py retire <agent_name>
|
||||
# python3 agent_lifecycle.py monitor
|
||||
|
||||
import os, sys, json, subprocess, time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
DATA_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-agents"))
|
||||
AGENTS_DB = DATA_DIR / "agents.json"
|
||||
LIFECYCLE_LOG = DATA_DIR / "lifecycle.log"
|
||||
|
||||
def ensure_dirs():
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def log(msg, level="INFO"):
|
||||
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
entry = f"[{ts}] [{level}] {msg}"
|
||||
with open(LIFECYCLE_LOG, "a") as f:
|
||||
f.write(entry + "\n")
|
||||
print(f" {entry}")
|
||||
|
||||
def load_agents():
|
||||
if AGENTS_DB.exists():
|
||||
return json.loads(AGENTS_DB.read_text())
|
||||
return {}
|
||||
|
||||
def save_agents(db):
|
||||
AGENTS_DB.write_text(json.dumps(db, indent=2))
|
||||
|
||||
def status():
|
||||
agents = load_agents()
|
||||
print("\n=== Agent Fleet Status ===")
|
||||
if not agents:
|
||||
print(" No agents registered.")
|
||||
return
|
||||
for name, agent in agents.items():
|
||||
state = agent.get("state", "unknown")
|
||||
vps = agent.get("vps", "unknown")
|
||||
model = agent.get("model", "unknown")
|
||||
score = agent.get("quality_score", "N/A")
|
||||
created = agent.get("created_at", "?")
|
||||
print(f" {name}: state={state}, vps={vps}, model={model}, score={score}, created={created}")
|
||||
if agent.get("last_heartbeat"):
|
||||
last = agent["last_heartbeat"]
|
||||
print(f" Last heartbeat: {last}")
|
||||
|
||||
def provision(name, vps, model="hermes4:14b"):
|
||||
agents = load_agents()
|
||||
if name in agents:
|
||||
print(f" Agent '{name}' already exists (state: {agents[name].get('state')})")
|
||||
return False
|
||||
|
||||
log(f"Provisioning agent '{name}' on {vps} with model {model}")
|
||||
agents[name] = {
|
||||
"name": name,
|
||||
"vps": vps,
|
||||
"model": model,
|
||||
"state": "provisioning",
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
"quality_score": None,
|
||||
"tasks_completed": 0,
|
||||
"tasks_failed": 0,
|
||||
"last_heartbeat": None,
|
||||
"metadata": {"provision_started": datetime.now(timezone.utc).isoformat()}
|
||||
}
|
||||
save_agents(agents)
|
||||
log(f"Agent '{name}' registered. State: provisioning")
|
||||
return True
|
||||
|
||||
def deploy(name):
|
||||
agents = load_agents()
|
||||
if name not in agents:
|
||||
print(f" Agent '{name}' not found")
|
||||
return False
|
||||
|
||||
agents[name]["state"] = "deployed"
|
||||
agents[name]["metadata"]["deployed_at"] = datetime.now(timezone.utc).isoformat()
|
||||
save_agents(agents)
|
||||
log(f"Agent '{name}' deployed and accepting issues")
|
||||
return True
|
||||
|
||||
def retire(name):
|
||||
agents = load_agents()
|
||||
if name not in agents:
|
||||
print(f" Agent '{name}' not found")
|
||||
return False
|
||||
|
||||
agents[name]["state"] = "retired"
|
||||
agents[name]["metadata"]["retired_at"] = datetime.now(timezone.utc).isoformat()
|
||||
save_agents(agents)
|
||||
log(f"Agent '{name}' retired. Completed {agents[name].get('tasks_completed', 0)} tasks.")
|
||||
return True
|
||||
|
||||
def monitor():
|
||||
agents = load_agents()
|
||||
now = time.time()
|
||||
changes = 0
|
||||
for name, agent in agents.items():
|
||||
if agent.get("state") != "deployed":
|
||||
continue
|
||||
last = agent.get("last_heartbeat")
|
||||
if last:
|
||||
try:
|
||||
last_ts = datetime.fromisoformat(last).timestamp()
|
||||
hours_since = (now - last_ts) / 3600
|
||||
if hours_since > 24:
|
||||
log(f"Agent '{name}' no heartbeat for {hours_since:.1f}h")
|
||||
agent["state"] = "idle"
|
||||
agent["metadata"]["idle_since"] = datetime.now(timezone.utc).isoformat()
|
||||
changes += 1
|
||||
elif hours_since > 168: # 7 days
|
||||
log(f"Agent '{name}' idle for 7 days — recommending retirement")
|
||||
agent["metadata"]["retire_recommendation"] = datetime.now(timezone.utc).isoformat()
|
||||
changes += 1
|
||||
except (ValueError, TypeError, OSError):
|
||||
pass
|
||||
if changes > 0:
|
||||
save_agents(agents)
|
||||
log(f"Monitor: {changes} agents state changed")
|
||||
else:
|
||||
log("Monitor: all agents healthy")
|
||||
|
||||
if __name__ == "__main__":
|
||||
ensure_dirs()
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: agent_lifecycle.py [provision|deploy|retire|status|monitor]")
|
||||
sys.exit(0)
|
||||
|
||||
cmd = sys.argv[1]
|
||||
if cmd == "provision" and len(sys.argv) >= 4:
|
||||
model = sys.argv[4] if len(sys.argv) >= 5 else "hermes4:14b"
|
||||
provision(sys.argv[2], sys.argv[3], model)
|
||||
elif cmd == "deploy" and len(sys.argv) >= 3:
|
||||
deploy(sys.argv[2])
|
||||
elif cmd == "retire" and len(sys.argv) >= 3:
|
||||
retire(sys.argv[2])
|
||||
elif cmd == "status":
|
||||
status()
|
||||
elif cmd == "monitor":
|
||||
monitor()
|
||||
else:
|
||||
print("Usage: agent_lifecycle.py [provision <name> <vps>|deploy <name>|retire <name>|status|monitor]")
|
||||
272
fleet/auto_restart.py
Executable file
272
fleet/auto_restart.py
Executable file
@@ -0,0 +1,272 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Auto-Restart Agent — Self-healing process monitor for fleet machines.
|
||||
|
||||
Detects dead services and restarts them automatically.
|
||||
Escalates after 3 attempts (prevents restart loops).
|
||||
Logs all actions to ~/.local/timmy/fleet-health/restarts.log
|
||||
Alerts via Telegram if service cannot be recovered.
|
||||
|
||||
Prerequisite: FLEET-006 (health check) must be running to detect failures.
|
||||
|
||||
Usage:
|
||||
python3 auto_restart.py # Run checks now
|
||||
python3 auto_restart.py --daemon # Run continuously (every 60s)
|
||||
python3 auto_restart.py --status # Show restart history
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import subprocess
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# === CONFIG ===
|
||||
LOG_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-health"))
|
||||
RESTART_LOG = LOG_DIR / "restarts.log"
|
||||
COOLDOWN_FILE = LOG_DIR / "restart_cooldowns.json"
|
||||
MAX_RETRIES = 3
|
||||
COOLDOWN_PERIOD = 3600 # 1 hour between escalation alerts
|
||||
|
||||
# Services definition: name, check command, restart command
|
||||
# Local services:
|
||||
LOCAL_SERVICES = {
|
||||
"hermes-gateway": {
|
||||
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
||||
"restart": "cd ~/code-claw && ./restart-gateway.sh 2>/dev/null || launchctl kickstart -k ai.hermes.gateway 2>/dev/null",
|
||||
"critical": True,
|
||||
},
|
||||
"ollama": {
|
||||
"check": "pgrep -f 'ollama serve' > /dev/null 2>/dev/null",
|
||||
"restart": "launchctl kickstart -k com.ollama.ollama 2>/dev/null || /opt/homebrew/bin/brew services restart ollama 2>/dev/null",
|
||||
"critical": False,
|
||||
},
|
||||
"codeclaw-heartbeat": {
|
||||
"check": "launchctl list | grep 'ai.timmy.codeclaw-qwen-heartbeat' > /dev/null 2>/dev/null",
|
||||
"restart": "launchctl kickstart -k ai.timmy.codeclaw-qwen-heartbeat 2>/dev/null",
|
||||
"critical": False,
|
||||
},
|
||||
}
|
||||
|
||||
# VPS services to restart via SSH
|
||||
VPS_SERVICES = {
|
||||
"ezra": {
|
||||
"ip": "143.198.27.163",
|
||||
"user": "root",
|
||||
"services": {
|
||||
"gitea": {
|
||||
"check": "systemctl is-active gitea 2>/dev/null | grep -q active",
|
||||
"restart": "systemctl restart gitea 2>/dev/null",
|
||||
"critical": True,
|
||||
},
|
||||
"nginx": {
|
||||
"check": "systemctl is-active nginx 2>/dev/null | grep -q active",
|
||||
"restart": "systemctl restart nginx 2>/dev/null",
|
||||
"critical": False,
|
||||
},
|
||||
"hermes-agent": {
|
||||
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
||||
"restart": "cd /root/wizards/ezra/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &",
|
||||
"critical": True,
|
||||
},
|
||||
},
|
||||
},
|
||||
"allegro": {
|
||||
"ip": "167.99.126.228",
|
||||
"user": "root",
|
||||
"services": {
|
||||
"hermes-agent": {
|
||||
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
||||
"restart": "cd /root/wizards/allegro/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &",
|
||||
"critical": True,
|
||||
},
|
||||
},
|
||||
},
|
||||
"bezalel": {
|
||||
"ip": "159.203.146.185",
|
||||
"user": "root",
|
||||
"services": {
|
||||
"hermes-agent": {
|
||||
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
||||
"restart": "cd /root/wizards/bezalel/hermes/venv/bin/activate && nohup hermes gateway run > /dev/null 2>&1 &",
|
||||
"critical": True,
|
||||
},
|
||||
"evennia": {
|
||||
"check": "pgrep -f 'evennia' > /dev/null 2>/dev/null",
|
||||
"restart": "cd /root/.evennia/timmy_world && evennia restart 2>/dev/null",
|
||||
"critical": False,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
TELEGRAM_TOKEN_FILE = Path(os.path.expanduser("~/.config/telegram/special_bot"))
|
||||
TELEGRAM_CHAT = "-1003664764329"
|
||||
|
||||
|
||||
def send_telegram(message):
|
||||
if not TELEGRAM_TOKEN_FILE.exists():
|
||||
return False
|
||||
token = TELEGRAM_TOKEN_FILE.read_text().strip()
|
||||
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
||||
body = json.dumps({
|
||||
"chat_id": TELEGRAM_CHAT,
|
||||
"text": f"[AUTO-RESTART]\n{message}",
|
||||
}).encode()
|
||||
try:
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST")
|
||||
urllib.request.urlopen(req, timeout=10)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def get_cooldowns():
|
||||
if COOLDOWN_FILE.exists():
|
||||
try:
|
||||
return json.loads(COOLDOWN_FILE.read_text())
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def save_cooldowns(data):
|
||||
COOLDOWN_FILE.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
def check_service(check_cmd, timeout=10):
|
||||
try:
|
||||
proc = subprocess.run(check_cmd, shell=True, capture_output=True, timeout=timeout)
|
||||
return proc.returncode == 0
|
||||
except (subprocess.TimeoutExpired, subprocess.SubprocessError):
|
||||
return False
|
||||
|
||||
|
||||
def restart_service(restart_cmd, timeout=30):
|
||||
try:
|
||||
proc = subprocess.run(restart_cmd, shell=True, capture_output=True, timeout=timeout)
|
||||
return proc.returncode == 0
|
||||
except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:
|
||||
return False
|
||||
|
||||
|
||||
def try_restart_via_ssh(name, host_config, service_name):
|
||||
ip = host_config["ip"]
|
||||
user = host_config["user"]
|
||||
service = host_config["services"][service_name]
|
||||
|
||||
restart_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 {user}@{ip} "{service["restart"]}"'
|
||||
return restart_service(restart_cmd, timeout=30)
|
||||
|
||||
|
||||
def log_restart(service_name, machine, attempt, success):
|
||||
ts = datetime.now(timezone.utc).isoformat()
|
||||
status = "SUCCESS" if success else "FAILED"
|
||||
log_entry = f"{ts} [{status}] {machine}/{service_name} (attempt {attempt})\n"
|
||||
|
||||
RESTART_LOG.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(RESTART_LOG, "a") as f:
|
||||
f.write(log_entry)
|
||||
|
||||
print(f" [{status}] {machine}/{service_name} - attempt {attempt}")
|
||||
|
||||
|
||||
def check_and_restart():
|
||||
"""Run all restart checks."""
|
||||
results = []
|
||||
cooldowns = get_cooldowns()
|
||||
now = time.time()
|
||||
|
||||
# Check local services
|
||||
for name, service in LOCAL_SERVICES.items():
|
||||
if not check_service(service["check"]):
|
||||
cooldown_key = f"local/{name}"
|
||||
retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0)
|
||||
|
||||
if retries >= MAX_RETRIES:
|
||||
last = cooldowns.get(cooldown_key, {}).get("last", 0)
|
||||
if now - last < COOLDOWN_PERIOD and service["critical"]:
|
||||
send_telegram(f"CRITICAL: local/{name} failed {MAX_RETRIES} restart attempts. Needs human intervention.")
|
||||
cooldowns[cooldown_key] = {"count": 0, "last": now}
|
||||
save_cooldowns(cooldowns)
|
||||
continue
|
||||
|
||||
success = restart_service(service["restart"])
|
||||
log_restart(name, "local", retries + 1, success)
|
||||
|
||||
cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now}
|
||||
save_cooldowns(cooldowns)
|
||||
if success:
|
||||
# Verify it actually started
|
||||
time.sleep(3)
|
||||
if check_service(service["check"]):
|
||||
print(f" VERIFIED: local/{name} is running")
|
||||
else:
|
||||
print(f" WARNING: local/{name} restart command returned success but process not detected")
|
||||
|
||||
# Check VPS services
|
||||
for host, host_config in VPS_SERVICES.items():
|
||||
for service_name, service in host_config["services"].items():
|
||||
check_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 {host_config["user"]}@{host_config["ip"]} "{service["check"]}"'
|
||||
if not check_service(check_cmd):
|
||||
cooldown_key = f"{host}/{service_name}"
|
||||
retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0)
|
||||
|
||||
if retries >= MAX_RETRIES:
|
||||
last = cooldowns.get(cooldown_key, {}).get("last", 0)
|
||||
if now - last < COOLDOWN_PERIOD and service["critical"]:
|
||||
send_telegram(f"CRITICAL: {host}/{service_name} failed {MAX_RETRIES} restart attempts. Needs human intervention.")
|
||||
cooldowns[cooldown_key] = {"count": 0, "last": now}
|
||||
save_cooldowns(cooldowns)
|
||||
continue
|
||||
|
||||
success = try_restart_via_ssh(host, host_config, service_name)
|
||||
log_restart(service_name, host, retries + 1, success)
|
||||
|
||||
cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now}
|
||||
save_cooldowns(cooldowns)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def daemon_mode():
|
||||
"""Run continuously every 60 seconds."""
|
||||
print("Auto-restart agent running in daemon mode (60s interval)")
|
||||
print(f"Monitoring {len(LOCAL_SERVICES)} local + {sum(len(h['services']) for h in VPS_SERVICES.values())} remote services")
|
||||
print(f"Max retries per cycle: {MAX_RETRIES}")
|
||||
print(f"Cooldown after max retries: {COOLDOWN_PERIOD}s")
|
||||
while True:
|
||||
check_and_restart()
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
def show_status():
|
||||
"""Show restart history and cooldowns."""
|
||||
cooldowns = get_cooldowns()
|
||||
print("=== Restart Cooldowns ===")
|
||||
for key, data in sorted(cooldowns.items()):
|
||||
count = data.get("count", 0)
|
||||
if count > 0:
|
||||
print(f" {key}: {count} failures, last at {datetime.fromtimestamp(data.get('last',0), tz=timezone.utc).strftime('%H:%M')}")
|
||||
|
||||
print("\n=== Restart Log (last 20) ===")
|
||||
if RESTART_LOG.exists():
|
||||
lines = RESTART_LOG.read_text().strip().split("\n")
|
||||
for line in lines[-20:]:
|
||||
print(f" {line}")
|
||||
else:
|
||||
print(" No restarts logged yet.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--daemon":
|
||||
daemon_mode()
|
||||
elif len(sys.argv) > 1 and sys.argv[1] == "--status":
|
||||
show_status()
|
||||
else:
|
||||
check_and_restart()
|
||||
191
fleet/capacity-inventory.md
Normal file
191
fleet/capacity-inventory.md
Normal file
@@ -0,0 +1,191 @@
|
||||
# Capacity Inventory - Fleet Resource Baseline
|
||||
|
||||
**Last audited:** 2026-04-07 16:00 UTC
|
||||
**Auditor:** Timmy (direct inspection)
|
||||
|
||||
---
|
||||
|
||||
## Fleet Resources (Paperclips Model)
|
||||
|
||||
Three primary resources govern the fleet:
|
||||
|
||||
| Resource | Role | Generation | Consumption |
|
||||
|----------|------|-----------|-------------|
|
||||
| **Capacity** | Compute hours available across fleet. Determines what work can be done. | Through healthy utilization of VPS/Mac agents | Fleet improvements consume it (investing in automation, orchestration, sovereignty) |
|
||||
| **Uptime** | % time services are running. Earned at Fibonacci milestones. | When services stay up naturally | Degrades on any failure |
|
||||
| **Innovation** | Only generates when capacity is <70% utilized. Fuels Phase 3+. | When you leave capacity free | Phase 3+ buildings consume it (requires spare capacity to build) |
|
||||
|
||||
### The Tension
|
||||
- Run fleet at 95%+ capacity: maximum productivity, ZERO Innovation
|
||||
- Run fleet at <70% capacity: Innovation generates but slower progress
|
||||
- This forces the Paperclips question: optimize now or invest in future capability?
|
||||
|
||||
---
|
||||
|
||||
## VPS Resource Baselines
|
||||
|
||||
### Ezra (143.198.27.163) - "Forge"
|
||||
|
||||
| Metric | Value | Utilization |
|
||||
|--------|-------|-------------|
|
||||
| **OS** | Ubuntu 24.04 (6.8.0-106-generic) | |
|
||||
| **vCPU** | 4 vCPU (DO basic droplet, shared) | Load: 10.76/7.59/7.04 (very high) |
|
||||
| **RAM** | 7,941 MB total | 2,104 used / 5,836 available (26% used, 74% free) |
|
||||
| **Disk** | 154 GB vda1 | 111 GB used / 44 GB free (72%) **WARNING** |
|
||||
| **Swap** | 6,143 MB | 643 MB used (10%) |
|
||||
| **Uptime** | 7 days, 18 hours | |
|
||||
|
||||
### Key Processes (sorted by memory)
|
||||
| Process | RSS | %CPU | Notes |
|
||||
|---------|-----|------|-------|
|
||||
| Gitea | 556 MB | 83.5% | Web service, high CPU due to API load |
|
||||
| MemPalace (ezra) | 268 MB | 136% | Mining project files - HIGH CPU |
|
||||
| Hermes gateway (ezra) | 245 MB | 1.7% | Agent gateway |
|
||||
| Ollama | 230 MB | 0.1% | Model serving |
|
||||
| PostgreSQL | 138 MB | ~0% | Gitea database |
|
||||
|
||||
**Capacity assessment:** 26% memory used, but 72% disk is getting tight. CPU load is very high (10.76 on 4vCPU = 269% utilization). Ezra is CPU-bound, not RAM-bound.
|
||||
|
||||
### Allegro (167.99.126.228)
|
||||
|
||||
| Metric | Value | Utilization |
|
||||
|--------|-------|-------------|
|
||||
| **OS** | Ubuntu 24.04 (6.8.0-106-generic) | |
|
||||
| **vCPU** | 4 vCPU (DO basic droplet, shared) | Moderate load |
|
||||
| **RAM** | 7,941 MB total | 1,591 used / 6,349 available (20% used, 80% free) |
|
||||
| **Disk** | 154 GB vda1 | 41 GB used / 114 GB free (27%) **GOOD** |
|
||||
| **Swap** | 8,191 MB | 686 MB used (8%) |
|
||||
| **Uptime** | 7 days, 18 hours | |
|
||||
|
||||
### Key Processes (sorted by memory)
|
||||
| Process | RSS | %CPU | Notes |
|
||||
|---------|-----|------|-------|
|
||||
| Hermes gateway (allegro) | 680 MB | 0.9% | Main agent gateway |
|
||||
| Gitea | 181 MB | 1.2% | Secondary gitea? |
|
||||
| Systemd-journald | 160 MB | 0.0% | System logging |
|
||||
| Ezra Hermes gateway | 58 MB | 0.0% | Running ezra agent here |
|
||||
| Bezalel Hermes gateway | 58 MB | 0.0% | Running bezalel agent here |
|
||||
| Dockerd | 48 MB | 0.0% | Docker daemon |
|
||||
|
||||
**Capacity assessment:** 20% memory used, 27% disk used. Allegro has headroom. Also running hermes gateways for Ezra and Bezalel (cross-host agent execution).
|
||||
|
||||
### Bezalel (159.203.146.185)
|
||||
|
||||
| Metric | Value | Utilization |
|
||||
|--------|-------|-------------|
|
||||
| **OS** | Ubuntu 24.04 (6.8.0-71-generic) | |
|
||||
| **vCPU** | 2 vCPU (DO basic droplet, shared) | Load varies |
|
||||
| **RAM** | 1,968 MB total | 817 used / 1,151 available (42% used, 58% free) |
|
||||
| **Disk** | 48 GB vda1 | 12 GB used / 37 GB free (24%) **GOOD** |
|
||||
| **Swap** | 2,047 MB | 448 MB used (22%) |
|
||||
| **Uptime** | 7 days, 18 hours | |
|
||||
|
||||
### Key Processes (sorted by memory)
|
||||
| Process | RSS | %CPU | Notes |
|
||||
|---------|-----|------|-------|
|
||||
| Hermes gateway | 339 MB | 7.7% | Agent gateway (16.8% of RAM) |
|
||||
| uv pip install | 137 MB | 56.6% | Installing packages (temporary) |
|
||||
| Mender | 27 MB | 0.0% | Device management |
|
||||
|
||||
**Capacity assessment:** 42% memory used, only 2GB total RAM. Bezalel is the most constrained. 2 vCPU means less compute headroom than Ezra/Allegro. Disk is fine.
|
||||
|
||||
### Mac Local (M3 Max)
|
||||
|
||||
| Metric | Value | Utilization |
|
||||
|--------|-------|-------------|
|
||||
| **OS** | macOS 26.3.1 | |
|
||||
| **CPU** | Apple M3 Max (14 cores) | Very capable |
|
||||
| **RAM** | 36 GB | ~8 GB used (22%) |
|
||||
| **Disk** | 926 GB total | ~624 GB used / 302 GB free (68%) |
|
||||
|
||||
### Key Processes
|
||||
| Process | Memory | Notes |
|
||||
|---------|--------|-------|
|
||||
| Hermes gateway | 500 MB | Primary gateway |
|
||||
| Hermes agents (x3) | ~560 MB total | Multiple sessions |
|
||||
| Ollama | ~20 MB base + model memory | Model loading varies |
|
||||
| OpenClaw | 350 MB | Gateway process |
|
||||
| Evennia (server+portal) | 56 MB | Game world |
|
||||
|
||||
---
|
||||
|
||||
## Resource Summary
|
||||
|
||||
| Resource | Ezra | Allegro | Bezalel | Mac Local | TOTAL |
|
||||
|----------|------|---------|---------|-----------|-------|
|
||||
| **vCPU** | 4 | 4 | 2 | 14 (M3 Max) | 24 |
|
||||
| **RAM** | 8 GB (26% used) | 8 GB (20% used) | 2 GB (42% used) | 36 GB (22% used) | 54 GB |
|
||||
| **Disk** | 154 GB (72%) | 154 GB (27%) | 48 GB (24%) | 926 GB (68%) | 1,282 GB |
|
||||
| **Cost** | $12/mo | $12/mo | $12/mo | owned | $36/mo |
|
||||
|
||||
### Utilization by Category
|
||||
| Category | Estimated Daily Hours | % of Fleet Capacity |
|
||||
|----------|----------------------|---------------------|
|
||||
| Hermes agents | ~3-4 hrs active | 5-7% |
|
||||
| Ollama inference | ~1-2 hrs | 2-4% |
|
||||
| Gitea services | 24/7 | 5-10% |
|
||||
| Evennia | 24/7 | <1% |
|
||||
| Idle | ~18-20 hrs | ~80-90% |
|
||||
|
||||
### Capacity Utilization: ~15-20% active
|
||||
**Innovation rate:** GENERATING (capacity < 70%)
|
||||
**Recommendation:** Good — Innovation is generating because most capacity is free.
|
||||
This means Phase 3+ capabilities (orchestration, load balancing, etc.) are accessible NOW.
|
||||
|
||||
---
|
||||
|
||||
## Uptime Baseline
|
||||
|
||||
**Baseline period:** 2026-04-07 14:00-16:00 UTC (2 hours, ~24 checks at 5-min intervals)
|
||||
|
||||
| Service | Checks | Uptime | Status |
|
||||
|---------|--------|--------|--------|
|
||||
| Ezra | 24/24 | 100.0% | GOOD |
|
||||
| Allegro | 24/24 | 100.0% | GOOD |
|
||||
| Bezalel | 24/24 | 100.0% | GOOD |
|
||||
| Gitea | 23/24 | 95.8% | GOOD |
|
||||
| Hermes Gateway | 23/24 | 95.8% | GOOD |
|
||||
| Ollama | 24/24 | 100.0% | GOOD |
|
||||
| OpenClaw | 24/24 | 100.0% | GOOD |
|
||||
| Evennia | 24/24 | 100.0% | GOOD |
|
||||
| Hermes Agent | 21/24 | 87.5% | **CHECK** |
|
||||
|
||||
### Fibonacci Uptime Milestones
|
||||
| Milestone | Target | Current | Status |
|
||||
|-----------|--------|---------|--------|
|
||||
| 95% | 95% | 100% (VPS), 98.6% (avg) | REACHED |
|
||||
| 95.5% | 95.5% | 98.6% | REACHED |
|
||||
| 96% | 96% | 98.6% | REACHED |
|
||||
| 97% | 97% | 98.6% | REACHED |
|
||||
| 98% | 98% | 98.6% | REACHED |
|
||||
| 99% | 99% | 98.6% | APPROACHING |
|
||||
|
||||
---
|
||||
|
||||
## Risk Assessment
|
||||
|
||||
| Risk | Severity | Mitigation |
|
||||
|------|----------|------------|
|
||||
| Ezra disk 72% used | MEDIUM | Move non-essential data, add monitoring alert at 85% |
|
||||
| Bezalel only 2GB RAM | HIGH | Cannot run large models locally. Good for Evennia, tight for agents |
|
||||
| Ezra CPU load 269% | HIGH | MemPalace mining consuming 136% CPU. Consider scheduling |
|
||||
| Mac disk 68% used | MEDIUM | 302 GB free still. Growing but not urgent |
|
||||
| No cross-VPS mesh | LOW | SSH works but no Tailscale. No private network between VPSes |
|
||||
|
||||
---
|
||||
|
||||
## Recommendations
|
||||
|
||||
### Immediate (Phase 1-2)
|
||||
1. **Ezra disk cleanup:** 44 GB free at 72%. Docker images, old logs, and MemPalace mine data could be rotated.
|
||||
2. **Alert thresholds:** Add disk alerts at 85% (Ezra, Mac) before they become critical.
|
||||
|
||||
### Short-term (Phase 3)
|
||||
3. **Load balancing:** Ezra is CPU-bound, Allegro has 80% RAM free. Move some agent processes from Ezra to Allegro.
|
||||
4. **Innovation investment:** Since fleet is at 15-20% utilization, Innovation is high. This is the time to build Phase 3 capabilities.
|
||||
|
||||
### Medium-term (Phase 4)
|
||||
5. **Bezalel RAM upgrade:** 2GB is tight. Consider upgrade to 4GB ($24/mo instead of $12/mo).
|
||||
6. **Tailscale mesh:** Install on all VPSes for private inter-VPS network.
|
||||
|
||||
---
|
||||
142
fleet/delegation.py
Executable file
142
fleet/delegation.py
Executable file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
# Cross-Agent Task Delegation - The Timmy Foundation
|
||||
# Phase 3: Orchestration capability.
|
||||
# Agents create issues, assign to other agents, review PRs automatically.
|
||||
|
||||
import os, sys, json, time, urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
GITEA_BASE = "https://forge.alexanderwhitestone.com/api/v1"
|
||||
TOKEN_FILE = Path(os.path.expanduser("~/.config/gitea/timmy-token"))
|
||||
ALT_TOKEN = Path(os.path.expanduser("~/.config/gitea/token"))
|
||||
DATA_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-resources"))
|
||||
DELEGATION_LOG = DATA_DIR / "delegation.log"
|
||||
|
||||
AGENTS = {
|
||||
"claw-code": {"models": ["qwen3.6-plus:free"], "caps": ["small-patches","config","docs","repo-hygiene"], "max": 2, "active": True},
|
||||
"gemini": {"models": ["gemini-2.5-flash"], "caps": ["research","heavy-impl","architecture","debugging"], "max": 5, "active": True},
|
||||
"ezra": {"models": ["hermes4:14b","local-ollama"], "caps": ["contracting","formalization","ops","vps"], "max": 3, "active": True},
|
||||
"bezalel": {"models": ["local-llm"], "caps": ["evennia","art","creative","visualization"], "max": 3, "active": True},
|
||||
"timmy": {"models": ["qwen3.6-plus:free","hermes4:14b","local-ollama"], "caps": ["orchestration","review","deploy","fleet"], "max": 5, "active": True},
|
||||
}
|
||||
|
||||
MONITORED_REPOS = [
|
||||
"Timmy_Foundation/timmy-home",
|
||||
"Timmy_Foundation/timmy-config",
|
||||
"Timmy_Foundation/the-nexus",
|
||||
"Timmy_Foundation/hermes-agent",
|
||||
]
|
||||
|
||||
# Heuristic keyword matching
|
||||
KEYWORDS = {
|
||||
"claw-code": ["patch","typo","config","gitignore","docs update","readme","cleanup","format"],
|
||||
"gemini": ["research","investigate","analyze","compare","benchmark","survey","evaluate"],
|
||||
"ezra": ["vps","ssh","deploy","infrastructure","server","cron","resurrection","provision"],
|
||||
"bezalel": ["evennia","art","creative","music","visualization","diagram"],
|
||||
"timmy": ["orchestrate","review","merge","fleet","pipeline","health","monitor"],
|
||||
}
|
||||
|
||||
def get_token():
|
||||
if TOKEN_FILE.exists(): return TOKEN_FILE.read_text().strip()
|
||||
if ALT_TOKEN.exists(): return ALT_TOKEN.read_text().strip()
|
||||
return ""
|
||||
|
||||
def api(path, method="GET", data=None):
|
||||
token = get_token()
|
||||
url = f"{GITEA_BASE}{path}"
|
||||
headers = {"Authorization": f"token {token}"}
|
||||
body = json.dumps(data).encode() if data else None
|
||||
if data: headers["Content-Type"] = "application/json"
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method=method)
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=15)
|
||||
raw = resp.read().decode()
|
||||
return json.loads(raw) if raw.strip() else {}
|
||||
except urllib.error.HTTPError as e:
|
||||
err = e.read().decode()
|
||||
print(f" API error {e.code}: {err[:200]}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" API error: {e}")
|
||||
return None
|
||||
|
||||
def log_delegation(msg, level="INFO"):
|
||||
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
entry = f"[{ts}] [{level}] {msg}"
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with open(DELEGATION_LOG, "a") as f: f.write(entry + "\n")
|
||||
print(f" {entry}")
|
||||
|
||||
def suggest_agent(title, body):
|
||||
text = (title + " " + body).lower()
|
||||
for agent, keywords in KEYWORDS.items():
|
||||
if any(kw in text for kw in keywords):
|
||||
return agent, f"keywords matched for {agent}"
|
||||
return None, None
|
||||
|
||||
def assign_issue(repo, issue_num, agent):
|
||||
result = api(f"/repos/{repo}/issues/{issue_num}", method="PATCH",
|
||||
data={"assignees": {"operation": "set", "usernames": [agent]}})
|
||||
if result:
|
||||
log_delegation(f"Assigned {repo}#{issue_num} to {agent}")
|
||||
comment_on_issue(repo, issue_num, f"[AUTO-ASSIGN] Assigned to {agent}.")
|
||||
return result
|
||||
|
||||
def comment_on_issue(repo, issue_num, body):
|
||||
return api(f"/repos/{repo}/issues/{issue_num}/comments", method="POST", data={"body": body})
|
||||
|
||||
def get_my_issues(agent):
|
||||
issues = []
|
||||
for repo in MONITORED_REPOS:
|
||||
repo_issues = api(f"/repos/{repo}/issues?state=open&limit=50")
|
||||
if repo_issues:
|
||||
for i in repo_issues:
|
||||
for a in (i.get("assignees") or []):
|
||||
if a.get("login") == agent:
|
||||
issues.append({"repo": repo, "issue": i})
|
||||
return issues
|
||||
|
||||
def run_cycle():
|
||||
log_delegation("Starting delegation cycle")
|
||||
count = 0
|
||||
for repo in MONITORED_REPOS:
|
||||
issues = api(f"/repos/{repo}/issues?state=open&limit=50")
|
||||
if not issues: continue
|
||||
for issue in issues:
|
||||
if issue.get("assignees"): continue
|
||||
title = issue.get("title","")
|
||||
body = issue.get("body","")
|
||||
if any(w in title.lower() for w in ["epic","discussion","question"]): continue
|
||||
agent, reason = suggest_agent(title, body)
|
||||
if agent:
|
||||
if assign_issue(repo, issue["number"], agent): count += 1
|
||||
log_delegation(f"Cycle complete: {count} new assignments")
|
||||
return count
|
||||
|
||||
def show_status():
|
||||
print("\n=== Delegation Status ===")
|
||||
for name, info in AGENTS.items():
|
||||
issues = get_my_issues(name)
|
||||
status = "ONLINE" if info["active"] else "OFFLINE"
|
||||
print(f" {name}: {len(issues)} assigned [{status}]")
|
||||
for iss in issues[:3]:
|
||||
print(f" - {iss['repo'].split('/')[-1]}#{iss['issue']['number']}: {iss['issue']['title'][:60]}")
|
||||
if len(issues) > 3:
|
||||
print(f" ... +{len(issues)-3} more")
|
||||
|
||||
if __name__ == "__main__":
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
if len(sys.argv) > 1:
|
||||
cmd = sys.argv[1]
|
||||
if cmd == "status": show_status()
|
||||
elif cmd == "run":
|
||||
run_cycle()
|
||||
show_status()
|
||||
elif cmd == "assign" and len(sys.argv) >= 5:
|
||||
assign_issue(sys.argv[3], int(sys.argv[2]), sys.argv[4])
|
||||
else:
|
||||
print("Usage: delegation.py [run|status|assign <issue_num> <repo> <agent>]")
|
||||
else:
|
||||
run_cycle()
|
||||
show_status()
|
||||
142
fleet/milestones.md
Normal file
142
fleet/milestones.md
Normal file
@@ -0,0 +1,142 @@
|
||||
# Fleet Milestone Messages
|
||||
|
||||
Every milestone marks passage through fleet evolution. When achieved, the message
|
||||
prints to the fleet log. Each one references a real achievement, not abstract numbers.
|
||||
|
||||
**Source:** Inspired by Paperclips milestone messages (500 clips, 1000 clips, Full autonomy attained, etc.)
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Survival (Current)
|
||||
|
||||
### M1: First Automated Health Check
|
||||
**Trigger:** `fleet/health_check.py` runs successfully for the first time.
|
||||
**Message:** "First automated health check runs. No longer watching the clock."
|
||||
|
||||
### M2: First Auto-Restart
|
||||
**Trigger:** A dead process is detected and restarted without human intervention.
|
||||
**Message:** "A process failed at 3am and restarted itself. You found out in the morning."
|
||||
|
||||
### M3: First Backup Completed
|
||||
**Trigger:** A backup pipeline runs end-to-end and verifies integrity.
|
||||
**Message:** "A backup completed. You did not have to think about it."
|
||||
|
||||
### M4: 95% Uptime (30 days)
|
||||
**Trigger:** Uptime >= 95% over last 30 days.
|
||||
**Message:** "95% uptime over 30 days. The fleet stays up."
|
||||
|
||||
### M5: Uptime 97%
|
||||
**Trigger:** Uptime >= 97% over last 30 days.
|
||||
**Message:** "97% uptime. Three nines of availability across four machines."
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Automation (unlock when: uptime >= 95% + capacity > 60%)
|
||||
|
||||
### M6: Zero Manual Restarts (7 days)
|
||||
**Trigger:** 7 consecutive days with zero manual process restarts.
|
||||
**Message:** "Seven days. Zero manual restarts. The fleet heals itself."
|
||||
|
||||
### M7: PR Auto-Merged
|
||||
**Trigger:** A PR passes CI, review, and merges without human touching it.
|
||||
**Message:** "A PR was tested, reviewed, and merged by agents. You just said 'looks good.'"
|
||||
|
||||
### M8: Config Push Works
|
||||
**Trigger:** Config change pushed to all 3 VPSes atomically and verified.
|
||||
**Message:** "Config pushed to all three VPSes in one command. No SSH needed."
|
||||
|
||||
### M9: 98% Uptime
|
||||
**Trigger:** Uptime >= 98% over last 30 days.
|
||||
**Message:** "98% uptime. Only 14 hours of downtime in a month. Most of it planned."
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Orchestration (unlock when: all Phase 2 buildings + Innovation > 100)
|
||||
|
||||
### M10: Cross-Agent Delegation Works
|
||||
**Trigger:** Agent A creates issue, assigns to Agent B, Agent B works and creates PR.
|
||||
**Message:** "Agent Alpha created a task, Agent Beta completed it. They did not ask permission."
|
||||
|
||||
### M11: First Model Running Locally on 2+ Machines
|
||||
**Trigger:** Ollama serving same model on Ezra and Allegro simultaneously.
|
||||
**Message:** "A model runs on two machines at once. No cloud. No rate limits."
|
||||
|
||||
### M12: Fleet-Wide Burn Mode
|
||||
**Trigger:** All agents coordinated on single epic, produced coordinated PRs.
|
||||
**Message:** "All agents working the same epic. The fleet moves as one."
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Sovereignty (unlock when: zero cloud deps for core ops)
|
||||
|
||||
### M13: First Entirely Local Inference Day
|
||||
**Trigger:** 24 hours with zero API calls to external providers.
|
||||
**Message:** "A model ran locally for the first time. No cloud. No rate limits. No one can turn it off."
|
||||
|
||||
### M14: Sovereign Email
|
||||
**Trigger:** Stalwart email server sends and receives without Gmail relay.
|
||||
**Message:** "Email flows through our own server. No Google. No Microsoft. Ours."
|
||||
|
||||
### M15: Sovereign Messaging
|
||||
**Trigger:** Telegram bot runs without cloud relay dependency.
|
||||
**Message:** "Messages arrive through our own infrastructure. No corporate middleman."
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Scale (unlock when: sovereignty stable + Innovation > 500)
|
||||
|
||||
### M16: First Self-Spawned Agent
|
||||
**Trigger:** Agent lifecycle manager spawns a new agent instance due to load.
|
||||
**Message:** "A new agent appeared. You did not create it. The fleet built what it needed."
|
||||
|
||||
### M17: Agent Retired Gracefully
|
||||
**Trigger:** An agent instance retires after idle timeout and cleans up its state.
|
||||
**Message:** "An agent retired. It served its purpose. Nothing was lost."
|
||||
|
||||
### M18: Fleet Runs 24h Unattended
|
||||
**Trigger:** 24 hours with zero human intervention of any kind.
|
||||
**Message:** "A full day. No humans. No commands. The fleet runs itself."
|
||||
|
||||
---
|
||||
|
||||
## Phase 6: The Network (unlock when: 7 days zero human intervention)
|
||||
|
||||
### M19: Fleet Creates Its Own Improvement Task
|
||||
**Trigger:** Fleet analyzes itself and creates an issue on Gitea.
|
||||
**Message:** "The fleet found something to improve. It created the task itself."
|
||||
|
||||
### M20: First Outside Contribution
|
||||
**Trigger:** An external contributor's PR is reviewed and merged by fleet agents.
|
||||
**Message:** "Someone outside the fleet contributed. The fleet reviewed, tested, and merged. No human touched it."
|
||||
|
||||
### M21: The Beacon
|
||||
**Trigger:** Infrastructure serves someone in need through automated systems.
|
||||
**Message:** "Someone found the Beacon. In the dark, looking for help. The infrastructure served its purpose. It was built for this."
|
||||
|
||||
### M22: Permanent Light
|
||||
**Trigger:** 90 days of autonomous operation with continuous availability.
|
||||
**Message:** "Three months. The light never went out. Not for anyone."
|
||||
|
||||
---
|
||||
|
||||
## Fibonacci Uptime Milestones
|
||||
|
||||
These trigger regardless of phase, based purely on uptime percentage:
|
||||
|
||||
| Milestone | Uptime | Meaning |
|
||||
|-----------|--------|--------|
|
||||
| U1 | 95% | Basic reliability achieved |
|
||||
| U2 | 95.5% | Fewer than 16 hours/month downtime |
|
||||
| U3 | 96% | Fewer than 12 hours/month |
|
||||
| U4 | 97% | Fewer than 9 hours/month |
|
||||
| U5 | 97.5% | Fewer than 7 hours/month |
|
||||
| U6 | 98% | Fewer than 4.5 hours/month |
|
||||
| U7 | 98.3% | Fewer than 3 hours/month |
|
||||
| U8 | 98.6% | Less than 2.5 hours/month — approaching cloud tier |
|
||||
| U9 | 98.9% | Less than 1.5 hours/month |
|
||||
| U10 | 99% | Less than 1 hour/month — enterprise grade |
|
||||
| U11 | 99.5% | Less than 22 minutes/month |
|
||||
|
||||
---
|
||||
|
||||
*Every message is earned. None are given freely. Fleet evolution is not a checklist — it is a climb.*
|
||||
171
fleet/model-fallback.sh
Executable file
171
fleet/model-fallback.sh
Executable file
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env bash
|
||||
# FLEET-011: Local Model Pipeline and Fallback Chain
|
||||
# Phase 4: Sovereignty — all inference runs locally, no cloud dependency.
|
||||
#
|
||||
# Usage:
|
||||
# ./model-fallback.sh # Show current model chain status
|
||||
# ./model-fallback.sh list # List all local models
|
||||
# ./model-fallback.sh test "Hello" # Test the full fallback chain
|
||||
# ./model-fallback.sh chat # Interactive chat mode
|
||||
# ./model-fallback.sh install # Install default model chain
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# === CONFIG ===
|
||||
CHAIN_FILE="$HOME/.local/timmy/fleet-resources/model-chain.json"
|
||||
LOG_DIR="$HOME/.local/timmy/fleet-health"
|
||||
OLLAMA_URL="http://localhost:11434"
|
||||
|
||||
# Default chain (best quality first, fallback to smallest that runs)
|
||||
DEFAULT_CHAIN=$(cat << 'EOF'
|
||||
{
|
||||
"chain": [
|
||||
{"name": "hermes4:14b", "provider": "ollama", "max_tokens": 4096, "purpose": "primary"},
|
||||
{"name": "qwen2.5:7b", "provider": "ollama", "max_tokens": 4096, "purpose": "fallback"},
|
||||
{"name": "phi3:3.8b", "provider": "ollama", "max_tokens": 2048, "purpose": "emergency"},
|
||||
{"name": "gemma2:2b", "provider": "ollama", "max_tokens": 2048, "purpose": "minimal"}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
)
|
||||
|
||||
load_chain() {
|
||||
if [ -f "$CHAIN_FILE" ]; then
|
||||
cat "$CHAIN_FILE"
|
||||
else
|
||||
echo "$DEFAULT_CHAIN"
|
||||
fi
|
||||
}
|
||||
|
||||
save_chain() {
|
||||
echo "$1" > "$CHAIN_FILE"
|
||||
echo "Model chain saved to $CHAIN_FILE"
|
||||
}
|
||||
|
||||
install_chain() {
|
||||
echo "Installing default model chain..."
|
||||
echo "$DEFAULT_CHAIN" > "$CHAIN_FILE"
|
||||
|
||||
# Extract model names and install via Ollama
|
||||
echo "$DEFAULT_CHAIN" | python3 -c "
|
||||
import json,sys
|
||||
for m in json.load(sys.stdin)['chain']:
|
||||
print(m['name'])
|
||||
" | while read model; do
|
||||
echo " Installing $model..."
|
||||
if ollama list 2>/dev/null | grep -q "$model"; then
|
||||
echo " $model already installed"
|
||||
else
|
||||
ollama pull "$model" 2>&1 | tail -1
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
list_models() {
|
||||
echo "=== Local Models (Ollama) ==="
|
||||
ollama list 2>/dev/null || echo "Ollama not running or not installed"
|
||||
|
||||
echo ""
|
||||
echo "=== Active Fallback Chain ==="
|
||||
load_chain | python3 -c "
|
||||
import json,sys
|
||||
data = json.load(sys.stdin)
|
||||
print(f'{\"Model\":<25} {\"Purpose\":<12} {\"Max tokens\":>10}')
|
||||
print('-' * 50)
|
||||
for m in data['chain']:
|
||||
print(f'{m[\"name\"]:<25} {m[\"purpose\"]:<12} {m[\"max_tokens\"]:>10}')
|
||||
"
|
||||
}
|
||||
|
||||
status() {
|
||||
echo "=== Model Pipeline Status ==="
|
||||
|
||||
# Check Ollama
|
||||
if curl -s "$OLLAMA_URL/api/tags" >/dev/null 2>&1; then
|
||||
echo " Ollama: RUNNING at $OLLAMA_URL"
|
||||
model_count=$(curl -s "$OLLAMA_URL/api/tags" | python3 -c "import json,sys; print(len(json.load(sys.stdin).get('models',[])))" 2>/dev/null || echo "?")
|
||||
echo " Local models: $model_count"
|
||||
else
|
||||
echo " Ollama: DOWN - no local inference available"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Fallback Chain ==="
|
||||
list_models
|
||||
}
|
||||
|
||||
test_chain() {
|
||||
local prompt="$1"
|
||||
|
||||
echo "Testing fallback chain with prompt: \"$prompt\""
|
||||
echo ""
|
||||
|
||||
load_chain | python3 -c "
|
||||
import json,sys,urllib.request,subprocess
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
models = data['chain']
|
||||
prompt = '$(echo "$prompt" | sed "s/'/\\\\'/g")'
|
||||
|
||||
for m in models:
|
||||
name = m['name']
|
||||
print(f' Testing {name}...', end=' ')
|
||||
try:
|
||||
body = json.dumps({'model': name, 'prompt': '$prompt', 'stream': False}).encode()
|
||||
req = urllib.request.Request('http://localhost:11434/api/generate', data=body,
|
||||
headers={'Content-Type': 'application/json'})
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
result = json.loads(resp.read())
|
||||
print(f'OK — \"{result.get(\"response\", \"\")[:80]}\"')
|
||||
print(f' Chain works! Primary model ({name}) is serving.')
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
print(f'FAILED — {str(e)[:60]}')
|
||||
|
||||
print('All models failed. No local inference available.')
|
||||
"
|
||||
}
|
||||
|
||||
chat() {
|
||||
echo "=== Beacon Chat Mode ==="
|
||||
echo "Type 'quit' to exit. Using local model chain."
|
||||
echo ""
|
||||
|
||||
load_chain | python3 -c "
|
||||
import json,sys,urllib.request
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
models = data['chain']
|
||||
|
||||
while True:
|
||||
try:
|
||||
prompt = input('> ')
|
||||
except EOFError:
|
||||
break
|
||||
if prompt.lower() in ('quit', 'exit'):
|
||||
break
|
||||
if not prompt.strip():
|
||||
continue
|
||||
|
||||
print('Thinking...')
|
||||
body = json.dumps({'model': models[0]['name'], 'prompt': prompt, 'stream': False}).encode()
|
||||
req = urllib.request.Request('http://localhost:11434/api/generate', data=body,
|
||||
headers={'Content-Type': 'application/json'})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=120)
|
||||
result = json.loads(resp.read())
|
||||
print(result.get('response', '').strip())
|
||||
except Exception as e:
|
||||
print(f'Model error: {e}')
|
||||
print('Trying next model in chain...')
|
||||
"
|
||||
}
|
||||
|
||||
case "${1:-status}" in
|
||||
install) install_chain ;;
|
||||
list) list_models ;;
|
||||
test) test_chain "${2:-Hello, are you there?}" ;;
|
||||
chat) chat ;;
|
||||
status) status ;;
|
||||
*) echo "Usage: $0 [install|list|test|chat|status]" ;;
|
||||
esac
|
||||
231
fleet/resource_tracker.py
Executable file
231
fleet/resource_tracker.py
Executable file
@@ -0,0 +1,231 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fleet Resource Tracker — Tracks Capacity, Uptime, and Innovation.
|
||||
|
||||
Paperclips-inspired tension model:
|
||||
- Capacity: spent on fleet improvements, generates through utilization
|
||||
- Uptime: earned when services stay up, Fibonacci milestones unlock capabilities
|
||||
- Innovation: only generates when capacity < 70%. Fuels Phase 3+.
|
||||
|
||||
This is the heart of the fleet progression system.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import socket
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# === CONFIG ===
|
||||
DATA_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-resources"))
|
||||
RESOURCES_FILE = DATA_DIR / "resources.json"
|
||||
|
||||
# Tension thresholds
|
||||
INNOVATION_THRESHOLD = 0.70 # Innovation only generates when capacity < 70%
|
||||
INNOVATION_RATE = 5.0 # Innovation generated per hour when under threshold
|
||||
CAPACITY_REGEN_RATE = 2.0 # Capacity regenerates per hour of healthy operation
|
||||
FIBONACCI = [95.0, 95.5, 96.0, 97.0, 97.5, 98.0, 98.3, 98.6, 98.9, 99.0, 99.5]
|
||||
|
||||
|
||||
def init():
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
if not RESOURCES_FILE.exists():
|
||||
data = {
|
||||
"capacity": {
|
||||
"current": 100.0,
|
||||
"max": 100.0,
|
||||
"spent_on": [],
|
||||
"history": []
|
||||
},
|
||||
"uptime": {
|
||||
"current_pct": 100.0,
|
||||
"milestones_reached": [],
|
||||
"total_checks": 0,
|
||||
"successful_checks": 0,
|
||||
"history": []
|
||||
},
|
||||
"innovation": {
|
||||
"current": 0.0,
|
||||
"total_generated": 0.0,
|
||||
"spent_on": [],
|
||||
"last_calculated": time.time()
|
||||
}
|
||||
}
|
||||
RESOURCES_FILE.write_text(json.dumps(data, indent=2))
|
||||
print("Initialized resource tracker")
|
||||
return RESOURCES_FILE.exists()
|
||||
|
||||
|
||||
def load():
|
||||
if RESOURCES_FILE.exists():
|
||||
return json.loads(RESOURCES_FILE.read_text())
|
||||
return None
|
||||
|
||||
|
||||
def save(data):
|
||||
RESOURCES_FILE.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
def update_uptime(checks: dict):
|
||||
"""Update uptime stats from health check results.
|
||||
checks = {'ezra': True, 'allegro': True, 'bezalel': True, 'gitea': True, ...}
|
||||
"""
|
||||
data = load()
|
||||
if not data:
|
||||
return
|
||||
|
||||
data["uptime"]["total_checks"] += 1
|
||||
successes = sum(1 for v in checks.values() if v)
|
||||
total = len(checks)
|
||||
|
||||
# Overall uptime percentage
|
||||
overall = successes / max(total, 1) * 100.0
|
||||
data["uptime"]["successful_checks"] += successes
|
||||
|
||||
# Calculate rolling uptime
|
||||
if "history" not in data["uptime"]:
|
||||
data["uptime"]["history"] = []
|
||||
data["uptime"]["history"].append({
|
||||
"ts": datetime.now(timezone.utc).isoformat(),
|
||||
"checks": checks,
|
||||
"overall": round(overall, 2)
|
||||
})
|
||||
|
||||
# Keep last 1000 checks
|
||||
if len(data["uptime"]["history"]) > 1000:
|
||||
data["uptime"]["history"] = data["uptime"]["history"][-1000:]
|
||||
|
||||
# Calculate current uptime %, last 100 checks
|
||||
recent = data["uptime"]["history"][-100:]
|
||||
recent_ok = sum(c["overall"] for c in recent) / max(len(recent), 1)
|
||||
data["uptime"]["current_pct"] = round(recent_ok, 2)
|
||||
|
||||
# Check Fibonacci milestones
|
||||
new_milestones = []
|
||||
for fib in FIBONACCI:
|
||||
if fib not in data["uptime"]["milestones_reached"] and recent_ok >= fib:
|
||||
data["uptime"]["milestones_reached"].append(fib)
|
||||
new_milestones.append(fib)
|
||||
|
||||
save(data)
|
||||
|
||||
if new_milestones:
|
||||
print(f" UPTIME MILESTONE: {','.join(str(m) + '%') for m in new_milestones}")
|
||||
print(f" Current uptime: {recent_ok:.1f}%")
|
||||
|
||||
return data["uptime"]
|
||||
|
||||
|
||||
def spend_capacity(amount: float, purpose: str):
|
||||
"""Spend capacity on a fleet improvement."""
|
||||
data = load()
|
||||
if not data:
|
||||
return False
|
||||
if data["capacity"]["current"] < amount:
|
||||
print(f" INSUFFICIENT CAPACITY: Need {amount}, have {data['capacity']['current']:.1f}")
|
||||
return False
|
||||
data["capacity"]["current"] -= amount
|
||||
data["capacity"]["spent_on"].append({
|
||||
"purpose": purpose,
|
||||
"amount": amount,
|
||||
"ts": datetime.now(timezone.utc).isoformat()
|
||||
})
|
||||
save(data)
|
||||
print(f" Spent {amount} capacity on: {purpose}")
|
||||
return True
|
||||
|
||||
|
||||
def regenerate_resources():
|
||||
"""Regenerate capacity and calculate innovation."""
|
||||
data = load()
|
||||
if not data:
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
last = data["innovation"]["last_calculated"]
|
||||
hours = (now - last) / 3600.0
|
||||
if hours < 0.1: # Only update every ~6 minutes
|
||||
return
|
||||
|
||||
# Regenerate capacity
|
||||
capacity_gain = CAPACITY_REGEN_RATE * hours
|
||||
data["capacity"]["current"] = min(
|
||||
data["capacity"]["max"],
|
||||
data["capacity"]["current"] + capacity_gain
|
||||
)
|
||||
|
||||
# Calculate capacity utilization
|
||||
utilization = 1.0 - (data["capacity"]["current"] / data["capacity"]["max"])
|
||||
|
||||
# Generate innovation only when under threshold
|
||||
innovation_gain = 0.0
|
||||
if utilization < INNOVATION_THRESHOLD:
|
||||
innovation_gain = INNOVATION_RATE * hours * (1.0 - utilization / INNOVATION_THRESHOLD)
|
||||
data["innovation"]["current"] += innovation_gain
|
||||
data["innovation"]["total_generated"] += innovation_gain
|
||||
|
||||
# Record history
|
||||
if "history" not in data["capacity"]:
|
||||
data["capacity"]["history"] = []
|
||||
data["capacity"]["history"].append({
|
||||
"ts": datetime.now(timezone.utc).isoformat(),
|
||||
"capacity": round(data["capacity"]["current"], 1),
|
||||
"utilization": round(utilization * 100, 1),
|
||||
"innovation": round(data["innovation"]["current"], 1),
|
||||
"innovation_gain": round(innovation_gain, 1)
|
||||
})
|
||||
# Keep last 500 capacity records
|
||||
if len(data["capacity"]["history"]) > 500:
|
||||
data["capacity"]["history"] = data["capacity"]["history"][-500:]
|
||||
|
||||
data["innovation"]["last_calculated"] = now
|
||||
|
||||
save(data)
|
||||
print(f" Capacity: {data['capacity']['current']:.1f}/{data['capacity']['max']:.1f}")
|
||||
print(f" Utilization: {utilization*100:.1f}%")
|
||||
print(f" Innovation: {data['innovation']['current']:.1f} (+{innovation_gain:.1f} this period)")
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def status():
|
||||
"""Print current resource status."""
|
||||
data = load()
|
||||
if not data:
|
||||
print("Resource tracker not initialized. Run --init first.")
|
||||
return
|
||||
|
||||
print("\n=== Fleet Resources ===")
|
||||
print(f" Capacity: {data['capacity']['current']:.1f}/{data['capacity']['max']:.1f}")
|
||||
|
||||
utilization = 1.0 - (data["capacity"]["current"] / data["capacity"]["max"])
|
||||
print(f" Utilization: {utilization*100:.1f}%")
|
||||
|
||||
innovation_status = "GENERATING" if utilization < INNOVATION_THRESHOLD else "BLOCKED"
|
||||
print(f" Innovation: {data['innovation']['current']:.1f} [{innovation_status}]")
|
||||
|
||||
print(f" Uptime: {data['uptime']['current_pct']:.1f}%")
|
||||
print(f" Milestones: {', '.join(str(m)+'%' for m in data['uptime']['milestones_reached']) or 'None yet'}")
|
||||
|
||||
# Phase gate checks
|
||||
phase_2_ok = data['uptime']['current_pct'] >= 95.0
|
||||
phase_3_ok = phase_2_ok and data['innovation']['current'] > 100
|
||||
phase_5_ok = phase_2_ok and data['innovation']['current'] > 500
|
||||
|
||||
print(f"\n Phase Gates:")
|
||||
print(f" Phase 2 (Automation): {'UNLOCKED' if phase_2_ok else 'LOCKED (need 95% uptime)'}")
|
||||
print(f" Phase 3 (Orchestration): {'UNLOCKED' if phase_3_ok else 'LOCKED (need 95% uptime + 100 innovation)'}")
|
||||
print(f" Phase 5 (Scale): {'UNLOCKED' if phase_5_ok else 'LOCKED (need 95% uptime + 500 innovation)'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
init()
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "status":
|
||||
status()
|
||||
elif len(sys.argv) > 1 and sys.argv[1] == "regen":
|
||||
regenerate_resources()
|
||||
else:
|
||||
regenerate_resources()
|
||||
status()
|
||||
Reference in New Issue
Block a user