Files
timmy-home/scripts/fleet_phase_status.py
Alexander Whitestone b90a15baca
Some checks failed
Smoke Test / smoke (pull_request) Failing after 11s
feat: codify phase-1 survival state (#548)
2026-04-15 00:24:25 -04:00

225 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""Render the current fleet survival phase as a durable report."""
from __future__ import annotations
import argparse
import json
from copy import deepcopy
from pathlib import Path
from typing import Any
PHASE_NAME = "[PHASE-1] Survival - Keep the Lights On"
NEXT_PHASE_NAME = "[PHASE-2] Automation - Self-Healing Infrastructure"
TARGET_UPTIME_PERCENT = 95.0
TARGET_UPTIME_DAYS = 30
TARGET_CAPACITY_PERCENT = 60.0
DEFAULT_BUILDINGS = [
"VPS hosts: Ezra, Allegro, Bezalel",
"Agents: Timmy harness, Code Claw heartbeat, Gemini AI Studio worker",
"Gitea forge",
"Evennia worlds",
]
DEFAULT_MANUAL_CLICKS = [
"Restart agents and services by hand when a node goes dark.",
"SSH into machines to verify health, disk, and memory.",
"Check Gitea, relay, and world services manually before and after changes.",
"Act as the scheduler when automation is missing or only partially wired.",
]
REPO_SIGNAL_FILES = {
"scripts/fleet_health_probe.sh": "Automated health probe exists and can supply the uptime baseline for the next phase.",
"scripts/fleet_milestones.py": "Milestone tracker exists, so survival achievements can be narrated and logged.",
"scripts/auto_restart_agent.sh": "Auto-restart tooling already exists as phase-2 groundwork.",
"scripts/backup_pipeline.sh": "Backup pipeline scaffold exists for post-survival automation work.",
"infrastructure/timmy-bridge/reports/generate_report.py": "Bridge reporting exists and can summarize heartbeat-driven uptime.",
}
DEFAULT_SNAPSHOT = {
"fleet_operational": True,
"resources": {
"uptime_percent": 0.0,
"days_at_or_above_95_percent": 0,
"capacity_utilization_percent": 0.0,
},
"current_buildings": DEFAULT_BUILDINGS,
"manual_clicks": DEFAULT_MANUAL_CLICKS,
"notes": [
"The fleet is alive, but the human is still the control loop.",
"Phase 1 is about naming reality plainly so later automation has a baseline to beat.",
],
}
def default_snapshot() -> dict[str, Any]:
return deepcopy(DEFAULT_SNAPSHOT)
def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
result = deepcopy(base)
for key, value in override.items():
if isinstance(value, dict) and isinstance(result.get(key), dict):
result[key] = _deep_merge(result[key], value)
else:
result[key] = value
return result
def load_snapshot(snapshot_path: Path | None = None) -> dict[str, Any]:
snapshot = default_snapshot()
if snapshot_path is None:
return snapshot
override = json.loads(snapshot_path.read_text(encoding="utf-8"))
return _deep_merge(snapshot, override)
def collect_repo_signals(repo_root: Path) -> list[str]:
signals: list[str] = []
for rel_path, description in REPO_SIGNAL_FILES.items():
if (repo_root / rel_path).exists():
signals.append(f"`{rel_path}` — {description}")
return signals
def compute_phase_status(snapshot: dict[str, Any], repo_root: Path | None = None) -> dict[str, Any]:
repo_root = repo_root or Path(__file__).resolve().parents[1]
resources = snapshot.get("resources", {})
uptime_percent = float(resources.get("uptime_percent", 0.0))
uptime_days = int(resources.get("days_at_or_above_95_percent", 0))
capacity_percent = float(resources.get("capacity_utilization_percent", 0.0))
fleet_operational = bool(snapshot.get("fleet_operational", False))
missing: list[str] = []
if not fleet_operational:
missing.append("Fleet operational flag is false.")
if uptime_percent < TARGET_UPTIME_PERCENT:
missing.append(f"Uptime {uptime_percent:.1f}% / {TARGET_UPTIME_PERCENT:.1f}%")
if uptime_days < TARGET_UPTIME_DAYS:
missing.append(f"Days at or above 95% uptime: {uptime_days}/{TARGET_UPTIME_DAYS}")
if capacity_percent <= TARGET_CAPACITY_PERCENT:
missing.append(f"Capacity utilization {capacity_percent:.1f}% / >{TARGET_CAPACITY_PERCENT:.1f}%")
return {
"title": PHASE_NAME,
"current_phase": "PHASE-1 Survival",
"fleet_operational": fleet_operational,
"resources": {
"uptime_percent": uptime_percent,
"days_at_or_above_95_percent": uptime_days,
"capacity_utilization_percent": capacity_percent,
},
"current_buildings": list(snapshot.get("current_buildings", DEFAULT_BUILDINGS)),
"manual_clicks": list(snapshot.get("manual_clicks", DEFAULT_MANUAL_CLICKS)),
"notes": list(snapshot.get("notes", [])),
"repo_signals": collect_repo_signals(repo_root),
"next_phase": NEXT_PHASE_NAME,
"next_phase_ready": fleet_operational and not missing,
"missing_requirements": missing,
}
def render_markdown(status: dict[str, Any]) -> str:
resources = status["resources"]
missing = status["missing_requirements"]
ready_line = "READY" if status["next_phase_ready"] else "NOT READY"
lines = [
f"# {status['title']}",
"",
"Phase 1 is the manual-clicker stage of the fleet. The machines exist. The services exist. The human is still the automation loop.",
"",
"## Phase Definition",
"",
"- Current state: fleet exists, agents run, everything important still depends on human vigilance.",
"- Resources tracked here: Capacity, Uptime.",
f"- Next phase: {status['next_phase']}",
"",
"## Current Buildings",
"",
]
lines.extend(f"- {item}" for item in status["current_buildings"])
lines.extend([
"",
"## Current Resource Snapshot",
"",
f"- Fleet operational: {'yes' if status['fleet_operational'] else 'no'}",
f"- Uptime baseline: {resources['uptime_percent']:.1f}%",
f"- Days at or above 95% uptime: {resources['days_at_or_above_95_percent']}",
f"- Capacity utilization: {resources['capacity_utilization_percent']:.1f}%",
"",
"## Next Phase Trigger",
"",
f"To unlock {status['next_phase']}, the fleet must hold both of these conditions at once:",
f"- Uptime >= {TARGET_UPTIME_PERCENT:.0f}% for {TARGET_UPTIME_DAYS} consecutive days",
f"- Capacity utilization > {TARGET_CAPACITY_PERCENT:.0f}%",
f"- Current trigger state: {ready_line}",
"",
"## Missing Requirements",
"",
])
if missing:
lines.extend(f"- {item}" for item in missing)
else:
lines.append("- None. Phase 2 can unlock now.")
lines.extend([
"",
"## Manual Clicker Interpretation",
"",
"Paperclips analogy: Phase 1 = Manual clicker. You ARE the automation.",
"Every restart, every SSH, every check is a manual click.",
"",
"## Manual Clicks Still Required",
"",
])
lines.extend(f"- {item}" for item in status["manual_clicks"])
lines.extend([
"",
"## Repo Signals Already Present",
"",
])
if status["repo_signals"]:
lines.extend(f"- {item}" for item in status["repo_signals"])
else:
lines.append("- No survival-adjacent repo signals detected.")
if status["notes"]:
lines.extend(["", "## Notes", ""])
lines.extend(f"- {item}" for item in status["notes"])
return "\n".join(lines).rstrip() + "\n"
def main() -> None:
parser = argparse.ArgumentParser(description="Render the fleet phase-1 survival report")
parser.add_argument("--snapshot", help="Optional JSON snapshot overriding the default phase-1 baseline")
parser.add_argument("--output", help="Write markdown report to this path")
parser.add_argument("--json", action="store_true", help="Print computed status as JSON instead of markdown")
args = parser.parse_args()
snapshot = load_snapshot(Path(args.snapshot).expanduser() if args.snapshot else None)
repo_root = Path(__file__).resolve().parents[1]
status = compute_phase_status(snapshot, repo_root=repo_root)
if args.json:
rendered = json.dumps(status, indent=2)
else:
rendered = render_markdown(status)
if args.output:
output_path = Path(args.output).expanduser()
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(rendered, encoding="utf-8")
print(f"Phase status written to {output_path}")
else:
print(rendered)
if __name__ == "__main__":
main()