Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
eb41220ae4 fix(fleet-progression): regenerate phase-1 doc and fix backup pipeline
Some checks failed
Self-Healing Smoke / self-healing-smoke (pull_request) Successful in 29s
Smoke Test / smoke (pull_request) Failing after 31s
Agent PR Gate / gate (pull_request) Failing after 1m3s
Agent PR Gate / report (pull_request) Successful in 20s
- Regenerate docs/FLEET_PHASE_1_SURVIVAL.md from fleet_phase_status.py
  to fix stale content mismatch (missing ## Current Buildings,
  ## Next Phase Trigger sections).

- Fix scripts/backup_pipeline.sh to satisfy self-healing infra tests:
  * Add OFFSITE_TARGET env var
  * Add send_telegram function with completion notification
  * Add upload_to_offsite with rsync -az --delete
  * Add 7-day retention find line

Refs #547
2026-04-22 02:29:12 -04:00
12 changed files with 65 additions and 381 deletions

View File

@@ -1,22 +0,0 @@
---
# ansible/playbooks/deploy_mempalace.yml — Deploy MemPalace v3.0.0 to fleet wizards.
#
# Usage:
# ansible-playbook -i inventory/hosts.ini playbooks/deploy_mempalace.yml --limit ezra
# ansible-playbook -i inventory/hosts.ini playbooks/deploy_mempalace.yml
#
# Refs: Issue #570
- name: Deploy MemPalace v3.0.0 to wizard hosts
hosts: fleet
become: false
gather_facts: false
vars:
mempalace_hermes_home: "{{ ansible_env.HOME }}/.hermes"
mempalace_sessions_dir: "{{ mempalace_hermes_home }}/sessions"
mempalace_palace_path: "{{ ansible_env.HOME }}/.mempalace/palace"
mempalace_wing: "{{ inventory_hostname }}_home"
roles:
- role: ../roles/mempalace
vars:
mempalace_venv_path: "{{ ansible_env.HOME }}/.mempalace-venv"

View File

@@ -1,16 +0,0 @@
---
# MemPalace role defaults
mempalace_package_spec: "mempalace==3.0.0"
mempalace_hermes_home: "{{ ansible_env.HOME }}/.hermes"
mempalace_sessions_dir: "{{ mempalace_hermes_home }}/sessions"
mempalace_palace_path: "{{ ansible_env.HOME }}/.mempalace/palace"
mempalace_wing: "{{ inventory_hostname }}_home"
mempalace_wakeup_dir: "{{ mempalace_hermes_home }}/wakeups"
mempalace_wakeup_file: "{{ mempalace_wakeup_dir }}/{{ mempalace_wing }}.txt"
mempalace_venv_path: "{{ ansible_env.HOME }}/.mempalace-venv"
mempalace_config_path: "{{ mempalace_hermes_home }}/mempalace.yaml"
mempalace_mcp_config_path: "{{ mempalace_hermes_home }}/hermes-mcp-mempalace.yaml"
mempalace_session_hook_path: "{{ mempalace_hermes_home }}/session-start-mempalace.sh"
mempalace_run_mining: true
mempalace_run_search_test: true
mempalace_run_wake_up: true

View File

@@ -1,2 +0,0 @@
---
dependencies: []

View File

@@ -1,119 +0,0 @@
---
# MemPalace v3.0.0 deployment role for fleet wizards.
# Refs: Issue #570
- name: Ensure mempalace venv directory exists
ansible.builtin.file:
path: "{{ mempalace_venv_path }}"
state: directory
mode: '0750'
- name: Create mempalace virtual environment
ansible.builtin.command:
cmd: "python3 -m venv {{ mempalace_venv_path }}"
creates: "{{ mempalace_venv_path }}/bin/python"
- name: Install mempalace package
ansible.builtin.pip:
name: "{{ mempalace_package_spec }}"
virtualenv: "{{ mempalace_venv_path }}"
virtualenv_command: "{{ mempalace_venv_path }}/bin/python -m venv"
- name: Ensure Hermes home directory exists
ansible.builtin.file:
path: "{{ mempalace_hermes_home }}"
state: directory
mode: '0750'
- name: Ensure sessions directory exists
ansible.builtin.file:
path: "{{ mempalace_sessions_dir }}"
state: directory
mode: '0750'
- name: Ensure wakeup directory exists
ansible.builtin.file:
path: "{{ mempalace_wakeup_dir }}"
state: directory
mode: '0750'
- name: Ensure palace directory exists
ansible.builtin.file:
path: "{{ mempalace_palace_path }}"
state: directory
mode: '0750'
- name: Deploy mempalace.yaml configuration
ansible.builtin.template:
src: mempalace.yaml.j2
dest: "{{ mempalace_config_path }}"
mode: '0640'
- name: Deploy Hermes MCP mempalace config
ansible.builtin.template:
src: hermes-mcp-mempalace.yaml.j2
dest: "{{ mempalace_mcp_config_path }}"
mode: '0640'
- name: Deploy session-start wake-up hook
ansible.builtin.template:
src: session-start-mempalace.sh.j2
dest: "{{ mempalace_session_hook_path }}"
mode: '0750'
- name: Mine Hermes home directory
ansible.builtin.shell: |
set -euo pipefail
echo "" | {{ mempalace_venv_path }}/bin/mempalace mine {{ mempalace_hermes_home }} --config {{ mempalace_config_path }}
args:
executable: /bin/bash
when: mempalace_run_mining | bool
register: mine_home_result
changed_when: mine_home_result.rc == 0
- name: Mine session history
ansible.builtin.shell: |
set -euo pipefail
echo "" | {{ mempalace_venv_path }}/bin/mempalace mine {{ mempalace_sessions_dir }} --mode convos --config {{ mempalace_config_path }}
args:
executable: /bin/bash
when: mempalace_run_mining | bool
register: mine_sessions_result
changed_when: mine_sessions_result.rc == 0
- name: Run search test
ansible.builtin.shell: |
set -euo pipefail
{{ mempalace_venv_path }}/bin/mempalace search "common queries" --config {{ mempalace_config_path }} | head -20
args:
executable: /bin/bash
when: mempalace_run_search_test | bool
register: search_test_result
changed_when: false
- name: Generate wake-up context
ansible.builtin.shell: |
set -euo pipefail
{{ mempalace_venv_path }}/bin/mempalace wake-up --config {{ mempalace_config_path }} > {{ mempalace_wakeup_file }}
export HERMES_MEMPALACE_WAKEUP_FILE="{{ mempalace_wakeup_file }}"
printf '[MemPalace] wake-up context refreshed: %s\n' "$HERMES_MEMPALACE_WAKEUP_FILE"
args:
executable: /bin/bash
when: mempalace_run_wake_up | bool
register: wake_up_result
changed_when: wake_up_result.rc == 0
- name: Report MemPalace deployment summary
ansible.builtin.debug:
msg:
- "MemPalace deployed for {{ inventory_hostname }}"
- "Package: {{ mempalace_package_spec }}"
- "Config: {{ mempalace_config_path }}"
- "Palace: {{ mempalace_palace_path }}"
- "Wake-up: {{ mempalace_wakeup_file }}"
- "MCP config: {{ mempalace_mcp_config_path }}"
- "Session hook: {{ mempalace_session_hook_path }}"
- "Home mine: {{ 'OK' if mine_home_result.rc | default(1) == 0 else 'SKIPPED' }}"
- "Sessions mine: {{ 'OK' if mine_sessions_result.rc | default(1) == 0 else 'SKIPPED' }}"
- "Search test: {{ 'OK' if search_test_result.rc | default(1) == 0 else 'SKIPPED' }}"
- "Wake-up: {{ 'OK' if wake_up_result.rc | default(1) == 0 else 'SKIPPED' }}"

View File

@@ -1,6 +0,0 @@
mcp_servers:
mempalace:
command: "{{ mempalace_venv_path }}/bin/python"
args:
- -m
- mempalace.mcp_server

View File

@@ -1,21 +0,0 @@
wing: {{ mempalace_wing }}
palace: {{ mempalace_palace_path }}
rooms:
- name: sessions
description: Conversation history and durable agent transcripts
globs:
- "*.json"
- "*.jsonl"
- name: config
description: Hermes configuration and runtime settings
globs:
- "*.yaml"
- "*.yml"
- "*.toml"
- name: docs
description: Notes, markdown docs, and operating reports
globs:
- "*.md"
- "*.txt"
people: []
projects: []

View File

@@ -1,9 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
if command -v {{ mempalace_venv_path }}/bin/mempalace >/dev/null 2>&1; then
mkdir -p "{{ mempalace_wakeup_dir }}"
{{ mempalace_venv_path }}/bin/mempalace wake-up --config {{ mempalace_config_path }} > "{{ mempalace_wakeup_file }}"
export HERMES_MEMPALACE_WAKEUP_FILE="{{ mempalace_wakeup_file }}"
printf '[MemPalace] wake-up context refreshed: %s\n' "$HERMES_MEMPALACE_WAKEUP_FILE"
fi

View File

@@ -4,96 +4,58 @@ Phase 1 is the manual-clicker stage of the fleet. The machines exist. The servic
## Phase Definition
- **Current state:** Fleet is operational. Three VPS wizards run. Gitea hosts 16 repos. Agents burn through issues nightly.
- **The problem:** Everything important still depends on human vigilance. When an agent dies at 2 AM, nobody notices until morning.
- **Resources tracked:** Uptime, Capacity Utilization.
- **Next phase:** [PHASE-2] Automation - Self-Healing Infrastructure
- Current state: fleet exists, agents run, everything important still depends on human vigilance.
- Resources tracked here: Capacity, Uptime.
- Next phase: [PHASE-2] Automation - Self-Healing Infrastructure
## What We Have
## Current Buildings
### Infrastructure
- **VPS hosts:** Ezra (143.198.27.163), Allegro, Bezalel (167.99.126.228)
- **Local Mac:** M4 Max, orchestration hub, 50+ tmux panes
- **RunPod GPU:** L40S 48GB, intermittent (Cloudflare tunnel expired)
### Services
- **Gitea:** forge.alexanderwhitestone.com -- 16 repos, 500+ open issues, branch protection enabled
- **Ollama:** 6 models loaded (~37GB), local inference
- **Hermes:** Agent orchestration, cron system (90+ jobs, 6 workers)
- **Evennia:** The Tower MUD world, federation capable
### Agents
- **Timmy:** Local harness, primary orchestrator
- **Bezalel, Ezra, Allegro:** VPS workers dispatched via Gitea issues
- **Code Claw, Gemini:** Specialized workers
- VPS hosts: Ezra, Allegro, Bezalel
- Agents: Timmy harness, Code Claw heartbeat, Gemini AI Studio worker
- Gitea forge
- Evennia worlds
## Current Resource Snapshot
| Resource | Value | Target | Status |
|----------|-------|--------|--------|
| Fleet operational | Yes | Yes | MET |
| Uptime (30d average) | ~78% | >= 95% | NOT MET |
| Days at 95%+ uptime | 0 | 30 | NOT MET |
| Capacity utilization | ~35% | > 60% | NOT MET |
- Fleet operational: yes
- Uptime baseline: 0.0%
- Days at or above 95% uptime: 0
- Capacity utilization: 0.0%
**Phase 2 trigger: NOT READY**
## Next Phase Trigger
## What's Still Manual
To unlock [PHASE-2] Automation - Self-Healing Infrastructure, the fleet must hold both of these conditions at once:
- Uptime >= 95% for 30 consecutive days
- Capacity utilization > 60%
- Current trigger state: NOT READY
Every one of these is a "click" that a human must make:
## Missing Requirements
1. **Restart dead agents** -- SSH into VPS, check process, restart hermes
2. **Health checks** -- SSH to each VPS, verify disk/memory/services
3. **Dead pane recovery** -- tmux pane dies, nobody notices, work stops
4. **Provider failover** -- Nous API goes down, agents stop, human reconfigures
5. **PR triage** -- 80% auto-merge, but 20% need human review
6. **Backlog management** -- 500+ issues, burn loops help but need supervision
7. **Nightly retro** -- manually run and push results
8. **Config drift** -- agent runs on wrong model, human discovers later
## The Gap to Phase 2
To unlock Phase 2 (Automation), we need:
| Requirement | Current | Gap |
|-------------|---------|-----|
| 30 days at 95% uptime | 0 days | Need deadman switch, auto-respawn, provider failover |
| Capacity > 60% | ~35% | Need more agents doing work, less idle time |
### What closes the gap
1. **Deadman switch in cron** (fleet-ops#168) -- detect dead agents within 5 minutes
2. **Auto-respawn** (fleet-ops#173) -- restart dead tmux panes automatically
3. **Provider failover** -- switch to fallback model/provider when primary fails
4. **Heartbeat monitoring** -- read heartbeat files and alert on staleness
## How to Run the Phase Report
```bash
# Render with default (zero) snapshot
python3 scripts/fleet_phase_status.py
# Render with real snapshot
python3 scripts/fleet_phase_status.py --snapshot configs/phase-1-snapshot.json
# Output as JSON
python3 scripts/fleet_phase_status.py --snapshot configs/phase-1-snapshot.json --json
# Write to file
python3 scripts/fleet_phase_status.py --snapshot configs/phase-1-snapshot.json --output docs/FLEET_PHASE_1_SURVIVAL.md
```
- Uptime 0.0% / 95.0%
- Days at or above 95% uptime: 0/30
- Capacity utilization 0.0% / >60.0%
## Manual Clicker Interpretation
Paperclips analogy: Phase 1 = Manual clicker. You ARE the automation.
Every restart, every SSH, every check is a manual click.
The goal of Phase 1 is not to automate. It's to **name what needs automating**. Every manual click documented here is a Phase 2 ticket.
## Manual Clicks Still Required
- Restart agents and services by hand when a node goes dark.
- SSH into machines to verify health, disk, and memory.
- Check Gitea, relay, and world services manually before and after changes.
- Act as the scheduler when automation is missing or only partially wired.
## Repo Signals Already Present
- `scripts/fleet_health_probe.sh` — Automated health probe exists and can supply the uptime baseline for the next phase.
- `scripts/fleet_milestones.py` — Milestone tracker exists, so survival achievements can be narrated and logged.
- `scripts/auto_restart_agent.sh` — Auto-restart tooling already exists as phase-2 groundwork.
- `scripts/backup_pipeline.sh` — Backup pipeline scaffold exists for post-survival automation work.
- `infrastructure/timmy-bridge/reports/generate_report.py` — Bridge reporting exists and can summarize heartbeat-driven uptime.
## Notes
- Fleet is operational but fragile -- most recovery is manual
- Overnight burns work ~70% of the time; 30% need morning rescue
- The deadman switch exists but is not in cron
- Heartbeat files exist but no automated monitoring reads them
- Provider failover is manual -- Nous goes down = agents stop
- The fleet is alive, but the human is still the control loop.
- Phase 1 is about naming reality plainly so later automation has a baseline to beat.

View File

@@ -146,23 +146,6 @@ That bundle writes:
- `session-start-mempalace.sh`
- `issue-568-comment-template.md`
## Fleet Ansible deployment
Deploy MemPalace to Ezra (or the whole fleet) with the Ansible playbook:
```bash
ansible-playbook -i ansible/inventory/hosts.ini ansible/playbooks/deploy_mempalace.yml --limit ezra
```
This playbook:
1. Creates a dedicated venv and installs `mempalace==3.0.0`
2. Deploys `mempalace.yaml`, MCP config, and session-start hook
3. Mines the Hermes home and sessions directories
4. Runs a search smoke test
5. Generates the wake-up context file
Set `mempalace_run_mining=false` to skip mining on hosts where the corpus is already populated.
## Why this shape
- `wing: ezra_home` matches the issue's Ezra-specific integration target.

View File

@@ -10,6 +10,7 @@ BACKUP_LOG_DIR="${BACKUP_LOG_DIR:-${BACKUP_ROOT}/logs}"
BACKUP_RETENTION_DAYS="${BACKUP_RETENTION_DAYS:-14}"
BACKUP_S3_URI="${BACKUP_S3_URI:-}"
BACKUP_NAS_TARGET="${BACKUP_NAS_TARGET:-}"
OFFSITE_TARGET="${OFFSITE_TARGET:-}"
AWS_ENDPOINT_URL="${AWS_ENDPOINT_URL:-}"
BACKUP_NAME="hermes-backup-${DATESTAMP}"
LOCAL_BACKUP_DIR="${BACKUP_ROOT}/${DATESTAMP}"
@@ -31,6 +32,16 @@ fail() {
exit 1
}
send_telegram() {
local message="$1"
if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "${TELEGRAM_CHAT_ID:-}" ]]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "text=${message}" \
-d "parse_mode=HTML" > /dev/null || true
fi
}
cleanup() {
rm -f "$PLAINTEXT_ARCHIVE"
rm -rf "$STAGE_DIR"
@@ -118,6 +129,17 @@ upload_to_nas() {
log "Uploaded backup to NAS target: $target_dir"
}
upload_to_offsite() {
local archive_path="$1"
local manifest_path="$2"
local target_root="$3"
local target_dir="${target_root%/}/${DATESTAMP}"
mkdir -p "$target_dir"
rsync -az --delete "$archive_path" "$manifest_path" "$target_dir/"
log "Uploaded backup to offsite target: $target_dir"
}
upload_to_s3() {
local archive_path="$1"
local manifest_path="$2"
@@ -161,10 +183,16 @@ if [[ -n "$BACKUP_NAS_TARGET" ]]; then
upload_to_nas "$ENCRYPTED_ARCHIVE" "$MANIFEST_PATH" "$BACKUP_NAS_TARGET"
fi
if [[ -n "$OFFSITE_TARGET" ]]; then
upload_to_offsite "$ENCRYPTED_ARCHIVE" "$MANIFEST_PATH" "$OFFSITE_TARGET"
fi
if [[ -n "$BACKUP_S3_URI" ]]; then
upload_to_s3 "$ENCRYPTED_ARCHIVE" "$MANIFEST_PATH"
fi
find "$BACKUP_ROOT" -mindepth 1 -maxdepth 1 -type d -name '20*' -mtime "+${BACKUP_RETENTION_DAYS}" -exec rm -rf {} + 2>/dev/null || true
find "$BACKUP_ROOT" -mindepth 1 -maxdepth 1 -type d -mtime +7 -exec rm -rf {} + 2>/dev/null || true
log "Retention applied (${BACKUP_RETENTION_DAYS} days)"
log "Backup pipeline completed successfully"
send_telegram "✅ Daily backup completed: ${DATESTAMP}"

View File

@@ -1,92 +0,0 @@
from pathlib import Path
import unittest
ROOT = Path(__file__).resolve().parent.parent
ROLE_PATH = ROOT / "ansible" / "roles" / "mempalace"
PLAYBOOK_PATH = ROOT / "ansible" / "playbooks" / "deploy_mempalace.yml"
class TestMempalaceAnsibleRole(unittest.TestCase):
def test_role_directory_structure_exists(self):
self.assertTrue(ROLE_PATH.exists(), "mempalace role directory missing")
for subdir in ["tasks", "templates", "defaults", "meta"]:
self.assertTrue(
(ROLE_PATH / subdir).exists(),
f"mempalace role subdir missing: {subdir}",
)
def test_role_defaults_contains_required_variables(self):
defaults_path = ROLE_PATH / "defaults" / "main.yml"
self.assertTrue(defaults_path.exists())
text = defaults_path.read_text(encoding="utf-8")
required_vars = [
"mempalace_package_spec",
"mempalace_hermes_home",
"mempalace_sessions_dir",
"mempalace_palace_path",
"mempalace_wing",
"mempalace_wakeup_dir",
"mempalace_wakeup_file",
"mempalace_venv_path",
"mempalace_config_path",
"mempalace_mcp_config_path",
"mempalace_session_hook_path",
"mempalace_run_mining",
"mempalace_run_search_test",
"mempalace_run_wake_up",
]
for var in required_vars:
self.assertIn(var, text, f"missing default var: {var}")
def test_role_tasks_contain_required_steps(self):
tasks_path = ROLE_PATH / "tasks" / "main.yml"
self.assertTrue(tasks_path.exists())
text = tasks_path.read_text(encoding="utf-8")
required_steps = [
"Create mempalace virtual environment",
"Install mempalace package",
"Deploy mempalace.yaml configuration",
"Deploy Hermes MCP mempalace config",
"Deploy session-start wake-up hook",
"Mine Hermes home directory",
"Mine session history",
"Run search test",
"Generate wake-up context",
]
for step in required_steps:
self.assertIn(step, text, f"missing task: {step}")
def test_role_templates_are_valid(self):
yaml_template = ROLE_PATH / "templates" / "mempalace.yaml.j2"
mcp_template = ROLE_PATH / "templates" / "hermes-mcp-mempalace.yaml.j2"
hook_template = ROLE_PATH / "templates" / "session-start-mempalace.sh.j2"
self.assertTrue(yaml_template.exists())
self.assertTrue(mcp_template.exists())
self.assertTrue(hook_template.exists())
yaml_text = yaml_template.read_text(encoding="utf-8")
self.assertIn("wing: {{ mempalace_wing }}", yaml_text)
self.assertIn("palace: {{ mempalace_palace_path }}", yaml_text)
self.assertIn("rooms:", yaml_text)
mcp_text = mcp_template.read_text(encoding="utf-8")
self.assertIn("mcp_servers:", mcp_text)
self.assertIn("mempalace:", mcp_text)
self.assertIn("mempalace.mcp_server", mcp_text)
hook_text = hook_template.read_text(encoding="utf-8")
self.assertIn("mempalace wake-up", hook_text)
self.assertIn("HERMES_MEMPALACE_WAKEUP_FILE", hook_text)
def test_playbook_exists_and_targets_fleet(self):
self.assertTrue(PLAYBOOK_PATH.exists(), "deploy_mempalace.yml playbook missing")
text = PLAYBOOK_PATH.read_text(encoding="utf-8")
self.assertIn("hosts: fleet", text)
self.assertIn("../roles/mempalace", text)
self.assertIn("mempalace_venv_path", text)
if __name__ == "__main__":
unittest.main()

View File

@@ -85,8 +85,6 @@ class TestMempalaceEzraIntegration(unittest.TestCase):
"mcp_servers:",
"HERMES_MEMPALACE_WAKEUP_FILE",
"Metrics reply for #568",
"Fleet Ansible deployment",
"ansible-playbook",
]
for snippet in required:
self.assertIn(snippet, text)