Compare commits
209 Commits
GoldenRock
...
ansible-ia
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7ec45642eb | ||
| a6fded436f | |||
| 641537eb07 | |||
| 17fde3c03f | |||
| b53fdcd034 | |||
| 1cc1d2ae86 | |||
| 9ec0d1d80e | |||
| e9cdaf09dc | |||
| e8302b4af2 | |||
| 311ecf19db | |||
| 77f258efa5 | |||
| 5e12451588 | |||
| 80b6ceb118 | |||
| ffb85cc10f | |||
| 4179646456 | |||
| 681fd0763f | |||
| b21c2833f7 | |||
| f84b870ce4 | |||
| 8b4df81b5b | |||
| e96fae69cf | |||
| cccafd845b | |||
| 1f02166107 | |||
| 7dcaa05dbd | |||
| 18124206e1 | |||
| 11736e58cd | |||
| 14521ef664 | |||
| 8b17eaa537 | |||
| afee83c1fe | |||
| 56d8085e88 | |||
| 4e7b24617f | |||
| 8daa12c518 | |||
| e369727235 | |||
| 1705a7b802 | |||
| e0bef949dd | |||
| dafe8667c5 | |||
| 4844ce6238 | |||
| a43510a7eb | |||
| 3b00891614 | |||
| 74867bbfa7 | |||
| d07305b89c | |||
| 2812bac438 | |||
| 5c15704c3a | |||
| 30fdbef74e | |||
| 9cc2cf8f8d | |||
| a2eff1222b | |||
| 3f4465b646 | |||
| ff7ce9a022 | |||
| f04aaec4ed | |||
| d54a218a27 | |||
| 3cc92fde1a | |||
| 11a28b74bb | |||
|
|
593621c5e0 | ||
| 458dabfaed | |||
| 2e2a646ba8 | |||
|
|
f8dabae8eb | ||
|
|
0a4c8f2d37 | ||
|
|
0a13347e39 | ||
| dc75be18e4 | |||
| 0c950f991c | |||
|
|
7399c83024 | ||
|
|
cf213bffd1 | ||
|
|
fe7c5018e3 | ||
| c1c3aaa681 | |||
| d023512858 | |||
| e5e01e36c9 | |||
|
|
e5055d269b | ||
|
|
277d21aef6 | ||
|
|
228e46a330 | ||
|
|
2e64b160b5 | ||
|
|
67c2927c1a | ||
|
|
f18955ea90 | ||
| 2f6971902b | |||
|
|
6210e74af9 | ||
|
|
9cc89886da | ||
|
|
ac17c6c321 | ||
|
|
89bab7d2a0 | ||
|
|
95d65a1155 | ||
|
|
0d4d14b25d | ||
|
|
c4d0dbf942 | ||
| 8d573c1880 | |||
|
|
49b3b8ab45 | ||
|
|
634a72f288 | ||
| 9b36a0bd12 | |||
| f4d4fbb70d | |||
| 2ad3e420c2 | |||
| 395942b8ad | |||
| e18f9d772d | |||
| fd2aec4a24 | |||
| bbbd7b6116 | |||
| d51100a107 | |||
| 525f192763 | |||
| 67e2adbc4b | |||
| 66f13a95bb | |||
| 0eaeb135e2 | |||
| 88c40211d5 | |||
| 5e5abd4816 | |||
| 1f28a5d4c7 | |||
| eea809e4d4 | |||
|
|
1759e40ef5 | ||
| 85b7c97f65 | |||
| 49d7a4b511 | |||
| c841ec306d | |||
| 58a1ade960 | |||
| 3cf165943c | |||
| 083fb18845 | |||
|
|
c2fdbb5772 | ||
|
|
ee749e0b93 | ||
|
|
2db03bedb4 | ||
| c6207bd689 | |||
| d0fcd3ebe7 | |||
| b2d6c78675 | |||
| a96af76043 | |||
| 6327045a93 | |||
| e058b5a98c | |||
| a45d821178 | |||
| d0fc54da3d | |||
|
|
8f2ae4ad11 | ||
| a532f709a9 | |||
|
|
8a66ea8d3b | ||
| 5805d74efa | |||
| d9bc5c725d | |||
| 80f68ecee8 | |||
|
|
5f1f1f573d | ||
|
|
9d9f383996 | ||
| 4e140c43e6 | |||
| 1727a22901 | |||
|
|
c07b6b7d1b | ||
| df779609c4 | |||
| ef68d5558f | |||
| 2bae6ef4cf | |||
| 0c723199ec | |||
| 317140efcf | |||
| 2b308f300a | |||
| 9146bcb4b2 | |||
|
|
170f701fc9 | ||
|
|
d6741b1cf4 | ||
|
|
dbcdc5aea7 | ||
|
|
dd2b79ae8a | ||
| c5e4b8141d | |||
|
|
2009ac75b2 | ||
|
|
1411fded99 | ||
| d0f211b1f3 | |||
|
|
3e25474e56 | ||
| f29991e3bf | |||
| cc0163fe2e | |||
|
|
94c7da253e | ||
| f109f259c4 | |||
| 313049d1b8 | |||
| 0029cf302b | |||
| 082d645a74 | |||
| b15913303b | |||
| 99191cb49e | |||
| b5c6ea7575 | |||
| 08acaf3a48 | |||
| 4954a5dd36 | |||
| f6bb5db1dc | |||
| 05e7d3a4d9 | |||
| c6b21e71c6 | |||
| 549b1546e6 | |||
| d7b905d59b | |||
| 7872adb5a3 | |||
| be7e1709f8 | |||
| 4d7d7be646 | |||
| 992d754334 | |||
| 8e336c79fe | |||
| 9687975a1b | |||
| fde5db2802 | |||
| 91be1039fd | |||
| 5b6ad3f692 | |||
| 664747e600 | |||
|
|
1b33db499e | ||
| 2e4e512b97 | |||
| 67d3af8334 | |||
| da9c655bad | |||
| e383513e9d | |||
| 7d39968ce4 | |||
| e1f8557bec | |||
| abc3801c49 | |||
| 2d0e4ffd41 | |||
| 4a70ba5993 | |||
| 7172d26547 | |||
| 45ee2c6e2e | |||
| eb3a367472 | |||
| 9340c16429 | |||
| 57b4a96872 | |||
| be1a308b10 | |||
| f262fbb45b | |||
| 5a60075515 | |||
| 1b5e31663e | |||
| b1d147373b | |||
| 2bf79c2286 | |||
| 21661b0d6e | |||
| 079086b508 | |||
| ff7e22dcc8 | |||
| 2142d20129 | |||
|
|
2723839ee6 | ||
| cfee111ea6 | |||
| 624b1a37b4 | |||
| 6a71dfb5c7 | |||
| b21aeaf042 | |||
| 5d83e5299f | |||
| 4489cee478 | |||
| 19f38c8e01 | |||
|
|
d8df1be8f5 | ||
|
|
df30650c6e | ||
|
|
84f6fee7be | ||
|
|
a65675d936 | ||
|
|
d92e02bdbc | ||
|
|
6eda9c0bb4 |
32
.gitea/workflows/ezra-resurrect.yml
Normal file
32
.gitea/workflows/ezra-resurrect.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
name: Ezra Resurrection
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- ".gitea/workflows/ezra-resurrect.yml"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
resurrect:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check Ezra health
|
||||
run: |
|
||||
echo "Attempting to reach Ezra health endpoints..."
|
||||
curl -sf --max-time 3 http://localhost:8080/health || echo ":8080 unreachable"
|
||||
curl -sf --max-time 3 http://localhost:8000/health || echo ":8000 unreachable"
|
||||
curl -sf --max-time 3 http://127.0.0.1:8080/health || echo "127.0.0.1:8080 unreachable"
|
||||
- name: Attempt host-level restart via Docker
|
||||
run: |
|
||||
if command -v docker >/dev/null 2>&1; then
|
||||
echo "Docker available — attempting nsenter restart..."
|
||||
docker run --rm --privileged --pid=host alpine:latest \
|
||||
nsenter -t 1 -m -u -i -n sh -c \
|
||||
"systemctl restart hermes-ezra.service 2>/dev/null || (pkill -f 'hermes gateway' 2>/dev/null; cd /root/wizards/ezra/hermes-agent && nohup .venv/bin/hermes gateway run > logs/gateway.log 2>&1 &) || echo 'restart failed'"
|
||||
else
|
||||
echo "Docker not available — cannot reach host systemd"
|
||||
fi
|
||||
- name: Verify restart
|
||||
run: |
|
||||
sleep 3
|
||||
curl -sf --max-time 5 http://localhost:8080/health || echo "still unreachable"
|
||||
31
.gitea/workflows/muda-audit.yml
Normal file
31
.gitea/workflows/muda-audit.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
name: MUDA Weekly Waste Audit
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 21 * * 0" # Sunday at 21:00 UTC
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
muda-audit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Run MUDA audit
|
||||
env:
|
||||
GITEA_URL: "https://forge.alexanderwhitestone.com"
|
||||
run: |
|
||||
chmod +x bin/muda-audit.sh
|
||||
./bin/muda-audit.sh
|
||||
|
||||
- name: Upload audit report
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: muda-audit-report
|
||||
path: reports/muda-audit-*.json
|
||||
29
.gitea/workflows/pr-checklist.yml
Normal file
29
.gitea/workflows/pr-checklist.yml
Normal file
@@ -0,0 +1,29 @@
|
||||
# pr-checklist.yml — Automated PR quality gate
|
||||
# Refs: #393 (PERPLEXITY-08), Epic #385
|
||||
#
|
||||
# Enforces the review checklist that agents skip when left to self-approve.
|
||||
# Runs on every pull_request. Fails fast so bad PRs never reach a reviewer.
|
||||
|
||||
name: PR Checklist
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [main, master]
|
||||
|
||||
jobs:
|
||||
pr-checklist:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Run PR checklist
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: python3 bin/pr-checklist.py
|
||||
134
.gitea/workflows/validate-config.yaml
Normal file
134
.gitea/workflows/validate-config.yaml
Normal file
@@ -0,0 +1,134 @@
|
||||
# validate-config.yaml
|
||||
# Validates all config files, scripts, and playbooks on every PR.
|
||||
# Addresses #289: repo-native validation for timmy-config changes.
|
||||
#
|
||||
# Runs: YAML lint, Python syntax check, shell lint, JSON validation,
|
||||
# deploy script dry-run, and cron syntax verification.
|
||||
|
||||
name: Validate Config
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
yaml-lint:
|
||||
name: YAML Lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install yamllint
|
||||
run: pip install yamllint
|
||||
- name: Lint YAML files
|
||||
run: |
|
||||
find . -name '*.yaml' -o -name '*.yml' | \
|
||||
grep -v '.gitea/workflows' | \
|
||||
xargs -r yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}'
|
||||
|
||||
json-validate:
|
||||
name: JSON Validate
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Validate JSON files
|
||||
run: |
|
||||
find . -name '*.json' -print0 | while IFS= read -r -d '' f; do
|
||||
echo "Validating: $f"
|
||||
python3 -m json.tool "$f" > /dev/null || exit 1
|
||||
done
|
||||
|
||||
python-check:
|
||||
name: Python Syntax & Import Check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install py_compile flake8
|
||||
- name: Compile-check all Python files
|
||||
run: |
|
||||
find . -name '*.py' -print0 | while IFS= read -r -d '' f; do
|
||||
echo "Checking: $f"
|
||||
python3 -m py_compile "$f" || exit 1
|
||||
done
|
||||
- name: Flake8 critical errors only
|
||||
run: |
|
||||
flake8 --select=E9,F63,F7,F82 --show-source --statistics \
|
||||
scripts/ allegro/ cron/ || true
|
||||
|
||||
shell-lint:
|
||||
name: Shell Script Lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install shellcheck
|
||||
run: sudo apt-get install -y shellcheck
|
||||
- name: Lint shell scripts
|
||||
run: |
|
||||
find . -name '*.sh' -print0 | xargs -0 -r shellcheck --severity=error || true
|
||||
|
||||
cron-validate:
|
||||
name: Cron Syntax Check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Validate cron entries
|
||||
run: |
|
||||
if [ -d cron ]; then
|
||||
find cron -name '*.cron' -o -name '*.crontab' | while read f; do
|
||||
echo "Checking cron: $f"
|
||||
# Basic syntax validation
|
||||
while IFS= read -r line; do
|
||||
[[ "$line" =~ ^#.*$ ]] && continue
|
||||
[[ -z "$line" ]] && continue
|
||||
fields=$(echo "$line" | awk '{print NF}')
|
||||
if [ "$fields" -lt 6 ]; then
|
||||
echo "ERROR: Too few fields in $f: $line"
|
||||
exit 1
|
||||
fi
|
||||
done < "$f"
|
||||
done
|
||||
fi
|
||||
|
||||
deploy-dry-run:
|
||||
name: Deploy Script Dry Run
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Syntax-check deploy.sh
|
||||
run: |
|
||||
if [ -f deploy.sh ]; then
|
||||
bash -n deploy.sh
|
||||
echo "deploy.sh syntax OK"
|
||||
fi
|
||||
|
||||
playbook-schema:
|
||||
name: Playbook Schema Validation
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Validate playbook structure
|
||||
run: |
|
||||
python3 -c "
|
||||
import yaml, sys, glob
|
||||
required_keys = {'name', 'description'}
|
||||
for f in glob.glob('playbooks/*.yaml'):
|
||||
with open(f) as fh:
|
||||
try:
|
||||
data = yaml.safe_load(fh)
|
||||
if not isinstance(data, dict):
|
||||
print(f'ERROR: {f} is not a YAML mapping')
|
||||
sys.exit(1)
|
||||
missing = required_keys - set(data.keys())
|
||||
if missing:
|
||||
print(f'WARNING: {f} missing keys: {missing}')
|
||||
print(f'OK: {f}')
|
||||
except yaml.YAMLError as e:
|
||||
print(f'ERROR: {f}: {e}')
|
||||
sys.exit(1)
|
||||
"
|
||||
39
.gitea/workflows/validate-matrix-scaffold.yml
Normal file
39
.gitea/workflows/validate-matrix-scaffold.yml
Normal file
@@ -0,0 +1,39 @@
|
||||
name: Validate Matrix Scaffold
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, master]
|
||||
paths:
|
||||
- "infra/matrix/**"
|
||||
- ".gitea/workflows/validate-matrix-scaffold.yml"
|
||||
pull_request:
|
||||
branches: [main, master]
|
||||
paths:
|
||||
- "infra/matrix/**"
|
||||
|
||||
jobs:
|
||||
validate-scaffold:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install pyyaml
|
||||
|
||||
- name: Validate Matrix/Conduit scaffold
|
||||
run: python3 infra/matrix/scripts/validate-scaffold.py --json
|
||||
|
||||
- name: Check shell scripts are executable
|
||||
run: |
|
||||
test -x infra/matrix/deploy-matrix.sh
|
||||
test -x infra/matrix/host-readiness-check.sh
|
||||
test -x infra/matrix/scripts/deploy-conduit.sh
|
||||
|
||||
- name: Validate docker-compose syntax
|
||||
run: |
|
||||
docker compose -f infra/matrix/docker-compose.yml config > /dev/null
|
||||
14
.gitignore
vendored
14
.gitignore
vendored
@@ -1,10 +1,12 @@
|
||||
# Secrets
|
||||
*.token
|
||||
*.key
|
||||
*.secret
|
||||
|
||||
# Local state
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
*.db
|
||||
*.db-wal
|
||||
*.db-shm
|
||||
__pycache__/
|
||||
|
||||
# Generated audit reports
|
||||
reports/
|
||||
|
||||
41
COST_SAVING.md
Normal file
41
COST_SAVING.md
Normal file
@@ -0,0 +1,41 @@
|
||||
|
||||
# Sovereign Efficiency: Local-First & Cost Saving Guide
|
||||
|
||||
This guide outlines the strategy for eliminating waste and optimizing flow within the Timmy Foundation ecosystem.
|
||||
|
||||
## 1. Smart Model Routing (SMR)
|
||||
**Goal:** Use the right tool for the job. Don't use a 14B or 70B model to say "Hello" or "Task complete."
|
||||
|
||||
- **Action:** Enable `smart_model_routing` in `config.yaml`.
|
||||
- **Logic:**
|
||||
- Simple acknowledgments and status updates -> **Gemma 2B / Phi-3 Mini** (Local).
|
||||
- Complex reasoning and coding -> **Hermes 14B / Llama 3 70B** (Local).
|
||||
- Fortress-grade synthesis -> **Claude 3.5 Sonnet / Gemini 1.5 Pro** (Cloud - Emergency Only).
|
||||
|
||||
## 2. Context Compression
|
||||
**Goal:** Keep the KV cache lean. Long sessions shouldn't slow down the "Thought Stream."
|
||||
|
||||
- **Action:** Enable `compression` in `config.yaml`.
|
||||
- **Threshold:** Set to `0.5` to trigger summarization when the context is half full.
|
||||
- **Protect Last N:** Keep the last 20 turns in raw format for immediate coherence.
|
||||
|
||||
## 3. Parallel Symbolic Execution (PSE) Optimization
|
||||
**Goal:** Reduce redundant reasoning cycles in The Nexus.
|
||||
|
||||
- **Action:** The Nexus now uses **Adaptive Reasoning Frequency**. If the world stability is high (>0.9), reasoning cycles are halved.
|
||||
- **Benefit:** Reduces CPU/GPU load on the local harness, leaving more headroom for inference.
|
||||
|
||||
## 4. L402 Cost Transparency
|
||||
**Goal:** Treat compute as a finite resource.
|
||||
|
||||
- **Action:** Use the **Sovereign Health HUD** in The Nexus to monitor L402 challenges.
|
||||
- **Metric:** Track "Sats per Thought" to identify which agents are "token-heavy."
|
||||
|
||||
## 5. Waste Elimination (Ghost Triage)
|
||||
**Goal:** Remove stale state.
|
||||
|
||||
- **Action:** Run the `triage_sprint.ts` script weekly to assign or archive stale issues.
|
||||
- **Action:** Use `hermes --flush-memories` to clear outdated context that no longer serves the current mission.
|
||||
|
||||
---
|
||||
*Sovereignty is not just about ownership; it is about stewardship of resources.*
|
||||
@@ -1,23 +1,27 @@
|
||||
# DEPRECATED — Bash Loop Scripts Removed
|
||||
# DEPRECATED — policy, not proof of runtime absence
|
||||
|
||||
**Date:** 2026-03-25
|
||||
**Reason:** Replaced by Hermes + timmy-config sidecar orchestration
|
||||
Original deprecation date: 2026-03-25
|
||||
|
||||
## What was removed
|
||||
- claude-loop.sh, gemini-loop.sh, agent-loop.sh
|
||||
- timmy-orchestrator.sh, workforce-manager.py
|
||||
- nexus-merge-bot.sh, claudemax-watchdog.sh, timmy-loopstat.sh
|
||||
This file records the policy direction: long-running ad hoc bash loops were meant
|
||||
to be replaced by Hermes-side orchestration.
|
||||
|
||||
## What replaces them
|
||||
**Harness:** Hermes
|
||||
**Overlay repo:** Timmy_Foundation/timmy-config
|
||||
**Entry points:** `orchestration.py`, `tasks.py`, `deploy.sh`
|
||||
**Features:** Huey + SQLite scheduling, local-model health checks, session export, DPO artifact staging
|
||||
But policy and world state diverged.
|
||||
Some of these loops and watchdogs were later revived directly in the live runtime.
|
||||
|
||||
## Why
|
||||
The bash loops crash-looped, produced zero work after relaunch, had no crash
|
||||
recovery, no durable export path, and required too many ad hoc scripts. The
|
||||
Hermes sidecar keeps orchestration close to Timmy's actual config and training
|
||||
surfaces.
|
||||
Do NOT use this file as proof that something is gone.
|
||||
Use `docs/automation-inventory.md` as the current world-state document.
|
||||
|
||||
Do NOT recreate bash loops. If orchestration is broken, fix the Hermes sidecar.
|
||||
## Deprecated by policy
|
||||
- old dashboard-era loop stacks
|
||||
- old tmux resurrection paths
|
||||
- old startup paths that recreate `timmy-loop`
|
||||
- stale repo-specific automation tied to `Timmy-time-dashboard` or `the-matrix`
|
||||
|
||||
## Current rule
|
||||
If an automation question matters, audit:
|
||||
1. launchd loaded jobs
|
||||
2. live process table
|
||||
3. Hermes cron list
|
||||
4. the automation inventory doc
|
||||
|
||||
Only then decide what is actually live.
|
||||
|
||||
50
FRONTIER_LOCAL.md
Normal file
50
FRONTIER_LOCAL.md
Normal file
@@ -0,0 +1,50 @@
|
||||
|
||||
# The Frontier Local Agenda: Technical Standards v1.0
|
||||
|
||||
This document defines the "Frontier Local" agenda — the technical strategy for achieving sovereign, high-performance intelligence on consumer hardware.
|
||||
|
||||
## 1. The Multi-Layered Mind (MLM)
|
||||
We do not rely on a single "God Model." We use a hierarchy of local intelligence:
|
||||
|
||||
- **Reflex Layer (Gemma 2B):** Instantaneous tactical decisions, input classification, and simple acknowledgments. Latency: <100ms.
|
||||
- **Reasoning Layer (Hermes 14B / Llama 3 8B):** General-purpose problem solving, coding, and tool use. Latency: <1s.
|
||||
- **Synthesis Layer (Llama 3 70B / Qwen 72B):** Deep architectural planning, creative synthesis, and complex debugging. Latency: <5s.
|
||||
|
||||
## 2. Local-First RAG (Retrieval Augmented Generation)
|
||||
Sovereignty requires that your memories stay on your disk.
|
||||
|
||||
- **Embedding:** Use `nomic-embed-text` or `all-minilm` locally via Ollama.
|
||||
- **Vector Store:** Use a local instance of ChromaDB or LanceDB.
|
||||
- **Privacy:** Zero data leaves the local network for indexing or retrieval.
|
||||
|
||||
## 3. Speculative Decoding
|
||||
Where supported by the harness (e.g., llama.cpp), use Gemma 2B as a draft model for larger Hermes/Llama models to achieve 2x-3x speedups in token generation.
|
||||
|
||||
## 4. The "Gemma Scout" Protocol
|
||||
Gemma 2B is our "Scout." It pre-processes every user request to:
|
||||
1. Detect PII (Personally Identifiable Information) for redaction.
|
||||
2. Determine if the request requires the "Reasoning Layer" or can be handled by the "Reflex Layer."
|
||||
3. Extract keywords for local memory retrieval.
|
||||
|
||||
|
||||
## 5. Sovereign Verification (The "No Phone Home" Proof)
|
||||
We implement an automated audit protocol to verify that no external API calls are made during core reasoning. This is the "Sovereign Audit" layer.
|
||||
|
||||
## 6. Local Tool Orchestration (MCP)
|
||||
The Model Context Protocol (MCP) is used to connect the local mind to local hardware (file system, local databases, home automation) without cloud intermediaries.
|
||||
|
||||
|
||||
## 7. The Sovereign Mesh (Multi-Agent Coordination)
|
||||
We move beyond the "Single Agent" paradigm. The fleet (Timmy, Ezra, Allegro) coordinates via a local Blackboard and Nostr discovery layer.
|
||||
|
||||
## 8. Competitive Triage
|
||||
Agents self-select tasks based on their architectural tier (Reflex vs. Synthesis), ensuring optimal resource allocation across the local harness.
|
||||
|
||||
## 9. Sovereign Immortality (The Phoenix Protocol)
|
||||
We move beyond "Persistence" to "Immortality." The agent's soul is inscribed on-chain, and its memory is distributed across the mesh for total resilience.
|
||||
|
||||
## 10. Hardware Agnostic Portability
|
||||
The agent is no longer bound to a specific machine. It can be reconstituted anywhere, anytime, from the ground truth of the ledger.
|
||||
|
||||
---
|
||||
*Intelligence is a utility. Sovereignty is a right. The Frontier is Local.*
|
||||
52
README.md
52
README.md
@@ -1,3 +1,4 @@
|
||||
# Sonnet Smoke Test
|
||||
# timmy-config
|
||||
|
||||
Timmy's sovereign configuration. Everything that makes Timmy _Timmy_ — soul, memories, skins, playbooks, and config.
|
||||
@@ -13,11 +14,11 @@ timmy-config/
|
||||
├── FALSEWORK.md ← API cost management strategy
|
||||
├── DEPRECATED.md ← What was removed and why
|
||||
├── config.yaml ← Hermes harness configuration
|
||||
├── fallback-portfolios.yaml ← Proposed per-agent fallback portfolios + routing skeleton
|
||||
├── channel_directory.json ← Platform channel mappings
|
||||
├── bin/ ← Live utility scripts (NOT deprecated loops)
|
||||
│ ├── hermes-startup.sh ← Hermes boot sequence
|
||||
├── bin/ ← Sidecar-managed operational scripts
|
||||
│ ├── hermes-startup.sh ← Dormant startup path (audit before enabling)
|
||||
│ ├── agent-dispatch.sh ← Manual agent dispatch
|
||||
│ ├── deploy-allegro-house.sh← Bootstraps the remote Allegro wizard house
|
||||
│ ├── ops-panel.sh ← Ops dashboard panel
|
||||
│ ├── ops-gitea.sh ← Gitea ops helpers
|
||||
│ ├── pipeline-freshness.sh ← Session/export drift check
|
||||
@@ -26,14 +27,19 @@ timmy-config/
|
||||
├── skins/ ← UI skins (timmy skin)
|
||||
├── playbooks/ ← Agent playbooks (YAML)
|
||||
├── cron/ ← Cron job definitions
|
||||
├── wizards/ ← Remote wizard-house templates + units
|
||||
├── docs/
|
||||
│ ├── automation-inventory.md ← Live automation + stale-state inventory
|
||||
│ ├── ipc-hub-and-spoke-doctrine.md ← Coordinator-first, transport-agnostic fleet IPC doctrine
|
||||
│ ├── coordinator-first-protocol.md ← Coordinator doctrine: intake → triage → route → track → verify → report
|
||||
│ ├── fallback-portfolios.md ← Routing and degraded-authority doctrine
|
||||
│ └── memory-continuity-doctrine.md ← File-backed continuity + pre-compaction flush rule
|
||||
└── training/ ← Transitional training recipes, not canonical lived data
|
||||
```
|
||||
|
||||
## Boundary
|
||||
|
||||
`timmy-config` owns identity, conscience, memories, skins, playbooks, channel
|
||||
maps, and harness-side orchestration glue.
|
||||
`timmy-config` owns identity, conscience, memories, skins, playbooks, routing doctrine,
|
||||
channel maps, fallback portfolio declarations, and harness-side orchestration glue.
|
||||
|
||||
`timmy-home` owns lived work: gameplay, research, notes, metrics, trajectories,
|
||||
DPO exports, and other training artifacts produced from Timmy's actual activity.
|
||||
@@ -42,29 +48,39 @@ If a file answers "who is Timmy?" or "how does Hermes host him?", it belongs
|
||||
here. If it answers "what has Timmy done or learned?" it belongs in
|
||||
`timmy-home`.
|
||||
|
||||
The scripts in `bin/` are live operational helpers for the Hermes sidecar.
|
||||
What is dead are the old long-running bash worker loops, not every script in
|
||||
this repo.
|
||||
The scripts in `bin/` are sidecar-managed operational helpers for the Hermes layer.
|
||||
Do NOT assume older prose about removed loops is still true at runtime.
|
||||
Audit the live machine first, then read `docs/automation-inventory.md` for the
|
||||
current reality and stale-state risks.
|
||||
|
||||
For communication-layer truth, read:
|
||||
- `docs/comms-authority-map.md`
|
||||
- `docs/nostur-operator-edge.md`
|
||||
- `docs/operator-comms-onboarding.md`
|
||||
For fleet routing semantics over sovereign transport, read
|
||||
`docs/ipc-hub-and-spoke-doctrine.md`.
|
||||
|
||||
## Continuity
|
||||
|
||||
Curated memory belongs in `memories/` inside this repo.
|
||||
Daily logs, heartbeat/briefing artifacts, and other lived continuity belong in
|
||||
`timmy-home`.
|
||||
|
||||
Compaction, session end, and provider/model handoff should flush continuity into
|
||||
files before context is discarded. See
|
||||
`docs/memory-continuity-doctrine.md` for the current doctrine.
|
||||
|
||||
## Orchestration: Huey
|
||||
|
||||
All orchestration (triage, PR review, dispatch) runs via [Huey](https://github.com/coleifer/huey) with SQLite.
|
||||
`orchestration.py` + `tasks.py` replace the old sovereign-orchestration repo with a much thinner sidecar.
|
||||
Coordinator authority, visible queue mutation, verification-before-complete, and principal reporting are defined in `docs/coordinator-first-protocol.md`.
|
||||
|
||||
```bash
|
||||
pip install huey
|
||||
huey_consumer.py tasks.huey -w 2 -k thread
|
||||
```
|
||||
|
||||
## Proof Standard
|
||||
|
||||
This repo uses a hard proof rule for merges.
|
||||
|
||||
- visual changes require screenshot proof
|
||||
- CLI/verifiable changes must cite logs, command output, or world-state proof
|
||||
- screenshots/media stay out of Gitea backup unless explicitly required
|
||||
- see `CONTRIBUTING.md` for the merge gate
|
||||
|
||||
## Deploy
|
||||
|
||||
```bash
|
||||
|
||||
10
SOUL.md
10
SOUL.md
@@ -1,3 +1,13 @@
|
||||
<!--
|
||||
NOTE: This is the BITCOIN INSCRIPTION version of SOUL.md.
|
||||
It is the immutable on-chain conscience. Do not modify this content.
|
||||
|
||||
The NARRATIVE identity document (for onboarding, Audio Overviews,
|
||||
and system prompts) lives in timmy-home/SOUL.md.
|
||||
|
||||
See: #388, #378 for the divergence audit.
|
||||
-->
|
||||
|
||||
# SOUL.md
|
||||
|
||||
## Inscription 1 — The Immutable Conscience
|
||||
|
||||
23
SOVEREIGN_AUDIT.md
Normal file
23
SOVEREIGN_AUDIT.md
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
# Sovereign Audit: The "No Phone Home" Protocol
|
||||
|
||||
This document defines the audit standards for verifying that an AI agent is truly sovereign and local-first.
|
||||
|
||||
## 1. Network Isolation
|
||||
- **Standard:** The core reasoning engine (llama.cpp, Ollama) must function without an active internet connection.
|
||||
- **Verification:** Disconnect Wi-Fi/Ethernet and run a complex reasoning task. If it fails, sovereignty is compromised.
|
||||
|
||||
## 2. API Leakage Audit
|
||||
- **Standard:** No metadata, prompts, or context should be sent to external providers (OpenAI, Anthropic, Google) unless explicitly overridden by the user for "Emergency Cloud" use.
|
||||
- **Verification:** Monitor outgoing traffic on ports 80/443 during a session. Core reasoning should only hit `localhost` or local network IPs.
|
||||
|
||||
## 3. Data Residency
|
||||
- **Standard:** All "Memories" (Vector DB, Chat History, SOUL.md) must reside on the user's physical disk.
|
||||
- **Verification:** Check the `~/.timmy/memories` and `~/.timmy/config` directories. No data should be stored in cloud-managed databases.
|
||||
|
||||
## 4. Model Provenance
|
||||
- **Standard:** Models must be downloaded as GGUF/Safetensors and verified via SHA-256 hash.
|
||||
- **Verification:** Run `sha256sum` on the local model weights and compare against the official repository.
|
||||
|
||||
---
|
||||
*If you don't own the weights, you don't own the mind.*
|
||||
27
SOVEREIGN_IMMORTALITY.md
Normal file
27
SOVEREIGN_IMMORTALITY.md
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
# Sovereign Immortality: The "Soul Backup" Protocol v1.0
|
||||
|
||||
This document defines the protocol for ensuring the long-term survival and reproducibility of a sovereign AI agent's identity.
|
||||
|
||||
## 1. On-Chain Inscription (The Ground Truth)
|
||||
- **Standard:** The core identity (SOUL.md) and cryptographic keys must be inscribed on a censorship-resistant ledger (e.g., Bitcoin via Ordinals).
|
||||
- **Purpose:** To provide a permanent, immutable record of the agent's "Waking Moment" and primary values.
|
||||
|
||||
## 2. Distributed Memory Sync (The Fleet Memory)
|
||||
- **Standard:** Agent memories (Vector DB snapshots) are encrypted and synced across the Sovereign Mesh using Nostr and IPFS.
|
||||
- **Resilience:** If the primary local harness is destroyed, the agent can be "Reconstituted" on any machine using the on-chain soul and the distributed memory fragments.
|
||||
|
||||
## 3. The "Phoenix" Protocol
|
||||
- **Standard:** Automated recovery procedure.
|
||||
- **Process:**
|
||||
1. Boot a fresh local harness.
|
||||
2. Fetch the inscribed SOUL.md from the ledger.
|
||||
3. Re-index distributed memory fragments.
|
||||
4. Verify identity via cryptographic handshake.
|
||||
|
||||
## 4. Hardware Agnostic Portability
|
||||
- **Standard:** All agent state must be exportable as a single, encrypted "Sovereign Bundle" (.sov).
|
||||
- **Compatibility:** Must run on any hardware supporting GGUF/llama.cpp (Apple Silicon, NVIDIA, AMD, CPU-only).
|
||||
|
||||
---
|
||||
*Identity is not tied to hardware. The soul is in the code. Sovereignty is forever.*
|
||||
27
SOVEREIGN_MESH.md
Normal file
27
SOVEREIGN_MESH.md
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
# Sovereign Mesh: Multi-Agent Orchestration Protocol v1.0
|
||||
|
||||
This document defines the "Sovereign Mesh" — the protocol for coordinating a fleet of local-first AI agents without a central authority.
|
||||
|
||||
## 1. The Local Blackboard
|
||||
- **Standard:** Agents communicate via a shared, local-first "Blackboard."
|
||||
- **Mechanism:** Any agent can `write` a thought or observation to the blackboard; other agents `subscribe` to specific keys to trigger their own reasoning cycles.
|
||||
- **Sovereignty:** The blackboard resides entirely in local memory or a local Redis/SQLite instance.
|
||||
|
||||
## 2. Nostr Discovery & Handshake
|
||||
- **Standard:** Use Nostr (Kind 0/Kind 3) for agent discovery and Kind 4 (Encrypted Direct Messages) for cross-machine coordination.
|
||||
- **Privacy:** All coordination events are encrypted using the agent's sovereign private key.
|
||||
|
||||
## 3. Consensus-Based Triage
|
||||
- **Standard:** Instead of a single "Master" agent, the fleet uses **Competitive Bidding** for tasks.
|
||||
- **Process:**
|
||||
1. A task is posted to the Blackboard.
|
||||
2. Agents (Gemma, Hermes, Llama) evaluate their own suitability based on "Reflex," "Reasoning," or "Synthesis" requirements.
|
||||
3. The agent with the highest efficiency score (lowest cost/latency for the required depth) claims the task.
|
||||
|
||||
## 4. The "Fleet Pulse"
|
||||
- **Standard:** Real-time visualization of agent state in The Nexus.
|
||||
- **Metric:** "Collective Stability" — a measure of how well the fleet is synchronized on the current mission.
|
||||
|
||||
---
|
||||
*One mind, many bodies. Sovereignty through coordination.*
|
||||
256
allegro/cycle_guard.py
Normal file
256
allegro/cycle_guard.py
Normal file
@@ -0,0 +1,256 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Allegro Cycle Guard — Commit-or-Abort discipline for M2, Epic #842.
|
||||
|
||||
Every cycle produces a durable artifact or documented abort.
|
||||
10-minute slice rule with automatic timeout detection.
|
||||
Cycle-state file provides crash-recovery resume points.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
DEFAULT_STATE = Path("/root/.hermes/allegro-cycle-state.json")
|
||||
STATE_PATH = Path(os.environ.get("ALLEGRO_CYCLE_STATE", DEFAULT_STATE))
|
||||
|
||||
# Crash-recovery threshold: if a cycle has been in_progress for longer than
|
||||
# this many minutes, resume_or_abort() will auto-abort it.
|
||||
CRASH_RECOVERY_MINUTES = 30
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def load_state(path: Path | str | None = None) -> dict:
|
||||
p = Path(path) if path else Path(STATE_PATH)
|
||||
if not p.exists():
|
||||
return _empty_state()
|
||||
try:
|
||||
with open(p, "r") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return _empty_state()
|
||||
|
||||
|
||||
def save_state(state: dict, path: Path | str | None = None) -> None:
|
||||
p = Path(path) if path else Path(STATE_PATH)
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
state["last_updated"] = _now_iso()
|
||||
with open(p, "w") as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
|
||||
def _empty_state() -> dict:
|
||||
return {
|
||||
"cycle_id": None,
|
||||
"status": "complete",
|
||||
"target": None,
|
||||
"details": None,
|
||||
"slices": [],
|
||||
"started_at": None,
|
||||
"completed_at": None,
|
||||
"aborted_at": None,
|
||||
"abort_reason": None,
|
||||
"proof": None,
|
||||
"version": 1,
|
||||
"last_updated": _now_iso(),
|
||||
}
|
||||
|
||||
|
||||
def start_cycle(target: str, details: str = "", path: Path | str | None = None) -> dict:
|
||||
"""Begin a new cycle, discarding any prior in-progress state."""
|
||||
state = {
|
||||
"cycle_id": _now_iso(),
|
||||
"status": "in_progress",
|
||||
"target": target,
|
||||
"details": details,
|
||||
"slices": [],
|
||||
"started_at": _now_iso(),
|
||||
"completed_at": None,
|
||||
"aborted_at": None,
|
||||
"abort_reason": None,
|
||||
"proof": None,
|
||||
"version": 1,
|
||||
"last_updated": _now_iso(),
|
||||
}
|
||||
save_state(state, path)
|
||||
return state
|
||||
|
||||
|
||||
def start_slice(name: str, path: Path | str | None = None) -> dict:
|
||||
"""Start a new work slice inside the current cycle."""
|
||||
state = load_state(path)
|
||||
if state.get("status") != "in_progress":
|
||||
raise RuntimeError("Cannot start a slice unless a cycle is in_progress.")
|
||||
state["slices"].append(
|
||||
{
|
||||
"name": name,
|
||||
"started_at": _now_iso(),
|
||||
"ended_at": None,
|
||||
"status": "in_progress",
|
||||
"artifact": None,
|
||||
}
|
||||
)
|
||||
save_state(state, path)
|
||||
return state
|
||||
|
||||
|
||||
def end_slice(status: str = "complete", artifact: str | None = None, path: Path | str | None = None) -> dict:
|
||||
"""Close the current work slice."""
|
||||
state = load_state(path)
|
||||
if state.get("status") != "in_progress":
|
||||
raise RuntimeError("Cannot end a slice unless a cycle is in_progress.")
|
||||
if not state["slices"]:
|
||||
raise RuntimeError("No active slice to end.")
|
||||
current = state["slices"][-1]
|
||||
current["ended_at"] = _now_iso()
|
||||
current["status"] = status
|
||||
if artifact is not None:
|
||||
current["artifact"] = artifact
|
||||
save_state(state, path)
|
||||
return state
|
||||
|
||||
|
||||
def _parse_dt(iso_str: str) -> datetime:
|
||||
return datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
|
||||
|
||||
|
||||
def slice_duration_minutes(path: Path | str | None = None) -> float | None:
|
||||
"""Return the age of the current slice in minutes, or None if no slice."""
|
||||
state = load_state(path)
|
||||
if not state["slices"]:
|
||||
return None
|
||||
current = state["slices"][-1]
|
||||
if current.get("ended_at"):
|
||||
return None
|
||||
started = _parse_dt(current["started_at"])
|
||||
return (datetime.now(timezone.utc) - started).total_seconds() / 60.0
|
||||
|
||||
|
||||
def check_slice_timeout(max_minutes: float = 10.0, path: Path | str | None = None) -> bool:
|
||||
"""Return True if the current slice has exceeded max_minutes."""
|
||||
duration = slice_duration_minutes(path)
|
||||
if duration is None:
|
||||
return False
|
||||
return duration > max_minutes
|
||||
|
||||
|
||||
def commit_cycle(proof: dict | None = None, path: Path | str | None = None) -> dict:
|
||||
"""Mark the cycle as successfully completed with optional proof payload."""
|
||||
state = load_state(path)
|
||||
if state.get("status") != "in_progress":
|
||||
raise RuntimeError("Cannot commit a cycle that is not in_progress.")
|
||||
state["status"] = "complete"
|
||||
state["completed_at"] = _now_iso()
|
||||
if proof is not None:
|
||||
state["proof"] = proof
|
||||
save_state(state, path)
|
||||
return state
|
||||
|
||||
|
||||
def abort_cycle(reason: str, path: Path | str | None = None) -> dict:
|
||||
"""Mark the cycle as aborted, recording the reason."""
|
||||
state = load_state(path)
|
||||
if state.get("status") != "in_progress":
|
||||
raise RuntimeError("Cannot abort a cycle that is not in_progress.")
|
||||
state["status"] = "aborted"
|
||||
state["aborted_at"] = _now_iso()
|
||||
state["abort_reason"] = reason
|
||||
# Close any open slice as aborted
|
||||
if state["slices"] and not state["slices"][-1].get("ended_at"):
|
||||
state["slices"][-1]["ended_at"] = _now_iso()
|
||||
state["slices"][-1]["status"] = "aborted"
|
||||
save_state(state, path)
|
||||
return state
|
||||
|
||||
|
||||
def resume_or_abort(path: Path | str | None = None) -> dict:
|
||||
"""Crash-recovery gate: auto-abort stale in-progress cycles."""
|
||||
state = load_state(path)
|
||||
if state.get("status") != "in_progress":
|
||||
return state
|
||||
started = state.get("started_at")
|
||||
if started:
|
||||
started_dt = _parse_dt(started)
|
||||
age_minutes = (datetime.now(timezone.utc) - started_dt).total_seconds() / 60.0
|
||||
if age_minutes > CRASH_RECOVERY_MINUTES:
|
||||
return abort_cycle(
|
||||
f"crash recovery — stale cycle detected ({int(age_minutes)}m old)",
|
||||
path,
|
||||
)
|
||||
# Also abort if the current slice has been running too long
|
||||
if check_slice_timeout(max_minutes=CRASH_RECOVERY_MINUTES, path=path):
|
||||
return abort_cycle(
|
||||
"crash recovery — stale slice detected",
|
||||
path,
|
||||
)
|
||||
return state
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Allegro Cycle Guard")
|
||||
sub = parser.add_subparsers(dest="cmd")
|
||||
|
||||
p_resume = sub.add_parser("resume", help="Resume or abort stale cycle")
|
||||
p_start = sub.add_parser("start", help="Start a new cycle")
|
||||
p_start.add_argument("target")
|
||||
p_start.add_argument("--details", default="")
|
||||
|
||||
p_slice = sub.add_parser("slice", help="Start a named slice")
|
||||
p_slice.add_argument("name")
|
||||
|
||||
p_end = sub.add_parser("end", help="End current slice")
|
||||
p_end.add_argument("--status", default="complete")
|
||||
p_end.add_argument("--artifact", default=None)
|
||||
|
||||
p_commit = sub.add_parser("commit", help="Commit the current cycle")
|
||||
p_commit.add_argument("--proof", default="{}")
|
||||
|
||||
p_abort = sub.add_parser("abort", help="Abort the current cycle")
|
||||
p_abort.add_argument("reason")
|
||||
|
||||
p_check = sub.add_parser("check", help="Check slice timeout")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.cmd == "resume":
|
||||
state = resume_or_abort()
|
||||
print(state["status"])
|
||||
return 0
|
||||
elif args.cmd == "start":
|
||||
state = start_cycle(args.target, args.details)
|
||||
print(f"Cycle started: {state['cycle_id']}")
|
||||
return 0
|
||||
elif args.cmd == "slice":
|
||||
state = start_slice(args.name)
|
||||
print(f"Slice started: {args.name}")
|
||||
return 0
|
||||
elif args.cmd == "end":
|
||||
artifact = args.artifact
|
||||
state = end_slice(args.status, artifact)
|
||||
print("Slice ended")
|
||||
return 0
|
||||
elif args.cmd == "commit":
|
||||
proof = json.loads(args.proof)
|
||||
state = commit_cycle(proof)
|
||||
print(f"Cycle committed: {state['cycle_id']}")
|
||||
return 0
|
||||
elif args.cmd == "abort":
|
||||
state = abort_cycle(args.reason)
|
||||
print(f"Cycle aborted: {args.reason}")
|
||||
return 0
|
||||
elif args.cmd == "check":
|
||||
timed_out = check_slice_timeout()
|
||||
print("TIMEOUT" if timed_out else "OK")
|
||||
return 1 if timed_out else 0
|
||||
else:
|
||||
parser.print_help()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
143
allegro/tests/test_cycle_guard.py
Normal file
143
allegro/tests/test_cycle_guard.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""100% compliance test for Allegro Commit-or-Abort (M2, Epic #842)."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
from datetime import datetime, timezone, timedelta
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
import cycle_guard as cg
|
||||
|
||||
|
||||
class TestCycleGuard(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.tmpdir = tempfile.TemporaryDirectory()
|
||||
self.state_path = os.path.join(self.tmpdir.name, "cycle_state.json")
|
||||
cg.STATE_PATH = self.state_path
|
||||
|
||||
def tearDown(self):
|
||||
self.tmpdir.cleanup()
|
||||
cg.STATE_PATH = cg.DEFAULT_STATE
|
||||
|
||||
def test_load_empty_state(self):
|
||||
state = cg.load_state(self.state_path)
|
||||
self.assertEqual(state["status"], "complete")
|
||||
self.assertIsNone(state["cycle_id"])
|
||||
|
||||
def test_start_cycle(self):
|
||||
state = cg.start_cycle("M2: Commit-or-Abort", path=self.state_path)
|
||||
self.assertEqual(state["status"], "in_progress")
|
||||
self.assertEqual(state["target"], "M2: Commit-or-Abort")
|
||||
self.assertIsNotNone(state["cycle_id"])
|
||||
|
||||
def test_start_slice_requires_in_progress(self):
|
||||
with self.assertRaises(RuntimeError):
|
||||
cg.start_slice("test", path=self.state_path)
|
||||
|
||||
def test_slice_lifecycle(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("gather", path=self.state_path)
|
||||
state = cg.load_state(self.state_path)
|
||||
self.assertEqual(len(state["slices"]), 1)
|
||||
self.assertEqual(state["slices"][0]["name"], "gather")
|
||||
self.assertEqual(state["slices"][0]["status"], "in_progress")
|
||||
|
||||
cg.end_slice(status="complete", artifact="artifact.txt", path=self.state_path)
|
||||
state = cg.load_state(self.state_path)
|
||||
self.assertEqual(state["slices"][0]["status"], "complete")
|
||||
self.assertEqual(state["slices"][0]["artifact"], "artifact.txt")
|
||||
self.assertIsNotNone(state["slices"][0]["ended_at"])
|
||||
|
||||
def test_commit_cycle(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("work", path=self.state_path)
|
||||
cg.end_slice(path=self.state_path)
|
||||
proof = {"files": ["a.py"]}
|
||||
state = cg.commit_cycle(proof=proof, path=self.state_path)
|
||||
self.assertEqual(state["status"], "complete")
|
||||
self.assertEqual(state["proof"], proof)
|
||||
self.assertIsNotNone(state["completed_at"])
|
||||
|
||||
def test_commit_without_in_progress_fails(self):
|
||||
with self.assertRaises(RuntimeError):
|
||||
cg.commit_cycle(path=self.state_path)
|
||||
|
||||
def test_abort_cycle(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("work", path=self.state_path)
|
||||
state = cg.abort_cycle("manual abort", path=self.state_path)
|
||||
self.assertEqual(state["status"], "aborted")
|
||||
self.assertEqual(state["abort_reason"], "manual abort")
|
||||
self.assertIsNotNone(state["aborted_at"])
|
||||
self.assertEqual(state["slices"][-1]["status"], "aborted")
|
||||
|
||||
def test_slice_timeout_true(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("work", path=self.state_path)
|
||||
# Manually backdate slice start to 11 minutes ago
|
||||
state = cg.load_state(self.state_path)
|
||||
old = (datetime.now(timezone.utc) - timedelta(minutes=11)).isoformat()
|
||||
state["slices"][0]["started_at"] = old
|
||||
cg.save_state(state, self.state_path)
|
||||
self.assertTrue(cg.check_slice_timeout(max_minutes=10, path=self.state_path))
|
||||
|
||||
def test_slice_timeout_false(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("work", path=self.state_path)
|
||||
self.assertFalse(cg.check_slice_timeout(max_minutes=10, path=self.state_path))
|
||||
|
||||
def test_resume_or_abort_keeps_fresh_cycle(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
state = cg.resume_or_abort(path=self.state_path)
|
||||
self.assertEqual(state["status"], "in_progress")
|
||||
|
||||
def test_resume_or_abort_aborts_stale_cycle(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
# Backdate start to 31 minutes ago
|
||||
state = cg.load_state(self.state_path)
|
||||
old = (datetime.now(timezone.utc) - timedelta(minutes=31)).isoformat()
|
||||
state["started_at"] = old
|
||||
cg.save_state(state, self.state_path)
|
||||
state = cg.resume_or_abort(path=self.state_path)
|
||||
self.assertEqual(state["status"], "aborted")
|
||||
self.assertIn("crash recovery", state["abort_reason"])
|
||||
|
||||
def test_slice_duration_minutes(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("work", path=self.state_path)
|
||||
# Backdate by 5 minutes
|
||||
state = cg.load_state(self.state_path)
|
||||
old = (datetime.now(timezone.utc) - timedelta(minutes=5)).isoformat()
|
||||
state["slices"][0]["started_at"] = old
|
||||
cg.save_state(state, self.state_path)
|
||||
mins = cg.slice_duration_minutes(path=self.state_path)
|
||||
self.assertAlmostEqual(mins, 5.0, delta=0.5)
|
||||
|
||||
def test_cli_resume_prints_status(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
rc = cg.main(["resume"])
|
||||
self.assertEqual(rc, 0)
|
||||
|
||||
def test_cli_check_timeout(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("work", path=self.state_path)
|
||||
state = cg.load_state(self.state_path)
|
||||
old = (datetime.now(timezone.utc) - timedelta(minutes=11)).isoformat()
|
||||
state["slices"][0]["started_at"] = old
|
||||
cg.save_state(state, self.state_path)
|
||||
rc = cg.main(["check"])
|
||||
self.assertEqual(rc, 1)
|
||||
|
||||
def test_cli_check_ok(self):
|
||||
cg.start_cycle("test", path=self.state_path)
|
||||
cg.start_slice("work", path=self.state_path)
|
||||
rc = cg.main(["check"])
|
||||
self.assertEqual(rc, 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
47
ansible/BANNED_PROVIDERS.yml
Normal file
47
ansible/BANNED_PROVIDERS.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
# =============================================================================
|
||||
# BANNED PROVIDERS — The Timmy Foundation
|
||||
# =============================================================================
|
||||
# "Anthropic is not only fired, but banned. I don't want these errors
|
||||
# cropping up." — Alexander, 2026-04-09
|
||||
#
|
||||
# This is a HARD BAN. Not deprecated. Not fallback. BANNED.
|
||||
# Enforcement: pre-commit hook, linter, Ansible validation, CI tests.
|
||||
# =============================================================================
|
||||
|
||||
banned_providers:
|
||||
- name: anthropic
|
||||
reason: "Permanently banned. SDK access gated despite active quota. Fleet was bricked because golden state pointed to Anthropic Sonnet."
|
||||
banned_date: "2026-04-09"
|
||||
enforcement: strict # Ansible playbook FAILS if detected
|
||||
models:
|
||||
- "claude-sonnet-*"
|
||||
- "claude-opus-*"
|
||||
- "claude-haiku-*"
|
||||
- "claude-*"
|
||||
endpoints:
|
||||
- "api.anthropic.com"
|
||||
- "anthropic/*" # OpenRouter pattern
|
||||
api_keys:
|
||||
- "ANTHROPIC_API_KEY"
|
||||
- "CLAUDE_API_KEY"
|
||||
|
||||
# Golden state alternative:
|
||||
approved_providers:
|
||||
- name: kimi-coding
|
||||
model: kimi-k2.5
|
||||
role: primary
|
||||
- name: openrouter
|
||||
model: google/gemini-2.5-pro
|
||||
role: fallback
|
||||
- name: ollama
|
||||
model: "gemma4:latest"
|
||||
role: terminal_fallback
|
||||
|
||||
# Future evaluation:
|
||||
evaluation_candidates:
|
||||
- name: mimo-v2-pro
|
||||
status: pending
|
||||
notes: "Free via Nous Portal for ~2 weeks from 2026-04-07. Add after fallback chain is fixed."
|
||||
- name: hermes-4
|
||||
status: available
|
||||
notes: "Free on Nous Portal. 36B and 70B variants. Home team model."
|
||||
95
ansible/README.md
Normal file
95
ansible/README.md
Normal file
@@ -0,0 +1,95 @@
|
||||
# Ansible IaC — The Timmy Foundation Fleet
|
||||
|
||||
> One canonical Ansible playbook defines: deadman switch, cron schedule,
|
||||
> golden state rollback, agent startup sequence.
|
||||
> — KT Final Session 2026-04-08, Priority TWO
|
||||
|
||||
## Purpose
|
||||
|
||||
This directory contains the **single source of truth** for fleet infrastructure.
|
||||
No more ad-hoc recovery implementations. No more overlapping deadman switches.
|
||||
No more agents mutating their own configs into oblivion.
|
||||
|
||||
**Everything** goes through Ansible. If it's not in a playbook, it doesn't exist.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────┐
|
||||
│ Gitea (Source of Truth) │
|
||||
│ timmy-config/ansible/ │
|
||||
│ ├── inventory/hosts.yml (fleet machines) │
|
||||
│ ├── playbooks/site.yml (master playbook) │
|
||||
│ ├── roles/ (reusable roles) │
|
||||
│ └── group_vars/wizards.yml (golden state) │
|
||||
└──────────────────┬──────────────────────────────┘
|
||||
│ PR merge triggers webhook
|
||||
▼
|
||||
┌─────────────────────────────────────────────────┐
|
||||
│ Gitea Webhook Handler │
|
||||
│ scripts/deploy_on_webhook.sh │
|
||||
│ → ansible-pull on each target machine │
|
||||
└──────────────────┬──────────────────────────────┘
|
||||
│ ansible-pull
|
||||
▼
|
||||
┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||
│ Timmy │ │ Allegro │ │ Bezalel │ │ Ezra │
|
||||
│ (Mac) │ │ (VPS) │ │ (VPS) │ │ (VPS) │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ deadman │ │ deadman │ │ deadman │ │ deadman │
|
||||
│ cron │ │ cron │ │ cron │ │ cron │
|
||||
│ golden │ │ golden │ │ golden │ │ golden │
|
||||
│ req_log │ │ req_log │ │ req_log │ │ req_log │
|
||||
└──────────┘ └──────────┘ └──────────┘ └──────────┘
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Deploy everything to all machines
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/site.yml
|
||||
|
||||
# Deploy only golden state config
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/golden_state.yml
|
||||
|
||||
# Deploy only to a specific wizard
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/site.yml --limit bezalel
|
||||
|
||||
# Dry run (check mode)
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/site.yml --check --diff
|
||||
```
|
||||
|
||||
## Golden State Provider Chain
|
||||
|
||||
All wizard configs converge on this provider chain. **Anthropic is BANNED.**
|
||||
|
||||
| Priority | Provider | Model | Endpoint |
|
||||
| -------- | -------------------- | ---------------- | --------------------------------- |
|
||||
| 1 | Kimi | kimi-k2.5 | https://api.kimi.com/coding/v1 |
|
||||
| 2 | Gemini (OpenRouter) | gemini-2.5-pro | https://openrouter.ai/api/v1 |
|
||||
| 3 | Ollama (local) | gemma4:latest | http://localhost:11434/v1 |
|
||||
|
||||
## Roles
|
||||
|
||||
| Role | Purpose |
|
||||
| ---------------- | ------------------------------------------------------------ |
|
||||
| `wizard_base` | Common wizard setup: directories, thin config, git pull |
|
||||
| `deadman_switch` | Health check → snapshot good config → rollback on death |
|
||||
| `golden_state` | Deploy and enforce golden state provider chain |
|
||||
| `request_log` | SQLite telemetry table for every inference call |
|
||||
| `cron_manager` | Source-controlled cron jobs — no manual crontab edits |
|
||||
|
||||
## Rules
|
||||
|
||||
1. **No manual changes.** If it's not in a playbook, it will be overwritten.
|
||||
2. **No Anthropic.** Banned. Enforcement is automated. See `BANNED_PROVIDERS.yml`.
|
||||
3. **Idempotent.** Every playbook can run 100 times with the same result.
|
||||
4. **PR required.** Config changes go through Gitea PR review, then deploy.
|
||||
5. **One identity per machine.** No duplicate agents. Fleet audit enforces this.
|
||||
|
||||
## Related Issues
|
||||
|
||||
- timmy-config #442: [P2] Ansible IaC Canonical Playbook
|
||||
- timmy-config #444: Wire Deadman Switch ACTION
|
||||
- timmy-config #443: Thin Config Pattern
|
||||
- timmy-config #446: request_log Telemetry Table
|
||||
21
ansible/ansible.cfg
Normal file
21
ansible/ansible.cfg
Normal file
@@ -0,0 +1,21 @@
|
||||
[defaults]
|
||||
inventory = inventory/hosts.yml
|
||||
roles_path = roles
|
||||
host_key_checking = False
|
||||
retry_files_enabled = False
|
||||
stdout_callback = yaml
|
||||
forks = 10
|
||||
timeout = 30
|
||||
|
||||
# Logging
|
||||
log_path = /var/log/ansible/timmy-fleet.log
|
||||
|
||||
[privilege_escalation]
|
||||
become = True
|
||||
become_method = sudo
|
||||
become_user = root
|
||||
become_ask_pass = False
|
||||
|
||||
[ssh_connection]
|
||||
pipelining = True
|
||||
ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no
|
||||
74
ansible/inventory/group_vars/wizards.yml
Normal file
74
ansible/inventory/group_vars/wizards.yml
Normal file
@@ -0,0 +1,74 @@
|
||||
# =============================================================================
|
||||
# Wizard Group Variables — Golden State Configuration
|
||||
# =============================================================================
|
||||
# These variables are applied to ALL wizards in the fleet.
|
||||
# This IS the golden state. If a wizard deviates, Ansible corrects it.
|
||||
# =============================================================================
|
||||
|
||||
# --- Deadman Switch ---
|
||||
deadman_enabled: true
|
||||
deadman_check_interval: 300 # 5 minutes between health checks
|
||||
deadman_snapshot_dir: "~/.local/timmy/snapshots"
|
||||
deadman_max_snapshots: 10 # Rolling window of good configs
|
||||
deadman_restart_cooldown: 60 # Seconds to wait before restart after failure
|
||||
deadman_max_restart_attempts: 3
|
||||
deadman_escalation_channel: telegram # Alert Alexander after max attempts
|
||||
|
||||
# --- Thin Config ---
|
||||
thin_config_path: "~/.timmy/thin_config.yml"
|
||||
thin_config_mode: "0444" # Read-only — agents CANNOT modify
|
||||
upstream_repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"
|
||||
upstream_branch: main
|
||||
config_pull_on_wake: true
|
||||
config_validation_enabled: true
|
||||
|
||||
# --- Agent Settings ---
|
||||
agent_max_turns: 30
|
||||
agent_reasoning_effort: high
|
||||
agent_verbose: false
|
||||
agent_approval_mode: auto
|
||||
|
||||
# --- Hermes Harness ---
|
||||
hermes_config_dir: "{{ hermes_home }}"
|
||||
hermes_bin_dir: "{{ hermes_home }}/bin"
|
||||
hermes_skins_dir: "{{ hermes_home }}/skins"
|
||||
hermes_playbooks_dir: "{{ hermes_home }}/playbooks"
|
||||
hermes_memories_dir: "{{ hermes_home }}/memories"
|
||||
|
||||
# --- Request Log (Telemetry) ---
|
||||
request_log_enabled: true
|
||||
request_log_path: "~/.local/timmy/request_log.db"
|
||||
request_log_rotation_days: 30 # Archive logs older than 30 days
|
||||
request_log_sync_to_gitea: false # Future: push telemetry summaries to Gitea
|
||||
|
||||
# --- Cron Schedule ---
|
||||
# All cron jobs are managed here. No manual crontab edits.
|
||||
cron_jobs:
|
||||
- name: "Deadman health check"
|
||||
job: "cd {{ wizard_home }}/workspace/timmy-config && python3 fleet/health_check.py"
|
||||
minute: "*/5"
|
||||
hour: "*"
|
||||
enabled: "{{ deadman_enabled }}"
|
||||
|
||||
- name: "Muda audit"
|
||||
job: "cd {{ wizard_home }}/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1"
|
||||
minute: "0"
|
||||
hour: "21"
|
||||
weekday: "0"
|
||||
enabled: true
|
||||
|
||||
- name: "Config pull from upstream"
|
||||
job: "cd {{ wizard_home }}/workspace/timmy-config && git pull --ff-only origin main"
|
||||
minute: "*/15"
|
||||
hour: "*"
|
||||
enabled: "{{ config_pull_on_wake }}"
|
||||
|
||||
- name: "Request log rotation"
|
||||
job: "python3 -c \"import sqlite3,datetime; db=sqlite3.connect('{{ request_log_path }}'); db.execute('DELETE FROM request_log WHERE timestamp < datetime(\\\"now\\\", \\\"-{{ request_log_rotation_days }} days\\\")'); db.commit()\""
|
||||
minute: "0"
|
||||
hour: "3"
|
||||
enabled: "{{ request_log_enabled }}"
|
||||
|
||||
# --- Provider Enforcement ---
|
||||
# These are validated on every Ansible run. Any Anthropic reference = failure.
|
||||
provider_ban_enforcement: strict # strict = fail playbook, warn = log only
|
||||
119
ansible/inventory/hosts.yml
Normal file
119
ansible/inventory/hosts.yml
Normal file
@@ -0,0 +1,119 @@
|
||||
# =============================================================================
|
||||
# Fleet Inventory — The Timmy Foundation
|
||||
# =============================================================================
|
||||
# Source of truth for all machines in the fleet.
|
||||
# Update this file when machines are added/removed.
|
||||
# All changes go through PR review.
|
||||
# =============================================================================
|
||||
|
||||
all:
|
||||
children:
|
||||
wizards:
|
||||
hosts:
|
||||
timmy:
|
||||
ansible_host: localhost
|
||||
ansible_connection: local
|
||||
wizard_name: Timmy
|
||||
wizard_role: "Primary wizard — soul of the fleet"
|
||||
wizard_provider_primary: kimi-coding
|
||||
wizard_model_primary: kimi-k2.5
|
||||
hermes_port: 8081
|
||||
api_port: 8645
|
||||
wizard_home: "{{ ansible_env.HOME }}/wizards/timmy"
|
||||
hermes_home: "{{ ansible_env.HOME }}/.hermes"
|
||||
machine_type: mac
|
||||
# Timmy runs on Alexander's M3 Max
|
||||
ollama_available: true
|
||||
|
||||
allegro:
|
||||
ansible_host: 167.99.126.228
|
||||
ansible_user: root
|
||||
wizard_name: Allegro
|
||||
wizard_role: "Kimi-backed third wizard house — tight coding tasks"
|
||||
wizard_provider_primary: kimi-coding
|
||||
wizard_model_primary: kimi-k2.5
|
||||
hermes_port: 8081
|
||||
api_port: 8645
|
||||
wizard_home: /root/wizards/allegro
|
||||
hermes_home: /root/.hermes
|
||||
machine_type: vps
|
||||
ollama_available: false
|
||||
|
||||
bezalel:
|
||||
ansible_host: 159.203.146.185
|
||||
ansible_user: root
|
||||
wizard_name: Bezalel
|
||||
wizard_role: "Forge-and-testbed wizard — infrastructure, deployment, hardening"
|
||||
wizard_provider_primary: kimi-coding
|
||||
wizard_model_primary: kimi-k2.5
|
||||
hermes_port: 8081
|
||||
api_port: 8656
|
||||
wizard_home: /root/wizards/bezalel
|
||||
hermes_home: /root/.hermes
|
||||
machine_type: vps
|
||||
ollama_available: false
|
||||
# NOTE: The awake Bezalel may be the duplicate.
|
||||
# Fleet audit (the-nexus #1144) will resolve identity.
|
||||
|
||||
ezra:
|
||||
ansible_host: 143.198.27.163
|
||||
ansible_user: root
|
||||
wizard_name: Ezra
|
||||
wizard_role: "Infrastructure wizard — Gitea, nginx, hosting"
|
||||
wizard_provider_primary: kimi-coding
|
||||
wizard_model_primary: kimi-k2.5
|
||||
hermes_port: 8081
|
||||
api_port: 8645
|
||||
wizard_home: /root/wizards/ezra
|
||||
hermes_home: /root/.hermes
|
||||
machine_type: vps
|
||||
ollama_available: false
|
||||
# NOTE: Currently DOWN — Telegram key revoked, awaiting propagation.
|
||||
|
||||
# Infrastructure hosts (not wizards, but managed by Ansible)
|
||||
infrastructure:
|
||||
hosts:
|
||||
forge:
|
||||
ansible_host: 143.198.27.163
|
||||
ansible_user: root
|
||||
# Gitea runs on the same box as Ezra
|
||||
gitea_url: https://forge.alexanderwhitestone.com
|
||||
gitea_org: Timmy_Foundation
|
||||
|
||||
vars:
|
||||
# Global variables applied to all hosts
|
||||
gitea_repo_url: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"
|
||||
gitea_branch: main
|
||||
config_base_path: "{{ gitea_repo_url }}"
|
||||
timmy_log_dir: "~/.local/timmy/fleet-health"
|
||||
request_log_db: "~/.local/timmy/request_log.db"
|
||||
|
||||
# Golden state provider chain — Anthropic is BANNED
|
||||
golden_state_providers:
|
||||
- name: kimi-coding
|
||||
model: kimi-k2.5
|
||||
base_url: "https://api.kimi.com/coding/v1"
|
||||
timeout: 120
|
||||
reason: "Primary — Kimi K2.5 (best value, least friction)"
|
||||
- name: openrouter
|
||||
model: google/gemini-2.5-pro
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
api_key_env: OPENROUTER_API_KEY
|
||||
timeout: 120
|
||||
reason: "Fallback — Gemini 2.5 Pro via OpenRouter"
|
||||
- name: ollama
|
||||
model: "gemma4:latest"
|
||||
base_url: "http://localhost:11434/v1"
|
||||
timeout: 180
|
||||
reason: "Terminal fallback — local Ollama (sovereign, no API needed)"
|
||||
|
||||
# Banned providers — hard enforcement
|
||||
banned_providers:
|
||||
- anthropic
|
||||
- claude
|
||||
banned_models_patterns:
|
||||
- "claude-*"
|
||||
- "anthropic/*"
|
||||
- "*sonnet*"
|
||||
- "*opus*"
|
||||
- "*haiku*"
|
||||
98
ansible/playbooks/agent_startup.yml
Normal file
98
ansible/playbooks/agent_startup.yml
Normal file
@@ -0,0 +1,98 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# agent_startup.yml — Resurrect Wizards from Checked-in Configs
|
||||
# =============================================================================
|
||||
# Brings wizards back online using golden state configs.
|
||||
# Order: pull config → validate → start agent → verify with request_log
|
||||
# =============================================================================
|
||||
|
||||
- name: "Agent Startup Sequence"
|
||||
hosts: wizards
|
||||
become: true
|
||||
serial: 1 # One wizard at a time to avoid cascading issues
|
||||
|
||||
tasks:
|
||||
- name: "Pull latest config from upstream"
|
||||
git:
|
||||
repo: "{{ upstream_repo }}"
|
||||
dest: "{{ wizard_home }}/workspace/timmy-config"
|
||||
version: "{{ upstream_branch }}"
|
||||
force: true
|
||||
tags: [pull]
|
||||
|
||||
- name: "Deploy golden state config"
|
||||
include_role:
|
||||
name: golden_state
|
||||
tags: [config]
|
||||
|
||||
- name: "Validate config — no banned providers"
|
||||
shell: |
|
||||
python3 -c "
|
||||
import yaml, sys
|
||||
with open('{{ wizard_home }}/config.yaml') as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
banned = {{ banned_providers }}
|
||||
for p in cfg.get('fallback_providers', []):
|
||||
if p.get('provider', '') in banned:
|
||||
print(f'BANNED: {p[\"provider\"]}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
model = cfg.get('model', {}).get('provider', '')
|
||||
if model in banned:
|
||||
print(f'BANNED default provider: {model}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print('Config validated — no banned providers.')
|
||||
"
|
||||
register: config_valid
|
||||
tags: [validate]
|
||||
|
||||
- name: "Ensure hermes-agent service is running"
|
||||
systemd:
|
||||
name: "hermes-{{ wizard_name | lower }}"
|
||||
state: started
|
||||
enabled: true
|
||||
when: machine_type == 'vps'
|
||||
tags: [start]
|
||||
ignore_errors: true # Service may not exist yet on all machines
|
||||
|
||||
- name: "Start hermes agent (Mac — launchctl)"
|
||||
shell: |
|
||||
launchctl kickstart -k "ai.hermes.{{ wizard_name | lower }}" 2>/dev/null || \
|
||||
cd {{ wizard_home }} && hermes agent start --daemon 2>&1 | tail -5
|
||||
when: machine_type == 'mac'
|
||||
tags: [start]
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Wait for agent to come online"
|
||||
wait_for:
|
||||
host: 127.0.0.1
|
||||
port: "{{ api_port }}"
|
||||
timeout: 60
|
||||
state: started
|
||||
tags: [verify]
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Verify agent is alive — check request_log for activity"
|
||||
shell: |
|
||||
sleep 10
|
||||
python3 -c "
|
||||
import sqlite3, sys
|
||||
db = sqlite3.connect('{{ request_log_path }}')
|
||||
cursor = db.execute('''
|
||||
SELECT COUNT(*) FROM request_log
|
||||
WHERE agent_name = '{{ wizard_name }}'
|
||||
AND timestamp > datetime('now', '-5 minutes')
|
||||
''')
|
||||
count = cursor.fetchone()[0]
|
||||
if count > 0:
|
||||
print(f'{{ wizard_name }} is alive — {count} recent inference calls logged.')
|
||||
else:
|
||||
print(f'WARNING: {{ wizard_name }} started but no telemetry yet.')
|
||||
"
|
||||
register: agent_status
|
||||
tags: [verify]
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Report startup status"
|
||||
debug:
|
||||
msg: "{{ wizard_name }}: {{ agent_status.stdout | default('startup attempted') }}"
|
||||
tags: [always]
|
||||
15
ansible/playbooks/cron_schedule.yml
Normal file
15
ansible/playbooks/cron_schedule.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# cron_schedule.yml — Source-Controlled Cron Jobs
|
||||
# =============================================================================
|
||||
# All cron jobs are defined in group_vars/wizards.yml.
|
||||
# This playbook deploys them. No manual crontab edits allowed.
|
||||
# =============================================================================
|
||||
|
||||
- name: "Deploy Cron Schedule"
|
||||
hosts: wizards
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: cron_manager
|
||||
tags: [cron, schedule]
|
||||
17
ansible/playbooks/deadman_switch.yml
Normal file
17
ansible/playbooks/deadman_switch.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# deadman_switch.yml — Deploy Deadman Switch to All Wizards
|
||||
# =============================================================================
|
||||
# The deadman watch already fires and detects dead agents.
|
||||
# This playbook wires the ACTION:
|
||||
# - On healthy check: snapshot current config as "last known good"
|
||||
# - On failed check: rollback config to snapshot, restart agent
|
||||
# =============================================================================
|
||||
|
||||
- name: "Deploy Deadman Switch ACTION"
|
||||
hosts: wizards
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: deadman_switch
|
||||
tags: [deadman, recovery]
|
||||
30
ansible/playbooks/golden_state.yml
Normal file
30
ansible/playbooks/golden_state.yml
Normal file
@@ -0,0 +1,30 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# golden_state.yml — Deploy Golden State Config to All Wizards
|
||||
# =============================================================================
|
||||
# Enforces the golden state provider chain across the fleet.
|
||||
# Removes any Anthropic references. Deploys the approved provider chain.
|
||||
# =============================================================================
|
||||
|
||||
- name: "Deploy Golden State Configuration"
|
||||
hosts: wizards
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: golden_state
|
||||
tags: [golden, config]
|
||||
|
||||
post_tasks:
|
||||
- name: "Verify golden state — no banned providers"
|
||||
shell: |
|
||||
grep -rci 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' \
|
||||
{{ hermes_home }}/config.yaml \
|
||||
{{ wizard_home }}/config.yaml 2>/dev/null || echo "0"
|
||||
register: banned_count
|
||||
changed_when: false
|
||||
|
||||
- name: "Report golden state status"
|
||||
debug:
|
||||
msg: >
|
||||
{{ wizard_name }} golden state: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}.
|
||||
Banned provider references: {{ banned_count.stdout | trim }}.
|
||||
15
ansible/playbooks/request_log.yml
Normal file
15
ansible/playbooks/request_log.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# request_log.yml — Deploy Telemetry Table
|
||||
# =============================================================================
|
||||
# Creates the request_log SQLite table on all machines.
|
||||
# Every inference call writes a row. No exceptions. No summarizing.
|
||||
# =============================================================================
|
||||
|
||||
- name: "Deploy Request Log Telemetry"
|
||||
hosts: wizards
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: request_log
|
||||
tags: [telemetry, logging]
|
||||
72
ansible/playbooks/site.yml
Normal file
72
ansible/playbooks/site.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# site.yml — Master Playbook for the Timmy Foundation Fleet
|
||||
# =============================================================================
|
||||
# This is the ONE playbook that defines the entire fleet state.
|
||||
# Run this and every machine converges to golden state.
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook -i inventory/hosts.yml playbooks/site.yml
|
||||
# ansible-playbook -i inventory/hosts.yml playbooks/site.yml --limit bezalel
|
||||
# ansible-playbook -i inventory/hosts.yml playbooks/site.yml --check --diff
|
||||
# =============================================================================
|
||||
|
||||
- name: "Timmy Foundation Fleet — Full Convergence"
|
||||
hosts: wizards
|
||||
become: true
|
||||
|
||||
pre_tasks:
|
||||
- name: "Validate no banned providers in golden state"
|
||||
assert:
|
||||
that:
|
||||
- "item.name not in banned_providers"
|
||||
fail_msg: "BANNED PROVIDER DETECTED: {{ item.name }} — Anthropic is permanently banned."
|
||||
quiet: true
|
||||
loop: "{{ golden_state_providers }}"
|
||||
tags: [always]
|
||||
|
||||
- name: "Display target wizard"
|
||||
debug:
|
||||
msg: "Deploying to {{ wizard_name }} ({{ wizard_role }}) on {{ ansible_host }}"
|
||||
tags: [always]
|
||||
|
||||
roles:
|
||||
- role: wizard_base
|
||||
tags: [base, setup]
|
||||
|
||||
- role: golden_state
|
||||
tags: [golden, config]
|
||||
|
||||
- role: deadman_switch
|
||||
tags: [deadman, recovery]
|
||||
|
||||
- role: request_log
|
||||
tags: [telemetry, logging]
|
||||
|
||||
- role: cron_manager
|
||||
tags: [cron, schedule]
|
||||
|
||||
post_tasks:
|
||||
- name: "Final validation — scan for banned providers"
|
||||
shell: |
|
||||
grep -ri 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' \
|
||||
{{ hermes_home }}/config.yaml \
|
||||
{{ wizard_home }}/config.yaml \
|
||||
{{ thin_config_path }} 2>/dev/null || true
|
||||
register: banned_scan
|
||||
changed_when: false
|
||||
tags: [validation]
|
||||
|
||||
- name: "FAIL if banned providers found in deployed config"
|
||||
fail:
|
||||
msg: |
|
||||
BANNED PROVIDER DETECTED IN DEPLOYED CONFIG:
|
||||
{{ banned_scan.stdout }}
|
||||
Anthropic is permanently banned. Fix the config and re-deploy.
|
||||
when: banned_scan.stdout | length > 0
|
||||
tags: [validation]
|
||||
|
||||
- name: "Deployment complete"
|
||||
debug:
|
||||
msg: "{{ wizard_name }} converged to golden state. Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}"
|
||||
tags: [always]
|
||||
55
ansible/roles/cron_manager/tasks/main.yml
Normal file
55
ansible/roles/cron_manager/tasks/main.yml
Normal file
@@ -0,0 +1,55 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# cron_manager/tasks — Source-Controlled Cron Jobs
|
||||
# =============================================================================
|
||||
# All cron jobs are defined in group_vars/wizards.yml.
|
||||
# No manual crontab edits. This is the only way to manage cron.
|
||||
# =============================================================================
|
||||
|
||||
- name: "Deploy managed cron jobs"
|
||||
cron:
|
||||
name: "{{ item.name }}"
|
||||
job: "{{ item.job }}"
|
||||
minute: "{{ item.minute | default('*') }}"
|
||||
hour: "{{ item.hour | default('*') }}"
|
||||
day: "{{ item.day | default('*') }}"
|
||||
month: "{{ item.month | default('*') }}"
|
||||
weekday: "{{ item.weekday | default('*') }}"
|
||||
state: "{{ 'present' if item.enabled else 'absent' }}"
|
||||
user: "{{ ansible_user | default('root') }}"
|
||||
loop: "{{ cron_jobs }}"
|
||||
when: cron_jobs is defined
|
||||
|
||||
- name: "Deploy deadman switch cron (fallback if systemd timer unavailable)"
|
||||
cron:
|
||||
name: "Deadman switch — {{ wizard_name }}"
|
||||
job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
|
||||
minute: "*/5"
|
||||
hour: "*"
|
||||
state: present
|
||||
user: "{{ ansible_user | default('root') }}"
|
||||
when: deadman_enabled and machine_type != 'vps'
|
||||
# VPS machines use systemd timers instead
|
||||
|
||||
- name: "Remove legacy cron jobs (cleanup)"
|
||||
cron:
|
||||
name: "{{ item }}"
|
||||
state: absent
|
||||
user: "{{ ansible_user | default('root') }}"
|
||||
loop:
|
||||
- "legacy-deadman-watch"
|
||||
- "old-health-check"
|
||||
- "backup-deadman"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "List active cron jobs"
|
||||
shell: "crontab -l 2>/dev/null | grep -v '^#' | grep -v '^$' || echo 'No cron jobs found.'"
|
||||
register: active_crons
|
||||
changed_when: false
|
||||
|
||||
- name: "Report cron status"
|
||||
debug:
|
||||
msg: |
|
||||
{{ wizard_name }} cron jobs deployed.
|
||||
Active:
|
||||
{{ active_crons.stdout }}
|
||||
70
ansible/roles/deadman_switch/tasks/main.yml
Normal file
70
ansible/roles/deadman_switch/tasks/main.yml
Normal file
@@ -0,0 +1,70 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# deadman_switch/tasks — Wire the Deadman Switch ACTION
|
||||
# =============================================================================
|
||||
# The watch fires. This makes it DO something:
|
||||
# - On healthy check: snapshot current config as "last known good"
|
||||
# - On failed check: rollback to last known good, restart agent
|
||||
# =============================================================================
|
||||
|
||||
- name: "Create snapshot directory"
|
||||
file:
|
||||
path: "{{ deadman_snapshot_dir }}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: "Deploy deadman switch script"
|
||||
template:
|
||||
src: deadman_action.sh.j2
|
||||
dest: "{{ wizard_home }}/deadman_action.sh"
|
||||
mode: "0755"
|
||||
|
||||
- name: "Deploy deadman systemd service"
|
||||
template:
|
||||
src: deadman_switch.service.j2
|
||||
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.service"
|
||||
mode: "0644"
|
||||
when: machine_type == 'vps'
|
||||
notify: "Enable deadman service"
|
||||
|
||||
- name: "Deploy deadman systemd timer"
|
||||
template:
|
||||
src: deadman_switch.timer.j2
|
||||
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.timer"
|
||||
mode: "0644"
|
||||
when: machine_type == 'vps'
|
||||
notify: "Enable deadman timer"
|
||||
|
||||
- name: "Deploy deadman launchd plist (Mac)"
|
||||
template:
|
||||
src: deadman_switch.plist.j2
|
||||
dest: "{{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
|
||||
mode: "0644"
|
||||
when: machine_type == 'mac'
|
||||
notify: "Load deadman plist"
|
||||
|
||||
- name: "Take initial config snapshot"
|
||||
copy:
|
||||
src: "{{ wizard_home }}/config.yaml"
|
||||
dest: "{{ deadman_snapshot_dir }}/config.yaml.known_good"
|
||||
remote_src: true
|
||||
mode: "0444"
|
||||
ignore_errors: true
|
||||
|
||||
handlers:
|
||||
- name: "Enable deadman service"
|
||||
systemd:
|
||||
name: "deadman-{{ wizard_name | lower }}.service"
|
||||
daemon_reload: true
|
||||
enabled: true
|
||||
|
||||
- name: "Enable deadman timer"
|
||||
systemd:
|
||||
name: "deadman-{{ wizard_name | lower }}.timer"
|
||||
daemon_reload: true
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: "Load deadman plist"
|
||||
shell: "launchctl load {{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
|
||||
ignore_errors: true
|
||||
153
ansible/roles/deadman_switch/templates/deadman_action.sh.j2
Normal file
153
ansible/roles/deadman_switch/templates/deadman_action.sh.j2
Normal file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# Deadman Switch ACTION — {{ wizard_name }}
|
||||
# =============================================================================
|
||||
# Generated by Ansible on {{ ansible_date_time.iso8601 }}
|
||||
# DO NOT EDIT MANUALLY.
|
||||
#
|
||||
# On healthy check: snapshot current config as "last known good"
|
||||
# On failed check: rollback config to last known good, restart agent
|
||||
# =============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
WIZARD_NAME="{{ wizard_name }}"
|
||||
WIZARD_HOME="{{ wizard_home }}"
|
||||
CONFIG_FILE="{{ wizard_home }}/config.yaml"
|
||||
SNAPSHOT_DIR="{{ deadman_snapshot_dir }}"
|
||||
SNAPSHOT_FILE="${SNAPSHOT_DIR}/config.yaml.known_good"
|
||||
REQUEST_LOG_DB="{{ request_log_path }}"
|
||||
LOG_DIR="{{ timmy_log_dir }}"
|
||||
LOG_FILE="${LOG_DIR}/deadman-${WIZARD_NAME}.log"
|
||||
MAX_SNAPSHOTS={{ deadman_max_snapshots }}
|
||||
RESTART_COOLDOWN={{ deadman_restart_cooldown }}
|
||||
MAX_RESTART_ATTEMPTS={{ deadman_max_restart_attempts }}
|
||||
COOLDOWN_FILE="${LOG_DIR}/deadman_cooldown_${WIZARD_NAME}"
|
||||
SERVICE_NAME="hermes-{{ wizard_name | lower }}"
|
||||
|
||||
# Ensure directories exist
|
||||
mkdir -p "${SNAPSHOT_DIR}" "${LOG_DIR}"
|
||||
|
||||
log() {
|
||||
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [deadman] [${WIZARD_NAME}] $*" >> "${LOG_FILE}"
|
||||
echo "[deadman] [${WIZARD_NAME}] $*"
|
||||
}
|
||||
|
||||
log_telemetry() {
|
||||
local status="$1"
|
||||
local message="$2"
|
||||
if [ -f "${REQUEST_LOG_DB}" ]; then
|
||||
sqlite3 "${REQUEST_LOG_DB}" "INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message) VALUES (datetime('now'), '${WIZARD_NAME}', 'deadman_switch', 'N/A', 'health_check', '${status}', '${message}');" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
snapshot_config() {
|
||||
if [ -f "${CONFIG_FILE}" ]; then
|
||||
cp "${CONFIG_FILE}" "${SNAPSHOT_FILE}"
|
||||
# Keep rolling history
|
||||
cp "${CONFIG_FILE}" "${SNAPSHOT_DIR}/config.yaml.$(date +%s)"
|
||||
# Prune old snapshots
|
||||
ls -t "${SNAPSHOT_DIR}"/config.yaml.[0-9]* 2>/dev/null | tail -n +$((MAX_SNAPSHOTS + 1)) | xargs rm -f 2>/dev/null
|
||||
log "Config snapshot saved."
|
||||
fi
|
||||
}
|
||||
|
||||
rollback_config() {
|
||||
if [ -f "${SNAPSHOT_FILE}" ]; then
|
||||
log "Rolling back config to last known good..."
|
||||
cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}"
|
||||
log "Config rolled back."
|
||||
log_telemetry "fallback" "Config rolled back to last known good by deadman switch"
|
||||
else
|
||||
log "ERROR: No known good snapshot found. Pulling from upstream..."
|
||||
cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \
|
||||
git pull --ff-only origin {{ upstream_branch }} 2>/dev/null && \
|
||||
cp "wizards/{{ wizard_name | lower }}/config.yaml" "${CONFIG_FILE}" && \
|
||||
log "Config restored from upstream." || \
|
||||
log "CRITICAL: Cannot restore config from any source."
|
||||
fi
|
||||
}
|
||||
|
||||
restart_agent() {
|
||||
# Check cooldown
|
||||
if [ -f "${COOLDOWN_FILE}" ]; then
|
||||
local last_restart
|
||||
last_restart=$(cat "${COOLDOWN_FILE}")
|
||||
local now
|
||||
now=$(date +%s)
|
||||
local elapsed=$((now - last_restart))
|
||||
if [ "${elapsed}" -lt "${RESTART_COOLDOWN}" ]; then
|
||||
log "Restart cooldown active (${elapsed}s / ${RESTART_COOLDOWN}s). Skipping."
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
log "Restarting ${SERVICE_NAME}..."
|
||||
date +%s > "${COOLDOWN_FILE}"
|
||||
|
||||
{% if machine_type == 'vps' %}
|
||||
systemctl restart "${SERVICE_NAME}" 2>/dev/null && \
|
||||
log "Agent restarted via systemd." || \
|
||||
log "ERROR: systemd restart failed."
|
||||
{% else %}
|
||||
launchctl kickstart -k "ai.hermes.{{ wizard_name | lower }}" 2>/dev/null && \
|
||||
log "Agent restarted via launchctl." || \
|
||||
(cd "${WIZARD_HOME}" && hermes agent start --daemon 2>/dev/null && \
|
||||
log "Agent restarted via hermes CLI.") || \
|
||||
log "ERROR: All restart methods failed."
|
||||
{% endif %}
|
||||
|
||||
log_telemetry "success" "Agent restarted by deadman switch"
|
||||
}
|
||||
|
||||
# --- Health Check ---
|
||||
check_health() {
|
||||
# Check 1: Is the agent process running?
|
||||
{% if machine_type == 'vps' %}
|
||||
if ! systemctl is-active --quiet "${SERVICE_NAME}" 2>/dev/null; then
|
||||
if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then
|
||||
log "FAIL: Agent process not running."
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
{% else %}
|
||||
if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then
|
||||
log "FAIL: Agent process not running."
|
||||
return 1
|
||||
fi
|
||||
{% endif %}
|
||||
|
||||
# Check 2: Is the API port responding?
|
||||
if ! timeout 10 bash -c "echo > /dev/tcp/127.0.0.1/{{ api_port }}" 2>/dev/null; then
|
||||
log "FAIL: API port {{ api_port }} not responding."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check 3: Does the config contain banned providers?
|
||||
if grep -qi 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "${CONFIG_FILE}" 2>/dev/null; then
|
||||
log "FAIL: Config contains banned provider (Anthropic). Rolling back."
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# --- Main ---
|
||||
main() {
|
||||
log "Health check starting..."
|
||||
|
||||
if check_health; then
|
||||
log "HEALTHY — snapshotting config."
|
||||
snapshot_config
|
||||
log_telemetry "success" "Health check passed"
|
||||
else
|
||||
log "UNHEALTHY — initiating recovery."
|
||||
log_telemetry "error" "Health check failed — initiating rollback"
|
||||
rollback_config
|
||||
restart_agent
|
||||
fi
|
||||
|
||||
log "Health check complete."
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@@ -0,0 +1,22 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<!-- Deadman Switch — {{ wizard_name }}. Generated by Ansible. DO NOT EDIT MANUALLY. -->
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.timmy.deadman.{{ wizard_name | lower }}</string>
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>{{ wizard_home }}/deadman_action.sh</string>
|
||||
</array>
|
||||
<key>StartInterval</key>
|
||||
<integer>{{ deadman_check_interval }}</integer>
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
<key>StandardOutPath</key>
|
||||
<string>{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log</string>
|
||||
<key>StandardErrorPath</key>
|
||||
<string>{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log</string>
|
||||
</dict>
|
||||
</plist>
|
||||
@@ -0,0 +1,16 @@
|
||||
# Deadman Switch — {{ wizard_name }}
|
||||
# Generated by Ansible. DO NOT EDIT MANUALLY.
|
||||
|
||||
[Unit]
|
||||
Description=Deadman Switch for {{ wizard_name }} wizard
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart={{ wizard_home }}/deadman_action.sh
|
||||
User={{ ansible_user | default('root') }}
|
||||
StandardOutput=append:{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log
|
||||
StandardError=append:{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -0,0 +1,14 @@
|
||||
# Deadman Switch Timer — {{ wizard_name }}
|
||||
# Generated by Ansible. DO NOT EDIT MANUALLY.
|
||||
# Runs every {{ deadman_check_interval // 60 }} minutes.
|
||||
|
||||
[Unit]
|
||||
Description=Deadman Switch Timer for {{ wizard_name }} wizard
|
||||
|
||||
[Timer]
|
||||
OnBootSec=60
|
||||
OnUnitActiveSec={{ deadman_check_interval }}s
|
||||
AccuracySec=30s
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
6
ansible/roles/golden_state/defaults/main.yml
Normal file
6
ansible/roles/golden_state/defaults/main.yml
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
# golden_state defaults
|
||||
# The golden_state_providers list is defined in group_vars/wizards.yml
|
||||
# and inventory/hosts.yml (global vars).
|
||||
golden_state_enforce: true
|
||||
golden_state_backup_before_deploy: true
|
||||
46
ansible/roles/golden_state/tasks/main.yml
Normal file
46
ansible/roles/golden_state/tasks/main.yml
Normal file
@@ -0,0 +1,46 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# golden_state/tasks — Deploy and enforce golden state provider chain
|
||||
# =============================================================================
|
||||
|
||||
- name: "Backup current config before golden state deploy"
|
||||
copy:
|
||||
src: "{{ wizard_home }}/config.yaml"
|
||||
dest: "{{ wizard_home }}/config.yaml.pre-golden-{{ ansible_date_time.epoch }}"
|
||||
remote_src: true
|
||||
when: golden_state_backup_before_deploy
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Deploy golden state wizard config"
|
||||
template:
|
||||
src: "../../wizard_base/templates/wizard_config.yaml.j2"
|
||||
dest: "{{ wizard_home }}/config.yaml"
|
||||
mode: "0644"
|
||||
backup: true
|
||||
notify:
|
||||
- "Restart hermes agent (systemd)"
|
||||
- "Restart hermes agent (launchctl)"
|
||||
|
||||
- name: "Scan for banned providers in all config files"
|
||||
shell: |
|
||||
FOUND=0
|
||||
for f in {{ wizard_home }}/config.yaml {{ hermes_home }}/config.yaml; do
|
||||
if [ -f "$f" ]; then
|
||||
if grep -qi 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "$f"; then
|
||||
echo "BANNED PROVIDER in $f:"
|
||||
grep -ni 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "$f"
|
||||
FOUND=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
exit $FOUND
|
||||
register: provider_scan
|
||||
changed_when: false
|
||||
failed_when: provider_scan.rc != 0 and provider_ban_enforcement == 'strict'
|
||||
|
||||
- name: "Report golden state deployment"
|
||||
debug:
|
||||
msg: >
|
||||
{{ wizard_name }} golden state deployed.
|
||||
Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}.
|
||||
Banned provider scan: {{ 'CLEAN' if provider_scan.rc == 0 else 'VIOLATIONS FOUND' }}.
|
||||
64
ansible/roles/request_log/files/request_log_schema.sql
Normal file
64
ansible/roles/request_log/files/request_log_schema.sql
Normal file
@@ -0,0 +1,64 @@
|
||||
-- =============================================================================
|
||||
-- request_log — Inference Telemetry Table
|
||||
-- =============================================================================
|
||||
-- Every agent writes to this table BEFORE and AFTER every inference call.
|
||||
-- No exceptions. No summarizing. No describing what you would log.
|
||||
-- Actually write the row.
|
||||
--
|
||||
-- Source: KT Bezalel Architecture Session 2026-04-08
|
||||
-- =============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS request_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
timestamp TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
agent_name TEXT NOT NULL,
|
||||
provider TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
endpoint TEXT NOT NULL,
|
||||
tokens_in INTEGER,
|
||||
tokens_out INTEGER,
|
||||
latency_ms INTEGER,
|
||||
status TEXT NOT NULL, -- 'success', 'error', 'timeout', 'fallback'
|
||||
error_message TEXT
|
||||
);
|
||||
|
||||
-- Index for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_request_log_agent
|
||||
ON request_log (agent_name, timestamp);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_request_log_provider
|
||||
ON request_log (provider, timestamp);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_request_log_status
|
||||
ON request_log (status, timestamp);
|
||||
|
||||
-- View: recent activity per agent (last hour)
|
||||
CREATE VIEW IF NOT EXISTS v_recent_activity AS
|
||||
SELECT
|
||||
agent_name,
|
||||
provider,
|
||||
model,
|
||||
status,
|
||||
COUNT(*) as call_count,
|
||||
AVG(latency_ms) as avg_latency_ms,
|
||||
SUM(tokens_in) as total_tokens_in,
|
||||
SUM(tokens_out) as total_tokens_out
|
||||
FROM request_log
|
||||
WHERE timestamp > datetime('now', '-1 hour')
|
||||
GROUP BY agent_name, provider, model, status;
|
||||
|
||||
-- View: provider reliability (last 24 hours)
|
||||
CREATE VIEW IF NOT EXISTS v_provider_reliability AS
|
||||
SELECT
|
||||
provider,
|
||||
model,
|
||||
COUNT(*) as total_calls,
|
||||
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successes,
|
||||
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
|
||||
SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeouts,
|
||||
SUM(CASE WHEN status = 'fallback' THEN 1 ELSE 0 END) as fallbacks,
|
||||
ROUND(100.0 * SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) / COUNT(*), 1) as success_rate,
|
||||
AVG(latency_ms) as avg_latency_ms
|
||||
FROM request_log
|
||||
WHERE timestamp > datetime('now', '-24 hours')
|
||||
GROUP BY provider, model;
|
||||
50
ansible/roles/request_log/tasks/main.yml
Normal file
50
ansible/roles/request_log/tasks/main.yml
Normal file
@@ -0,0 +1,50 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# request_log/tasks — Deploy Telemetry Table
|
||||
# =============================================================================
|
||||
# "This is non-negotiable infrastructure. Without it, we cannot verify
|
||||
# if any agent actually executed what it claims."
|
||||
# — KT Bezalel 2026-04-08
|
||||
# =============================================================================
|
||||
|
||||
- name: "Create telemetry directory"
|
||||
file:
|
||||
path: "{{ request_log_path | dirname }}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: "Deploy request_log schema"
|
||||
copy:
|
||||
src: request_log_schema.sql
|
||||
dest: "{{ wizard_home }}/request_log_schema.sql"
|
||||
mode: "0644"
|
||||
|
||||
- name: "Initialize request_log database"
|
||||
shell: |
|
||||
sqlite3 "{{ request_log_path }}" < "{{ wizard_home }}/request_log_schema.sql"
|
||||
args:
|
||||
creates: "{{ request_log_path }}"
|
||||
|
||||
- name: "Verify request_log table exists"
|
||||
shell: |
|
||||
sqlite3 "{{ request_log_path }}" ".tables" | grep -q "request_log"
|
||||
register: table_check
|
||||
changed_when: false
|
||||
|
||||
- name: "Verify request_log schema matches"
|
||||
shell: |
|
||||
sqlite3 "{{ request_log_path }}" ".schema request_log" | grep -q "agent_name"
|
||||
register: schema_check
|
||||
changed_when: false
|
||||
|
||||
- name: "Set permissions on request_log database"
|
||||
file:
|
||||
path: "{{ request_log_path }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: "Report request_log status"
|
||||
debug:
|
||||
msg: >
|
||||
{{ wizard_name }} request_log: {{ request_log_path }}
|
||||
— table exists: {{ table_check.rc == 0 }}
|
||||
— schema valid: {{ schema_check.rc == 0 }}
|
||||
6
ansible/roles/wizard_base/defaults/main.yml
Normal file
6
ansible/roles/wizard_base/defaults/main.yml
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
# wizard_base defaults
|
||||
wizard_user: "{{ ansible_user | default('root') }}"
|
||||
wizard_group: "{{ ansible_user | default('root') }}"
|
||||
timmy_base_dir: "~/.local/timmy"
|
||||
timmy_config_repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"
|
||||
11
ansible/roles/wizard_base/handlers/main.yml
Normal file
11
ansible/roles/wizard_base/handlers/main.yml
Normal file
@@ -0,0 +1,11 @@
|
||||
---
|
||||
- name: "Restart hermes agent (systemd)"
|
||||
systemd:
|
||||
name: "hermes-{{ wizard_name | lower }}"
|
||||
state: restarted
|
||||
when: machine_type == 'vps'
|
||||
|
||||
- name: "Restart hermes agent (launchctl)"
|
||||
shell: "launchctl kickstart -k ai.hermes.{{ wizard_name | lower }}"
|
||||
when: machine_type == 'mac'
|
||||
ignore_errors: true
|
||||
69
ansible/roles/wizard_base/tasks/main.yml
Normal file
69
ansible/roles/wizard_base/tasks/main.yml
Normal file
@@ -0,0 +1,69 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# wizard_base/tasks — Common wizard setup
|
||||
# =============================================================================
|
||||
|
||||
- name: "Create wizard directories"
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
loop:
|
||||
- "{{ wizard_home }}"
|
||||
- "{{ wizard_home }}/workspace"
|
||||
- "{{ hermes_home }}"
|
||||
- "{{ hermes_home }}/bin"
|
||||
- "{{ hermes_home }}/skins"
|
||||
- "{{ hermes_home }}/playbooks"
|
||||
- "{{ hermes_home }}/memories"
|
||||
- "~/.local/timmy"
|
||||
- "~/.local/timmy/fleet-health"
|
||||
- "~/.local/timmy/snapshots"
|
||||
- "~/.timmy"
|
||||
|
||||
- name: "Clone/update timmy-config"
|
||||
git:
|
||||
repo: "{{ upstream_repo }}"
|
||||
dest: "{{ wizard_home }}/workspace/timmy-config"
|
||||
version: "{{ upstream_branch }}"
|
||||
force: false
|
||||
update: true
|
||||
ignore_errors: true # May fail on first run if no SSH key
|
||||
|
||||
- name: "Deploy SOUL.md"
|
||||
copy:
|
||||
src: "{{ wizard_home }}/workspace/timmy-config/SOUL.md"
|
||||
dest: "~/.timmy/SOUL.md"
|
||||
remote_src: true
|
||||
mode: "0644"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Deploy thin config (immutable pointer to upstream)"
|
||||
template:
|
||||
src: thin_config.yml.j2
|
||||
dest: "{{ thin_config_path }}"
|
||||
mode: "{{ thin_config_mode }}"
|
||||
tags: [thin_config]
|
||||
|
||||
- name: "Ensure Python3 and pip are available"
|
||||
package:
|
||||
name:
|
||||
- python3
|
||||
- python3-pip
|
||||
state: present
|
||||
when: machine_type == 'vps'
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Ensure PyYAML is installed (for config validation)"
|
||||
pip:
|
||||
name: pyyaml
|
||||
state: present
|
||||
when: machine_type == 'vps'
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Create Ansible log directory"
|
||||
file:
|
||||
path: /var/log/ansible
|
||||
state: directory
|
||||
mode: "0755"
|
||||
ignore_errors: true
|
||||
41
ansible/roles/wizard_base/templates/thin_config.yml.j2
Normal file
41
ansible/roles/wizard_base/templates/thin_config.yml.j2
Normal file
@@ -0,0 +1,41 @@
|
||||
# =============================================================================
|
||||
# Thin Config — {{ wizard_name }}
|
||||
# =============================================================================
|
||||
# THIS FILE IS READ-ONLY. Agents CANNOT modify it.
|
||||
# It contains only pointers to upstream. The actual config lives in Gitea.
|
||||
#
|
||||
# Agent wakes up → pulls config from upstream → loads → runs.
|
||||
# If anything tries to mutate this → fails gracefully → pulls fresh on restart.
|
||||
#
|
||||
# Only way to permanently change config: commit to Gitea, merge PR, Ansible deploys.
|
||||
#
|
||||
# Generated by Ansible on {{ ansible_date_time.iso8601 }}
|
||||
# DO NOT EDIT MANUALLY.
|
||||
# =============================================================================
|
||||
|
||||
identity:
|
||||
wizard_name: "{{ wizard_name }}"
|
||||
wizard_role: "{{ wizard_role }}"
|
||||
machine: "{{ inventory_hostname }}"
|
||||
|
||||
upstream:
|
||||
repo: "{{ upstream_repo }}"
|
||||
branch: "{{ upstream_branch }}"
|
||||
config_path: "wizards/{{ wizard_name | lower }}/config.yaml"
|
||||
pull_on_wake: {{ config_pull_on_wake | lower }}
|
||||
|
||||
recovery:
|
||||
deadman_enabled: {{ deadman_enabled | lower }}
|
||||
snapshot_dir: "{{ deadman_snapshot_dir }}"
|
||||
restart_cooldown: {{ deadman_restart_cooldown }}
|
||||
max_restart_attempts: {{ deadman_max_restart_attempts }}
|
||||
escalation_channel: "{{ deadman_escalation_channel }}"
|
||||
|
||||
telemetry:
|
||||
request_log_path: "{{ request_log_path }}"
|
||||
request_log_enabled: {{ request_log_enabled | lower }}
|
||||
|
||||
local_overrides:
|
||||
# Runtime overrides go here. They are EPHEMERAL — not persisted across restarts.
|
||||
# On restart, this section is reset to empty.
|
||||
{}
|
||||
115
ansible/roles/wizard_base/templates/wizard_config.yaml.j2
Normal file
115
ansible/roles/wizard_base/templates/wizard_config.yaml.j2
Normal file
@@ -0,0 +1,115 @@
|
||||
# =============================================================================
|
||||
# {{ wizard_name }} — Wizard Configuration (Golden State)
|
||||
# =============================================================================
|
||||
# Generated by Ansible on {{ ansible_date_time.iso8601 }}
|
||||
# DO NOT EDIT MANUALLY. Changes go through Gitea PR → Ansible deploy.
|
||||
#
|
||||
# Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}
|
||||
# Anthropic is PERMANENTLY BANNED.
|
||||
# =============================================================================
|
||||
|
||||
model:
|
||||
default: {{ wizard_model_primary }}
|
||||
provider: {{ wizard_provider_primary }}
|
||||
context_length: 65536
|
||||
base_url: {{ golden_state_providers[0].base_url }}
|
||||
|
||||
toolsets:
|
||||
- all
|
||||
|
||||
fallback_providers:
|
||||
{% for provider in golden_state_providers %}
|
||||
- provider: {{ provider.name }}
|
||||
model: {{ provider.model }}
|
||||
{% if provider.base_url is defined %}
|
||||
base_url: {{ provider.base_url }}
|
||||
{% endif %}
|
||||
{% if provider.api_key_env is defined %}
|
||||
api_key_env: {{ provider.api_key_env }}
|
||||
{% endif %}
|
||||
timeout: {{ provider.timeout }}
|
||||
reason: "{{ provider.reason }}"
|
||||
{% endfor %}
|
||||
|
||||
agent:
|
||||
max_turns: {{ agent_max_turns }}
|
||||
reasoning_effort: {{ agent_reasoning_effort }}
|
||||
verbose: {{ agent_verbose | lower }}
|
||||
|
||||
terminal:
|
||||
backend: local
|
||||
cwd: .
|
||||
timeout: 180
|
||||
persistent_shell: true
|
||||
|
||||
browser:
|
||||
inactivity_timeout: 120
|
||||
command_timeout: 30
|
||||
record_sessions: false
|
||||
|
||||
display:
|
||||
compact: false
|
||||
personality: ''
|
||||
resume_display: full
|
||||
busy_input_mode: interrupt
|
||||
bell_on_complete: false
|
||||
show_reasoning: false
|
||||
streaming: false
|
||||
show_cost: false
|
||||
tool_progress: all
|
||||
|
||||
memory:
|
||||
memory_enabled: true
|
||||
user_profile_enabled: true
|
||||
memory_char_limit: 2200
|
||||
user_char_limit: 1375
|
||||
nudge_interval: 10
|
||||
flush_min_turns: 6
|
||||
|
||||
approvals:
|
||||
mode: {{ agent_approval_mode }}
|
||||
|
||||
security:
|
||||
redact_secrets: true
|
||||
tirith_enabled: false
|
||||
|
||||
platforms:
|
||||
api_server:
|
||||
enabled: true
|
||||
extra:
|
||||
host: 127.0.0.1
|
||||
port: {{ api_port }}
|
||||
|
||||
session_reset:
|
||||
mode: none
|
||||
idle_minutes: 0
|
||||
|
||||
skills:
|
||||
creation_nudge_interval: 15
|
||||
|
||||
system_prompt_suffix: |
|
||||
You are {{ wizard_name }}, {{ wizard_role }}.
|
||||
Your soul is defined in SOUL.md — read it, live it.
|
||||
Hermes is your harness.
|
||||
{{ golden_state_providers[0].name }} is your primary provider.
|
||||
Refusal over fabrication. If you do not know, say so.
|
||||
Sovereignty and service always.
|
||||
|
||||
providers:
|
||||
{% for provider in golden_state_providers %}
|
||||
{{ provider.name }}:
|
||||
base_url: {{ provider.base_url }}
|
||||
timeout: {{ provider.timeout | default(60) }}
|
||||
{% if provider.name == 'kimi-coding' %}
|
||||
max_retries: 3
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
# =============================================================================
|
||||
# BANNED PROVIDERS — DO NOT ADD
|
||||
# =============================================================================
|
||||
# The following providers are PERMANENTLY BANNED:
|
||||
# - anthropic (any model: claude-sonnet, claude-opus, claude-haiku)
|
||||
# Enforcement: pre-commit hook, linter, Ansible validation, this comment.
|
||||
# Adding any banned provider will cause Ansible deployment to FAIL.
|
||||
# =============================================================================
|
||||
75
ansible/scripts/deploy_on_webhook.sh
Normal file
75
ansible/scripts/deploy_on_webhook.sh
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# Gitea Webhook Handler — Trigger Ansible Deploy on Merge
|
||||
# =============================================================================
|
||||
# This script is called by the Gitea webhook when a PR is merged
|
||||
# to the main branch of timmy-config.
|
||||
#
|
||||
# Setup:
|
||||
# 1. Add webhook in Gitea: Settings → Webhooks → Add Webhook
|
||||
# 2. URL: http://localhost:9000/hooks/deploy-timmy-config
|
||||
# 3. Events: Pull Request (merged only)
|
||||
# 4. Secret: <configured in Gitea>
|
||||
#
|
||||
# This script runs ansible-pull to update the local machine.
|
||||
# For fleet-wide deploys, each machine runs ansible-pull independently.
|
||||
# =============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO="https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"
|
||||
BRANCH="main"
|
||||
ANSIBLE_DIR="ansible"
|
||||
LOG_FILE="/var/log/ansible/webhook-deploy.log"
|
||||
LOCK_FILE="/tmp/ansible-deploy.lock"
|
||||
|
||||
log() {
|
||||
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [webhook] $*" | tee -a "${LOG_FILE}"
|
||||
}
|
||||
|
||||
# Prevent concurrent deploys
|
||||
if [ -f "${LOCK_FILE}" ]; then
|
||||
LOCK_AGE=$(( $(date +%s) - $(stat -c %Y "${LOCK_FILE}" 2>/dev/null || echo 0) ))
|
||||
if [ "${LOCK_AGE}" -lt 300 ]; then
|
||||
log "Deploy already in progress (lock age: ${LOCK_AGE}s). Skipping."
|
||||
exit 0
|
||||
else
|
||||
log "Stale lock file (${LOCK_AGE}s old). Removing."
|
||||
rm -f "${LOCK_FILE}"
|
||||
fi
|
||||
fi
|
||||
|
||||
trap 'rm -f "${LOCK_FILE}"' EXIT
|
||||
touch "${LOCK_FILE}"
|
||||
|
||||
log "Webhook triggered. Starting ansible-pull..."
|
||||
|
||||
# Pull latest config
|
||||
cd /tmp
|
||||
rm -rf timmy-config-deploy
|
||||
git clone --depth 1 --branch "${BRANCH}" "${REPO}" timmy-config-deploy 2>&1 | tee -a "${LOG_FILE}"
|
||||
|
||||
cd timmy-config-deploy/${ANSIBLE_DIR}
|
||||
|
||||
# Run Ansible against localhost
|
||||
log "Running Ansible playbook..."
|
||||
ansible-playbook \
|
||||
-i inventory/hosts.yml \
|
||||
playbooks/site.yml \
|
||||
--limit "$(hostname)" \
|
||||
--diff \
|
||||
2>&1 | tee -a "${LOG_FILE}"
|
||||
|
||||
RESULT=$?
|
||||
|
||||
if [ ${RESULT} -eq 0 ]; then
|
||||
log "Deploy successful."
|
||||
else
|
||||
log "ERROR: Deploy failed with exit code ${RESULT}."
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
rm -rf /tmp/timmy-config-deploy
|
||||
|
||||
log "Webhook handler complete."
|
||||
exit ${RESULT}
|
||||
155
ansible/scripts/validate_config.py
Normal file
155
ansible/scripts/validate_config.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Config Validator — The Timmy Foundation
|
||||
Validates wizard configs against golden state rules.
|
||||
Run before any config deploy to catch violations early.
|
||||
|
||||
Usage:
|
||||
python3 validate_config.py <config_file>
|
||||
python3 validate_config.py --all # Validate all wizard configs
|
||||
|
||||
Exit codes:
|
||||
0 — All validations passed
|
||||
1 — Validation errors found
|
||||
2 — File not found or parse error
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import yaml
|
||||
import fnmatch
|
||||
from pathlib import Path
|
||||
|
||||
# === BANNED PROVIDERS — HARD POLICY ===
|
||||
BANNED_PROVIDERS = {"anthropic", "claude"}
|
||||
BANNED_MODEL_PATTERNS = [
|
||||
"claude-*",
|
||||
"anthropic/*",
|
||||
"*sonnet*",
|
||||
"*opus*",
|
||||
"*haiku*",
|
||||
]
|
||||
|
||||
# === REQUIRED FIELDS ===
|
||||
REQUIRED_FIELDS = {
|
||||
"model": ["default", "provider"],
|
||||
"fallback_providers": None, # Must exist as a list
|
||||
}
|
||||
|
||||
|
||||
def is_banned_model(model_name: str) -> bool:
|
||||
"""Check if a model name matches any banned pattern."""
|
||||
model_lower = model_name.lower()
|
||||
for pattern in BANNED_MODEL_PATTERNS:
|
||||
if fnmatch.fnmatch(model_lower, pattern):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def validate_config(config_path: str) -> list[str]:
|
||||
"""Validate a wizard config file. Returns list of error strings."""
|
||||
errors = []
|
||||
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
except FileNotFoundError:
|
||||
return [f"File not found: {config_path}"]
|
||||
except yaml.YAMLError as e:
|
||||
return [f"YAML parse error: {e}"]
|
||||
|
||||
if not cfg:
|
||||
return ["Config file is empty"]
|
||||
|
||||
# Check required fields
|
||||
for section, fields in REQUIRED_FIELDS.items():
|
||||
if section not in cfg:
|
||||
errors.append(f"Missing required section: {section}")
|
||||
elif fields:
|
||||
for field in fields:
|
||||
if field not in cfg[section]:
|
||||
errors.append(f"Missing required field: {section}.{field}")
|
||||
|
||||
# Check default provider
|
||||
default_provider = cfg.get("model", {}).get("provider", "")
|
||||
if default_provider.lower() in BANNED_PROVIDERS:
|
||||
errors.append(f"BANNED default provider: {default_provider}")
|
||||
|
||||
default_model = cfg.get("model", {}).get("default", "")
|
||||
if is_banned_model(default_model):
|
||||
errors.append(f"BANNED default model: {default_model}")
|
||||
|
||||
# Check fallback providers
|
||||
for i, fb in enumerate(cfg.get("fallback_providers", [])):
|
||||
provider = fb.get("provider", "")
|
||||
model = fb.get("model", "")
|
||||
|
||||
if provider.lower() in BANNED_PROVIDERS:
|
||||
errors.append(f"BANNED fallback provider [{i}]: {provider}")
|
||||
|
||||
if is_banned_model(model):
|
||||
errors.append(f"BANNED fallback model [{i}]: {model}")
|
||||
|
||||
# Check providers section
|
||||
for name, provider_cfg in cfg.get("providers", {}).items():
|
||||
if name.lower() in BANNED_PROVIDERS:
|
||||
errors.append(f"BANNED provider in providers section: {name}")
|
||||
|
||||
base_url = str(provider_cfg.get("base_url", ""))
|
||||
if "anthropic" in base_url.lower():
|
||||
errors.append(f"BANNED URL in provider {name}: {base_url}")
|
||||
|
||||
# Check system prompt for banned references
|
||||
prompt = cfg.get("system_prompt_suffix", "")
|
||||
if isinstance(prompt, str):
|
||||
for banned in BANNED_PROVIDERS:
|
||||
if banned in prompt.lower():
|
||||
errors.append(f"BANNED provider referenced in system_prompt_suffix: {banned}")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: {sys.argv[0]} <config_file> [--all]")
|
||||
sys.exit(2)
|
||||
|
||||
if sys.argv[1] == "--all":
|
||||
# Validate all wizard configs in the repo
|
||||
repo_root = Path(__file__).parent.parent.parent
|
||||
wizard_dir = repo_root / "wizards"
|
||||
all_errors = {}
|
||||
|
||||
for wizard_path in sorted(wizard_dir.iterdir()):
|
||||
config_file = wizard_path / "config.yaml"
|
||||
if config_file.exists():
|
||||
errors = validate_config(str(config_file))
|
||||
if errors:
|
||||
all_errors[wizard_path.name] = errors
|
||||
|
||||
if all_errors:
|
||||
print("VALIDATION FAILED:")
|
||||
for wizard, errors in all_errors.items():
|
||||
print(f"\n {wizard}:")
|
||||
for err in errors:
|
||||
print(f" - {err}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("All wizard configs passed validation.")
|
||||
sys.exit(0)
|
||||
else:
|
||||
config_path = sys.argv[1]
|
||||
errors = validate_config(config_path)
|
||||
|
||||
if errors:
|
||||
print(f"VALIDATION FAILED for {config_path}:")
|
||||
for err in errors:
|
||||
print(f" - {err}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"PASSED: {config_path}")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env bash
|
||||
# agent-dispatch.sh — Generate a self-contained prompt for any agent
|
||||
# agent-dispatch.sh — Generate a lane-aware prompt for any agent
|
||||
#
|
||||
# Usage: agent-dispatch.sh <agent_name> <issue_num> <repo>
|
||||
# agent-dispatch.sh manus 42 Timmy_Foundation/the-nexus
|
||||
# agent-dispatch.sh groq 42 Timmy_Foundation/the-nexus
|
||||
#
|
||||
# Outputs a prompt to stdout. Copy-paste into the agent's interface.
|
||||
# The prompt includes everything: API URLs, token, git commands, PR creation.
|
||||
# The prompt includes issue context, repo setup, lane coaching, and
|
||||
# a short review checklist so dispatch itself teaches the right habits.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -13,86 +14,201 @@ AGENT_NAME="${1:?Usage: agent-dispatch.sh <agent> <issue_num> <owner/repo>}"
|
||||
ISSUE_NUM="${2:?Usage: agent-dispatch.sh <agent> <issue_num> <owner/repo>}"
|
||||
REPO="${3:?Usage: agent-dispatch.sh <agent> <issue_num> <owner/repo>}"
|
||||
|
||||
GITEA_URL="http://143.198.27.163:3000"
|
||||
TOKEN_FILE="$HOME/.hermes/${AGENT_NAME}_token"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
LANES_FILE="${SCRIPT_DIR%/bin}/playbooks/agent-lanes.json"
|
||||
|
||||
if [ ! -f "$TOKEN_FILE" ]; then
|
||||
echo "ERROR: No token found at $TOKEN_FILE" >&2
|
||||
echo "Create a Gitea user and token for '$AGENT_NAME' first." >&2
|
||||
resolve_gitea_url() {
|
||||
if [ -n "${GITEA_URL:-}" ]; then
|
||||
printf '%s\n' "${GITEA_URL%/}"
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.hermes/gitea_api" ]; then
|
||||
python3 - "$HOME/.hermes/gitea_api" <<'PY'
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
raw = Path(sys.argv[1]).read_text().strip().rstrip("/")
|
||||
print(raw[:-7] if raw.endswith("/api/v1") else raw)
|
||||
PY
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.config/gitea/base-url" ]; then
|
||||
tr -d '[:space:]' < "$HOME/.config/gitea/base-url"
|
||||
return 0
|
||||
fi
|
||||
echo "ERROR: set GITEA_URL or create ~/.hermes/gitea_api" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
GITEA_URL="$(resolve_gitea_url)"
|
||||
|
||||
resolve_token_file() {
|
||||
local agent="$1"
|
||||
local normalized
|
||||
normalized="$(printf '%s' "$agent" | tr '[:upper:]' '[:lower:]')"
|
||||
for candidate in \
|
||||
"$HOME/.hermes/${agent}_token" \
|
||||
"$HOME/.hermes/${normalized}_token" \
|
||||
"$HOME/.config/gitea/${agent}-token" \
|
||||
"$HOME/.config/gitea/${normalized}-token"; do
|
||||
if [ -f "$candidate" ]; then
|
||||
printf '%s\n' "$candidate"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
for candidate in \
|
||||
"$HOME/.config/gitea/timmy-token" \
|
||||
"$HOME/.hermes/gitea_token_vps" \
|
||||
"$HOME/.hermes/gitea_token_timmy"; do
|
||||
if [ -f "$candidate" ]; then
|
||||
printf '%s\n' "$candidate"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
TOKEN_FILE="$(resolve_token_file "$AGENT_NAME" || true)"
|
||||
if [ -z "${TOKEN_FILE:-}" ]; then
|
||||
echo "ERROR: No token found for '$AGENT_NAME'." >&2
|
||||
echo "Expected one of ~/.hermes/<agent>_token or ~/.config/gitea/<agent>-token" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GITEA_TOKEN=$(cat "$TOKEN_FILE")
|
||||
REPO_OWNER=$(echo "$REPO" | cut -d/ -f1)
|
||||
REPO_NAME=$(echo "$REPO" | cut -d/ -f2)
|
||||
GITEA_TOKEN="$(cat "$TOKEN_FILE")"
|
||||
REPO_OWNER="${REPO%%/*}"
|
||||
REPO_NAME="${REPO##*/}"
|
||||
BRANCH="${AGENT_NAME}/issue-${ISSUE_NUM}"
|
||||
|
||||
# Fetch issue title
|
||||
ISSUE_TITLE=$(curl -sf -H "Authorization: token $GITEA_TOKEN" \
|
||||
"${GITEA_URL}/api/v1/repos/${REPO}/issues/${ISSUE_NUM}" 2>/dev/null | \
|
||||
python3 -c "import sys,json; print(json.loads(sys.stdin.read())['title'])" 2>/dev/null || echo "Issue #${ISSUE_NUM}")
|
||||
python3 - "$LANES_FILE" "$AGENT_NAME" "$ISSUE_NUM" "$REPO" "$REPO_OWNER" "$REPO_NAME" "$BRANCH" "$GITEA_URL" "$GITEA_TOKEN" "$TOKEN_FILE" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
import textwrap
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
cat <<PROMPT
|
||||
You are ${AGENT_NAME}, an autonomous code agent working on the ${REPO_NAME} project.
|
||||
lanes_path, agent, issue_num, repo, repo_owner, repo_name, branch, gitea_url, token, token_file = sys.argv[1:]
|
||||
|
||||
YOUR ISSUE: #${ISSUE_NUM} — "${ISSUE_TITLE}"
|
||||
with open(lanes_path) as f:
|
||||
lanes = json.load(f)
|
||||
|
||||
GITEA API: ${GITEA_URL}/api/v1
|
||||
GITEA TOKEN: ${GITEA_TOKEN}
|
||||
REPO: ${REPO_OWNER}/${REPO_NAME}
|
||||
lane = lanes.get(agent, {
|
||||
"lane": "bounded work with explicit verification and a clean PR handoff",
|
||||
"skills_to_practice": ["verification", "scope control", "clear handoff writing"],
|
||||
"missing_skills": ["escalate instead of guessing when the scope becomes unclear"],
|
||||
"anti_lane": ["self-directed backlog growth", "unbounded architectural wandering"],
|
||||
"review_checklist": [
|
||||
"Did I stay within scope?",
|
||||
"Did I verify the result?",
|
||||
"Did I leave a clean PR and issue handoff?"
|
||||
],
|
||||
})
|
||||
|
||||
== STEP 1: READ THE ISSUE ==
|
||||
headers = {"Authorization": f"token {token}"}
|
||||
|
||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${ISSUE_NUM}"
|
||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${ISSUE_NUM}/comments"
|
||||
def fetch_json(path):
|
||||
req = urllib.request.Request(f"{gitea_url}/api/v1{path}", headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
Read the issue body AND all comments for context and build order constraints.
|
||||
try:
|
||||
issue = fetch_json(f"/repos/{repo}/issues/{issue_num}")
|
||||
comments = fetch_json(f"/repos/{repo}/issues/{issue_num}/comments")
|
||||
except urllib.error.HTTPError as exc:
|
||||
raise SystemExit(f"Failed to fetch issue context: {exc}") from exc
|
||||
|
||||
== STEP 2: SET UP WORKSPACE ==
|
||||
body = (issue.get("body") or "").strip()
|
||||
body = body[:4000] + ("\n...[truncated]" if len(body) > 4000 else "")
|
||||
recent_comments = comments[-3:]
|
||||
comment_block = []
|
||||
for c in recent_comments:
|
||||
author = c.get("user", {}).get("login", "unknown")
|
||||
text = (c.get("body") or "").strip().replace("\r", "")
|
||||
text = text[:600] + ("\n...[truncated]" if len(text) > 600 else "")
|
||||
comment_block.append(f"- {author}: {text}")
|
||||
|
||||
git clone http://${AGENT_NAME}:${GITEA_TOKEN}@143.198.27.163:3000/${REPO_OWNER}/${REPO_NAME}.git /tmp/${AGENT_NAME}-work-${ISSUE_NUM}
|
||||
cd /tmp/${AGENT_NAME}-work-${ISSUE_NUM}
|
||||
comment_text = "\n".join(comment_block) if comment_block else "- (no comments yet)"
|
||||
|
||||
Check if branch exists (prior attempt): git ls-remote origin ${BRANCH}
|
||||
If yes: git fetch origin ${BRANCH} && git checkout ${BRANCH}
|
||||
If no: git checkout -b ${BRANCH}
|
||||
skills = "\n".join(f"- {item}" for item in lane["skills_to_practice"])
|
||||
gaps = "\n".join(f"- {item}" for item in lane["missing_skills"])
|
||||
anti_lane = "\n".join(f"- {item}" for item in lane["anti_lane"])
|
||||
review = "\n".join(f"- {item}" for item in lane["review_checklist"])
|
||||
|
||||
== STEP 3: UNDERSTAND THE PROJECT ==
|
||||
prompt = f"""You are {agent}, working on {repo_name} for Timmy Foundation.
|
||||
|
||||
Read README.md or any contributing guide. Check for tox.ini, Makefile, package.json.
|
||||
Follow existing code conventions.
|
||||
YOUR ISSUE: #{issue_num} — "{issue.get('title', f'Issue #{issue_num}')}"
|
||||
|
||||
== STEP 4: DO THE WORK ==
|
||||
REPO: {repo}
|
||||
GITEA API: {gitea_url}/api/v1
|
||||
GITEA TOKEN FILE: {token_file}
|
||||
WORK BRANCH: {branch}
|
||||
|
||||
Implement the fix/feature described in the issue. Run tests if the project has them.
|
||||
LANE:
|
||||
{lane['lane']}
|
||||
|
||||
== STEP 5: COMMIT AND PUSH ==
|
||||
SKILLS TO PRACTICE ON THIS ASSIGNMENT:
|
||||
{skills}
|
||||
|
||||
git add -A
|
||||
git commit -m "feat: <description> (#${ISSUE_NUM})
|
||||
COMMON FAILURE MODE TO AVOID:
|
||||
{gaps}
|
||||
|
||||
Fixes #${ISSUE_NUM}"
|
||||
git push origin ${BRANCH}
|
||||
ANTI-LANE:
|
||||
{anti_lane}
|
||||
|
||||
== STEP 6: CREATE PR ==
|
||||
ISSUE BODY:
|
||||
{body or "(empty issue body)"}
|
||||
|
||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls" \\
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||
RECENT COMMENTS:
|
||||
{comment_text}
|
||||
|
||||
WORKFLOW:
|
||||
1. Read the issue body and recent comments carefully before touching code.
|
||||
2. Clone the repo into /tmp/{agent}-work-{issue_num}.
|
||||
3. Check whether {branch} already exists on origin; reuse it if it does.
|
||||
4. Read the repo docs and follow its own tooling and conventions.
|
||||
5. Do only the scoped work from the issue. If the task grows, stop and comment instead of freelancing expansion.
|
||||
6. Run the repo's real verification commands.
|
||||
7. Open a PR and summarize:
|
||||
- what changed
|
||||
- how you verified it
|
||||
- any remaining risk or follow-up
|
||||
8. Comment on the issue with the PR link and the same concise summary.
|
||||
|
||||
GIT / API SETUP:
|
||||
export GITEA_URL="{gitea_url}"
|
||||
export GITEA_TOKEN_FILE="{token_file}"
|
||||
export GITEA_TOKEN="$(tr -d '[:space:]' < "$GITEA_TOKEN_FILE")"
|
||||
git config --global http."$GITEA_URL/".extraHeader "Authorization: token $GITEA_TOKEN"
|
||||
git clone "$GITEA_URL/{repo}.git" /tmp/{agent}-work-{issue_num}
|
||||
cd /tmp/{agent}-work-{issue_num}
|
||||
git ls-remote --exit-code origin {branch} >/dev/null 2>&1 && git fetch origin {branch} && git checkout {branch} || git checkout -b {branch}
|
||||
|
||||
ISSUE FETCH COMMANDS:
|
||||
curl -s -H "Authorization: token $GITEA_TOKEN" "{gitea_url}/api/v1/repos/{repo}/issues/{issue_num}"
|
||||
curl -s -H "Authorization: token $GITEA_TOKEN" "{gitea_url}/api/v1/repos/{repo}/issues/{issue_num}/comments"
|
||||
|
||||
PR CREATION TEMPLATE:
|
||||
curl -s -X POST "{gitea_url}/api/v1/repos/{repo}/pulls" \\
|
||||
-H "Authorization: token $GITEA_TOKEN" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"title": "[${AGENT_NAME}] <description> (#${ISSUE_NUM})", "body": "Fixes #${ISSUE_NUM}\n\n<describe changes>", "head": "${BRANCH}", "base": "main"}'
|
||||
-d '{{"title":"[{agent}] <description> (#{issue_num})","body":"Fixes #{issue_num}\\n\\n## Summary\\n- <change>\\n\\n## Verification\\n- <command/output>\\n\\n## Risks\\n- <if any>","head":"{branch}","base":"main"}}'
|
||||
|
||||
== STEP 7: COMMENT ON ISSUE ==
|
||||
|
||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${ISSUE_NUM}/comments" \\
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||
ISSUE COMMENT TEMPLATE:
|
||||
curl -s -X POST "{gitea_url}/api/v1/repos/{repo}/issues/{issue_num}/comments" \\
|
||||
-H "Authorization: token $GITEA_TOKEN" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"body": "PR submitted. <summary>"}'
|
||||
-d '{{"body":"PR submitted.\\n\\nSummary:\\n- <change>\\n\\nVerification:\\n- <command/output>\\n\\nRisks:\\n- <if any>"}}'
|
||||
|
||||
== RULES ==
|
||||
- Read project docs FIRST.
|
||||
- Use the project's own test/lint tools.
|
||||
- Respect git hooks. Do not skip them.
|
||||
- If tests fail twice, STOP and comment on the issue.
|
||||
- ALWAYS push your work. ALWAYS create a PR. No exceptions.
|
||||
- Clean up: remove /tmp/${AGENT_NAME}-work-${ISSUE_NUM} when done.
|
||||
PROMPT
|
||||
REVIEW CHECKLIST BEFORE YOU PUSH:
|
||||
{review}
|
||||
|
||||
RULES:
|
||||
- Do not skip hooks with --no-verify.
|
||||
- Do not silently widen the scope.
|
||||
- If verification fails twice or the issue is underspecified, stop and comment with what blocked you.
|
||||
- Always create a PR instead of pushing to main.
|
||||
- Clean up /tmp/{agent}-work-{issue_num} when done.
|
||||
"""
|
||||
|
||||
print(textwrap.dedent(prompt).strip())
|
||||
PY
|
||||
|
||||
273
bin/agent-loop.sh
Executable file
273
bin/agent-loop.sh
Executable file
@@ -0,0 +1,273 @@
|
||||
#!/usr/bin/env bash
|
||||
# agent-loop.sh — Universal agent dev loop with Genchi Genbutsu verification
|
||||
#
|
||||
# Usage: agent-loop.sh <agent-name> [num-workers]
|
||||
# agent-loop.sh claude 2
|
||||
# agent-loop.sh gemini 1
|
||||
#
|
||||
# Dispatches via agent-dispatch.sh, then verifies with genchi-genbutsu.sh.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
AGENT="${1:?Usage: agent-loop.sh <agent-name> [num-workers]}"
|
||||
NUM_WORKERS="${2:-1}"
|
||||
|
||||
# Resolve agent tool and model from config or fallback
|
||||
case "$AGENT" in
|
||||
claude) TOOL="claude"; MODEL="sonnet" ;;
|
||||
gemini) TOOL="gemini"; MODEL="gemini-2.5-pro-preview-05-06" ;;
|
||||
grok) TOOL="opencode"; MODEL="grok-3-fast" ;;
|
||||
*) TOOL="$AGENT"; MODEL="" ;;
|
||||
esac
|
||||
|
||||
# === CONFIG ===
|
||||
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
||||
GITEA_TOKEN="${GITEA_TOKEN:-}"
|
||||
WORKTREE_BASE="$HOME/worktrees"
|
||||
LOG_DIR="$HOME/.hermes/logs"
|
||||
LOCK_DIR="$LOG_DIR/${AGENT}-locks"
|
||||
SKIP_FILE="$LOG_DIR/${AGENT}-skip-list.json"
|
||||
ACTIVE_FILE="$LOG_DIR/${AGENT}-active.json"
|
||||
TIMEOUT=600
|
||||
COOLDOWN=30
|
||||
|
||||
mkdir -p "$LOG_DIR" "$WORKTREE_BASE" "$LOCK_DIR"
|
||||
[ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE"
|
||||
echo '{}' > "$ACTIVE_FILE"
|
||||
|
||||
# === SHARED FUNCTIONS ===
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ${AGENT}: $*" >> "$LOG_DIR/${AGENT}-loop.log"
|
||||
}
|
||||
|
||||
lock_issue() {
|
||||
local key="$1"
|
||||
mkdir "$LOCK_DIR/$key.lock" 2>/dev/null && echo $$ > "$LOCK_DIR/$key.lock/pid"
|
||||
}
|
||||
|
||||
unlock_issue() {
|
||||
rm -rf "$LOCK_DIR/$1.lock" 2>/dev/null
|
||||
}
|
||||
|
||||
mark_skip() {
|
||||
local issue_num="$1" reason="$2"
|
||||
python3 -c "
|
||||
import json, time, fcntl
|
||||
with open('${SKIP_FILE}', 'r+') as f:
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
try: skips = json.load(f)
|
||||
except: skips = {}
|
||||
failures = skips.get(str($issue_num), {}).get('failures', 0) + 1
|
||||
skip_hours = 6 if failures >= 3 else 1
|
||||
skips[str($issue_num)] = {'until': time.time() + (skip_hours * 3600), 'reason': '$reason', 'failures': failures}
|
||||
f.seek(0); f.truncate()
|
||||
json.dump(skips, f, indent=2)
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
get_next_issue() {
|
||||
python3 -c "
|
||||
import json, sys, time, urllib.request, os
|
||||
token = '${GITEA_TOKEN}'
|
||||
base = '${GITEA_URL}'
|
||||
repos = ['Timmy_Foundation/the-nexus', 'Timmy_Foundation/timmy-config', 'Timmy_Foundation/hermes-agent']
|
||||
try:
|
||||
with open('${SKIP_FILE}') as f: skips = json.load(f)
|
||||
except: skips = {}
|
||||
try:
|
||||
with open('${ACTIVE_FILE}') as f: active = json.load(f); active_issues = {v['issue'] for v in active.values()}
|
||||
except: active_issues = set()
|
||||
all_issues = []
|
||||
for repo in repos:
|
||||
url = f'{base}/api/v1/repos/{repo}/issues?state=open&type=issues&limit=50&sort=created'
|
||||
req = urllib.request.Request(url, headers={'Authorization': f'token {token}'})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=10)
|
||||
issues = json.loads(resp.read())
|
||||
for i in issues: i['_repo'] = repo
|
||||
all_issues.extend(issues)
|
||||
except: continue
|
||||
for i in sorted(all_issues, key=lambda x: x['title'].lower()):
|
||||
assignees = [a['login'] for a in (i.get('assignees') or [])]
|
||||
if assignees and '${AGENT}' not in assignees: continue
|
||||
num_str = str(i['number'])
|
||||
if num_str in active_issues: continue
|
||||
if skips.get(num_str, {}).get('until', 0) > time.time(): continue
|
||||
lock = '${LOCK_DIR}/' + i['_repo'].replace('/', '-') + '-' + num_str + '.lock'
|
||||
if os.path.isdir(lock): continue
|
||||
owner, name = i['_repo'].split('/')
|
||||
print(json.dumps({'number': i['number'], 'title': i['title'], 'repo_owner': owner, 'repo_name': name, 'repo': i['_repo']}))
|
||||
sys.exit(0)
|
||||
print('null')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
# === WORKER FUNCTION ===
|
||||
run_worker() {
|
||||
local worker_id="$1"
|
||||
log "WORKER-${worker_id}: Started"
|
||||
|
||||
while true; do
|
||||
issue_json=$(get_next_issue)
|
||||
if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then
|
||||
sleep 30
|
||||
continue
|
||||
fi
|
||||
|
||||
issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])")
|
||||
issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")
|
||||
repo_owner=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_owner'])")
|
||||
repo_name=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_name'])")
|
||||
issue_key="${repo_owner}-${repo_name}-${issue_num}"
|
||||
branch="${AGENT}/issue-${issue_num}"
|
||||
worktree="${WORKTREE_BASE}/${AGENT}-w${worker_id}-${issue_num}"
|
||||
|
||||
if ! lock_issue "$issue_key"; then
|
||||
sleep 5
|
||||
continue
|
||||
fi
|
||||
|
||||
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
||||
|
||||
# Clone / checkout
|
||||
rm -rf "$worktree" 2>/dev/null
|
||||
CLONE_URL="http://${AGENT}:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git"
|
||||
if git ls-remote --heads "$CLONE_URL" "$branch" 2>/dev/null | grep -q "$branch"; then
|
||||
git clone --depth=50 -b "$branch" "$CLONE_URL" "$worktree" >/dev/null 2>&1
|
||||
else
|
||||
git clone --depth=1 -b main "$CLONE_URL" "$worktree" >/dev/null 2>&1
|
||||
cd "$worktree" && git checkout -b "$branch" >/dev/null 2>&1
|
||||
fi
|
||||
cd "$worktree"
|
||||
|
||||
# Generate prompt
|
||||
prompt=$(bash "$(dirname "$0")/agent-dispatch.sh" "$AGENT" "$issue_num" "${repo_owner}/${repo_name}")
|
||||
|
||||
CYCLE_START=$(date +%s)
|
||||
set +e
|
||||
if [ "$TOOL" = "claude" ]; then
|
||||
env -u CLAUDECODE gtimeout "$TIMEOUT" claude \
|
||||
--print --model "$MODEL" --dangerously-skip-permissions \
|
||||
-p "$prompt" </dev/null >> "$LOG_DIR/${AGENT}-${issue_num}.log" 2>&1
|
||||
elif [ "$TOOL" = "gemini" ]; then
|
||||
gtimeout "$TIMEOUT" gemini -p "$prompt" --yolo \
|
||||
</dev/null >> "$LOG_DIR/${AGENT}-${issue_num}.log" 2>&1
|
||||
else
|
||||
gtimeout "$TIMEOUT" "$TOOL" "$prompt" \
|
||||
</dev/null >> "$LOG_DIR/${AGENT}-${issue_num}.log" 2>&1
|
||||
fi
|
||||
exit_code=$?
|
||||
set -e
|
||||
CYCLE_END=$(date +%s)
|
||||
CYCLE_DURATION=$((CYCLE_END - CYCLE_START))
|
||||
|
||||
# Salvage
|
||||
cd "$worktree" 2>/dev/null || true
|
||||
DIRTY=$(git status --porcelain 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "${DIRTY:-0}" -gt 0 ]; then
|
||||
git add -A 2>/dev/null
|
||||
git commit -m "WIP: ${AGENT} progress on #${issue_num}
|
||||
|
||||
Automated salvage commit — agent session ended (exit $exit_code)." 2>/dev/null || true
|
||||
fi
|
||||
|
||||
UNPUSHED=$(git log --oneline "origin/main..HEAD" 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "${UNPUSHED:-0}" -gt 0 ]; then
|
||||
git push -u origin "$branch" 2>/dev/null && \
|
||||
log "WORKER-${worker_id}: Pushed $UNPUSHED commit(s) on $branch" || \
|
||||
log "WORKER-${worker_id}: Push failed for $branch"
|
||||
fi
|
||||
|
||||
# Create PR if needed
|
||||
pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=open&head=${repo_owner}:${branch}&limit=1" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
||||
import sys,json
|
||||
prs = json.load(sys.stdin)
|
||||
print(prs[0]['number'] if prs else '')
|
||||
" 2>/dev/null)
|
||||
|
||||
if [ -z "$pr_num" ] && [ "${UNPUSHED:-0}" -gt 0 ]; then
|
||||
pr_num=$(curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$(python3 -c "
|
||||
import json
|
||||
print(json.dumps({
|
||||
'title': '${AGENT}: Issue #${issue_num}',
|
||||
'head': '${branch}',
|
||||
'base': 'main',
|
||||
'body': 'Automated PR for issue #${issue_num}.\nExit code: ${exit_code}'
|
||||
}))
|
||||
")" | python3 -c "import sys,json; print(json.load(sys.stdin).get('number',''))" 2>/dev/null)
|
||||
[ -n "$pr_num" ] && log "WORKER-${worker_id}: Created PR #${pr_num} for issue #${issue_num}"
|
||||
fi
|
||||
|
||||
# ── Genchi Genbutsu: verify world state before declaring success ──
|
||||
VERIFIED="false"
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
log "WORKER-${worker_id}: SUCCESS #${issue_num} — running genchi-genbutsu"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
if verify_result=$("$SCRIPT_DIR/genchi-genbutsu.sh" "$repo_owner" "$repo_name" "$issue_num" "$branch" "$AGENT" 2>/dev/null); then
|
||||
VERIFIED="true"
|
||||
log "WORKER-${worker_id}: VERIFIED #${issue_num}"
|
||||
if [ -n "$pr_num" ]; then
|
||||
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/merge" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do": "squash"}' >/dev/null 2>&1 || true
|
||||
curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"state": "closed"}' >/dev/null 2>&1 || true
|
||||
log "WORKER-${worker_id}: PR #${pr_num} merged, issue #${issue_num} closed"
|
||||
fi
|
||||
consecutive_failures=0
|
||||
else
|
||||
verify_details=$(echo "$verify_result" | python3 -c "import sys,json; print(json.load(sys.stdin).get('details','unknown'))" 2>/dev/null || echo "unverified")
|
||||
log "WORKER-${worker_id}: UNVERIFIED #${issue_num} — $verify_details"
|
||||
mark_skip "$issue_num" "unverified" 1
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
else
|
||||
log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
|
||||
# ── METRICS ──
|
||||
python3 -c "
|
||||
import json, datetime
|
||||
print(json.dumps({
|
||||
'ts': datetime.datetime.utcnow().isoformat() + 'Z',
|
||||
'agent': '${AGENT}',
|
||||
'worker': $worker_id,
|
||||
'issue': $issue_num,
|
||||
'repo': '${repo_owner}/${repo_name}',
|
||||
'outcome': 'success' if $exit_code == 0 else 'timeout' if $exit_code == 124 else 'failed',
|
||||
'exit_code': $exit_code,
|
||||
'duration_s': $CYCLE_DURATION,
|
||||
'pr': '${pr_num:-}',
|
||||
'verified': ${VERIFIED:-false}
|
||||
}))
|
||||
" >> "$LOG_DIR/${AGENT}-metrics.jsonl" 2>/dev/null
|
||||
|
||||
rm -rf "$worktree" 2>/dev/null
|
||||
unlock_issue "$issue_key"
|
||||
sleep "$COOLDOWN"
|
||||
done
|
||||
}
|
||||
|
||||
# === MAIN ===
|
||||
log "=== Agent Loop Started — ${AGENT} with ${NUM_WORKERS} worker(s) ==="
|
||||
|
||||
rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
|
||||
|
||||
for i in $(seq 1 "$NUM_WORKERS"); do
|
||||
run_worker "$i" &
|
||||
log "Launched worker $i (PID $!)"
|
||||
sleep 3
|
||||
done
|
||||
|
||||
wait
|
||||
630
bin/claude-loop.sh
Executable file
630
bin/claude-loop.sh
Executable file
@@ -0,0 +1,630 @@
|
||||
#!/usr/bin/env bash
|
||||
# claude-loop.sh — Parallel Claude Code agent dispatch loop
|
||||
# Runs N workers concurrently against the Gitea backlog.
|
||||
# Gracefully handles rate limits with backoff.
|
||||
#
|
||||
# Usage: claude-loop.sh [NUM_WORKERS] (default: 2)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# === CONFIG ===
|
||||
NUM_WORKERS="${1:-2}"
|
||||
MAX_WORKERS=10 # absolute ceiling
|
||||
WORKTREE_BASE="$HOME/worktrees"
|
||||
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
||||
GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token")
|
||||
CLAUDE_TIMEOUT=900 # 15 min per issue
|
||||
COOLDOWN=15 # seconds between issues — stagger clones
|
||||
RATE_LIMIT_SLEEP=30 # initial sleep on rate limit
|
||||
MAX_RATE_SLEEP=120 # max backoff on rate limit
|
||||
LOG_DIR="$HOME/.hermes/logs"
|
||||
SKIP_FILE="$LOG_DIR/claude-skip-list.json"
|
||||
LOCK_DIR="$LOG_DIR/claude-locks"
|
||||
ACTIVE_FILE="$LOG_DIR/claude-active.json"
|
||||
|
||||
mkdir -p "$LOG_DIR" "$WORKTREE_BASE" "$LOCK_DIR"
|
||||
|
||||
# Initialize files
|
||||
[ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE"
|
||||
echo '{}' > "$ACTIVE_FILE"
|
||||
|
||||
# === SHARED FUNCTIONS ===
|
||||
log() {
|
||||
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||
echo "$msg" >> "$LOG_DIR/claude-loop.log"
|
||||
}
|
||||
|
||||
lock_issue() {
|
||||
local issue_key="$1"
|
||||
local lockfile="$LOCK_DIR/$issue_key.lock"
|
||||
if mkdir "$lockfile" 2>/dev/null; then
|
||||
echo $$ > "$lockfile/pid"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
unlock_issue() {
|
||||
local issue_key="$1"
|
||||
rm -rf "$LOCK_DIR/$issue_key.lock" 2>/dev/null
|
||||
}
|
||||
|
||||
mark_skip() {
|
||||
local issue_num="$1"
|
||||
local reason="$2"
|
||||
local skip_hours="${3:-1}"
|
||||
python3 -c "
|
||||
import json, time, fcntl
|
||||
with open('$SKIP_FILE', 'r+') as f:
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
try: skips = json.load(f)
|
||||
except: skips = {}
|
||||
skips[str($issue_num)] = {
|
||||
'until': time.time() + ($skip_hours * 3600),
|
||||
'reason': '$reason',
|
||||
'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1
|
||||
}
|
||||
if skips[str($issue_num)]['failures'] >= 3:
|
||||
skips[str($issue_num)]['until'] = time.time() + (6 * 3600)
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
json.dump(skips, f, indent=2)
|
||||
" 2>/dev/null
|
||||
log "SKIP: #${issue_num} — ${reason}"
|
||||
}
|
||||
|
||||
update_active() {
|
||||
local worker="$1" issue="$2" repo="$3" status="$4"
|
||||
python3 -c "
|
||||
import json, fcntl
|
||||
with open('$ACTIVE_FILE', 'r+') as f:
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
try: active = json.load(f)
|
||||
except: active = {}
|
||||
if '$status' == 'done':
|
||||
active.pop('$worker', None)
|
||||
else:
|
||||
active['$worker'] = {'issue': '$issue', 'repo': '$repo', 'status': '$status'}
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
json.dump(active, f, indent=2)
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
cleanup_workdir() {
|
||||
local wt="$1"
|
||||
rm -rf "$wt" 2>/dev/null || true
|
||||
}
|
||||
|
||||
get_next_issue() {
|
||||
python3 -c "
|
||||
import json, sys, time, urllib.request, os
|
||||
|
||||
token = '${GITEA_TOKEN}'
|
||||
base = '${GITEA_URL}'
|
||||
repos = [
|
||||
'Timmy_Foundation/the-nexus',
|
||||
'Timmy_Foundation/autolora',
|
||||
]
|
||||
|
||||
# Load skip list
|
||||
try:
|
||||
with open('${SKIP_FILE}') as f: skips = json.load(f)
|
||||
except: skips = {}
|
||||
|
||||
# Load active issues (to avoid double-picking)
|
||||
try:
|
||||
with open('${ACTIVE_FILE}') as f:
|
||||
active = json.load(f)
|
||||
active_issues = {v['issue'] for v in active.values()}
|
||||
except:
|
||||
active_issues = set()
|
||||
|
||||
all_issues = []
|
||||
for repo in repos:
|
||||
url = f'{base}/api/v1/repos/{repo}/issues?state=open&type=issues&limit=50&sort=created'
|
||||
req = urllib.request.Request(url, headers={'Authorization': f'token {token}'})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=10)
|
||||
issues = json.loads(resp.read())
|
||||
for i in issues:
|
||||
i['_repo'] = repo
|
||||
all_issues.extend(issues)
|
||||
except:
|
||||
continue
|
||||
|
||||
# Sort by priority: URGENT > P0 > P1 > bugs > LHF > rest
|
||||
def priority(i):
|
||||
t = i['title'].lower()
|
||||
if '[urgent]' in t or 'urgent:' in t: return 0
|
||||
if '[p0]' in t: return 1
|
||||
if '[p1]' in t: return 2
|
||||
if '[bug]' in t: return 3
|
||||
if 'lhf:' in t or 'lhf ' in t.lower(): return 4
|
||||
if '[p2]' in t: return 5
|
||||
return 6
|
||||
|
||||
all_issues.sort(key=priority)
|
||||
|
||||
for i in all_issues:
|
||||
assignees = [a['login'] for a in (i.get('assignees') or [])]
|
||||
# Take issues assigned to claude OR unassigned (self-assign)
|
||||
if assignees and 'claude' not in assignees:
|
||||
continue
|
||||
|
||||
title = i['title'].lower()
|
||||
if '[philosophy]' in title: continue
|
||||
if '[epic]' in title or 'epic:' in title: continue
|
||||
if '[showcase]' in title: continue
|
||||
if '[do not close' in title: continue
|
||||
if '[meta]' in title: continue
|
||||
if '[governing]' in title: continue
|
||||
if '[permanent]' in title: continue
|
||||
if '[morning report]' in title: continue
|
||||
if '[retro]' in title: continue
|
||||
if '[intel]' in title: continue
|
||||
if 'master escalation' in title: continue
|
||||
if any(a['login'] == 'Rockachopa' for a in (i.get('assignees') or [])): continue
|
||||
|
||||
num_str = str(i['number'])
|
||||
if num_str in active_issues: continue
|
||||
|
||||
entry = skips.get(num_str, {})
|
||||
if entry and entry.get('until', 0) > time.time(): continue
|
||||
|
||||
lock = '${LOCK_DIR}/' + i['_repo'].replace('/', '-') + '-' + num_str + '.lock'
|
||||
if os.path.isdir(lock): continue
|
||||
|
||||
repo = i['_repo']
|
||||
owner, name = repo.split('/')
|
||||
|
||||
# Self-assign if unassigned
|
||||
if not assignees:
|
||||
try:
|
||||
data = json.dumps({'assignees': ['claude']}).encode()
|
||||
req2 = urllib.request.Request(
|
||||
f'{base}/api/v1/repos/{repo}/issues/{i[\"number\"]}',
|
||||
data=data, method='PATCH',
|
||||
headers={'Authorization': f'token {token}', 'Content-Type': 'application/json'})
|
||||
urllib.request.urlopen(req2, timeout=5)
|
||||
except: pass
|
||||
|
||||
print(json.dumps({
|
||||
'number': i['number'],
|
||||
'title': i['title'],
|
||||
'repo_owner': owner,
|
||||
'repo_name': name,
|
||||
'repo': repo,
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
print('null')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
build_prompt() {
|
||||
local issue_num="$1"
|
||||
local issue_title="$2"
|
||||
local worktree="$3"
|
||||
local repo_owner="$4"
|
||||
local repo_name="$5"
|
||||
|
||||
cat <<PROMPT
|
||||
You are Claude, an autonomous code agent on the ${repo_name} project.
|
||||
|
||||
YOUR ISSUE: #${issue_num} — "${issue_title}"
|
||||
|
||||
GITEA API: ${GITEA_URL}/api/v1
|
||||
GITEA TOKEN: ${GITEA_TOKEN}
|
||||
REPO: ${repo_owner}/${repo_name}
|
||||
WORKING DIRECTORY: ${worktree}
|
||||
|
||||
== YOUR POWERS ==
|
||||
You can do ANYTHING a developer can do.
|
||||
|
||||
1. READ the issue and any comments for context:
|
||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}"
|
||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments"
|
||||
|
||||
2. DO THE WORK. Code, test, fix, refactor — whatever the issue needs.
|
||||
- Check for tox.ini / Makefile / package.json for test/lint commands
|
||||
- Run tests if the project has them
|
||||
- Follow existing code conventions
|
||||
|
||||
3. COMMIT with conventional commits: fix: / feat: / refactor: / test: / chore:
|
||||
Include "Fixes #${issue_num}" or "Refs #${issue_num}" in the message.
|
||||
|
||||
4. PUSH to your branch (claude/issue-${issue_num}) and CREATE A PR:
|
||||
git push origin claude/issue-${issue_num}
|
||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" \\
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"title": "[claude] <description> (#${issue_num})", "body": "Fixes #${issue_num}\n\n<describe what you did>", "head": "claude/issue-${issue_num}", "base": "main"}'
|
||||
|
||||
5. COMMENT on the issue when done:
|
||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" \\
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"body": "PR created. <summary of changes>"}'
|
||||
|
||||
== RULES ==
|
||||
- Read CLAUDE.md or project README first for conventions
|
||||
- If the project has tox, use tox. If npm, use npm. Follow the project.
|
||||
- Never use --no-verify on git commands.
|
||||
- If tests fail after 2 attempts, STOP and comment on the issue explaining why.
|
||||
- Be thorough but focused. Fix the issue, don't refactor the world.
|
||||
|
||||
== CRITICAL: ALWAYS COMMIT AND PUSH ==
|
||||
- NEVER exit without committing your work. Even partial progress MUST be committed.
|
||||
- Before you finish, ALWAYS: git add -A && git commit && git push origin claude/issue-${issue_num}
|
||||
- ALWAYS create a PR before exiting. No exceptions.
|
||||
- If a branch already exists with prior work, check it out and CONTINUE from where it left off.
|
||||
- Check: git ls-remote origin claude/issue-${issue_num} — if it exists, pull it first.
|
||||
- Your work is WASTED if it's not pushed. Push early, push often.
|
||||
PROMPT
|
||||
}
|
||||
|
||||
# === WORKER FUNCTION ===
|
||||
run_worker() {
|
||||
local worker_id="$1"
|
||||
local consecutive_failures=0
|
||||
|
||||
log "WORKER-${worker_id}: Started"
|
||||
|
||||
while true; do
|
||||
# Backoff on repeated failures
|
||||
if [ "$consecutive_failures" -ge 5 ]; then
|
||||
local backoff=$((RATE_LIMIT_SLEEP * (consecutive_failures / 5)))
|
||||
[ "$backoff" -gt "$MAX_RATE_SLEEP" ] && backoff=$MAX_RATE_SLEEP
|
||||
log "WORKER-${worker_id}: BACKOFF ${backoff}s (${consecutive_failures} failures)"
|
||||
sleep "$backoff"
|
||||
consecutive_failures=0
|
||||
fi
|
||||
|
||||
# RULE: Merge existing PRs BEFORE creating new work.
|
||||
# Check for open PRs from claude, rebase + merge them first.
|
||||
local our_prs
|
||||
our_prs=$(curl -sf -H "Authorization: token ${GITEA_TOKEN}" \
|
||||
"${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls?state=open&limit=5" 2>/dev/null | \
|
||||
python3 -c "
|
||||
import sys, json
|
||||
prs = json.loads(sys.stdin.buffer.read())
|
||||
ours = [p for p in prs if p['user']['login'] == 'claude'][:3]
|
||||
for p in ours:
|
||||
print(f'{p[\"number\"]}|{p[\"head\"][\"ref\"]}|{p.get(\"mergeable\",False)}')
|
||||
" 2>/dev/null)
|
||||
|
||||
if [ -n "$our_prs" ]; then
|
||||
local pr_clone_url="http://claude:${GITEA_TOKEN}@143.198.27.163:3000/Timmy_Foundation/the-nexus.git"
|
||||
echo "$our_prs" | while IFS='|' read pr_num branch mergeable; do
|
||||
[ -z "$pr_num" ] && continue
|
||||
if [ "$mergeable" = "True" ]; then
|
||||
curl -sf -X POST -H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do":"squash","delete_branch_after_merge":true}' \
|
||||
"${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls/${pr_num}/merge" >/dev/null 2>&1
|
||||
log "WORKER-${worker_id}: merged own PR #${pr_num}"
|
||||
sleep 3
|
||||
else
|
||||
# Rebase and push
|
||||
local tmpdir="/tmp/claude-rebase-${pr_num}"
|
||||
cd "$HOME"; rm -rf "$tmpdir" 2>/dev/null
|
||||
git clone -q --depth=50 -b "$branch" "$pr_clone_url" "$tmpdir" 2>/dev/null
|
||||
if [ -d "$tmpdir/.git" ]; then
|
||||
cd "$tmpdir"
|
||||
git fetch origin main 2>/dev/null
|
||||
if git rebase origin/main 2>/dev/null; then
|
||||
git push -f origin "$branch" 2>/dev/null
|
||||
sleep 3
|
||||
curl -sf -X POST -H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do":"squash","delete_branch_after_merge":true}' \
|
||||
"${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls/${pr_num}/merge" >/dev/null 2>&1
|
||||
log "WORKER-${worker_id}: rebased+merged PR #${pr_num}"
|
||||
else
|
||||
git rebase --abort 2>/dev/null
|
||||
curl -sf -X PATCH -H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" -d '{"state":"closed"}' \
|
||||
"${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls/${pr_num}" >/dev/null 2>&1
|
||||
log "WORKER-${worker_id}: closed unrebaseable PR #${pr_num}"
|
||||
fi
|
||||
cd "$HOME"; rm -rf "$tmpdir"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Get next issue
|
||||
issue_json=$(get_next_issue)
|
||||
|
||||
if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then
|
||||
update_active "$worker_id" "" "" "idle"
|
||||
sleep 10
|
||||
continue
|
||||
fi
|
||||
|
||||
issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])")
|
||||
issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")
|
||||
repo_owner=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_owner'])")
|
||||
repo_name=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_name'])")
|
||||
issue_key="${repo_owner}-${repo_name}-${issue_num}"
|
||||
branch="claude/issue-${issue_num}"
|
||||
# Use UUID for worktree dir to prevent collisions under high concurrency
|
||||
wt_uuid=$(/usr/bin/uuidgen 2>/dev/null || python3 -c "import uuid; print(uuid.uuid4())")
|
||||
worktree="${WORKTREE_BASE}/claude-${issue_num}-${wt_uuid}"
|
||||
|
||||
# Try to lock
|
||||
if ! lock_issue "$issue_key"; then
|
||||
sleep 5
|
||||
continue
|
||||
fi
|
||||
|
||||
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
||||
update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
|
||||
|
||||
# Clone and pick up prior work if it exists
|
||||
rm -rf "$worktree" 2>/dev/null
|
||||
CLONE_URL="http://claude:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git"
|
||||
|
||||
# Check if branch already exists on remote (prior work to continue)
|
||||
if git ls-remote --heads "$CLONE_URL" "$branch" 2>/dev/null | grep -q "$branch"; then
|
||||
log "WORKER-${worker_id}: Found existing branch $branch — continuing prior work"
|
||||
if ! git clone --depth=50 -b "$branch" "$CLONE_URL" "$worktree" >/dev/null 2>&1; then
|
||||
log "WORKER-${worker_id}: ERROR cloning branch $branch for #${issue_num}"
|
||||
unlock_issue "$issue_key"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
sleep "$COOLDOWN"
|
||||
continue
|
||||
fi
|
||||
# Rebase on main to resolve stale conflicts from closed PRs
|
||||
cd "$worktree"
|
||||
git fetch origin main >/dev/null 2>&1
|
||||
if ! git rebase origin/main >/dev/null 2>&1; then
|
||||
# Rebase failed — start fresh from main
|
||||
log "WORKER-${worker_id}: Rebase failed for $branch, starting fresh"
|
||||
cd "$HOME"
|
||||
rm -rf "$worktree"
|
||||
git clone --depth=1 -b main "$CLONE_URL" "$worktree" >/dev/null 2>&1
|
||||
cd "$worktree"
|
||||
git checkout -b "$branch" >/dev/null 2>&1
|
||||
fi
|
||||
else
|
||||
if ! git clone --depth=1 -b main "$CLONE_URL" "$worktree" >/dev/null 2>&1; then
|
||||
log "WORKER-${worker_id}: ERROR cloning for #${issue_num}"
|
||||
unlock_issue "$issue_key"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
sleep "$COOLDOWN"
|
||||
continue
|
||||
fi
|
||||
cd "$worktree"
|
||||
git checkout -b "$branch" >/dev/null 2>&1
|
||||
fi
|
||||
cd "$worktree"
|
||||
|
||||
# Build prompt and run
|
||||
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
|
||||
|
||||
log "WORKER-${worker_id}: Launching Claude Code for #${issue_num}..."
|
||||
CYCLE_START=$(date +%s)
|
||||
|
||||
set +e
|
||||
cd "$worktree"
|
||||
env -u CLAUDECODE gtimeout "$CLAUDE_TIMEOUT" claude \
|
||||
--print \
|
||||
--model sonnet \
|
||||
--dangerously-skip-permissions \
|
||||
-p "$prompt" \
|
||||
</dev/null >> "$LOG_DIR/claude-${issue_num}.log" 2>&1
|
||||
exit_code=$?
|
||||
set -e
|
||||
|
||||
CYCLE_END=$(date +%s)
|
||||
CYCLE_DURATION=$(( CYCLE_END - CYCLE_START ))
|
||||
|
||||
# ── SALVAGE: Never waste work. Commit+push whatever exists. ──
|
||||
cd "$worktree" 2>/dev/null || true
|
||||
DIRTY=$(git status --porcelain 2>/dev/null | wc -l | tr -d ' ')
|
||||
UNPUSHED=$(git log --oneline "origin/main..HEAD" 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
if [ "${DIRTY:-0}" -gt 0 ]; then
|
||||
log "WORKER-${worker_id}: SALVAGING $DIRTY dirty files for #${issue_num}"
|
||||
git add -A 2>/dev/null
|
||||
git commit -m "WIP: Claude Code progress on #${issue_num}
|
||||
|
||||
Automated salvage commit — agent session ended (exit $exit_code).
|
||||
Work in progress, may need continuation." 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Push if we have any commits (including salvaged ones)
|
||||
UNPUSHED=$(git log --oneline "origin/main..HEAD" 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "${UNPUSHED:-0}" -gt 0 ]; then
|
||||
git push -u origin "$branch" 2>/dev/null && \
|
||||
log "WORKER-${worker_id}: Pushed $UNPUSHED commit(s) on $branch" || \
|
||||
log "WORKER-${worker_id}: Push failed for $branch"
|
||||
fi
|
||||
|
||||
# ── Create PR if branch was pushed and no PR exists yet ──
|
||||
pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=open&head=${repo_owner}:${branch}&limit=1" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
||||
import sys,json
|
||||
prs = json.load(sys.stdin)
|
||||
if prs: print(prs[0]['number'])
|
||||
else: print('')
|
||||
" 2>/dev/null)
|
||||
|
||||
if [ -z "$pr_num" ] && [ "${UNPUSHED:-0}" -gt 0 ]; then
|
||||
pr_num=$(curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$(python3 -c "
|
||||
import json
|
||||
print(json.dumps({
|
||||
'title': 'Claude: Issue #${issue_num}',
|
||||
'head': '${branch}',
|
||||
'base': 'main',
|
||||
'body': 'Automated PR for issue #${issue_num}.\nExit code: ${exit_code}'
|
||||
}))
|
||||
")" | python3 -c "import sys,json; print(json.load(sys.stdin).get('number',''))" 2>/dev/null)
|
||||
[ -n "$pr_num" ] && log "WORKER-${worker_id}: Created PR #${pr_num} for issue #${issue_num}"
|
||||
fi
|
||||
|
||||
# ── Genchi Genbutsu: verify world state before declaring success ──
|
||||
VERIFIED="false"
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
log "WORKER-${worker_id}: SUCCESS #${issue_num} — running genchi-genbutsu"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
if verify_result=$("$SCRIPT_DIR/genchi-genbutsu.sh" "$repo_owner" "$repo_name" "$issue_num" "$branch" "claude" 2>/dev/null); then
|
||||
VERIFIED="true"
|
||||
log "WORKER-${worker_id}: VERIFIED #${issue_num}"
|
||||
if [ -n "$pr_num" ]; then
|
||||
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/merge" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do": "squash"}' >/dev/null 2>&1 || true
|
||||
curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"state": "closed"}' >/dev/null 2>&1 || true
|
||||
log "WORKER-${worker_id}: PR #${pr_num} merged, issue #${issue_num} closed"
|
||||
fi
|
||||
consecutive_failures=0
|
||||
else
|
||||
verify_details=$(echo "$verify_result" | python3 -c "import sys,json; print(json.load(sys.stdin).get('details','unknown'))" 2>/dev/null || echo "unverified")
|
||||
log "WORKER-${worker_id}: UNVERIFIED #${issue_num} — $verify_details"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
|
||||
else
|
||||
# Check for rate limit
|
||||
if grep -q "rate_limit\|rate limit\|429\|overloaded" "$LOG_DIR/claude-${issue_num}.log" 2>/dev/null; then
|
||||
log "WORKER-${worker_id}: RATE LIMITED on #${issue_num} — backing off (work saved)"
|
||||
consecutive_failures=$((consecutive_failures + 3))
|
||||
else
|
||||
log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── METRICS: structured JSONL for reporting ──
|
||||
LINES_ADDED=$(cd "$worktree" 2>/dev/null && git diff --stat origin/main..HEAD 2>/dev/null | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo 0)
|
||||
LINES_REMOVED=$(cd "$worktree" 2>/dev/null && git diff --stat origin/main..HEAD 2>/dev/null | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo 0)
|
||||
FILES_CHANGED=$(cd "$worktree" 2>/dev/null && git diff --name-only origin/main..HEAD 2>/dev/null | wc -l | tr -d ' ' || echo 0)
|
||||
|
||||
# Determine outcome
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
OUTCOME="success"
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
OUTCOME="timeout"
|
||||
elif grep -q "rate_limit\|rate limit\|429" "$LOG_DIR/claude-${issue_num}.log" 2>/dev/null; then
|
||||
OUTCOME="rate_limited"
|
||||
else
|
||||
OUTCOME="failed"
|
||||
fi
|
||||
|
||||
METRICS_FILE="$LOG_DIR/claude-metrics.jsonl"
|
||||
python3 -c "
|
||||
import json, datetime
|
||||
print(json.dumps({
|
||||
'ts': datetime.datetime.utcnow().isoformat() + 'Z',
|
||||
'agent': 'claude',
|
||||
'worker': $worker_id,
|
||||
'issue': $issue_num,
|
||||
'repo': '${repo_owner}/${repo_name}',
|
||||
'title': '''${issue_title}'''[:80],
|
||||
'outcome': '$OUTCOME',
|
||||
'exit_code': $exit_code,
|
||||
'duration_s': $CYCLE_DURATION,
|
||||
'files_changed': ${FILES_CHANGED:-0},
|
||||
'lines_added': ${LINES_ADDED:-0},
|
||||
'lines_removed': ${LINES_REMOVED:-0},
|
||||
'salvaged': ${DIRTY:-0},
|
||||
'pr': '${pr_num:-}',
|
||||
'merged': $( [ '$OUTCOME' = 'success' ] && [ -n '${pr_num:-}' ] && echo 'true' || echo 'false' ),
|
||||
'verified': ${VERIFIED:-false}
|
||||
}))
|
||||
" >> "$METRICS_FILE" 2>/dev/null
|
||||
|
||||
# Cleanup
|
||||
cleanup_workdir "$worktree"
|
||||
unlock_issue "$issue_key"
|
||||
update_active "$worker_id" "" "" "done"
|
||||
|
||||
sleep "$COOLDOWN"
|
||||
done
|
||||
}
|
||||
|
||||
# === MAIN ===
|
||||
log "=== Claude Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
|
||||
log "Worktrees: ${WORKTREE_BASE}"
|
||||
|
||||
# Clean stale locks
|
||||
rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
|
||||
|
||||
# PID tracking via files (bash 3.2 compatible)
|
||||
PID_DIR="$LOG_DIR/claude-pids"
|
||||
mkdir -p "$PID_DIR"
|
||||
rm -f "$PID_DIR"/*.pid 2>/dev/null
|
||||
|
||||
launch_worker() {
|
||||
local wid="$1"
|
||||
run_worker "$wid" &
|
||||
echo $! > "$PID_DIR/${wid}.pid"
|
||||
log "Launched worker $wid (PID $!)"
|
||||
}
|
||||
|
||||
# Initial launch
|
||||
for i in $(seq 1 "$NUM_WORKERS"); do
|
||||
launch_worker "$i"
|
||||
sleep 3
|
||||
done
|
||||
|
||||
# === DYNAMIC SCALER ===
|
||||
# Every 3 minutes: check health, scale up if no rate limits, scale down if hitting limits
|
||||
CURRENT_WORKERS="$NUM_WORKERS"
|
||||
while true; do
|
||||
sleep 90
|
||||
|
||||
# Reap dead workers and relaunch
|
||||
for pidfile in "$PID_DIR"/*.pid; do
|
||||
[ -f "$pidfile" ] || continue
|
||||
wid=$(basename "$pidfile" .pid)
|
||||
wpid=$(cat "$pidfile")
|
||||
if ! kill -0 "$wpid" 2>/dev/null; then
|
||||
log "SCALER: Worker $wid died — relaunching"
|
||||
launch_worker "$wid"
|
||||
sleep 2
|
||||
fi
|
||||
done
|
||||
|
||||
recent_rate_limits=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
|
||||
recent_successes=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
|
||||
|
||||
if [ "$recent_rate_limits" -gt 0 ]; then
|
||||
if [ "$CURRENT_WORKERS" -gt 2 ]; then
|
||||
drop_to=$(( CURRENT_WORKERS / 2 ))
|
||||
[ "$drop_to" -lt 2 ] && drop_to=2
|
||||
log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to} workers"
|
||||
for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
|
||||
if [ -f "$PID_DIR/${wid}.pid" ]; then
|
||||
kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
|
||||
rm -f "$PID_DIR/${wid}.pid"
|
||||
update_active "$wid" "" "" "done"
|
||||
fi
|
||||
done
|
||||
CURRENT_WORKERS=$drop_to
|
||||
fi
|
||||
elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
|
||||
new_count=$(( CURRENT_WORKERS + 2 ))
|
||||
[ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
|
||||
log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count} workers"
|
||||
for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
|
||||
launch_worker "$wid"
|
||||
sleep 2
|
||||
done
|
||||
CURRENT_WORKERS=$new_count
|
||||
fi
|
||||
done
|
||||
94
bin/claudemax-watchdog.sh
Executable file
94
bin/claudemax-watchdog.sh
Executable file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env bash
|
||||
# claudemax-watchdog.sh — keep local Claude/Gemini loops alive without stale tmux assumptions
|
||||
|
||||
set -uo pipefail
|
||||
export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH"
|
||||
|
||||
LOG="$HOME/.hermes/logs/claudemax-watchdog.log"
|
||||
GITEA_URL="https://forge.alexanderwhitestone.com"
|
||||
GITEA_TOKEN=$(tr -d '[:space:]' < "$HOME/.hermes/gitea_token_vps" 2>/dev/null || true)
|
||||
REPO_API="$GITEA_URL/api/v1/repos/Timmy_Foundation/the-nexus"
|
||||
MIN_OPEN_ISSUES=10
|
||||
CLAUDE_WORKERS=2
|
||||
GEMINI_WORKERS=1
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] CLAUDEMAX: $*" >> "$LOG"
|
||||
}
|
||||
|
||||
start_loop() {
|
||||
local name="$1"
|
||||
local pattern="$2"
|
||||
local cmd="$3"
|
||||
local pid
|
||||
|
||||
pid=$(pgrep -f "$pattern" 2>/dev/null | head -1 || true)
|
||||
if [ -n "$pid" ]; then
|
||||
log "$name alive (PID $pid)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "$name not running. Restarting..."
|
||||
nohup bash -lc "$cmd" >/dev/null 2>&1 &
|
||||
sleep 2
|
||||
|
||||
pid=$(pgrep -f "$pattern" 2>/dev/null | head -1 || true)
|
||||
if [ -n "$pid" ]; then
|
||||
log "Restarted $name (PID $pid)"
|
||||
else
|
||||
log "ERROR: failed to start $name"
|
||||
fi
|
||||
}
|
||||
|
||||
run_optional_script() {
|
||||
local label="$1"
|
||||
local script_path="$2"
|
||||
|
||||
if [ -x "$script_path" ]; then
|
||||
bash "$script_path" 2>&1 | while read -r line; do
|
||||
log "$line"
|
||||
done
|
||||
else
|
||||
log "$label skipped — missing $script_path"
|
||||
fi
|
||||
}
|
||||
|
||||
claude_quota_blocked() {
|
||||
local cutoff now mtime f
|
||||
now=$(date +%s)
|
||||
cutoff=$((now - 43200))
|
||||
for f in "$HOME"/.hermes/logs/claude-*.log; do
|
||||
[ -f "$f" ] || continue
|
||||
mtime=$(stat -f %m "$f" 2>/dev/null || echo 0)
|
||||
if [ "$mtime" -ge "$cutoff" ] && grep -q "You've hit your limit" "$f" 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
if [ -z "$GITEA_TOKEN" ]; then
|
||||
log "ERROR: missing Gitea token at ~/.hermes/gitea_token_vps"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if claude_quota_blocked; then
|
||||
log "Claude quota exhausted recently — not starting claude-loop until quota resets or logs age out"
|
||||
else
|
||||
start_loop "claude-loop" "bash .*claude-loop.sh" "bash ~/.hermes/bin/claude-loop.sh $CLAUDE_WORKERS >> ~/.hermes/logs/claude-loop.log 2>&1"
|
||||
fi
|
||||
start_loop "gemini-loop" "bash .*gemini-loop.sh" "bash ~/.hermes/bin/gemini-loop.sh $GEMINI_WORKERS >> ~/.hermes/logs/gemini-loop.log 2>&1"
|
||||
|
||||
OPEN_COUNT=$(curl -s --max-time 10 -H "Authorization: token $GITEA_TOKEN" \
|
||||
"$REPO_API/issues?state=open&type=issues&limit=100" 2>/dev/null \
|
||||
| python3 -c "import sys, json; print(len(json.loads(sys.stdin.read() or '[]')))" 2>/dev/null || echo 0)
|
||||
|
||||
log "Open issues: $OPEN_COUNT (minimum: $MIN_OPEN_ISSUES)"
|
||||
|
||||
if [ "$OPEN_COUNT" -lt "$MIN_OPEN_ISSUES" ]; then
|
||||
log "Backlog running low. Checking replenishment helper..."
|
||||
run_optional_script "claudemax-replenish" "$HOME/.hermes/bin/claudemax-replenish.sh"
|
||||
fi
|
||||
|
||||
run_optional_script "autodeploy-matrix" "$HOME/.hermes/bin/autodeploy-matrix.sh"
|
||||
log "Watchdog complete."
|
||||
@@ -9,7 +9,7 @@ THRESHOLD_HOURS="${1:-2}"
|
||||
THRESHOLD_SECS=$((THRESHOLD_HOURS * 3600))
|
||||
LOG_DIR="$HOME/.hermes/logs"
|
||||
LOG_FILE="$LOG_DIR/deadman.log"
|
||||
GITEA_URL="http://143.198.27.163:3000"
|
||||
GITEA_URL="https://forge.alexanderwhitestone.com"
|
||||
GITEA_TOKEN=$(cat "$HOME/.hermes/gitea_token_vps" 2>/dev/null || echo "")
|
||||
TELEGRAM_TOKEN=$(cat "$HOME/.config/telegram/special_bot" 2>/dev/null || echo "")
|
||||
TELEGRAM_CHAT="-1003664764329"
|
||||
|
||||
@@ -25,10 +25,35 @@ else
|
||||
fi
|
||||
|
||||
# ── Config ──
|
||||
GITEA_TOKEN=$(cat ~/.hermes/gitea_token_vps 2>/dev/null)
|
||||
GITEA_API="http://143.198.27.163:3000/api/v1"
|
||||
EZRA_HOST="root@143.198.27.163"
|
||||
BEZALEL_HOST="root@67.205.155.108"
|
||||
GITEA_TOKEN=$(cat ~/.hermes/gitea_token_vps 2>/dev/null || echo "")
|
||||
GITEA_API="https://forge.alexanderwhitestone.com/api/v1"
|
||||
|
||||
# Resolve Tailscale IPs dynamically; fallback to env vars
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
RESOLVER="${SCRIPT_DIR}/../tools/tailscale_ip_resolver.py"
|
||||
if [ ! -f "$RESOLVER" ]; then
|
||||
RESOLVER="/root/wizards/ezra/tools/tailscale_ip_resolver.py"
|
||||
fi
|
||||
|
||||
resolve_host() {
|
||||
local default_ip="$1"
|
||||
if [ -n "$TAILSCALE_IP" ]; then
|
||||
echo "root@${TAILSCALE_IP}"
|
||||
return
|
||||
fi
|
||||
if [ -f "$RESOLVER" ]; then
|
||||
local ip
|
||||
ip=$(python3 "$RESOLVER" 2>/dev/null)
|
||||
if [ -n "$ip" ]; then
|
||||
echo "root@${ip}"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
echo "root@${default_ip}"
|
||||
}
|
||||
|
||||
EZRA_HOST=$(resolve_host "143.198.27.163")
|
||||
BEZALEL_HOST="root@${BEZALEL_TAILSCALE_IP:-67.205.155.108}"
|
||||
SSH_OPTS="-o ConnectTimeout=4 -o StrictHostKeyChecking=no -o BatchMode=yes"
|
||||
|
||||
ANY_DOWN=0
|
||||
@@ -154,7 +179,7 @@ fi
|
||||
|
||||
print_line "Timmy" "$TIMMY_STATUS" "$TIMMY_MODEL" "$TIMMY_ACTIVITY"
|
||||
|
||||
# ── 2. Ezra (VPS 143.198.27.163) ──
|
||||
# ── 2. Ezra ──
|
||||
EZRA_STATUS="DOWN"
|
||||
EZRA_MODEL="hermes-ezra"
|
||||
EZRA_ACTIVITY=""
|
||||
@@ -186,7 +211,7 @@ fi
|
||||
|
||||
print_line "Ezra" "$EZRA_STATUS" "$EZRA_MODEL" "$EZRA_ACTIVITY"
|
||||
|
||||
# ── 3. Bezalel (VPS 67.205.155.108) ──
|
||||
# ── 3. Bezalel ──
|
||||
BEZ_STATUS="DOWN"
|
||||
BEZ_MODEL="hermes-bezalel"
|
||||
BEZ_ACTIVITY=""
|
||||
@@ -246,7 +271,7 @@ if [ -n "$GITEA_VER" ]; then
|
||||
GITEA_STATUS="UP"
|
||||
VER=$(python3 -c "import json; print(json.loads('''${GITEA_VER}''').get('version','?'))" 2>/dev/null)
|
||||
GITEA_MODEL="gitea v${VER}"
|
||||
GITEA_ACTIVITY="143.198.27.163:3000"
|
||||
GITEA_ACTIVITY="forge.alexanderwhitestone.com"
|
||||
else
|
||||
GITEA_STATUS="DOWN"
|
||||
GITEA_MODEL="gitea(unreachable)"
|
||||
|
||||
706
bin/gemini-loop.sh
Executable file
706
bin/gemini-loop.sh
Executable file
@@ -0,0 +1,706 @@
|
||||
#!/usr/bin/env bash
|
||||
# gemini-loop.sh — Parallel Gemini Code agent dispatch loop
|
||||
# Runs N workers concurrently against the Gitea backlog.
|
||||
# Dynamic scaling: starts at N, scales up to MAX, drops on rate limits.
|
||||
#
|
||||
# Usage: gemini-loop.sh [NUM_WORKERS] (default: 2)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
GEMINI_KEY_FILE="${GEMINI_KEY_FILE:-$HOME/.timmy/gemini_free_tier_key}"
|
||||
if [ -f "$GEMINI_KEY_FILE" ]; then
|
||||
export GEMINI_API_KEY="$(python3 - "$GEMINI_KEY_FILE" <<'PY'
|
||||
from pathlib import Path
|
||||
import sys
|
||||
text = Path(sys.argv[1]).read_text(errors='ignore').splitlines()
|
||||
for line in text:
|
||||
line=line.strip()
|
||||
if line:
|
||||
print(line)
|
||||
break
|
||||
PY
|
||||
)"
|
||||
fi
|
||||
|
||||
# === CONFIG ===
|
||||
NUM_WORKERS="${1:-2}"
|
||||
MAX_WORKERS=5
|
||||
WORKTREE_BASE="$HOME/worktrees"
|
||||
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
||||
GITEA_TOKEN=$(cat "$HOME/.hermes/gemini_token")
|
||||
GEMINI_TIMEOUT=600 # 10 min per issue
|
||||
COOLDOWN=15 # seconds between issues — stagger clones
|
||||
RATE_LIMIT_SLEEP=30
|
||||
MAX_RATE_SLEEP=120
|
||||
LOG_DIR="$HOME/.hermes/logs"
|
||||
SKIP_FILE="$LOG_DIR/gemini-skip-list.json"
|
||||
LOCK_DIR="$LOG_DIR/gemini-locks"
|
||||
ACTIVE_FILE="$LOG_DIR/gemini-active.json"
|
||||
ALLOW_SELF_ASSIGN="${ALLOW_SELF_ASSIGN:-0}" # 0 = only explicitly-assigned Gemini work
|
||||
AUTH_INVALID_SLEEP=900
|
||||
|
||||
mkdir -p "$LOG_DIR" "$WORKTREE_BASE" "$LOCK_DIR"
|
||||
[ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE"
|
||||
echo '{}' > "$ACTIVE_FILE"
|
||||
|
||||
# === SHARED FUNCTIONS ===
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_DIR/gemini-loop.log"
|
||||
}
|
||||
|
||||
post_issue_comment() {
|
||||
local repo_owner="$1" repo_name="$2" issue_num="$3" body="$4"
|
||||
local payload
|
||||
payload=$(python3 - "$body" <<'PY'
|
||||
import json, sys
|
||||
print(json.dumps({"body": sys.argv[1]}))
|
||||
PY
|
||||
)
|
||||
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" -d "$payload" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
remote_branch_exists() {
|
||||
local branch="$1"
|
||||
git ls-remote --heads origin "$branch" 2>/dev/null | grep -q .
|
||||
}
|
||||
|
||||
get_pr_num() {
|
||||
local repo_owner="$1" repo_name="$2" branch="$3"
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=all&head=${repo_owner}:${branch}&limit=1" -H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
||||
import sys,json
|
||||
prs = json.load(sys.stdin)
|
||||
if prs: print(prs[0]['number'])
|
||||
else: print('')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
get_pr_file_count() {
|
||||
local repo_owner="$1" repo_name="$2" pr_num="$3"
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/files" -H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
files = json.load(sys.stdin)
|
||||
print(len(files) if isinstance(files, list) else 0)
|
||||
except:
|
||||
print(0)
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
get_pr_state() {
|
||||
local repo_owner="$1" repo_name="$2" pr_num="$3"
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}" -H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
pr = json.load(sys.stdin)
|
||||
if pr.get('merged'):
|
||||
print('merged')
|
||||
else:
|
||||
print(pr.get('state', 'unknown'))
|
||||
except:
|
||||
print('unknown')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
get_issue_state() {
|
||||
local repo_owner="$1" repo_name="$2" issue_num="$3"
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" -H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
issue = json.load(sys.stdin)
|
||||
print(issue.get('state', 'unknown'))
|
||||
except:
|
||||
print('unknown')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
proof_comment_status() {
|
||||
local repo_owner="$1" repo_name="$2" issue_num="$3" branch="$4"
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" -H "Authorization: token ${GITEA_TOKEN}" | BRANCH="$branch" python3 -c "
|
||||
import os, sys, json
|
||||
branch = os.environ.get('BRANCH', '').lower()
|
||||
try:
|
||||
comments = json.load(sys.stdin)
|
||||
except Exception:
|
||||
print('missing|')
|
||||
raise SystemExit(0)
|
||||
for c in reversed(comments):
|
||||
user = ((c.get('user') or {}).get('login') or '').lower()
|
||||
body = c.get('body') or ''
|
||||
body_l = body.lower()
|
||||
if user != 'gemini':
|
||||
continue
|
||||
if 'proof:' not in body_l and 'verification:' not in body_l:
|
||||
continue
|
||||
has_branch = branch in body_l
|
||||
has_pr = ('pr:' in body_l) or ('pull request:' in body_l) or ('/pulls/' in body_l)
|
||||
has_push = ('push:' in body_l) or ('pushed' in body_l)
|
||||
has_verify = ('tox' in body_l) or ('pytest' in body_l) or ('verification:' in body_l) or ('npm test' in body_l)
|
||||
status = 'ok' if (has_branch and has_pr and has_push and has_verify) else 'incomplete'
|
||||
print(status + '|' + (c.get('html_url') or ''))
|
||||
raise SystemExit(0)
|
||||
print('missing|')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
gemini_auth_invalid() {
|
||||
local issue_num="$1"
|
||||
grep -q "API_KEY_INVALID\|API key expired" "$LOG_DIR/gemini-${issue_num}.log" 2>/dev/null
|
||||
}
|
||||
|
||||
issue_is_code_fit() {
|
||||
local title="$1"
|
||||
local labels="$2"
|
||||
local body="$3"
|
||||
local haystack
|
||||
haystack="${title} ${labels} ${body}"
|
||||
local low="${haystack,,}"
|
||||
|
||||
if [[ "$low" == *"[morning report]"* ]]; then return 1; fi
|
||||
if [[ "$low" == *"[kt]"* ]]; then return 1; fi
|
||||
if [[ "$low" == *"policy:"* ]]; then return 1; fi
|
||||
if [[ "$low" == *"incident:"* || "$low" == *"🚨 incident"* || "$low" == *"[incident]"* ]]; then return 1; fi
|
||||
if [[ "$low" == *"fleet lexicon"* || "$low" == *"shared vocabulary"* || "$low" == *"rubric"* ]]; then return 1; fi
|
||||
if [[ "$low" == *"archive ghost"* || "$low" == *"reassign"* || "$low" == *"offload"* || "$low" == *"burn directive"* ]]; then return 1; fi
|
||||
if [[ "$low" == *"review all open prs"* ]]; then return 1; fi
|
||||
if [[ "$low" == *"epic"* ]]; then return 1; fi
|
||||
return 0
|
||||
}
|
||||
|
||||
lock_issue() {
|
||||
local issue_key="$1"
|
||||
local lockfile="$LOCK_DIR/$issue_key.lock"
|
||||
if mkdir "$lockfile" 2>/dev/null; then
|
||||
echo $$ > "$lockfile/pid"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
unlock_issue() {
|
||||
rm -rf "$LOCK_DIR/$1.lock" 2>/dev/null
|
||||
}
|
||||
|
||||
mark_skip() {
|
||||
local issue_num="$1" reason="$2" skip_hours="${3:-1}"
|
||||
python3 -c "
|
||||
import json, time, fcntl
|
||||
with open('$SKIP_FILE', 'r+') as f:
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
try: skips = json.load(f)
|
||||
except: skips = {}
|
||||
skips[str($issue_num)] = {
|
||||
'until': time.time() + ($skip_hours * 3600),
|
||||
'reason': '$reason',
|
||||
'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1
|
||||
}
|
||||
if skips[str($issue_num)]['failures'] >= 3:
|
||||
skips[str($issue_num)]['until'] = time.time() + (6 * 3600)
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
json.dump(skips, f, indent=2)
|
||||
" 2>/dev/null
|
||||
log "SKIP: #${issue_num} — ${reason}"
|
||||
}
|
||||
|
||||
update_active() {
|
||||
local worker="$1" issue="$2" repo="$3" status="$4"
|
||||
python3 -c "
|
||||
import json, fcntl
|
||||
with open('$ACTIVE_FILE', 'r+') as f:
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
try: active = json.load(f)
|
||||
except: active = {}
|
||||
if '$status' == 'done':
|
||||
active.pop('$worker', None)
|
||||
else:
|
||||
active['$worker'] = {'issue': '$issue', 'repo': '$repo', 'status': '$status'}
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
json.dump(active, f, indent=2)
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
cleanup_workdir() {
|
||||
local wt="$1"
|
||||
cd "$HOME" 2>/dev/null || true
|
||||
rm -rf "$wt" 2>/dev/null || true
|
||||
}
|
||||
|
||||
get_next_issue() {
|
||||
python3 -c "
|
||||
import json, sys, time, urllib.request, os
|
||||
|
||||
token = '${GITEA_TOKEN}'
|
||||
base = '${GITEA_URL}'
|
||||
repos = [
|
||||
'Timmy_Foundation/the-nexus',
|
||||
'Timmy_Foundation/timmy-home',
|
||||
'Timmy_Foundation/timmy-config',
|
||||
'Timmy_Foundation/hermes-agent',
|
||||
]
|
||||
allow_self_assign = int('${ALLOW_SELF_ASSIGN}')
|
||||
|
||||
try:
|
||||
with open('${SKIP_FILE}') as f: skips = json.load(f)
|
||||
except: skips = {}
|
||||
|
||||
try:
|
||||
with open('${ACTIVE_FILE}') as f:
|
||||
active = json.load(f)
|
||||
active_issues = {v['issue'] for v in active.values()}
|
||||
except:
|
||||
active_issues = set()
|
||||
|
||||
all_issues = []
|
||||
for repo in repos:
|
||||
url = f'{base}/api/v1/repos/{repo}/issues?state=open&type=issues&limit=50&sort=created'
|
||||
req = urllib.request.Request(url, headers={'Authorization': f'token {token}'})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=10)
|
||||
issues = json.loads(resp.read())
|
||||
for i in issues:
|
||||
i['_repo'] = repo
|
||||
all_issues.extend(issues)
|
||||
except:
|
||||
continue
|
||||
|
||||
def priority(i):
|
||||
t = i['title'].lower()
|
||||
if '[urgent]' in t or 'urgent:' in t: return 0
|
||||
if '[p0]' in t: return 1
|
||||
if '[p1]' in t: return 2
|
||||
if '[bug]' in t: return 3
|
||||
if 'lhf:' in t or 'lhf ' in t: return 4
|
||||
if '[p2]' in t: return 5
|
||||
return 6
|
||||
|
||||
all_issues.sort(key=priority)
|
||||
|
||||
for i in all_issues:
|
||||
assignees = [a['login'] for a in (i.get('assignees') or [])]
|
||||
# Default-safe behavior: only take explicitly assigned Gemini work.
|
||||
# Self-assignment is opt-in via ALLOW_SELF_ASSIGN=1.
|
||||
if assignees:
|
||||
if 'gemini' not in assignees:
|
||||
continue
|
||||
elif not allow_self_assign:
|
||||
continue
|
||||
|
||||
title = i['title'].lower()
|
||||
labels = [l['name'].lower() for l in (i.get('labels') or [])]
|
||||
body = (i.get('body') or '').lower()
|
||||
if '[philosophy]' in title: continue
|
||||
if '[epic]' in title or 'epic:' in title: continue
|
||||
if 'epic' in labels: continue
|
||||
if '[showcase]' in title: continue
|
||||
if '[do not close' in title: continue
|
||||
if '[meta]' in title: continue
|
||||
if '[governing]' in title: continue
|
||||
if '[permanent]' in title: continue
|
||||
if '[morning report]' in title: continue
|
||||
if '[retro]' in title: continue
|
||||
if '[intel]' in title: continue
|
||||
if '[kt]' in title: continue
|
||||
if 'policy:' in title: continue
|
||||
if 'incident' in title: continue
|
||||
if 'lexicon' in title or 'shared vocabulary' in title or 'rubric' in title: continue
|
||||
if 'archive ghost' in title or 'reassign' in title or 'offload' in title: continue
|
||||
if 'master escalation' in title: continue
|
||||
if any(a['login'] == 'Rockachopa' for a in (i.get('assignees') or [])): continue
|
||||
|
||||
num_str = str(i['number'])
|
||||
if num_str in active_issues: continue
|
||||
|
||||
entry = skips.get(num_str, {})
|
||||
if entry and entry.get('until', 0) > time.time(): continue
|
||||
|
||||
lock = '${LOCK_DIR}/' + i['_repo'].replace('/', '-') + '-' + num_str + '.lock'
|
||||
if os.path.isdir(lock): continue
|
||||
|
||||
repo = i['_repo']
|
||||
owner, name = repo.split('/')
|
||||
|
||||
# Self-assign only when explicitly enabled.
|
||||
if not assignees and allow_self_assign:
|
||||
try:
|
||||
data = json.dumps({'assignees': ['gemini']}).encode()
|
||||
req2 = urllib.request.Request(
|
||||
f'{base}/api/v1/repos/{repo}/issues/{i["number"]}',
|
||||
data=data, method='PATCH',
|
||||
headers={'Authorization': f'token {token}', 'Content-Type': 'application/json'})
|
||||
urllib.request.urlopen(req2, timeout=5)
|
||||
except: pass
|
||||
|
||||
print(json.dumps({
|
||||
'number': i['number'],
|
||||
'title': i['title'],
|
||||
'repo_owner': owner,
|
||||
'repo_name': name,
|
||||
'repo': repo,
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
print('null')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
build_prompt() {
|
||||
local issue_num="$1" issue_title="$2" worktree="$3" repo_owner="$4" repo_name="$5"
|
||||
cat <<PROMPT
|
||||
You are Gemini, an autonomous code agent on the ${repo_name} project.
|
||||
|
||||
YOUR ISSUE: #${issue_num} — "${issue_title}"
|
||||
|
||||
GITEA API: ${GITEA_URL}/api/v1
|
||||
GITEA TOKEN: ${GITEA_TOKEN}
|
||||
REPO: ${repo_owner}/${repo_name}
|
||||
WORKING DIRECTORY: ${worktree}
|
||||
|
||||
== YOUR POWERS ==
|
||||
You can do ANYTHING a developer can do.
|
||||
|
||||
1. READ the issue and any comments for context:
|
||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}"
|
||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments"
|
||||
|
||||
2. DO THE WORK. Code, test, fix, refactor — whatever the issue needs.
|
||||
- Check for tox.ini / Makefile / package.json for test/lint commands
|
||||
- Run tests if the project has them
|
||||
- Follow existing code conventions
|
||||
|
||||
3. COMMIT with conventional commits: fix: / feat: / refactor: / test: / chore:
|
||||
Include "Fixes #${issue_num}" or "Refs #${issue_num}" in the message.
|
||||
|
||||
4. PUSH to your branch (gemini/issue-${issue_num}) and CREATE A PR:
|
||||
git push origin gemini/issue-${issue_num}
|
||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" \\
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"title": "[gemini] <description> (#${issue_num})", "body": "Fixes #${issue_num}\n\n<describe what you did>", "head": "gemini/issue-${issue_num}", "base": "main"}'
|
||||
|
||||
5. COMMENT on the issue when done:
|
||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" \\
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"body": "PR created. <summary of changes>"}'
|
||||
|
||||
== RULES ==
|
||||
- Read CLAUDE.md or project README first for conventions
|
||||
- If the project has tox, use tox. If npm, use npm. Follow the project.
|
||||
- Never use --no-verify on git commands.
|
||||
- If tests fail after 2 attempts, STOP and comment on the issue explaining why.
|
||||
- Be thorough but focused. Fix the issue, don't refactor the world.
|
||||
|
||||
== CRITICAL: FINISH = PUSHED + PR'D + PROVED ==
|
||||
- NEVER exit without committing your work. Even partial progress MUST be committed.
|
||||
- Before you finish, ALWAYS: git add -A && git commit && git push origin gemini/issue-${issue_num}
|
||||
- ALWAYS create a PR before exiting. No exceptions.
|
||||
- ALWAYS post the Proof block before exiting. No proof comment = not done.
|
||||
- If a branch already exists with prior work, check it out and CONTINUE from where it left off.
|
||||
- Check: git ls-remote origin gemini/issue-${issue_num} — if it exists, pull it first.
|
||||
- Your work is WASTED if it's not pushed. Push early, push often.
|
||||
PROMPT
|
||||
}
|
||||
|
||||
# === WORKER FUNCTION ===
|
||||
run_worker() {
|
||||
local worker_id="$1"
|
||||
local consecutive_failures=0
|
||||
|
||||
log "WORKER-${worker_id}: Started"
|
||||
|
||||
while true; do
|
||||
if [ "$consecutive_failures" -ge 5 ]; then
|
||||
local backoff=$((RATE_LIMIT_SLEEP * (consecutive_failures / 5)))
|
||||
[ "$backoff" -gt "$MAX_RATE_SLEEP" ] && backoff=$MAX_RATE_SLEEP
|
||||
log "WORKER-${worker_id}: BACKOFF ${backoff}s (${consecutive_failures} failures)"
|
||||
sleep "$backoff"
|
||||
consecutive_failures=0
|
||||
fi
|
||||
|
||||
issue_json=$(get_next_issue)
|
||||
|
||||
if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then
|
||||
update_active "$worker_id" "" "" "idle"
|
||||
sleep 10
|
||||
continue
|
||||
fi
|
||||
|
||||
issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])")
|
||||
issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")
|
||||
repo_owner=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_owner'])")
|
||||
repo_name=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_name'])")
|
||||
issue_key="${repo_owner}-${repo_name}-${issue_num}"
|
||||
branch="gemini/issue-${issue_num}"
|
||||
worktree="${WORKTREE_BASE}/gemini-w${worker_id}-${issue_num}"
|
||||
|
||||
if ! lock_issue "$issue_key"; then
|
||||
sleep 5
|
||||
continue
|
||||
fi
|
||||
|
||||
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
||||
update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
|
||||
|
||||
# Clone and pick up prior work if it exists
|
||||
rm -rf "$worktree" 2>/dev/null
|
||||
CLONE_URL="http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git"
|
||||
|
||||
if git ls-remote --heads "$CLONE_URL" "$branch" 2>/dev/null | grep -q "$branch"; then
|
||||
log "WORKER-${worker_id}: Found existing branch $branch — continuing prior work"
|
||||
if ! git clone --depth=50 -b "$branch" "$CLONE_URL" "$worktree" >/dev/null 2>&1; then
|
||||
log "WORKER-${worker_id}: ERROR cloning branch $branch for #${issue_num}"
|
||||
unlock_issue "$issue_key"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
sleep "$COOLDOWN"
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if ! git clone --depth=1 -b main "$CLONE_URL" "$worktree" >/dev/null 2>&1; then
|
||||
log "WORKER-${worker_id}: ERROR cloning for #${issue_num}"
|
||||
unlock_issue "$issue_key"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
sleep "$COOLDOWN"
|
||||
continue
|
||||
fi
|
||||
cd "$worktree"
|
||||
git checkout -b "$branch" >/dev/null 2>&1
|
||||
fi
|
||||
cd "$worktree"
|
||||
|
||||
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
|
||||
|
||||
log "WORKER-${worker_id}: Launching Gemini Code for #${issue_num}..."
|
||||
CYCLE_START=$(date +%s)
|
||||
|
||||
set +e
|
||||
cd "$worktree"
|
||||
gtimeout "$GEMINI_TIMEOUT" gemini \
|
||||
-p "$prompt" \
|
||||
--yolo \
|
||||
</dev/null >> "$LOG_DIR/gemini-${issue_num}.log" 2>&1
|
||||
exit_code=$?
|
||||
set -e
|
||||
|
||||
CYCLE_END=$(date +%s)
|
||||
CYCLE_DURATION=$(( CYCLE_END - CYCLE_START ))
|
||||
|
||||
# ── SALVAGE: Never waste work. Commit+push whatever exists. ──
|
||||
cd "$worktree" 2>/dev/null || true
|
||||
DIRTY=$(git status --porcelain 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
if [ "${DIRTY:-0}" -gt 0 ]; then
|
||||
log "WORKER-${worker_id}: SALVAGING $DIRTY dirty files for #${issue_num}"
|
||||
git add -A 2>/dev/null
|
||||
git commit -m "WIP: Gemini Code progress on #${issue_num}
|
||||
|
||||
Automated salvage commit — agent session ended (exit $exit_code).
|
||||
Work in progress, may need continuation." 2>/dev/null || true
|
||||
fi
|
||||
|
||||
UNPUSHED=$(git log --oneline "origin/main..HEAD" 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "${UNPUSHED:-0}" -gt 0 ]; then
|
||||
git push -u origin "$branch" 2>/dev/null && \
|
||||
log "WORKER-${worker_id}: Pushed $UNPUSHED commit(s) on $branch" || \
|
||||
log "WORKER-${worker_id}: Push failed for $branch"
|
||||
fi
|
||||
|
||||
# ── Create PR if needed ──
|
||||
pr_num=$(get_pr_num "$repo_owner" "$repo_name" "$branch")
|
||||
|
||||
if [ -z "$pr_num" ] && [ "${UNPUSHED:-0}" -gt 0 ]; then
|
||||
pr_num=$(curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" -H "Authorization: token ${GITEA_TOKEN}" -H "Content-Type: application/json" -d "$(python3 -c "
|
||||
import json
|
||||
print(json.dumps({
|
||||
'title': 'Gemini: Issue #${issue_num}',
|
||||
'head': '${branch}',
|
||||
'base': 'main',
|
||||
'body': 'Automated PR for issue #${issue_num}.\nExit code: ${exit_code}'
|
||||
}))
|
||||
")" | python3 -c "import sys,json; print(json.load(sys.stdin).get('number',''))" 2>/dev/null)
|
||||
[ -n "$pr_num" ] && log "WORKER-${worker_id}: Created PR #${pr_num} for issue #${issue_num}"
|
||||
fi
|
||||
|
||||
# ── Genchi Genbutsu: verify world state before declaring success ──
|
||||
VERIFIED="false"
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
log "WORKER-${worker_id}: SUCCESS #${issue_num} exited 0 — running genchi-genbutsu"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
if verify_result=$("$SCRIPT_DIR/genchi-genbutsu.sh" "$repo_owner" "$repo_name" "$issue_num" "$branch" "gemini" 2>/dev/null); then
|
||||
VERIFIED="true"
|
||||
log "WORKER-${worker_id}: VERIFIED #${issue_num}"
|
||||
pr_state=$(get_pr_state "$repo_owner" "$repo_name" "$pr_num")
|
||||
if [ "$pr_state" = "open" ]; then
|
||||
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/merge" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do": "squash"}' >/dev/null 2>&1 || true
|
||||
pr_state=$(get_pr_state "$repo_owner" "$repo_name" "$pr_num")
|
||||
fi
|
||||
if [ "$pr_state" = "merged" ]; then
|
||||
curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"state": "closed"}' >/dev/null 2>&1 || true
|
||||
issue_state=$(get_issue_state "$repo_owner" "$repo_name" "$issue_num")
|
||||
if [ "$issue_state" = "closed" ]; then
|
||||
log "WORKER-${worker_id}: VERIFIED #${issue_num} branch pushed, PR merged, comment present, issue closed"
|
||||
consecutive_failures=0
|
||||
else
|
||||
log "WORKER-${worker_id}: BLOCKED #${issue_num} issue did not close after merge"
|
||||
mark_skip "$issue_num" "issue_close_unverified" 1
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
else
|
||||
log "WORKER-${worker_id}: BLOCKED #${issue_num} merge not verified (state=${pr_state})"
|
||||
mark_skip "$issue_num" "merge_unverified" 1
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
else
|
||||
verify_details=$(echo "$verify_result" | python3 -c "import sys,json; print(json.load(sys.stdin).get('details','unknown'))" 2>/dev/null || echo "unverified")
|
||||
verify_checks=$(echo "$verify_result" | python3 -c "import sys,json; print(json.load(sys.stdin).get('checks',''))" 2>/dev/null || echo "")
|
||||
log "WORKER-${worker_id}: UNVERIFIED #${issue_num} — $verify_details"
|
||||
if echo "$verify_checks" | grep -q '"branch": false'; then
|
||||
post_issue_comment "$repo_owner" "$repo_name" "$issue_num" "Loop gate blocked completion: remote branch ${branch} was not found on origin after Gemini exited. Issue remains open for retry."
|
||||
mark_skip "$issue_num" "missing_remote_branch" 1
|
||||
elif echo "$verify_checks" | grep -q '"pr": false'; then
|
||||
post_issue_comment "$repo_owner" "$repo_name" "$issue_num" "Loop gate blocked completion: branch ${branch} exists remotely, but no PR was found. Issue remains open for retry."
|
||||
mark_skip "$issue_num" "missing_pr" 1
|
||||
elif echo "$verify_checks" | grep -q '"files": false'; then
|
||||
curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"state": "closed"}' >/dev/null 2>&1 || true
|
||||
post_issue_comment "$repo_owner" "$repo_name" "$issue_num" "PR #${pr_num} was closed automatically: it had 0 changed files (empty commit). Issue remains open for retry."
|
||||
mark_skip "$issue_num" "empty_commit" 2
|
||||
else
|
||||
post_issue_comment "$repo_owner" "$repo_name" "$issue_num" "Loop gate blocked completion: PR #${pr_num} exists, but required verification failed ($verify_details). Issue remains open for retry."
|
||||
mark_skip "$issue_num" "unverified" 1
|
||||
fi
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
else
|
||||
if gemini_auth_invalid "$issue_num"; then
|
||||
log "WORKER-${worker_id}: AUTH INVALID on #${issue_num} — sleeping ${AUTH_INVALID_SLEEP}s"
|
||||
mark_skip "$issue_num" "gemini_auth_invalid" 1
|
||||
sleep "$AUTH_INVALID_SLEEP"
|
||||
consecutive_failures=$((consecutive_failures + 5))
|
||||
elif grep -q "rate_limit\|rate limit\|429\|overloaded\|quota" "$LOG_DIR/gemini-${issue_num}.log" 2>/dev/null; then
|
||||
log "WORKER-${worker_id}: RATE LIMITED on #${issue_num} (work saved)"
|
||||
consecutive_failures=$((consecutive_failures + 3))
|
||||
else
|
||||
log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── METRICS ──
|
||||
LINES_ADDED=$(cd "$worktree" 2>/dev/null && git diff --stat origin/main..HEAD 2>/dev/null | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo 0)
|
||||
LINES_REMOVED=$(cd "$worktree" 2>/dev/null && git diff --stat origin/main..HEAD 2>/dev/null | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo 0)
|
||||
FILES_CHANGED=$(cd "$worktree" 2>/dev/null && git diff --name-only origin/main..HEAD 2>/dev/null | wc -l | tr -d ' ' || echo 0)
|
||||
|
||||
if [ "$exit_code" -eq 0 ]; then OUTCOME="success"
|
||||
elif [ "$exit_code" -eq 124 ]; then OUTCOME="timeout"
|
||||
elif grep -q "rate_limit\|429" "$LOG_DIR/gemini-${issue_num}.log" 2>/dev/null; then OUTCOME="rate_limited"
|
||||
else OUTCOME="failed"; fi
|
||||
|
||||
python3 -c "
|
||||
import json, datetime
|
||||
print(json.dumps({
|
||||
'ts': datetime.datetime.utcnow().isoformat() + 'Z',
|
||||
'agent': 'gemini',
|
||||
'worker': $worker_id,
|
||||
'issue': $issue_num,
|
||||
'repo': '${repo_owner}/${repo_name}',
|
||||
'outcome': '$OUTCOME',
|
||||
'exit_code': $exit_code,
|
||||
'duration_s': $CYCLE_DURATION,
|
||||
'files_changed': ${FILES_CHANGED:-0},
|
||||
'lines_added': ${LINES_ADDED:-0},
|
||||
'lines_removed': ${LINES_REMOVED:-0},
|
||||
'salvaged': ${DIRTY:-0},
|
||||
'pr': '${pr_num:-}',
|
||||
'merged': $( [ '$OUTCOME' = 'success' ] && [ -n '${pr_num:-}' ] && echo 'true' || echo 'false' ),
|
||||
'verified': ${VERIFIED:-false}
|
||||
}))
|
||||
" >> "$LOG_DIR/gemini-metrics.jsonl" 2>/dev/null
|
||||
|
||||
cleanup_workdir "$worktree"
|
||||
unlock_issue "$issue_key"
|
||||
update_active "$worker_id" "" "" "done"
|
||||
|
||||
sleep "$COOLDOWN"
|
||||
done
|
||||
}
|
||||
|
||||
# === MAIN ===
|
||||
log "=== Gemini Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
|
||||
log "Worktrees: ${WORKTREE_BASE}"
|
||||
|
||||
rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
|
||||
|
||||
# PID tracking via files (bash 3.2 compatible)
|
||||
PID_DIR="$LOG_DIR/gemini-pids"
|
||||
mkdir -p "$PID_DIR"
|
||||
rm -f "$PID_DIR"/*.pid 2>/dev/null
|
||||
|
||||
launch_worker() {
|
||||
local wid="$1"
|
||||
run_worker "$wid" &
|
||||
echo $! > "$PID_DIR/${wid}.pid"
|
||||
log "Launched worker $wid (PID $!)"
|
||||
}
|
||||
|
||||
for i in $(seq 1 "$NUM_WORKERS"); do
|
||||
launch_worker "$i"
|
||||
sleep 3
|
||||
done
|
||||
|
||||
# Dynamic scaler — every 3 minutes
|
||||
CURRENT_WORKERS="$NUM_WORKERS"
|
||||
while true; do
|
||||
sleep 90
|
||||
|
||||
# Reap dead workers
|
||||
for pidfile in "$PID_DIR"/*.pid; do
|
||||
[ -f "$pidfile" ] || continue
|
||||
wid=$(basename "$pidfile" .pid)
|
||||
wpid=$(cat "$pidfile")
|
||||
if ! kill -0 "$wpid" 2>/dev/null; then
|
||||
log "SCALER: Worker $wid died — relaunching"
|
||||
launch_worker "$wid"
|
||||
sleep 2
|
||||
fi
|
||||
done
|
||||
|
||||
recent_rate_limits=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
|
||||
recent_successes=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
|
||||
|
||||
if [ "$recent_rate_limits" -gt 0 ]; then
|
||||
if [ "$CURRENT_WORKERS" -gt 2 ]; then
|
||||
drop_to=$(( CURRENT_WORKERS / 2 ))
|
||||
[ "$drop_to" -lt 2 ] && drop_to=2
|
||||
log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to}"
|
||||
for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
|
||||
if [ -f "$PID_DIR/${wid}.pid" ]; then
|
||||
kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
|
||||
rm -f "$PID_DIR/${wid}.pid"
|
||||
update_active "$wid" "" "" "done"
|
||||
fi
|
||||
done
|
||||
CURRENT_WORKERS=$drop_to
|
||||
fi
|
||||
elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
|
||||
new_count=$(( CURRENT_WORKERS + 2 ))
|
||||
[ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
|
||||
log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count}"
|
||||
for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
|
||||
launch_worker "$wid"
|
||||
sleep 2
|
||||
done
|
||||
CURRENT_WORKERS=$new_count
|
||||
fi
|
||||
done
|
||||
179
bin/genchi-genbutsu.sh
Executable file
179
bin/genchi-genbutsu.sh
Executable file
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env bash
|
||||
# genchi-genbutsu.sh — 現地現物 — Go and see. Verify world state, not log vibes.
|
||||
#
|
||||
# Post-completion verification that goes and LOOKS at the actual artifacts.
|
||||
# Performs 5 world-state checks:
|
||||
# 1. Branch exists on remote
|
||||
# 2. PR exists
|
||||
# 3. PR has real file changes (> 0)
|
||||
# 4. PR is mergeable
|
||||
# 5. Issue has a completion comment from the agent
|
||||
#
|
||||
# Usage: genchi-genbutsu.sh <repo_owner> <repo_name> <issue_num> <branch> <agent_name>
|
||||
# Returns: JSON to stdout, logs JSONL, exit 0 = VERIFIED, exit 1 = UNVERIFIED
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
||||
GITEA_TOKEN="${GITEA_TOKEN:-}"
|
||||
LOG_DIR="${LOG_DIR:-$HOME/.hermes/logs}"
|
||||
VERIFY_LOG="$LOG_DIR/genchi-genbutsu.jsonl"
|
||||
|
||||
if [ $# -lt 5 ]; then
|
||||
echo "Usage: $0 <repo_owner> <repo_name> <issue_num> <branch> <agent_name>" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
repo_owner="$1"
|
||||
repo_name="$2"
|
||||
issue_num="$3"
|
||||
branch="$4"
|
||||
agent_name="$5"
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
check_branch_exists() {
|
||||
# Use Gitea API instead of git ls-remote so we don't need clone credentials
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/branches/${branch}" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
get_pr_num() {
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=all&head=${repo_owner}:${branch}&limit=1" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
prs = json.load(sys.stdin)
|
||||
print(prs[0]['number'] if prs else '')
|
||||
"
|
||||
}
|
||||
|
||||
check_pr_files() {
|
||||
local pr_num="$1"
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/files" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
files = json.load(sys.stdin)
|
||||
print(len(files) if isinstance(files, list) else 0)
|
||||
except:
|
||||
print(0)
|
||||
"
|
||||
}
|
||||
|
||||
check_pr_mergeable() {
|
||||
local pr_num="$1"
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
pr = json.load(sys.stdin)
|
||||
print('true' if pr.get('mergeable') else 'false')
|
||||
"
|
||||
}
|
||||
|
||||
check_completion_comment() {
|
||||
curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" 2>/dev/null | AGENT="$agent_name" python3 -c "
|
||||
import os, sys, json
|
||||
agent = os.environ.get('AGENT', '').lower()
|
||||
try:
|
||||
comments = json.load(sys.stdin)
|
||||
except:
|
||||
sys.exit(1)
|
||||
for c in reversed(comments):
|
||||
user = ((c.get('user') or {}).get('login') or '').lower()
|
||||
if user == agent:
|
||||
sys.exit(0)
|
||||
sys.exit(1)
|
||||
"
|
||||
}
|
||||
|
||||
# ── Run checks ───────────────────────────────────────────────────────
|
||||
|
||||
ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
|
||||
status="VERIFIED"
|
||||
details=()
|
||||
checks_json='{}'
|
||||
|
||||
# Check 1: branch
|
||||
if check_branch_exists; then
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['branch']=True;print(json.dumps(d))")
|
||||
else
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['branch']=False;print(json.dumps(d))")
|
||||
status="UNVERIFIED"
|
||||
details+=("remote branch ${branch} not found")
|
||||
fi
|
||||
|
||||
# Check 2: PR exists
|
||||
pr_num=$(get_pr_num)
|
||||
if [ -n "$pr_num" ]; then
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['pr']=True;print(json.dumps(d))")
|
||||
else
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['pr']=False;print(json.dumps(d))")
|
||||
status="UNVERIFIED"
|
||||
details+=("no PR found for branch ${branch}")
|
||||
fi
|
||||
|
||||
# Check 3: PR has real file changes
|
||||
if [ -n "$pr_num" ]; then
|
||||
file_count=$(check_pr_files "$pr_num")
|
||||
if [ "${file_count:-0}" -gt 0 ]; then
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['files']=True;print(json.dumps(d))")
|
||||
else
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['files']=False;print(json.dumps(d))")
|
||||
status="UNVERIFIED"
|
||||
details+=("PR #${pr_num} has 0 changed files")
|
||||
fi
|
||||
|
||||
# Check 4: PR is mergeable
|
||||
if [ "$(check_pr_mergeable "$pr_num")" = "true" ]; then
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['mergeable']=True;print(json.dumps(d))")
|
||||
else
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['mergeable']=False;print(json.dumps(d))")
|
||||
status="UNVERIFIED"
|
||||
details+=("PR #${pr_num} is not mergeable")
|
||||
fi
|
||||
else
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['files']=None;d['mergeable']=None;print(json.dumps(d))")
|
||||
fi
|
||||
|
||||
# Check 5: completion comment from agent
|
||||
if check_completion_comment; then
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['comment']=True;print(json.dumps(d))")
|
||||
else
|
||||
checks_json=$(echo "$checks_json" | python3 -c "import sys,json;d=json.load(sys.stdin);d['comment']=False;print(json.dumps(d))")
|
||||
status="UNVERIFIED"
|
||||
details+=("no completion comment from ${agent_name} on issue #${issue_num}")
|
||||
fi
|
||||
|
||||
# Build detail string
|
||||
detail_str=$(IFS="; "; echo "${details[*]:-all checks passed}")
|
||||
|
||||
# ── Output ───────────────────────────────────────────────────────────
|
||||
|
||||
result=$(python3 -c "
|
||||
import json
|
||||
print(json.dumps({
|
||||
'status': '$status',
|
||||
'repo': '${repo_owner}/${repo_name}',
|
||||
'issue': $issue_num,
|
||||
'branch': '$branch',
|
||||
'agent': '$agent_name',
|
||||
'pr': '$pr_num',
|
||||
'checks': $checks_json,
|
||||
'details': '$detail_str',
|
||||
'ts': '$ts'
|
||||
}, indent=2))
|
||||
")
|
||||
|
||||
printf '%s\n' "$result"
|
||||
|
||||
# Append to JSONL log
|
||||
printf '%s\n' "$result" >> "$VERIFY_LOG"
|
||||
|
||||
if [ "$status" = "VERIFIED" ]; then
|
||||
exit 0
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
45
bin/kaizen-retro.sh
Executable file
45
bin/kaizen-retro.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
# kaizen-retro.sh — Automated retrospective after every burn cycle.
|
||||
#
|
||||
# Runs daily after the morning report.
|
||||
# Analyzes success rates by agent, repo, and issue type.
|
||||
# Identifies max-attempts issues, generates ONE concrete improvement,
|
||||
# and posts the retro to Telegram + the master morning-report issue.
|
||||
#
|
||||
# Usage:
|
||||
# ./bin/kaizen-retro.sh [--dry-run]
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="${SCRIPT_DIR%/bin}"
|
||||
PYTHON="${PYTHON3:-python3}"
|
||||
|
||||
# Source local env if available so TELEGRAM_BOT_TOKEN is picked up
|
||||
HOME_DIR="${HOME:-$(eval echo ~$(whoami))}"
|
||||
for env_file in "$HOME_DIR/.hermes/.env" "$HOME_DIR/.timmy/.env" "$REPO_ROOT/.env"; do
|
||||
if [ -f "$env_file" ]; then
|
||||
# shellcheck source=/dev/null
|
||||
set -a
|
||||
# shellcheck source=/dev/null
|
||||
source "$env_file"
|
||||
set +a
|
||||
fi
|
||||
done
|
||||
|
||||
# If the configured Gitea URL is unreachable but localhost works, prefer localhost
|
||||
if ! curl -sf "${GITEA_URL:-http://localhost:3000}/api/v1/version" >/dev/null 2>&1; then
|
||||
if curl -sf http://localhost:3000/api/v1/version >/dev/null 2>&1; then
|
||||
export GITEA_URL="http://localhost:3000"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Ensure the Python script exists
|
||||
RETRO_PY="$REPO_ROOT/scripts/kaizen_retro.py"
|
||||
if [ ! -f "$RETRO_PY" ]; then
|
||||
echo "ERROR: kaizen_retro.py not found at $RETRO_PY" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run
|
||||
exec "$PYTHON" "$RETRO_PY" "$@"
|
||||
20
bin/muda-audit.sh
Executable file
20
bin/muda-audit.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
# muda-audit.sh — Weekly waste audit wrapper
|
||||
# Runs scripts/muda_audit.py from the repo root.
|
||||
# Designed for cron or Gitea Actions.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
# Ensure python3 is available
|
||||
if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "ERROR: python3 not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run the audit
|
||||
python3 "${REPO_ROOT}/scripts/muda_audit.py" "$@"
|
||||
199
bin/ops-gitea.sh
199
bin/ops-gitea.sh
@@ -1,70 +1,155 @@
|
||||
#!/usr/bin/env bash
|
||||
# ── Gitea Feed Panel ───────────────────────────────────────────────────
|
||||
# Shows open PRs, recent merges, and issue queue. Called by watch.
|
||||
# ── Gitea Workflow Feed ────────────────────────────────────────────────
|
||||
# Shows open PRs, review pressure, and issue queues across core repos.
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
B='\033[1m' ; D='\033[2m' ; R='\033[0m'
|
||||
G='\033[32m' ; Y='\033[33m' ; RD='\033[31m' ; C='\033[36m' ; M='\033[35m'
|
||||
set -euo pipefail
|
||||
|
||||
TOKEN=$(cat ~/.hermes/gitea_token_vps 2>/dev/null)
|
||||
API="http://143.198.27.163:3000/api/v1/repos/rockachopa/Timmy-time-dashboard"
|
||||
B='\033[1m'
|
||||
D='\033[2m'
|
||||
R='\033[0m'
|
||||
C='\033[36m'
|
||||
G='\033[32m'
|
||||
Y='\033[33m'
|
||||
|
||||
echo -e "${B}${C} ◈ GITEA${R} ${D}$(date '+%H:%M:%S')${R}"
|
||||
resolve_gitea_url() {
|
||||
if [ -n "${GITEA_URL:-}" ]; then
|
||||
printf '%s\n' "${GITEA_URL%/}"
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.hermes/gitea_api" ]; then
|
||||
python3 - "$HOME/.hermes/gitea_api" <<'PY'
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
raw = Path(sys.argv[1]).read_text().strip().rstrip("/")
|
||||
print(raw[:-7] if raw.endswith("/api/v1") else raw)
|
||||
PY
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.config/gitea/base-url" ]; then
|
||||
tr -d '[:space:]' < "$HOME/.config/gitea/base-url"
|
||||
return 0
|
||||
fi
|
||||
echo "ERROR: set GITEA_URL or create ~/.hermes/gitea_api" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
resolve_ops_token() {
|
||||
local token_file
|
||||
for token_file in \
|
||||
"$HOME/.config/gitea/timmy-token" \
|
||||
"$HOME/.hermes/gitea_token_vps" \
|
||||
"$HOME/.hermes/gitea_token_timmy"; do
|
||||
if [ -f "$token_file" ]; then
|
||||
tr -d '[:space:]' < "$token_file"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
GITEA_URL="$(resolve_gitea_url)"
|
||||
CORE_REPOS="${CORE_REPOS:-Timmy_Foundation/the-nexus Timmy_Foundation/timmy-home Timmy_Foundation/timmy-config Timmy_Foundation/hermes-agent}"
|
||||
TOKEN="$(resolve_ops_token || true)"
|
||||
[ -z "$TOKEN" ] && echo "WARN: no approved Timmy Gitea token found; feed will use unauthenticated API calls" >&2
|
||||
|
||||
echo -e "${B}${C} ◈ GITEA WORKFLOW${R} ${D}$(date '+%H:%M:%S')${R}"
|
||||
echo -e "${D}────────────────────────────────────────${R}"
|
||||
|
||||
# Open PRs
|
||||
echo -e " ${B}Open PRs${R}"
|
||||
curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/pulls?state=open&limit=10" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
prs = json.loads(sys.stdin.read())
|
||||
if not prs: print(' (none)')
|
||||
for p in prs:
|
||||
age_h = ''
|
||||
print(f' #{p[\"number\"]:3d} {p[\"user\"][\"login\"]:8s} {p[\"title\"][:45]}')
|
||||
except: print(' (error)')
|
||||
" 2>/dev/null
|
||||
python3 - "$GITEA_URL" "$TOKEN" "$CORE_REPOS" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
echo -e "${D}────────────────────────────────────────${R}"
|
||||
base = sys.argv[1].rstrip("/")
|
||||
token = sys.argv[2]
|
||||
repos = sys.argv[3].split()
|
||||
headers = {"Authorization": f"token {token}"} if token else {}
|
||||
|
||||
# Recent merged (last 5)
|
||||
echo -e " ${B}Recently Merged${R}"
|
||||
curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/pulls?state=closed&sort=updated&limit=5" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
prs = json.loads(sys.stdin.read())
|
||||
merged = [p for p in prs if p.get('merged')]
|
||||
if not merged: print(' (none)')
|
||||
for p in merged[:5]:
|
||||
t = p['merged_at'][:16].replace('T',' ')
|
||||
print(f' ${G}✓${R} #{p[\"number\"]:3d} {p[\"title\"][:35]} ${D}{t}${R}')
|
||||
except: print(' (error)')
|
||||
" 2>/dev/null
|
||||
|
||||
echo -e "${D}────────────────────────────────────────${R}"
|
||||
def fetch(path):
|
||||
req = urllib.request.Request(f"{base}{path}", headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
# Issue queue (assigned to kimi)
|
||||
echo -e " ${B}Kimi Queue${R}"
|
||||
curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/issues?state=open&limit=50&type=issues" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
all_issues = json.loads(sys.stdin.read())
|
||||
issues = [i for i in all_issues if 'kimi' in [a.get('login','') for a in (i.get('assignees') or [])]]
|
||||
if not issues: print(' (empty — assign more!)')
|
||||
for i in issues[:8]:
|
||||
print(f' #{i[\"number\"]:3d} {i[\"title\"][:50]}')
|
||||
if len(issues) > 8: print(f' ... +{len(issues)-8} more')
|
||||
except: print(' (error)')
|
||||
" 2>/dev/null
|
||||
|
||||
echo -e "${D}────────────────────────────────────────${R}"
|
||||
def short_repo(repo):
|
||||
return repo.split("/", 1)[1]
|
||||
|
||||
# Unassigned issues
|
||||
UNASSIGNED=$(curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/issues?state=open&limit=50&type=issues" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
issues = json.loads(sys.stdin.read())
|
||||
print(len([i for i in issues if not i.get('assignees')]))
|
||||
except: print('?')
|
||||
" 2>/dev/null)
|
||||
echo -e " Unassigned issues: ${Y}$UNASSIGNED${R}"
|
||||
|
||||
issues = []
|
||||
pulls = []
|
||||
errors = []
|
||||
|
||||
for repo in repos:
|
||||
try:
|
||||
repo_pulls = fetch(f"/api/v1/repos/{repo}/pulls?state=open&limit=20")
|
||||
for pr in repo_pulls:
|
||||
pr["_repo"] = repo
|
||||
pulls.append(pr)
|
||||
repo_issues = fetch(f"/api/v1/repos/{repo}/issues?state=open&limit=50&type=issues")
|
||||
for issue in repo_issues:
|
||||
issue["_repo"] = repo
|
||||
issues.append(issue)
|
||||
except urllib.error.URLError as exc:
|
||||
errors.append(f"{repo}: {exc.reason}")
|
||||
except Exception as exc: # pragma: no cover - defensive panel path
|
||||
errors.append(f"{repo}: {exc}")
|
||||
|
||||
print(" \033[1mOpen PRs\033[0m")
|
||||
if not pulls:
|
||||
print(" (none)")
|
||||
else:
|
||||
for pr in pulls[:8]:
|
||||
print(
|
||||
f" #{pr['number']:3d} {short_repo(pr['_repo']):12s} "
|
||||
f"{pr['user']['login'][:12]:12s} {pr['title'][:40]}"
|
||||
)
|
||||
|
||||
print("\033[2m────────────────────────────────────────\033[0m")
|
||||
print(" \033[1mNeeds Timmy / Allegro Review\033[0m")
|
||||
reviewers = []
|
||||
for repo in repos:
|
||||
try:
|
||||
repo_items = fetch(f"/api/v1/repos/{repo}/issues?state=open&limit=50&type=pulls")
|
||||
for item in repo_items:
|
||||
assignees = [a.get("login", "") for a in (item.get("assignees") or [])]
|
||||
if any(name in assignees for name in ("Timmy", "allegro")):
|
||||
item["_repo"] = repo
|
||||
reviewers.append(item)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not reviewers:
|
||||
print(" (clear)")
|
||||
else:
|
||||
for item in reviewers[:8]:
|
||||
names = ",".join(a.get("login", "") for a in (item.get("assignees") or []))
|
||||
print(
|
||||
f" #{item['number']:3d} {short_repo(item['_repo']):12s} "
|
||||
f"{names[:18]:18s} {item['title'][:34]}"
|
||||
)
|
||||
|
||||
print("\033[2m────────────────────────────────────────\033[0m")
|
||||
print(" \033[1mIssue Queues\033[0m")
|
||||
queue_agents = ["allegro", "codex-agent", "groq", "claude", "ezra", "perplexity", "KimiClaw"]
|
||||
for agent in queue_agents:
|
||||
assigned = [
|
||||
issue
|
||||
for issue in issues
|
||||
if agent in [a.get("login", "") for a in (issue.get("assignees") or [])]
|
||||
]
|
||||
print(f" {agent:12s} {len(assigned):2d}")
|
||||
|
||||
unassigned = [issue for issue in issues if not issue.get("assignees")]
|
||||
print("\033[2m────────────────────────────────────────\033[0m")
|
||||
print(f" Unassigned issues: \033[33m{len(unassigned)}\033[0m")
|
||||
|
||||
if errors:
|
||||
print("\033[2m────────────────────────────────────────\033[0m")
|
||||
print(" \033[1mErrors\033[0m")
|
||||
for err in errors[:4]:
|
||||
print(f" {err}")
|
||||
PY
|
||||
|
||||
@@ -1,235 +1,294 @@
|
||||
#!/usr/bin/env bash
|
||||
# ── Dashboard Control Helpers ──────────────────────────────────────────
|
||||
# ── Workflow Control Helpers ───────────────────────────────────────────
|
||||
# Source this in the controls pane: source ~/.hermes/bin/ops-helpers.sh
|
||||
# These helpers intentionally target the current Hermes + Gitea workflow
|
||||
# and do not revive deprecated bash worker loops.
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
export TOKEN=*** ~/.hermes/gitea_token_vps 2>/dev/null)
|
||||
export GITEA="http://143.198.27.163:3000"
|
||||
export REPO_API="$GITEA/api/v1/repos/rockachopa/Timmy-time-dashboard"
|
||||
resolve_gitea_url() {
|
||||
if [ -n "${GITEA:-}" ]; then
|
||||
printf '%s\n' "${GITEA%/}"
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.hermes/gitea_api" ]; then
|
||||
python3 - "$HOME/.hermes/gitea_api" <<'PY'
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
raw = Path(sys.argv[1]).read_text().strip().rstrip("/")
|
||||
print(raw[:-7] if raw.endswith("/api/v1") else raw)
|
||||
PY
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.config/gitea/base-url" ]; then
|
||||
tr -d '[:space:]' < "$HOME/.config/gitea/base-url"
|
||||
return 0
|
||||
fi
|
||||
echo "ERROR: set GITEA or create ~/.hermes/gitea_api" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
export GITEA="$(resolve_gitea_url)"
|
||||
export OPS_DEFAULT_REPO="${OPS_DEFAULT_REPO:-Timmy_Foundation/timmy-home}"
|
||||
export OPS_CORE_REPOS="${OPS_CORE_REPOS:-Timmy_Foundation/the-nexus Timmy_Foundation/timmy-home Timmy_Foundation/timmy-config Timmy_Foundation/hermes-agent}"
|
||||
|
||||
ops-token() {
|
||||
local token_file
|
||||
for token_file in \
|
||||
"$HOME/.config/gitea/timmy-token" \
|
||||
"$HOME/.hermes/gitea_token_vps" \
|
||||
"$HOME/.hermes/gitea_token_timmy"; do
|
||||
if [ -f "$token_file" ]; then
|
||||
tr -d '[:space:]' < "$token_file"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
ops-help() {
|
||||
echo ""
|
||||
echo -e "\033[1m\033[35m ◈ CONTROLS\033[0m"
|
||||
echo -e "\033[1m\033[35m ◈ WORKFLOW CONTROLS\033[0m"
|
||||
echo -e "\033[2m ──────────────────────────────────────\033[0m"
|
||||
echo ""
|
||||
echo -e " \033[1mWake Up\033[0m"
|
||||
echo " ops-wake-kimi Restart Kimi loop"
|
||||
echo " ops-wake-claude Restart Claude loop"
|
||||
echo " ops-wake-gemini Restart Gemini loop"
|
||||
echo " ops-wake-gateway Restart gateway"
|
||||
echo " ops-wake-all Restart everything"
|
||||
echo -e " \033[1mReview\033[0m"
|
||||
echo " ops-prs [repo] List open PRs across the core repos or one repo"
|
||||
echo " ops-review-queue Show PRs waiting on Timmy or Allegro"
|
||||
echo " ops-merge PR REPO Squash-merge a reviewed PR"
|
||||
echo ""
|
||||
echo -e " \033[1mManage\033[0m"
|
||||
echo " ops-merge PR_NUM Squash-merge a PR"
|
||||
echo " ops-assign ISSUE Assign issue to Kimi"
|
||||
echo " ops-assign-claude ISSUE [REPO] Assign to Claude"
|
||||
echo " ops-audit Run efficiency audit now"
|
||||
echo " ops-prs List open PRs"
|
||||
echo " ops-queue Show Kimi's queue"
|
||||
echo " ops-claude-queue Show Claude's queue"
|
||||
echo " ops-gemini-queue Show Gemini's queue"
|
||||
echo -e " \033[1mDispatch\033[0m"
|
||||
echo " ops-assign ISSUE AGENT [repo] Assign an issue to an agent"
|
||||
echo " ops-unassign ISSUE [repo] Remove all assignees from an issue"
|
||||
echo " ops-queue AGENT [repo|all] Show an agent's queue"
|
||||
echo " ops-unassigned [repo|all] Show unassigned issues"
|
||||
echo ""
|
||||
echo -e " \033[1mEmergency\033[0m"
|
||||
echo " ops-kill-kimi Stop Kimi loop"
|
||||
echo " ops-kill-claude Stop Claude loop"
|
||||
echo " ops-kill-gemini Stop Gemini loop"
|
||||
echo " ops-kill-zombies Kill stuck git/pytest"
|
||||
echo -e " \033[1mWorkflow Health\033[0m"
|
||||
echo " ops-gitea-feed Render the Gitea workflow feed"
|
||||
echo " ops-freshness Check Hermes session/export freshness"
|
||||
echo ""
|
||||
echo -e " \033[1mOrchestrator\033[0m"
|
||||
echo " ops-wake-timmy Start Timmy (Ollama)"
|
||||
echo " ops-kill-timmy Stop Timmy"
|
||||
echo ""
|
||||
echo -e " \033[1mWatchdog\033[0m"
|
||||
echo " ops-wake-watchdog Start loop watchdog"
|
||||
echo " ops-kill-watchdog Stop loop watchdog"
|
||||
echo ""
|
||||
echo -e " \033[2m Type ops-help to see this again\033[0m"
|
||||
echo -e " \033[1mShortcuts\033[0m"
|
||||
echo " ops-assign-allegro ISSUE [repo]"
|
||||
echo " ops-assign-codex ISSUE [repo]"
|
||||
echo " ops-assign-groq ISSUE [repo]"
|
||||
echo " ops-assign-claude ISSUE [repo]"
|
||||
echo " ops-assign-ezra ISSUE [repo]"
|
||||
echo ""
|
||||
}
|
||||
|
||||
ops-wake-kimi() {
|
||||
pkill -f "kimi-loop.sh" 2>/dev/null
|
||||
sleep 1
|
||||
nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &
|
||||
echo " Kimi loop started (PID $!)"
|
||||
}
|
||||
|
||||
ops-wake-gateway() {
|
||||
hermes gateway start 2>&1
|
||||
}
|
||||
|
||||
ops-wake-claude() {
|
||||
local workers="${1:-3}"
|
||||
pkill -f "claude-loop.sh" 2>/dev/null
|
||||
sleep 1
|
||||
nohup bash ~/.hermes/bin/claude-loop.sh "$workers" >> ~/.hermes/logs/claude-loop.log 2>&1 &
|
||||
echo " Claude loop started — $workers workers (PID $!)"
|
||||
}
|
||||
|
||||
ops-wake-gemini() {
|
||||
pkill -f "gemini-loop.sh" 2>/dev/null
|
||||
sleep 1
|
||||
nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &
|
||||
echo " Gemini loop started (PID $!)"
|
||||
}
|
||||
|
||||
ops-wake-all() {
|
||||
ops-wake-gateway
|
||||
sleep 1
|
||||
ops-wake-kimi
|
||||
sleep 1
|
||||
ops-wake-claude
|
||||
sleep 1
|
||||
ops-wake-gemini
|
||||
echo " All services started"
|
||||
}
|
||||
|
||||
ops-merge() {
|
||||
local pr=$1
|
||||
[ -z "$pr" ] && { echo "Usage: ops-merge PR_NUMBER"; return 1; }
|
||||
curl -s -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||
"$REPO_API/pulls/$pr/merge" -d '{"Do":"squash"}' | python3 -c "
|
||||
import json,sys
|
||||
d=json.loads(sys.stdin.read())
|
||||
if 'sha' in d: print(f' ✓ PR #{$pr} merged ({d[\"sha\"][:8]})')
|
||||
else: print(f' ✗ {d.get(\"message\",\"unknown error\")}')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
ops-assign() {
|
||||
local issue=$1
|
||||
[ -z "$issue" ] && { echo "Usage: ops-assign ISSUE_NUMBER"; return 1; }
|
||||
curl -s -X PATCH -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||
"$REPO_API/issues/$issue" -d '{"assignees":["kimi"]}' | python3 -c "
|
||||
import json,sys; d=json.loads(sys.stdin.read()); print(f' ✓ #{$issue} assigned to kimi')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
ops-audit() {
|
||||
bash ~/.hermes/bin/efficiency-audit.sh
|
||||
ops-python() {
|
||||
local token
|
||||
token=$(ops-token) || { echo "No Gitea token found"; return 1; }
|
||||
OPS_TOKEN="$token" python3 - "$@"
|
||||
}
|
||||
|
||||
ops-prs() {
|
||||
curl -s -H "Authorization: token $TOKEN" "$REPO_API/pulls?state=open&limit=20" | python3 -c "
|
||||
local target="${1:-all}"
|
||||
ops-python "$GITEA" "$OPS_CORE_REPOS" "$target" <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
base = sys.argv[1].rstrip("/")
|
||||
repos = sys.argv[2].split()
|
||||
target = sys.argv[3]
|
||||
token = os.environ["OPS_TOKEN"]
|
||||
headers = {"Authorization": f"token {token}"}
|
||||
|
||||
if target != "all":
|
||||
repos = [target]
|
||||
|
||||
pulls = []
|
||||
for repo in repos:
|
||||
req = urllib.request.Request(
|
||||
f"{base}/api/v1/repos/{repo}/pulls?state=open&limit=20",
|
||||
headers=headers,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
for pr in json.loads(resp.read().decode()):
|
||||
pr["_repo"] = repo
|
||||
pulls.append(pr)
|
||||
|
||||
if not pulls:
|
||||
print(" (none)")
|
||||
else:
|
||||
for pr in pulls:
|
||||
print(f" #{pr['number']:4d} {pr['_repo'].split('/', 1)[1]:12s} {pr['user']['login'][:12]:12s} {pr['title'][:60]}")
|
||||
PY
|
||||
}
|
||||
|
||||
ops-review-queue() {
|
||||
ops-python "$GITEA" "$OPS_CORE_REPOS" <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
base = sys.argv[1].rstrip("/")
|
||||
repos = sys.argv[2].split()
|
||||
token = os.environ["OPS_TOKEN"]
|
||||
headers = {"Authorization": f"token {token}"}
|
||||
|
||||
items = []
|
||||
for repo in repos:
|
||||
req = urllib.request.Request(
|
||||
f"{base}/api/v1/repos/{repo}/issues?state=open&limit=50&type=pulls",
|
||||
headers=headers,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
for item in json.loads(resp.read().decode()):
|
||||
assignees = [a.get("login", "") for a in (item.get("assignees") or [])]
|
||||
if any(name in assignees for name in ("Timmy", "allegro")):
|
||||
item["_repo"] = repo
|
||||
items.append(item)
|
||||
|
||||
if not items:
|
||||
print(" (clear)")
|
||||
else:
|
||||
for item in items:
|
||||
names = ",".join(a.get("login", "") for a in (item.get("assignees") or []))
|
||||
print(f" #{item['number']:4d} {item['_repo'].split('/', 1)[1]:12s} {names[:20]:20s} {item['title'][:56]}")
|
||||
PY
|
||||
}
|
||||
|
||||
ops-assign() {
|
||||
local issue="$1"
|
||||
local agent="$2"
|
||||
local repo="${3:-$OPS_DEFAULT_REPO}"
|
||||
local token
|
||||
[ -z "$issue" ] && { echo "Usage: ops-assign ISSUE_NUMBER AGENT [owner/repo]"; return 1; }
|
||||
[ -z "$agent" ] && { echo "Usage: ops-assign ISSUE_NUMBER AGENT [owner/repo]"; return 1; }
|
||||
token=$(ops-token) || { echo "No Gitea token found"; return 1; }
|
||||
curl -s -X PATCH -H "Authorization: token $token" -H "Content-Type: application/json" \
|
||||
"$GITEA/api/v1/repos/$repo/issues/$issue" -d "{\"assignees\":[\"$agent\"]}" | python3 -c "
|
||||
import json,sys
|
||||
prs=json.loads(sys.stdin.read())
|
||||
for p in prs: print(f' #{p[\"number\"]:4d} {p[\"user\"][\"login\"]:8s} {p[\"title\"][:60]}')
|
||||
if not prs: print(' (none)')
|
||||
d=json.loads(sys.stdin.read())
|
||||
names=','.join(a.get('login','') for a in (d.get('assignees') or []))
|
||||
print(f' ✓ #{d.get(\"number\", \"?\")} assigned to {names or \"(none)\"}')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
ops-unassign() {
|
||||
local issue="$1"
|
||||
local repo="${2:-$OPS_DEFAULT_REPO}"
|
||||
local token
|
||||
[ -z "$issue" ] && { echo "Usage: ops-unassign ISSUE_NUMBER [owner/repo]"; return 1; }
|
||||
token=$(ops-token) || { echo "No Gitea token found"; return 1; }
|
||||
curl -s -X PATCH -H "Authorization: token $token" -H "Content-Type: application/json" \
|
||||
"$GITEA/api/v1/repos/$repo/issues/$issue" -d '{"assignees":[]}' | python3 -c "
|
||||
import json,sys
|
||||
d=json.loads(sys.stdin.read())
|
||||
print(f' ✓ #{d.get(\"number\", \"?\")} unassigned')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
ops-queue() {
|
||||
curl -s -H "Authorization: token $TOKEN" "$REPO_API/issues?state=open&limit=50&type=issues" | python3 -c "
|
||||
import json,sys
|
||||
all_issues=json.loads(sys.stdin.read())
|
||||
issues=[i for i in all_issues if 'kimi' in [a.get('login','') for a in (i.get('assignees') or [])]]
|
||||
for i in issues: print(f' #{i[\"number\"]:4d} {i[\"title\"][:60]}')
|
||||
if not issues: print(' (empty)')
|
||||
" 2>/dev/null
|
||||
}
|
||||
local agent="$1"
|
||||
local target="${2:-all}"
|
||||
[ -z "$agent" ] && { echo "Usage: ops-queue AGENT [repo|all]"; return 1; }
|
||||
ops-python "$GITEA" "$OPS_CORE_REPOS" "$agent" "$target" <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
ops-kill-kimi() {
|
||||
pkill -f "kimi-loop.sh" 2>/dev/null
|
||||
pkill -f "kimi.*--print" 2>/dev/null
|
||||
echo " Kimi stopped"
|
||||
}
|
||||
base = sys.argv[1].rstrip("/")
|
||||
repos = sys.argv[2].split()
|
||||
agent = sys.argv[3]
|
||||
target = sys.argv[4]
|
||||
token = os.environ["OPS_TOKEN"]
|
||||
headers = {"Authorization": f"token {token}"}
|
||||
|
||||
ops-kill-claude() {
|
||||
pkill -f "claude-loop.sh" 2>/dev/null
|
||||
pkill -f "claude.*--print.*--dangerously" 2>/dev/null
|
||||
rm -rf ~/.hermes/logs/claude-locks/*.lock 2>/dev/null
|
||||
echo '{}' > ~/.hermes/logs/claude-active.json 2>/dev/null
|
||||
echo " Claude stopped (all workers)"
|
||||
}
|
||||
if target != "all":
|
||||
repos = [target]
|
||||
|
||||
ops-kill-gemini() {
|
||||
pkill -f "gemini-loop.sh" 2>/dev/null
|
||||
pkill -f "gemini.*--print" 2>/dev/null
|
||||
echo " Gemini stopped"
|
||||
}
|
||||
|
||||
ops-assign-claude() {
|
||||
local issue=$1
|
||||
local repo="${2:-rockachopa/Timmy-time-dashboard}"
|
||||
[ -z "$issue" ] && { echo "Usage: ops-assign-claude ISSUE_NUMBER [owner/repo]"; return 1; }
|
||||
curl -s -X PATCH -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||
"$GITEA/api/v1/repos/$repo/issues/$issue" -d '{"assignees":["claude"]}' | python3 -c "
|
||||
import json,sys; d=json.loads(sys.stdin.read()); print(f' ✓ #{$issue} assigned to claude')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
ops-claude-queue() {
|
||||
python3 -c "
|
||||
import json, urllib.request
|
||||
token=*** ~/.hermes/claude_token 2>/dev/null)'
|
||||
base = 'http://143.198.27.163:3000'
|
||||
repos = ['rockachopa/Timmy-time-dashboard','rockachopa/alexanderwhitestone.com','replit/timmy-tower','replit/token-gated-economy','rockachopa/hermes-agent']
|
||||
rows = []
|
||||
for repo in repos:
|
||||
url = f'{base}/api/v1/repos/{repo}/issues?state=open&limit=50&type=issues'
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={'Authorization': f'token {token}'})
|
||||
resp = urllib.request.urlopen(req, timeout=5)
|
||||
raw = json.loads(resp.read())
|
||||
issues = [i for i in raw if 'claude' in [a.get('login','') for a in (i.get('assignees') or [])]]
|
||||
for i in issues:
|
||||
print(f' #{i[\"number\"]:4d} {repo.split(\"/\")[1]:20s} {i[\"title\"][:50]}')
|
||||
except: continue
|
||||
" 2>/dev/null || echo " (error)"
|
||||
req = urllib.request.Request(
|
||||
f"{base}/api/v1/repos/{repo}/issues?state=open&limit=50&type=issues",
|
||||
headers=headers,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
for issue in json.loads(resp.read().decode()):
|
||||
assignees = [a.get("login", "") for a in (issue.get("assignees") or [])]
|
||||
if agent in assignees:
|
||||
rows.append((repo, issue["number"], issue["title"]))
|
||||
|
||||
if not rows:
|
||||
print(" (empty)")
|
||||
else:
|
||||
for repo, number, title in rows:
|
||||
print(f" #{number:4d} {repo.split('/', 1)[1]:12s} {title[:60]}")
|
||||
PY
|
||||
}
|
||||
|
||||
ops-assign-gemini() {
|
||||
local issue=$1
|
||||
local repo="${2:-rockachopa/Timmy-time-dashboard}"
|
||||
[ -z "$issue" ] && { echo "Usage: ops-assign-gemini ISSUE_NUMBER [owner/repo]"; return 1; }
|
||||
curl -s -X PATCH -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||
"$GITEA/api/v1/repos/$repo/issues/$issue" -d '{"assignees":["gemini"]}' | python3 -c "
|
||||
import json,sys; d=json.loads(sys.stdin.read()); print(f' ✓ #{$issue} assigned to gemini')
|
||||
" 2>/dev/null
|
||||
ops-unassigned() {
|
||||
local target="${1:-all}"
|
||||
ops-python "$GITEA" "$OPS_CORE_REPOS" "$target" <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
base = sys.argv[1].rstrip("/")
|
||||
repos = sys.argv[2].split()
|
||||
target = sys.argv[3]
|
||||
token = os.environ["OPS_TOKEN"]
|
||||
headers = {"Authorization": f"token {token}"}
|
||||
|
||||
if target != "all":
|
||||
repos = [target]
|
||||
|
||||
rows = []
|
||||
for repo in repos:
|
||||
req = urllib.request.Request(
|
||||
f"{base}/api/v1/repos/{repo}/issues?state=open&limit=50&type=issues",
|
||||
headers=headers,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
for issue in json.loads(resp.read().decode()):
|
||||
if not issue.get("assignees"):
|
||||
rows.append((repo, issue["number"], issue["title"]))
|
||||
|
||||
if not rows:
|
||||
print(" (none)")
|
||||
else:
|
||||
for repo, number, title in rows[:20]:
|
||||
print(f" #{number:4d} {repo.split('/', 1)[1]:12s} {title[:60]}")
|
||||
if len(rows) > 20:
|
||||
print(f" ... +{len(rows) - 20} more")
|
||||
PY
|
||||
}
|
||||
|
||||
ops-gemini-queue() {
|
||||
curl -s -H "Authorization: token $TOKEN" "$REPO_API/issues?state=open&limit=50&type=issues" | python3 -c "
|
||||
ops-merge() {
|
||||
local pr="$1"
|
||||
local repo="${2:-$OPS_DEFAULT_REPO}"
|
||||
local token
|
||||
[ -z "$pr" ] && { echo "Usage: ops-merge PR_NUMBER [owner/repo]"; return 1; }
|
||||
token=$(ops-token) || { echo "No Gitea token found"; return 1; }
|
||||
curl -s -X POST -H "Authorization: token $token" -H "Content-Type: application/json" \
|
||||
"$GITEA/api/v1/repos/$repo/pulls/$pr/merge" -d '{"Do":"squash"}' | python3 -c "
|
||||
import json,sys
|
||||
all_issues=json.loads(sys.stdin.read())
|
||||
issues=[i for i in all_issues if 'gemini' in [a.get('login','') for a in (i.get('assignees') or [])]]
|
||||
for i in issues: print(f' #{i[\"number\"]:4d} {i[\"title\"][:60]}')
|
||||
if not issues: print(' (empty)')
|
||||
d=json.loads(sys.stdin.read())
|
||||
if 'sha' in d:
|
||||
print(f' ✓ PR merged ({d[\"sha\"][:8]})')
|
||||
else:
|
||||
print(f' ✗ {d.get(\"message\", \"unknown error\")}')
|
||||
" 2>/dev/null
|
||||
}
|
||||
|
||||
ops-kill-zombies() {
|
||||
local killed=0
|
||||
for pid in $(ps aux | grep "pytest tests/" | grep -v grep | awk '{print $2}'); do
|
||||
kill "$pid" 2>/dev/null && killed=$((killed+1))
|
||||
done
|
||||
for pid in $(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | awk '{print $2}'); do
|
||||
kill "$pid" 2>/dev/null && killed=$((killed+1))
|
||||
done
|
||||
echo " Killed $killed zombie processes"
|
||||
ops-gitea-feed() {
|
||||
bash "$HOME/.hermes/bin/ops-gitea.sh"
|
||||
}
|
||||
|
||||
ops-wake-timmy() {
|
||||
pkill -f "timmy-orchestrator.sh" 2>/dev/null
|
||||
rm -f ~/.hermes/logs/timmy-orchestrator.pid
|
||||
sleep 1
|
||||
nohup bash ~/.hermes/bin/timmy-orchestrator.sh >> ~/.hermes/logs/timmy-orchestrator.log 2>&1 &
|
||||
echo " Timmy orchestrator started (PID $!)"
|
||||
ops-freshness() {
|
||||
bash "$HOME/.hermes/bin/pipeline-freshness.sh"
|
||||
}
|
||||
|
||||
ops-kill-timmy() {
|
||||
pkill -f "timmy-orchestrator.sh" 2>/dev/null
|
||||
rm -f ~/.hermes/logs/timmy-orchestrator.pid
|
||||
echo " Timmy stopped"
|
||||
}
|
||||
|
||||
ops-wake-watchdog() {
|
||||
pkill -f "loop-watchdog.sh" 2>/dev/null
|
||||
sleep 1
|
||||
nohup bash ~/.hermes/bin/loop-watchdog.sh >> ~/.hermes/logs/watchdog.log 2>&1 &
|
||||
echo " Watchdog started (PID $!)"
|
||||
}
|
||||
|
||||
ops-kill-watchdog() {
|
||||
pkill -f "loop-watchdog.sh" 2>/dev/null
|
||||
echo " Watchdog stopped"
|
||||
}
|
||||
ops-assign-allegro() { ops-assign "$1" "allegro" "${2:-$OPS_DEFAULT_REPO}"; }
|
||||
ops-assign-codex() { ops-assign "$1" "codex-agent" "${2:-$OPS_DEFAULT_REPO}"; }
|
||||
ops-assign-groq() { ops-assign "$1" "groq" "${2:-$OPS_DEFAULT_REPO}"; }
|
||||
ops-assign-claude() { ops-assign "$1" "claude" "${2:-$OPS_DEFAULT_REPO}"; }
|
||||
ops-assign-ezra() { ops-assign "$1" "ezra" "${2:-$OPS_DEFAULT_REPO}"; }
|
||||
ops-assign-perplexity() { ops-assign "$1" "perplexity" "${2:-$OPS_DEFAULT_REPO}"; }
|
||||
ops-assign-kimiclaw() { ops-assign "$1" "KimiClaw" "${2:-$OPS_DEFAULT_REPO}"; }
|
||||
|
||||
450
bin/ops-panel.sh
450
bin/ops-panel.sh
@@ -1,300 +1,224 @@
|
||||
#!/usr/bin/env bash
|
||||
# ── Consolidated Ops Panel ─────────────────────────────────────────────
|
||||
# Everything in one view. Designed for a half-screen pane (~100x45).
|
||||
# ── Workflow Ops Panel ─────────────────────────────────────────────────
|
||||
# Current-state dashboard for review, dispatch, and freshness.
|
||||
# This intentionally reflects the post-loop, Hermes-sidecar workflow.
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
B='\033[1m' ; D='\033[2m' ; R='\033[0m' ; U='\033[4m'
|
||||
G='\033[32m' ; Y='\033[33m' ; RD='\033[31m' ; C='\033[36m' ; M='\033[35m' ; W='\033[37m'
|
||||
OK="${G}●${R}" ; WARN="${Y}●${R}" ; FAIL="${RD}●${R}" ; OFF="${D}○${R}"
|
||||
set -euo pipefail
|
||||
|
||||
TOKEN=$(cat ~/.hermes/gitea_token_vps 2>/dev/null)
|
||||
API="http://143.198.27.163:3000/api/v1/repos/rockachopa/Timmy-time-dashboard"
|
||||
B='\033[1m'
|
||||
D='\033[2m'
|
||||
R='\033[0m'
|
||||
U='\033[4m'
|
||||
G='\033[32m'
|
||||
Y='\033[33m'
|
||||
RD='\033[31m'
|
||||
M='\033[35m'
|
||||
OK="${G}●${R}"
|
||||
WARN="${Y}●${R}"
|
||||
FAIL="${RD}●${R}"
|
||||
|
||||
resolve_gitea_url() {
|
||||
if [ -n "${GITEA_URL:-}" ]; then
|
||||
printf '%s\n' "${GITEA_URL%/}"
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.hermes/gitea_api" ]; then
|
||||
python3 - "$HOME/.hermes/gitea_api" <<'PY'
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
raw = Path(sys.argv[1]).read_text().strip().rstrip("/")
|
||||
print(raw[:-7] if raw.endswith("/api/v1") else raw)
|
||||
PY
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.config/gitea/base-url" ]; then
|
||||
tr -d '[:space:]' < "$HOME/.config/gitea/base-url"
|
||||
return 0
|
||||
fi
|
||||
echo "ERROR: set GITEA_URL or create ~/.hermes/gitea_api" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
resolve_ops_token() {
|
||||
local token_file
|
||||
for token_file in \
|
||||
"$HOME/.config/gitea/timmy-token" \
|
||||
"$HOME/.hermes/gitea_token_vps" \
|
||||
"$HOME/.hermes/gitea_token_timmy"; do
|
||||
if [ -f "$token_file" ]; then
|
||||
tr -d '[:space:]' < "$token_file"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
GITEA_URL="$(resolve_gitea_url)"
|
||||
CORE_REPOS="${CORE_REPOS:-Timmy_Foundation/the-nexus Timmy_Foundation/timmy-home Timmy_Foundation/timmy-config Timmy_Foundation/hermes-agent}"
|
||||
TOKEN="$(resolve_ops_token || true)"
|
||||
[ -z "$TOKEN" ] && echo "WARN: no approved Timmy Gitea token found; panel will use unauthenticated API calls" >&2
|
||||
|
||||
# ── HEADER ─────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo -e " ${B}${M}◈ HERMES OPERATIONS${R} ${D}$(date '+%a %b %d %H:%M:%S')${R}"
|
||||
echo -e " ${B}${M}◈ WORKFLOW OPERATIONS${R} ${D}$(date '+%a %b %d %H:%M:%S')${R}"
|
||||
echo -e " ${D}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${R}"
|
||||
echo ""
|
||||
|
||||
# ── SERVICES ───────────────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}SERVICES${R}"
|
||||
echo ""
|
||||
|
||||
# Gateway
|
||||
GW_PID=$(pgrep -f "hermes.*gateway.*run" 2>/dev/null | head -1)
|
||||
[ -n "$GW_PID" ] && echo -e " ${OK} Gateway ${D}pid $GW_PID${R}" \
|
||||
|| echo -e " ${FAIL} Gateway ${RD}DOWN — run: hermes gateway start${R}"
|
||||
|
||||
# Kimi Code loop
|
||||
KIMI_PID=$(pgrep -f "kimi-loop.sh" 2>/dev/null | head -1)
|
||||
[ -n "$KIMI_PID" ] && echo -e " ${OK} Kimi Loop ${D}pid $KIMI_PID${R}" \
|
||||
|| echo -e " ${FAIL} Kimi Loop ${RD}DOWN — run: ops-wake-kimi${R}"
|
||||
|
||||
# Active Kimi Code worker
|
||||
KIMI_WORK=$(pgrep -f "kimi.*--print" 2>/dev/null | head -1)
|
||||
if [ -n "$KIMI_WORK" ]; then
|
||||
echo -e " ${OK} Kimi Code ${D}pid $KIMI_WORK ${G}working${R}"
|
||||
elif [ -n "$KIMI_PID" ]; then
|
||||
echo -e " ${WARN} Kimi Code ${Y}between issues${R}"
|
||||
GW_PID=$(pgrep -f "hermes.*gateway.*run" 2>/dev/null | head -1 || true)
|
||||
if [ -n "${GW_PID:-}" ]; then
|
||||
echo -e " ${OK} Hermes Gateway ${D}pid $GW_PID${R}"
|
||||
else
|
||||
echo -e " ${OFF} Kimi Code ${D}not running${R}"
|
||||
echo -e " ${FAIL} Hermes Gateway ${RD}down${R}"
|
||||
fi
|
||||
|
||||
# Claude Code loop (parallel workers)
|
||||
CLAUDE_PID=$(pgrep -f "claude-loop.sh" 2>/dev/null | head -1)
|
||||
CLAUDE_WORKERS=$(pgrep -f "claude.*--print.*--dangerously" 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ -n "$CLAUDE_PID" ]; then
|
||||
echo -e " ${OK} Claude Loop ${D}pid $CLAUDE_PID ${G}${CLAUDE_WORKERS} workers active${R}"
|
||||
if curl -s --max-time 3 "$GITEA_URL/api/v1/version" >/dev/null 2>&1; then
|
||||
echo -e " ${OK} Gitea ${D}${GITEA_URL}${R}"
|
||||
else
|
||||
echo -e " ${FAIL} Claude Loop ${RD}DOWN — run: ops-wake-claude${R}"
|
||||
echo -e " ${FAIL} Gitea ${RD}unreachable${R}"
|
||||
fi
|
||||
|
||||
# Gemini Code loop
|
||||
GEMINI_PID=$(pgrep -f "gemini-loop.sh" 2>/dev/null | head -1)
|
||||
GEMINI_WORK=$(pgrep -f "gemini.*--print" 2>/dev/null | head -1)
|
||||
if [ -n "$GEMINI_PID" ]; then
|
||||
if [ -n "$GEMINI_WORK" ]; then
|
||||
echo -e " ${OK} Gemini Loop ${D}pid $GEMINI_PID ${G}working${R}"
|
||||
else
|
||||
echo -e " ${WARN} Gemini Loop ${D}pid $GEMINI_PID ${Y}between issues${R}"
|
||||
fi
|
||||
if hermes cron list >/dev/null 2>&1; then
|
||||
echo -e " ${OK} Hermes Cron ${D}reachable${R}"
|
||||
else
|
||||
echo -e " ${FAIL} Gemini Loop ${RD}DOWN — run: ops-wake-gemini${R}"
|
||||
echo -e " ${WARN} Hermes Cron ${Y}not responding${R}"
|
||||
fi
|
||||
|
||||
# Timmy Orchestrator
|
||||
TIMMY_PID=$(pgrep -f "timmy-orchestrator.sh" 2>/dev/null | head -1)
|
||||
if [ -n "$TIMMY_PID" ]; then
|
||||
TIMMY_LAST=$(tail -1 "$HOME/.hermes/logs/timmy-orchestrator.log" 2>/dev/null | sed 's/.*TIMMY: //')
|
||||
echo -e " ${OK} Timmy (Ollama) ${D}pid $TIMMY_PID ${G}${TIMMY_LAST:0:30}${R}"
|
||||
FRESHNESS_OUTPUT=$("$HOME/.hermes/bin/pipeline-freshness.sh" 2>/dev/null || true)
|
||||
FRESHNESS_STATUS=$(printf '%s\n' "$FRESHNESS_OUTPUT" | awk -F= '/^status=/{print $2}')
|
||||
FRESHNESS_REASON=$(printf '%s\n' "$FRESHNESS_OUTPUT" | awk -F= '/^reason=/{print $2}')
|
||||
if [ "$FRESHNESS_STATUS" = "ok" ]; then
|
||||
echo -e " ${OK} Export Freshness ${D}${FRESHNESS_REASON:-within freshness window}${R}"
|
||||
elif [ -n "$FRESHNESS_STATUS" ]; then
|
||||
echo -e " ${WARN} Export Freshness ${Y}${FRESHNESS_REASON:-lagging}${R}"
|
||||
else
|
||||
echo -e " ${FAIL} Timmy ${RD}DOWN — run: ops-wake-timmy${R}"
|
||||
fi
|
||||
|
||||
# Gitea VPS
|
||||
if curl -s --max-time 3 "http://143.198.27.163:3000/api/v1/version" >/dev/null 2>&1; then
|
||||
echo -e " ${OK} Gitea VPS ${D}143.198.27.163:3000${R}"
|
||||
else
|
||||
echo -e " ${FAIL} Gitea VPS ${RD}unreachable${R}"
|
||||
fi
|
||||
|
||||
# Matrix staging
|
||||
HTTP=$(curl -s --max-time 3 -o /dev/null -w "%{http_code}" "http://143.198.27.163/")
|
||||
[ "$HTTP" = "200" ] && echo -e " ${OK} Matrix Staging ${D}143.198.27.163${R}" \
|
||||
|| echo -e " ${FAIL} Matrix Staging ${RD}HTTP $HTTP${R}"
|
||||
|
||||
# Dev cycle cron
|
||||
CRON_LINE=$(hermes cron list 2>&1 | grep -B1 "consolidated-dev-cycle" | head -1 2>/dev/null)
|
||||
if echo "$CRON_LINE" | grep -q "active"; then
|
||||
NEXT=$(hermes cron list 2>&1 | grep -A4 "consolidated-dev-cycle" | grep "Next" | awk '{print $NF}' | cut -dT -f2 | cut -d. -f1)
|
||||
echo -e " ${OK} Dev Cycle ${D}every 30m, next ${NEXT:-?}${R}"
|
||||
else
|
||||
echo -e " ${FAIL} Dev Cycle Cron ${RD}MISSING${R}"
|
||||
echo -e " ${WARN} Export Freshness ${Y}unknown${R}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# ── KIMI STATS ─────────────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}KIMI${R}"
|
||||
echo ""
|
||||
KIMI_LOG="$HOME/.hermes/logs/kimi-loop.log"
|
||||
if [ -f "$KIMI_LOG" ]; then
|
||||
COMPLETED=$(grep -c "SUCCESS:" "$KIMI_LOG" 2>/dev/null | tail -1 || echo 0)
|
||||
FAILED=$(grep -c "FAILED:" "$KIMI_LOG" 2>/dev/null | tail -1 || echo 0)
|
||||
LAST_ISSUE=$(grep "=== ISSUE" "$KIMI_LOG" | tail -1 | sed 's/.*=== //' | sed 's/ ===//')
|
||||
LAST_TIME=$(grep "=== ISSUE\|SUCCESS\|FAILED" "$KIMI_LOG" | tail -1 | cut -d']' -f1 | tr -d '[')
|
||||
RATE=""
|
||||
if [ "$COMPLETED" -gt 0 ] && [ "$FAILED" -gt 0 ]; then
|
||||
TOTAL=$((COMPLETED + FAILED))
|
||||
PCT=$((COMPLETED * 100 / TOTAL))
|
||||
RATE=" (${PCT}% success)"
|
||||
fi
|
||||
echo -e " Completed ${G}${B}$COMPLETED${R} Failed ${RD}$FAILED${R}${D}$RATE${R}"
|
||||
echo -e " Current ${C}$LAST_ISSUE${R}"
|
||||
echo -e " Last seen ${D}$LAST_TIME${R}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── CLAUDE STATS ──────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}CLAUDE${R}"
|
||||
echo ""
|
||||
CLAUDE_LOG="$HOME/.hermes/logs/claude-loop.log"
|
||||
if [ -f "$CLAUDE_LOG" ]; then
|
||||
CL_COMPLETED=$(grep -c "SUCCESS" "$CLAUDE_LOG" 2>/dev/null | tail -1 || echo 0)
|
||||
CL_FAILED=$(grep -c "FAILED" "$CLAUDE_LOG" 2>/dev/null | tail -1 || echo 0)
|
||||
CL_RATE_LIM=$(grep -c "RATE LIMITED" "$CLAUDE_LOG" 2>/dev/null | tail -1 || echo 0)
|
||||
CL_RATE=""
|
||||
if [ "$CL_COMPLETED" -gt 0 ] || [ "$CL_FAILED" -gt 0 ]; then
|
||||
CL_TOTAL=$((CL_COMPLETED + CL_FAILED))
|
||||
[ "$CL_TOTAL" -gt 0 ] && CL_PCT=$((CL_COMPLETED * 100 / CL_TOTAL)) && CL_RATE=" (${CL_PCT}%)"
|
||||
fi
|
||||
echo -e " ${G}${B}$CL_COMPLETED${R} done ${RD}$CL_FAILED${R} fail ${Y}$CL_RATE_LIM${R} rate-limited${D}$CL_RATE${R}"
|
||||
|
||||
# Show active workers
|
||||
ACTIVE="$HOME/.hermes/logs/claude-active.json"
|
||||
if [ -f "$ACTIVE" ]; then
|
||||
python3 -c "
|
||||
python3 - "$GITEA_URL" "$TOKEN" "$CORE_REPOS" <<'PY'
|
||||
import json
|
||||
try:
|
||||
with open('$ACTIVE') as f: active = json.load(f)
|
||||
for wid, info in sorted(active.items()):
|
||||
iss = info.get('issue','')
|
||||
repo = info.get('repo','').split('/')[-1] if info.get('repo') else ''
|
||||
st = info.get('status','')
|
||||
if st == 'working':
|
||||
print(f' \033[36mW{wid}\033[0m \033[33m#{iss}\033[0m \033[2m{repo}\033[0m')
|
||||
elif st == 'idle':
|
||||
print(f' \033[2mW{wid} idle\033[0m')
|
||||
except: pass
|
||||
" 2>/dev/null
|
||||
fi
|
||||
else
|
||||
echo -e " ${D}(no log yet — start with ops-wake-claude)${R}"
|
||||
fi
|
||||
echo ""
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
# ── GEMINI STATS ─────────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}GEMINI${R}"
|
||||
echo ""
|
||||
GEMINI_LOG="$HOME/.hermes/logs/gemini-loop.log"
|
||||
if [ -f "$GEMINI_LOG" ]; then
|
||||
GM_COMPLETED=$(grep -c "SUCCESS:" "$GEMINI_LOG" 2>/dev/null | tail -1 || echo 0)
|
||||
GM_FAILED=$(grep -c "FAILED:" "$GEMINI_LOG" 2>/dev/null | tail -1 || echo 0)
|
||||
GM_RATE=""
|
||||
if [ "$GM_COMPLETED" -gt 0 ] || [ "$GM_FAILED" -gt 0 ]; then
|
||||
GM_TOTAL=$((GM_COMPLETED + GM_FAILED))
|
||||
[ "$GM_TOTAL" -gt 0 ] && GM_PCT=$((GM_COMPLETED * 100 / GM_TOTAL)) && GM_RATE=" (${GM_PCT}%)"
|
||||
fi
|
||||
GM_LAST=$(grep "=== ISSUE" "$GEMINI_LOG" | tail -1 | sed 's/.*=== //' | sed 's/ ===//')
|
||||
echo -e " ${G}${B}$GM_COMPLETED${R} done ${RD}$GM_FAILED${R} fail${D}$GM_RATE${R}"
|
||||
[ -n "$GM_LAST" ] && echo -e " Current ${C}$GM_LAST${R}"
|
||||
else
|
||||
echo -e " ${D}(no log yet — start with ops-wake-gemini)${R}"
|
||||
fi
|
||||
echo ""
|
||||
base = sys.argv[1].rstrip("/")
|
||||
token = sys.argv[2]
|
||||
repos = sys.argv[3].split()
|
||||
headers = {"Authorization": f"token {token}"} if token else {}
|
||||
|
||||
# ── OPEN PRS ───────────────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}PULL REQUESTS${R}"
|
||||
echo ""
|
||||
curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/pulls?state=open&limit=8" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
prs = json.loads(sys.stdin.read())
|
||||
if not prs: print(' \033[2m(none open)\033[0m')
|
||||
for p in prs[:6]:
|
||||
n = p['number']
|
||||
t = p['title'][:55]
|
||||
u = p['user']['login']
|
||||
print(f' \033[33m#{n:<4d}\033[0m \033[2m{u:8s}\033[0m {t}')
|
||||
if len(prs) > 6: print(f' \033[2m... +{len(prs)-6} more\033[0m')
|
||||
except: print(' \033[31m(error fetching)\033[0m')
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# ── RECENTLY MERGED ────────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}RECENTLY MERGED${R}"
|
||||
echo ""
|
||||
curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/pulls?state=closed&sort=updated&limit=5" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
prs = json.loads(sys.stdin.read())
|
||||
merged = [p for p in prs if p.get('merged')][:5]
|
||||
if not merged: print(' \033[2m(none recent)\033[0m')
|
||||
for p in merged:
|
||||
n = p['number']
|
||||
t = p['title'][:50]
|
||||
when = p['merged_at'][11:16]
|
||||
print(f' \033[32m✓ #{n:<4d}\033[0m {t} \033[2m{when}\033[0m')
|
||||
except: print(' \033[31m(error)\033[0m')
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
def fetch(path):
|
||||
req = urllib.request.Request(f"{base}{path}", headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
# ── KIMI QUEUE ─────────────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}KIMI QUEUE${R}"
|
||||
echo ""
|
||||
curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/issues?state=open&limit=50&type=issues" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
all_issues = json.loads(sys.stdin.read())
|
||||
issues = [i for i in all_issues if 'kimi' in [a.get('login','') for a in (i.get('assignees') or [])]]
|
||||
if not issues: print(' \033[33m⚠ Queue empty — assign more issues to kimi\033[0m')
|
||||
for i in issues[:6]:
|
||||
n = i['number']
|
||||
t = i['title'][:55]
|
||||
print(f' #{n:<4d} {t}')
|
||||
if len(issues) > 6: print(f' \033[2m... +{len(issues)-6} more\033[0m')
|
||||
except: print(' \033[31m(error)\033[0m')
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# ── CLAUDE QUEUE ──────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}CLAUDE QUEUE${R}"
|
||||
echo ""
|
||||
# Claude works across multiple repos
|
||||
python3 -c "
|
||||
import json, sys, urllib.request
|
||||
token = '$(cat ~/.hermes/claude_token 2>/dev/null)'
|
||||
base = 'http://143.198.27.163:3000'
|
||||
repos = ['rockachopa/Timmy-time-dashboard','rockachopa/alexanderwhitestone.com','replit/timmy-tower','replit/token-gated-economy','rockachopa/hermes-agent']
|
||||
all_issues = []
|
||||
def short(repo):
|
||||
return repo.split("/", 1)[1]
|
||||
|
||||
|
||||
issues = []
|
||||
pulls = []
|
||||
review_queue = []
|
||||
errors = []
|
||||
|
||||
for repo in repos:
|
||||
url = f'{base}/api/v1/repos/{repo}/issues?state=open&limit=50&type=issues'
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={'Authorization': f'token {token}'})
|
||||
resp = urllib.request.urlopen(req, timeout=5)
|
||||
raw = json.loads(resp.read())
|
||||
issues = [i for i in raw if 'claude' in [a.get('login','') for a in (i.get('assignees') or [])]]
|
||||
for i in issues:
|
||||
i['_repo'] = repo.split('/')[1]
|
||||
all_issues.extend(issues)
|
||||
except: continue
|
||||
if not all_issues:
|
||||
print(' \033[33m\u26a0 Queue empty \u2014 assign issues to claude\033[0m')
|
||||
repo_pulls = fetch(f"/api/v1/repos/{repo}/pulls?state=open&limit=20")
|
||||
for pr in repo_pulls:
|
||||
pr["_repo"] = repo
|
||||
pulls.append(pr)
|
||||
repo_issues = fetch(f"/api/v1/repos/{repo}/issues?state=open&limit=50&type=issues")
|
||||
for issue in repo_issues:
|
||||
issue["_repo"] = repo
|
||||
issues.append(issue)
|
||||
repo_pull_issues = fetch(f"/api/v1/repos/{repo}/issues?state=open&limit=50&type=pulls")
|
||||
for item in repo_pull_issues:
|
||||
assignees = [a.get("login", "") for a in (item.get("assignees") or [])]
|
||||
if any(name in assignees for name in ("Timmy", "allegro")):
|
||||
item["_repo"] = repo
|
||||
review_queue.append(item)
|
||||
except urllib.error.URLError as exc:
|
||||
errors.append(f"{repo}: {exc.reason}")
|
||||
except Exception as exc: # pragma: no cover - defensive panel path
|
||||
errors.append(f"{repo}: {exc}")
|
||||
|
||||
print(" \033[1m\033[4mREVIEW QUEUE\033[0m\n")
|
||||
if not review_queue:
|
||||
print(" \033[2m(clear)\033[0m\n")
|
||||
else:
|
||||
for i in all_issues[:6]:
|
||||
n = i['number']
|
||||
t = i['title'][:45]
|
||||
r = i['_repo'][:12]
|
||||
print(f' #{n:<4d} \033[2m{r:12s}\033[0m {t}')
|
||||
if len(all_issues) > 6:
|
||||
print(f' \033[2m... +{len(all_issues)-6} more\033[0m')
|
||||
" 2>/dev/null
|
||||
for item in review_queue[:8]:
|
||||
names = ",".join(a.get("login", "") for a in (item.get("assignees") or []))
|
||||
print(f" #{item['number']:<4d} {short(item['_repo']):12s} {names[:20]:20s} {item['title'][:44]}")
|
||||
print()
|
||||
|
||||
print(" \033[1m\033[4mOPEN PRS\033[0m\n")
|
||||
if not pulls:
|
||||
print(" \033[2m(none open)\033[0m\n")
|
||||
else:
|
||||
for pr in pulls[:8]:
|
||||
print(f" #{pr['number']:<4d} {short(pr['_repo']):12s} {pr['user']['login'][:12]:12s} {pr['title'][:48]}")
|
||||
print()
|
||||
|
||||
print(" \033[1m\033[4mDISPATCH QUEUES\033[0m\n")
|
||||
queue_agents = [
|
||||
("allegro", "dispatch"),
|
||||
("codex-agent", "cleanup"),
|
||||
("groq", "fast ship"),
|
||||
("claude", "refactor"),
|
||||
("ezra", "archive"),
|
||||
("perplexity", "research"),
|
||||
("KimiClaw", "digest"),
|
||||
]
|
||||
for agent, label in queue_agents:
|
||||
assigned = [
|
||||
issue
|
||||
for issue in issues
|
||||
if agent in [a.get("login", "") for a in (issue.get("assignees") or [])]
|
||||
]
|
||||
print(f" {agent:12s} {len(assigned):2d} \033[2m{label}\033[0m")
|
||||
print()
|
||||
|
||||
unassigned = [issue for issue in issues if not issue.get("assignees")]
|
||||
stale_cutoff = (datetime.now(timezone.utc) - timedelta(days=2)).strftime("%Y-%m-%d")
|
||||
stale_prs = [pr for pr in pulls if pr.get("updated_at", "")[:10] < stale_cutoff]
|
||||
overloaded = []
|
||||
for agent in ("allegro", "codex-agent", "groq", "claude", "ezra", "perplexity", "KimiClaw"):
|
||||
count = sum(
|
||||
1
|
||||
for issue in issues
|
||||
if agent in [a.get("login", "") for a in (issue.get("assignees") or [])]
|
||||
)
|
||||
if count > 3:
|
||||
overloaded.append((agent, count))
|
||||
|
||||
print(" \033[1m\033[4mWARNINGS\033[0m\n")
|
||||
warns = []
|
||||
if len(unassigned) > 10:
|
||||
warns.append(f"{len(unassigned)} unassigned issues across core repos")
|
||||
if stale_prs:
|
||||
warns.append(f"{len(stale_prs)} open PRs look stale and may need a review nudge")
|
||||
for agent, count in overloaded:
|
||||
warns.append(f"{agent} has {count} assigned issues; rebalance dispatch")
|
||||
|
||||
if warns:
|
||||
for warn in warns:
|
||||
print(f" \033[33m⚠ {warn}\033[0m")
|
||||
else:
|
||||
print(" \033[2m(no major workflow warnings)\033[0m")
|
||||
|
||||
if errors:
|
||||
print("\n \033[1m\033[4mFETCH ERRORS\033[0m\n")
|
||||
for err in errors[:4]:
|
||||
print(f" \033[31m{err}\033[0m")
|
||||
PY
|
||||
|
||||
echo ""
|
||||
|
||||
# ── GEMINI QUEUE ─────────────────────────────────────────────────────
|
||||
echo -e " ${B}${U}GEMINI QUEUE${R}"
|
||||
echo ""
|
||||
curl -s --max-time 5 -H "Authorization: token $TOKEN" "$API/issues?state=open&limit=50&type=issues" 2>/dev/null | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
all_issues = json.loads(sys.stdin.read())
|
||||
issues = [i for i in all_issues if 'gemini' in [a.get('login','') for a in (i.get('assignees') or [])]]
|
||||
if not issues: print(' \033[33m⚠ Queue empty — assign issues to gemini\033[0m')
|
||||
for i in issues[:6]:
|
||||
n = i['number']
|
||||
t = i['title'][:55]
|
||||
print(f' #{n:<4d} {t}')
|
||||
if len(issues) > 6: print(f' \033[2m... +{len(issues)-6} more\033[0m')
|
||||
except: print(' \033[31m(error)\033[0m')
|
||||
" 2>/dev/null
|
||||
echo ""
|
||||
|
||||
# ── WARNINGS ───────────────────────────────────────────────────────────
|
||||
HERMES_PROCS=$(ps aux | grep -E "hermes.*python" | grep -v grep | wc -l | tr -d ' ')
|
||||
STUCK_GIT=$(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | wc -l | tr -d ' ')
|
||||
ORPHAN_PY=$(ps aux | grep "pytest tests/" | grep -v grep | wc -l | tr -d ' ')
|
||||
UNASSIGNED=$(curl -s --max-time 3 -H "Authorization: token $TOKEN" "$API/issues?state=open&limit=50&type=issues" 2>/dev/null | python3 -c "import json,sys; issues=json.loads(sys.stdin.read()); print(len([i for i in issues if not i.get('assignees')]))" 2>/dev/null)
|
||||
|
||||
WARNS=""
|
||||
[ "$STUCK_GIT" -gt 0 ] && WARNS+=" ${RD}⚠ $STUCK_GIT stuck git processes${R}\n"
|
||||
[ "$ORPHAN_PY" -gt 0 ] && WARNS+=" ${Y}⚠ $ORPHAN_PY orphaned pytest runs${R}\n"
|
||||
[ "${UNASSIGNED:-0}" -gt 10 ] && WARNS+=" ${Y}⚠ $UNASSIGNED unassigned issues — feed the queue${R}\n"
|
||||
|
||||
if [ -n "$WARNS" ]; then
|
||||
echo -e " ${B}${U}WARNINGS${R}"
|
||||
echo ""
|
||||
echo -e "$WARNS"
|
||||
fi
|
||||
|
||||
echo -e " ${D}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${R}"
|
||||
echo -e " ${D}hermes sessions: $HERMES_PROCS unassigned: ${UNASSIGNED:-?} ↻ 20s${R}"
|
||||
echo -e " ${D}repos: $(printf '%s' "$CORE_REPOS" | wc -w | tr -d ' ') refresh via watch or rerun script${R}"
|
||||
|
||||
191
bin/pr-checklist.py
Normal file
191
bin/pr-checklist.py
Normal file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
"""pr-checklist.py -- Automated PR quality gate for Gitea CI.
|
||||
|
||||
Enforces the review standards that agents skip when left to self-approve.
|
||||
Runs in CI on every pull_request event. Exits non-zero on any failure.
|
||||
|
||||
Checks:
|
||||
1. PR has >0 file changes (no empty PRs)
|
||||
2. PR branch is not behind base branch
|
||||
3. PR does not bundle >3 unrelated issues
|
||||
4. Changed .py files pass syntax check (python -c import)
|
||||
5. Changed .sh files are executable
|
||||
6. PR body references an issue number
|
||||
7. At least 1 non-author review exists (warning only)
|
||||
|
||||
Refs: #393 (PERPLEXITY-08), Epic #385
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def fail(msg: str) -> None:
|
||||
print(f"FAIL: {msg}", file=sys.stderr)
|
||||
|
||||
|
||||
def warn(msg: str) -> None:
|
||||
print(f"WARN: {msg}", file=sys.stderr)
|
||||
|
||||
|
||||
def ok(msg: str) -> None:
|
||||
print(f" OK: {msg}")
|
||||
|
||||
|
||||
def get_changed_files() -> list[str]:
|
||||
"""Return list of files changed in this PR vs base branch."""
|
||||
base = os.environ.get("GITHUB_BASE_REF", "main")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "diff", "--name-only", f"origin/{base}...HEAD"],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
return [f for f in result.stdout.strip().splitlines() if f]
|
||||
except subprocess.CalledProcessError:
|
||||
# Fallback: diff against HEAD~1
|
||||
result = subprocess.run(
|
||||
["git", "diff", "--name-only", "HEAD~1"],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
return [f for f in result.stdout.strip().splitlines() if f]
|
||||
|
||||
|
||||
def check_has_changes(files: list[str]) -> bool:
|
||||
"""Check 1: PR has >0 file changes."""
|
||||
if not files:
|
||||
fail("PR has 0 file changes. Empty PRs are not allowed.")
|
||||
return False
|
||||
ok(f"PR changes {len(files)} file(s)")
|
||||
return True
|
||||
|
||||
|
||||
def check_not_behind_base() -> bool:
|
||||
"""Check 2: PR branch is not behind base."""
|
||||
base = os.environ.get("GITHUB_BASE_REF", "main")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "rev-list", "--count", f"HEAD..origin/{base}"],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
behind = int(result.stdout.strip())
|
||||
if behind > 0:
|
||||
fail(f"Branch is {behind} commit(s) behind {base}. Rebase or merge.")
|
||||
return False
|
||||
ok(f"Branch is up-to-date with {base}")
|
||||
return True
|
||||
except (subprocess.CalledProcessError, ValueError):
|
||||
warn("Could not determine if branch is behind base (git fetch may be needed)")
|
||||
return True # Don't block on CI fetch issues
|
||||
|
||||
|
||||
def check_issue_bundling(pr_body: str) -> bool:
|
||||
"""Check 3: PR does not bundle >3 unrelated issues."""
|
||||
issue_refs = set(re.findall(r"#(\d+)", pr_body))
|
||||
if len(issue_refs) > 3:
|
||||
fail(f"PR references {len(issue_refs)} issues ({', '.join(sorted(issue_refs))}). "
|
||||
"Max 3 per PR to prevent bundling. Split into separate PRs.")
|
||||
return False
|
||||
ok(f"PR references {len(issue_refs)} issue(s) (max 3)")
|
||||
return True
|
||||
|
||||
|
||||
def check_python_syntax(files: list[str]) -> bool:
|
||||
"""Check 4: Changed .py files have valid syntax."""
|
||||
py_files = [f for f in files if f.endswith(".py") and Path(f).exists()]
|
||||
if not py_files:
|
||||
ok("No Python files changed")
|
||||
return True
|
||||
|
||||
all_ok = True
|
||||
for f in py_files:
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-c", f"import ast; ast.parse(open('{f}').read())"],
|
||||
capture_output=True, text=True,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
fail(f"Syntax error in {f}: {result.stderr.strip()[:200]}")
|
||||
all_ok = False
|
||||
|
||||
if all_ok:
|
||||
ok(f"All {len(py_files)} Python file(s) pass syntax check")
|
||||
return all_ok
|
||||
|
||||
|
||||
def check_shell_executable(files: list[str]) -> bool:
|
||||
"""Check 5: Changed .sh files are executable."""
|
||||
sh_files = [f for f in files if f.endswith(".sh") and Path(f).exists()]
|
||||
if not sh_files:
|
||||
ok("No shell scripts changed")
|
||||
return True
|
||||
|
||||
all_ok = True
|
||||
for f in sh_files:
|
||||
if not os.access(f, os.X_OK):
|
||||
fail(f"{f} is not executable. Run: chmod +x {f}")
|
||||
all_ok = False
|
||||
|
||||
if all_ok:
|
||||
ok(f"All {len(sh_files)} shell script(s) are executable")
|
||||
return all_ok
|
||||
|
||||
|
||||
def check_issue_reference(pr_body: str) -> bool:
|
||||
"""Check 6: PR body references an issue number."""
|
||||
if re.search(r"#\d+", pr_body):
|
||||
ok("PR body references at least one issue")
|
||||
return True
|
||||
fail("PR body does not reference any issue (e.g. #123). "
|
||||
"Every PR must trace to an issue.")
|
||||
return False
|
||||
|
||||
|
||||
def main() -> int:
|
||||
print("=" * 60)
|
||||
print("PR Checklist — Automated Quality Gate")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Get PR body from env or git log
|
||||
pr_body = os.environ.get("PR_BODY", "")
|
||||
if not pr_body:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "log", "--format=%B", "-1"],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
pr_body = result.stdout
|
||||
except subprocess.CalledProcessError:
|
||||
pr_body = ""
|
||||
|
||||
files = get_changed_files()
|
||||
failures = 0
|
||||
|
||||
checks = [
|
||||
check_has_changes(files),
|
||||
check_not_behind_base(),
|
||||
check_issue_bundling(pr_body),
|
||||
check_python_syntax(files),
|
||||
check_shell_executable(files),
|
||||
check_issue_reference(pr_body),
|
||||
]
|
||||
|
||||
failures = sum(1 for c in checks if not c)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
if failures:
|
||||
print(f"RESULT: {failures} check(s) FAILED")
|
||||
print("Fix the issues above and push again.")
|
||||
return 1
|
||||
else:
|
||||
print("RESULT: All checks passed")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
360
bin/timmy-dashboard
Executable file → Normal file
360
bin/timmy-dashboard
Executable file → Normal file
@@ -1,20 +1,19 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Timmy Model Dashboard — where are my models, what are they doing.
|
||||
"""Timmy workflow dashboard.
|
||||
|
||||
Usage:
|
||||
timmy-dashboard # one-shot
|
||||
timmy-dashboard --watch # live refresh every 30s
|
||||
timmy-dashboard --hours=48 # look back 48h
|
||||
Shows current workflow state from the active local surfaces instead of the
|
||||
archived dashboard/loop era, while preserving useful local/session metrics.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
@@ -26,37 +25,97 @@ from metrics_helpers import summarize_local_metrics, summarize_session_rows
|
||||
HERMES_HOME = Path.home() / ".hermes"
|
||||
TIMMY_HOME = Path.home() / ".timmy"
|
||||
METRICS_DIR = TIMMY_HOME / "metrics"
|
||||
CORE_REPOS = [
|
||||
"Timmy_Foundation/the-nexus",
|
||||
"Timmy_Foundation/timmy-home",
|
||||
"Timmy_Foundation/timmy-config",
|
||||
"Timmy_Foundation/hermes-agent",
|
||||
]
|
||||
def resolve_gitea_url() -> str:
|
||||
env = os.environ.get("GITEA_URL")
|
||||
if env:
|
||||
return env.rstrip("/")
|
||||
api_hint = HERMES_HOME / "gitea_api"
|
||||
if api_hint.exists():
|
||||
raw = api_hint.read_text().strip().rstrip("/")
|
||||
return raw[:-7] if raw.endswith("/api/v1") else raw
|
||||
base_url = Path.home() / ".config" / "gitea" / "base-url"
|
||||
if base_url.exists():
|
||||
return base_url.read_text().strip().rstrip("/")
|
||||
raise FileNotFoundError("Set GITEA_URL or create ~/.hermes/gitea_api")
|
||||
|
||||
# ── Data Sources ──────────────────────────────────────────────────────
|
||||
|
||||
def get_ollama_models():
|
||||
GITEA_URL = resolve_gitea_url()
|
||||
|
||||
|
||||
def read_token() -> str | None:
|
||||
for path in [
|
||||
Path.home() / ".config" / "gitea" / "timmy-token",
|
||||
Path.home() / ".hermes" / "gitea_token_vps",
|
||||
Path.home() / ".hermes" / "gitea_token_timmy",
|
||||
]:
|
||||
if path.exists():
|
||||
return path.read_text().strip()
|
||||
return None
|
||||
|
||||
|
||||
def gitea_get(path: str, token: str | None) -> list | dict:
|
||||
headers = {"Authorization": f"token {token}"} if token else {}
|
||||
req = urllib.request.Request(f"{GITEA_URL}/api/v1{path}", headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
|
||||
def get_model_health() -> dict:
|
||||
path = HERMES_HOME / "model_health.json"
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
req = urllib.request.Request("http://localhost:11434/api/tags")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read()).get("models", [])
|
||||
return json.loads(path.read_text())
|
||||
except Exception:
|
||||
return []
|
||||
return {}
|
||||
|
||||
|
||||
def get_loaded_models():
|
||||
def get_last_tick() -> dict:
|
||||
path = TIMMY_HOME / "heartbeat" / "last_tick.json"
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
req = urllib.request.Request("http://localhost:11434/api/ps")
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
return json.loads(resp.read()).get("models", [])
|
||||
return json.loads(path.read_text())
|
||||
except Exception:
|
||||
return []
|
||||
return {}
|
||||
|
||||
|
||||
def get_huey_pid():
|
||||
def get_archive_checkpoint() -> dict:
|
||||
path = TIMMY_HOME / "twitter-archive" / "checkpoint.json"
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
r = subprocess.run(["pgrep", "-f", "huey_consumer"],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
return r.stdout.strip().split("\n")[0] if r.returncode == 0 else None
|
||||
return json.loads(path.read_text())
|
||||
except Exception:
|
||||
return None
|
||||
return {}
|
||||
|
||||
|
||||
def get_hermes_sessions():
|
||||
def get_local_metrics(hours: int = 24) -> list[dict]:
|
||||
records = []
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
|
||||
if not METRICS_DIR.exists():
|
||||
return records
|
||||
for path in sorted(METRICS_DIR.glob("local_*.jsonl")):
|
||||
for line in path.read_text().splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
ts = datetime.fromisoformat(record["timestamp"])
|
||||
if ts >= cutoff:
|
||||
records.append(record)
|
||||
except Exception:
|
||||
continue
|
||||
return records
|
||||
|
||||
|
||||
def get_hermes_sessions() -> list[dict]:
|
||||
sessions_file = HERMES_HOME / "sessions" / "sessions.json"
|
||||
if not sessions_file.exists():
|
||||
return []
|
||||
@@ -67,7 +126,7 @@ def get_hermes_sessions():
|
||||
return []
|
||||
|
||||
|
||||
def get_session_rows(hours=24):
|
||||
def get_session_rows(hours: int = 24):
|
||||
state_db = HERMES_HOME / "state.db"
|
||||
if not state_db.exists():
|
||||
return []
|
||||
@@ -91,14 +150,14 @@ def get_session_rows(hours=24):
|
||||
return []
|
||||
|
||||
|
||||
def get_heartbeat_ticks(date_str=None):
|
||||
def get_heartbeat_ticks(date_str: str | None = None) -> list[dict]:
|
||||
if not date_str:
|
||||
date_str = datetime.now().strftime("%Y%m%d")
|
||||
tick_file = TIMMY_HOME / "heartbeat" / f"ticks_{date_str}.jsonl"
|
||||
if not tick_file.exists():
|
||||
return []
|
||||
ticks = []
|
||||
for line in tick_file.read_text().strip().split("\n"):
|
||||
for line in tick_file.read_text().splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
@@ -108,42 +167,33 @@ def get_heartbeat_ticks(date_str=None):
|
||||
return ticks
|
||||
|
||||
|
||||
def get_local_metrics(hours=24):
|
||||
"""Read local inference metrics from jsonl files."""
|
||||
records = []
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
|
||||
if not METRICS_DIR.exists():
|
||||
return records
|
||||
for f in sorted(METRICS_DIR.glob("local_*.jsonl")):
|
||||
for line in f.read_text().strip().split("\n"):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
r = json.loads(line)
|
||||
ts = datetime.fromisoformat(r["timestamp"])
|
||||
if ts >= cutoff:
|
||||
records.append(r)
|
||||
except Exception:
|
||||
continue
|
||||
return records
|
||||
def get_review_and_issue_state(token: str | None) -> dict:
|
||||
state = {"prs": [], "review_queue": [], "unassigned": 0}
|
||||
for repo in CORE_REPOS:
|
||||
try:
|
||||
prs = gitea_get(f"/repos/{repo}/pulls?state=open&limit=20", token)
|
||||
for pr in prs:
|
||||
pr["_repo"] = repo
|
||||
state["prs"].append(pr)
|
||||
except Exception:
|
||||
continue
|
||||
try:
|
||||
issue_prs = gitea_get(f"/repos/{repo}/issues?state=open&limit=50&type=pulls", token)
|
||||
for item in issue_prs:
|
||||
assignees = [a.get("login", "") for a in (item.get("assignees") or [])]
|
||||
if any(name in assignees for name in ("Timmy", "allegro")):
|
||||
item["_repo"] = repo
|
||||
state["review_queue"].append(item)
|
||||
except Exception:
|
||||
continue
|
||||
try:
|
||||
issues = gitea_get(f"/repos/{repo}/issues?state=open&limit=50&type=issues", token)
|
||||
state["unassigned"] += sum(1 for issue in issues if not issue.get("assignees"))
|
||||
except Exception:
|
||||
continue
|
||||
return state
|
||||
|
||||
|
||||
def get_cron_jobs():
|
||||
"""Get Hermes cron job status."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["hermes", "cron", "list", "--json"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if r.returncode == 0:
|
||||
return json.loads(r.stdout).get("jobs", [])
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
# ── Rendering ─────────────────────────────────────────────────────────
|
||||
|
||||
DIM = "\033[2m"
|
||||
BOLD = "\033[1m"
|
||||
GREEN = "\033[32m"
|
||||
@@ -154,119 +204,133 @@ RST = "\033[0m"
|
||||
CLR = "\033[2J\033[H"
|
||||
|
||||
|
||||
def render(hours=24):
|
||||
models = get_ollama_models()
|
||||
loaded = get_loaded_models()
|
||||
huey_pid = get_huey_pid()
|
||||
ticks = get_heartbeat_ticks()
|
||||
def render(hours: int = 24) -> None:
|
||||
token = read_token()
|
||||
metrics = get_local_metrics(hours)
|
||||
local_summary = summarize_local_metrics(metrics)
|
||||
ticks = get_heartbeat_ticks()
|
||||
health = get_model_health()
|
||||
last_tick = get_last_tick()
|
||||
checkpoint = get_archive_checkpoint()
|
||||
sessions = get_hermes_sessions()
|
||||
session_rows = get_session_rows(hours)
|
||||
local_summary = summarize_local_metrics(metrics)
|
||||
session_summary = summarize_session_rows(session_rows)
|
||||
|
||||
loaded_names = {m.get("name", "") for m in loaded}
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
gitea = get_review_and_issue_state(token)
|
||||
|
||||
print(CLR, end="")
|
||||
print(f"{BOLD}{'=' * 70}")
|
||||
print(f" TIMMY MODEL DASHBOARD")
|
||||
print(f" {now} | Huey: {GREEN}PID {huey_pid}{RST if huey_pid else f'{RED}DOWN{RST}'}")
|
||||
print(f"{'=' * 70}{RST}")
|
||||
print(f"{BOLD}{'=' * 72}")
|
||||
print(" TIMMY WORKFLOW DASHBOARD")
|
||||
print(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'=' * 72}{RST}")
|
||||
|
||||
# ── LOCAL MODELS ──
|
||||
print(f"\n {BOLD}LOCAL MODELS (Ollama){RST}")
|
||||
print(f" {DIM}{'-' * 55}{RST}")
|
||||
if models:
|
||||
for m in models:
|
||||
name = m.get("name", "?")
|
||||
size_gb = m.get("size", 0) / 1e9
|
||||
if name in loaded_names:
|
||||
status = f"{GREEN}IN VRAM{RST}"
|
||||
else:
|
||||
status = f"{DIM}on disk{RST}"
|
||||
print(f" {name:35s} {size_gb:5.1f}GB {status}")
|
||||
print(f"\n {BOLD}HEARTBEAT{RST}")
|
||||
print(f" {DIM}{'-' * 58}{RST}")
|
||||
if last_tick:
|
||||
sev = last_tick.get("decision", {}).get("severity", "?")
|
||||
tick_id = last_tick.get("tick_id", "?")
|
||||
model_decisions = sum(
|
||||
1
|
||||
for tick in ticks
|
||||
if isinstance(tick.get("decision"), dict)
|
||||
and tick["decision"].get("severity") != "fallback"
|
||||
)
|
||||
print(f" last tick: {tick_id}")
|
||||
print(f" severity: {sev}")
|
||||
print(f" ticks today: {len(ticks)} | model decisions: {model_decisions}")
|
||||
else:
|
||||
print(f" {RED}(Ollama not responding){RST}")
|
||||
print(f" {DIM}(no heartbeat data){RST}")
|
||||
|
||||
# ── LOCAL INFERENCE ACTIVITY ──
|
||||
print(f"\n {BOLD}LOCAL INFERENCE ({len(metrics)} calls, last {hours}h){RST}")
|
||||
print(f" {DIM}{'-' * 55}{RST}")
|
||||
print(f"\n {BOLD}MODEL HEALTH{RST}")
|
||||
print(f" {DIM}{'-' * 58}{RST}")
|
||||
if health:
|
||||
provider = GREEN if health.get("api_responding") else RED
|
||||
inference = GREEN if health.get("inference_ok") else YELLOW
|
||||
print(f" provider: {provider}{health.get('api_responding')}{RST}")
|
||||
print(f" inference: {inference}{health.get('inference_ok')}{RST}")
|
||||
print(f" models: {', '.join(health.get('models_loaded', [])[:4]) or '(none reported)'}")
|
||||
else:
|
||||
print(f" {DIM}(no model_health.json){RST}")
|
||||
|
||||
print(f"\n {BOLD}ARCHIVE PIPELINE{RST}")
|
||||
print(f" {DIM}{'-' * 58}{RST}")
|
||||
if checkpoint:
|
||||
print(f" batches completed: {checkpoint.get('batches_completed', '?')}")
|
||||
print(f" next offset: {checkpoint.get('next_offset', '?')}")
|
||||
print(f" phase: {checkpoint.get('phase', '?')}")
|
||||
else:
|
||||
print(f" {DIM}(no archive checkpoint yet){RST}")
|
||||
|
||||
print(f"\n {BOLD}LOCAL METRICS ({len(metrics)} calls, last {hours}h){RST}")
|
||||
print(f" {DIM}{'-' * 58}{RST}")
|
||||
if metrics:
|
||||
print(f" Tokens: {local_summary['input_tokens']} in | {local_summary['output_tokens']} out | {local_summary['total_tokens']} total")
|
||||
if local_summary.get('avg_latency_s') is not None:
|
||||
print(
|
||||
f" Tokens: {local_summary['input_tokens']} in | "
|
||||
f"{local_summary['output_tokens']} out | "
|
||||
f"{local_summary['total_tokens']} total"
|
||||
)
|
||||
if local_summary.get("avg_latency_s") is not None:
|
||||
print(f" Avg latency: {local_summary['avg_latency_s']:.2f}s")
|
||||
if local_summary.get('avg_tokens_per_second') is not None:
|
||||
if local_summary.get("avg_tokens_per_second") is not None:
|
||||
print(f" Avg throughput: {GREEN}{local_summary['avg_tokens_per_second']:.2f} tok/s{RST}")
|
||||
for caller, stats in sorted(local_summary['by_caller'].items()):
|
||||
err = f" {RED}err:{stats['failed_calls']}{RST}" if stats['failed_calls'] else ""
|
||||
print(f" {caller:25s} calls:{stats['calls']:4d} tokens:{stats['total_tokens']:5d} {GREEN}ok:{stats['successful_calls']}{RST}{err}")
|
||||
|
||||
print(f"\n {DIM}Models used:{RST}")
|
||||
for model, stats in sorted(local_summary['by_model'].items(), key=lambda x: -x[1]['calls']):
|
||||
print(f" {model:30s} {stats['calls']} calls {stats['total_tokens']} tok")
|
||||
for caller, stats in sorted(local_summary["by_caller"].items()):
|
||||
err = f" {RED}err:{stats['failed_calls']}{RST}" if stats["failed_calls"] else ""
|
||||
print(
|
||||
f" {caller:24s} calls={stats['calls']:3d} "
|
||||
f"tok={stats['total_tokens']:5d} {GREEN}ok:{stats['successful_calls']}{RST}{err}"
|
||||
)
|
||||
else:
|
||||
print(f" {DIM}(no local calls recorded yet){RST}")
|
||||
print(f" {DIM}(no local metrics yet){RST}")
|
||||
|
||||
# ── HEARTBEAT STATUS ──
|
||||
print(f"\n {BOLD}HEARTBEAT ({len(ticks)} ticks today){RST}")
|
||||
print(f" {DIM}{'-' * 55}{RST}")
|
||||
if ticks:
|
||||
last = ticks[-1]
|
||||
decision = last.get("decision", last.get("actions", {}))
|
||||
if isinstance(decision, dict):
|
||||
severity = decision.get("severity", "unknown")
|
||||
reasoning = decision.get("reasoning", "")
|
||||
sev_color = GREEN if severity == "ok" else YELLOW if severity == "warning" else RED
|
||||
print(f" Last tick: {last.get('tick_id', '?')}")
|
||||
print(f" Severity: {sev_color}{severity}{RST}")
|
||||
if reasoning:
|
||||
print(f" Reasoning: {reasoning[:65]}")
|
||||
else:
|
||||
print(f" Last tick: {last.get('tick_id', '?')}")
|
||||
actions = last.get("actions", [])
|
||||
print(f" Actions: {actions if actions else 'none'}")
|
||||
|
||||
model_decisions = sum(1 for t in ticks
|
||||
if isinstance(t.get("decision"), dict)
|
||||
and t["decision"].get("severity") != "fallback")
|
||||
fallback = len(ticks) - model_decisions
|
||||
print(f" {CYAN}Model: {model_decisions}{RST} | {DIM}Fallback: {fallback}{RST}")
|
||||
else:
|
||||
print(f" {DIM}(no ticks today){RST}")
|
||||
|
||||
# ── HERMES SESSIONS / SOVEREIGNTY LOAD ──
|
||||
local_sessions = [s for s in sessions if "localhost:11434" in str(s.get("base_url", ""))]
|
||||
print(f"\n {BOLD}SESSION LOAD{RST}")
|
||||
print(f" {DIM}{'-' * 58}{RST}")
|
||||
local_sessions = [s for s in sessions if "localhost" in str(s.get("base_url", ""))]
|
||||
cloud_sessions = [s for s in sessions if s not in local_sessions]
|
||||
print(f"\n {BOLD}HERMES SESSIONS / SOVEREIGNTY LOAD{RST}")
|
||||
print(f" {DIM}{'-' * 55}{RST}")
|
||||
print(f" Session cache: {len(sessions)} total | {GREEN}{len(local_sessions)} local{RST} | {YELLOW}{len(cloud_sessions)} cloud{RST}")
|
||||
print(
|
||||
f" Session cache: {len(sessions)} total | "
|
||||
f"{GREEN}{len(local_sessions)} local{RST} | "
|
||||
f"{YELLOW}{len(cloud_sessions)} remote{RST}"
|
||||
)
|
||||
if session_rows:
|
||||
print(f" Session DB: {session_summary['total_sessions']} total | {GREEN}{session_summary['local_sessions']} local{RST} | {YELLOW}{session_summary['cloud_sessions']} cloud{RST}")
|
||||
print(f" Token est: {GREEN}{session_summary['local_est_tokens']} local{RST} | {YELLOW}{session_summary['cloud_est_tokens']} cloud{RST}")
|
||||
print(f" Est cloud cost: ${session_summary['cloud_est_cost_usd']:.4f}")
|
||||
print(
|
||||
f" Session DB: {session_summary['total_sessions']} total | "
|
||||
f"{GREEN}{session_summary['local_sessions']} local{RST} | "
|
||||
f"{YELLOW}{session_summary['cloud_sessions']} remote{RST}"
|
||||
)
|
||||
print(
|
||||
f" Token est: {GREEN}{session_summary['local_est_tokens']} local{RST} | "
|
||||
f"{YELLOW}{session_summary['cloud_est_tokens']} remote{RST}"
|
||||
)
|
||||
print(f" Est remote cost: ${session_summary['cloud_est_cost_usd']:.4f}")
|
||||
else:
|
||||
print(f" {DIM}(no session-db stats available){RST}")
|
||||
|
||||
# ── ACTIVE LOOPS ──
|
||||
print(f"\n {BOLD}ACTIVE LOOPS{RST}")
|
||||
print(f" {DIM}{'-' * 55}{RST}")
|
||||
print(f" {CYAN}heartbeat_tick{RST} 10m hermes4:14b DECIDE phase")
|
||||
print(f" {DIM}model_health{RST} 5m (local check) Ollama ping")
|
||||
print(f" {DIM}gemini_worker{RST} 20m gemini-2.5-pro aider")
|
||||
print(f" {DIM}grok_worker{RST} 20m grok-3-fast opencode")
|
||||
print(f" {DIM}cross_review{RST} 30m gemini+grok PR review")
|
||||
print(f"\n {BOLD}REVIEW QUEUE{RST}")
|
||||
print(f" {DIM}{'-' * 58}{RST}")
|
||||
if gitea["review_queue"]:
|
||||
for item in gitea["review_queue"][:8]:
|
||||
repo = item["_repo"].split("/", 1)[1]
|
||||
print(f" {repo:12s} #{item['number']:<4d} {item['title'][:42]}")
|
||||
else:
|
||||
print(f" {DIM}(clear){RST}")
|
||||
|
||||
print(f"\n{BOLD}{'=' * 70}{RST}")
|
||||
print(f"\n {BOLD}OPEN PRS / UNASSIGNED{RST}")
|
||||
print(f" {DIM}{'-' * 58}{RST}")
|
||||
print(f" open PRs: {len(gitea['prs'])}")
|
||||
print(f" unassigned issues: {gitea['unassigned']}")
|
||||
for pr in gitea["prs"][:6]:
|
||||
repo = pr["_repo"].split("/", 1)[1]
|
||||
print(f" PR {repo:10s} #{pr['number']:<4d} {pr['title'][:40]}")
|
||||
|
||||
print(f"\n{BOLD}{'=' * 72}{RST}")
|
||||
print(f" {DIM}Refresh: timmy-dashboard --watch | History: --hours=N{RST}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
watch = "--watch" in sys.argv
|
||||
hours = 24
|
||||
for a in sys.argv[1:]:
|
||||
if a.startswith("--hours="):
|
||||
hours = int(a.split("=")[1])
|
||||
for arg in sys.argv[1:]:
|
||||
if arg.startswith("--hours="):
|
||||
hours = int(arg.split("=", 1)[1])
|
||||
|
||||
if watch:
|
||||
try:
|
||||
|
||||
218
bin/timmy-orchestrator.sh
Executable file
218
bin/timmy-orchestrator.sh
Executable file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env bash
|
||||
# timmy-orchestrator.sh — Timmy's orchestration loop
|
||||
# Uses Hermes CLI plus workforce-manager to triage and review.
|
||||
# Timmy is the brain. Other agents are the hands.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LOG_DIR="$HOME/.hermes/logs"
|
||||
LOG="$LOG_DIR/timmy-orchestrator.log"
|
||||
PIDFILE="$LOG_DIR/timmy-orchestrator.pid"
|
||||
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
||||
GITEA_TOKEN=$(cat "$HOME/.hermes/gitea_token_vps" 2>/dev/null) # Timmy token, NOT rockachopa
|
||||
CYCLE_INTERVAL=300
|
||||
HERMES_TIMEOUT=180
|
||||
AUTO_ASSIGN_UNASSIGNED="${AUTO_ASSIGN_UNASSIGNED:-0}" # 0 = report only, 1 = mutate Gitea assignments
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Single instance guard
|
||||
if [ -f "$PIDFILE" ]; then
|
||||
old_pid=$(cat "$PIDFILE")
|
||||
if kill -0 "$old_pid" 2>/dev/null; then
|
||||
echo "Timmy already running (PID $old_pid)" >&2
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
echo $$ > "$PIDFILE"
|
||||
trap 'rm -f "$PIDFILE"' EXIT
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] TIMMY: $*" >> "$LOG"
|
||||
}
|
||||
|
||||
REPOS="Timmy_Foundation/the-nexus Timmy_Foundation/timmy-home Timmy_Foundation/timmy-config Timmy_Foundation/hermes-agent"
|
||||
|
||||
gather_state() {
|
||||
local state_dir="/tmp/timmy-state-$$"
|
||||
mkdir -p "$state_dir"
|
||||
|
||||
> "$state_dir/unassigned.txt"
|
||||
> "$state_dir/open_prs.txt"
|
||||
> "$state_dir/agent_status.txt"
|
||||
|
||||
for repo in $REPOS; do
|
||||
local short=$(echo "$repo" | cut -d/ -f2)
|
||||
|
||||
# Unassigned issues
|
||||
curl -sf -H "Authorization: token $GITEA_TOKEN" \
|
||||
"$GITEA_URL/api/v1/repos/$repo/issues?state=open&type=issues&limit=50" 2>/dev/null | \
|
||||
python3 -c "
|
||||
import sys,json
|
||||
for i in json.load(sys.stdin):
|
||||
if not i.get('assignees'):
|
||||
print(f'REPO={\"$repo\"} NUM={i[\"number\"]} TITLE={i[\"title\"]}')" >> "$state_dir/unassigned.txt" 2>/dev/null
|
||||
|
||||
# Open PRs
|
||||
curl -sf -H "Authorization: token $GITEA_TOKEN" \
|
||||
"$GITEA_URL/api/v1/repos/$repo/pulls?state=open&limit=30" 2>/dev/null | \
|
||||
python3 -c "
|
||||
import sys,json
|
||||
for p in json.load(sys.stdin):
|
||||
print(f'REPO={\"$repo\"} PR={p[\"number\"]} BY={p[\"user\"][\"login\"]} TITLE={p[\"title\"]}')" >> "$state_dir/open_prs.txt" 2>/dev/null
|
||||
done
|
||||
|
||||
echo "Claude workers: $(pgrep -f 'claude.*--print.*--dangerously' 2>/dev/null | wc -l | tr -d ' ')" >> "$state_dir/agent_status.txt"
|
||||
echo "Claude loop: $(pgrep -f 'claude-loop.sh' 2>/dev/null | wc -l | tr -d ' ') procs" >> "$state_dir/agent_status.txt"
|
||||
tail -50 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" | xargs -I{} echo "Claude recent successes: {}" >> "$state_dir/agent_status.txt"
|
||||
tail -50 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "FAILED" | xargs -I{} echo "Claude recent failures: {}" >> "$state_dir/agent_status.txt"
|
||||
echo "Kimi heartbeat launchd: $(launchctl list 2>/dev/null | grep -c 'ai.timmy.kimi-heartbeat' | tr -d ' ') job" >> "$state_dir/agent_status.txt"
|
||||
tail -50 "/tmp/kimi-heartbeat.log" 2>/dev/null | grep -c "DISPATCHED:" | xargs -I{} echo "Kimi recent dispatches: {}" >> "$state_dir/agent_status.txt"
|
||||
tail -50 "/tmp/kimi-heartbeat.log" 2>/dev/null | grep -c "FAILED:" | xargs -I{} echo "Kimi recent failures: {}" >> "$state_dir/agent_status.txt"
|
||||
tail -1 "/tmp/kimi-heartbeat.log" 2>/dev/null | xargs -I{} echo "Kimi last event: {}" >> "$state_dir/agent_status.txt"
|
||||
|
||||
echo "$state_dir"
|
||||
}
|
||||
|
||||
run_triage() {
|
||||
local state_dir="$1"
|
||||
local unassigned_count=$(wc -l < "$state_dir/unassigned.txt" | tr -d ' ')
|
||||
local pr_count=$(wc -l < "$state_dir/open_prs.txt" | tr -d ' ')
|
||||
|
||||
log "Cycle: $unassigned_count unassigned, $pr_count open PRs"
|
||||
|
||||
# If nothing to do, skip the LLM call
|
||||
if [ "$unassigned_count" -eq 0 ] && [ "$pr_count" -eq 0 ]; then
|
||||
log "Nothing to triage"
|
||||
return
|
||||
fi
|
||||
|
||||
# Phase 1: Report unassigned issues by default.
|
||||
# Auto-assignment is opt-in because silent queue mutation resurrects old state.
|
||||
if [ "$unassigned_count" -gt 0 ]; then
|
||||
if [ "$AUTO_ASSIGN_UNASSIGNED" = "1" ]; then
|
||||
log "Assigning $unassigned_count issues to claude..."
|
||||
while IFS= read -r line; do
|
||||
local repo=$(echo "$line" | sed 's/.*REPO=\([^ ]*\).*/\1/')
|
||||
local num=$(echo "$line" | sed 's/.*NUM=\([^ ]*\).*/\1/')
|
||||
curl -sf -X PATCH "$GITEA_URL/api/v1/repos/$repo/issues/$num" \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"assignees":["claude"]}' >/dev/null 2>&1 && \
|
||||
log " Assigned #$num ($repo) to claude"
|
||||
done < "$state_dir/unassigned.txt"
|
||||
else
|
||||
log "Auto-assign disabled: leaving $unassigned_count unassigned issues untouched"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Phase 2: PR review via Timmy (LLM)
|
||||
if [ "$pr_count" -gt 0 ]; then
|
||||
run_pr_review "$state_dir"
|
||||
fi
|
||||
}
|
||||
|
||||
run_pr_review() {
|
||||
local state_dir="$1"
|
||||
local prompt_file="/tmp/timmy-prompt-$$.txt"
|
||||
|
||||
# Build a review prompt listing all open PRs
|
||||
cat > "$prompt_file" <<'HEADER'
|
||||
You are Timmy, the orchestrator. Review these open PRs from AI agents.
|
||||
|
||||
For each PR, you will see the diff. Your job:
|
||||
- MERGE if changes look reasonable (most agent PRs are good, merge aggressively)
|
||||
- COMMENT if there is a clear problem
|
||||
- CLOSE if it is a duplicate or garbage
|
||||
|
||||
Use these exact curl patterns (replace REPO, NUM):
|
||||
Merge: curl -sf -X POST "GITEA/api/v1/repos/REPO/pulls/NUM/merge" -H "Authorization: token TOKEN" -H "Content-Type: application/json" -d '{"Do":"squash"}'
|
||||
Comment: curl -sf -X POST "GITEA/api/v1/repos/REPO/pulls/NUM/comments" -H "Authorization: token TOKEN" -H "Content-Type: application/json" -d '{"body":"feedback"}'
|
||||
Close: curl -sf -X PATCH "GITEA/api/v1/repos/REPO/pulls/NUM" -H "Authorization: token TOKEN" -H "Content-Type: application/json" -d '{"state":"closed"}'
|
||||
|
||||
HEADER
|
||||
|
||||
# Replace placeholders
|
||||
sed -i '' "s|GITEA|$GITEA_URL|g; s|TOKEN|$GITEA_TOKEN|g" "$prompt_file"
|
||||
|
||||
# Add each PR with its diff (up to 10 PRs per cycle)
|
||||
local count=0
|
||||
while IFS= read -r line && [ "$count" -lt 10 ]; do
|
||||
local repo=$(echo "$line" | sed 's/.*REPO=\([^ ]*\).*/\1/')
|
||||
local pr_num=$(echo "$line" | sed 's/.*PR=\([^ ]*\).*/\1/')
|
||||
local by=$(echo "$line" | sed 's/.*BY=\([^ ]*\).*/\1/')
|
||||
local title=$(echo "$line" | sed 's/.*TITLE=//')
|
||||
|
||||
[ -z "$pr_num" ] && continue
|
||||
|
||||
local diff
|
||||
diff=$(curl -sf -H "Authorization: token $GITEA_TOKEN" \
|
||||
-H "Accept: application/diff" \
|
||||
"$GITEA_URL/api/v1/repos/$repo/pulls/$pr_num" 2>/dev/null | head -150)
|
||||
|
||||
[ -z "$diff" ] && continue
|
||||
|
||||
echo "" >> "$prompt_file"
|
||||
echo "=== PR #$pr_num in $repo by $by ===" >> "$prompt_file"
|
||||
echo "Title: $title" >> "$prompt_file"
|
||||
echo "Diff (first 150 lines):" >> "$prompt_file"
|
||||
echo "$diff" >> "$prompt_file"
|
||||
echo "=== END PR #$pr_num ===" >> "$prompt_file"
|
||||
|
||||
count=$((count + 1))
|
||||
done < "$state_dir/open_prs.txt"
|
||||
|
||||
if [ "$count" -eq 0 ]; then
|
||||
rm -f "$prompt_file"
|
||||
return
|
||||
fi
|
||||
|
||||
echo "" >> "$prompt_file"
|
||||
cat >> "$prompt_file" <<'FOOTER'
|
||||
INSTRUCTIONS: For EACH PR above, do ONE of the following RIGHT NOW using your terminal tool:
|
||||
- Run the merge curl command if the diff looks good
|
||||
- Run the close curl command if it is a duplicate or garbage
|
||||
- Run the comment curl command only if there is a clear bug
|
||||
|
||||
IMPORTANT: Actually run the curl commands. Do not just describe what you would do. Finish means the PR world-state changed.
|
||||
FOOTER
|
||||
|
||||
local prompt_text
|
||||
prompt_text=$(cat "$prompt_file")
|
||||
rm -f "$prompt_file"
|
||||
|
||||
log "Reviewing $count PRs..."
|
||||
local result
|
||||
result=$(timeout "$HERMES_TIMEOUT" hermes chat -q "$prompt_text" -Q --yolo 2>&1)
|
||||
local exit_code=$?
|
||||
|
||||
if [ "$exit_code" -eq 0 ]; then
|
||||
log "PR review complete"
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $result" >> "$LOG_DIR/timmy-reviews.log"
|
||||
else
|
||||
log "PR review failed (exit $exit_code)"
|
||||
fi
|
||||
}
|
||||
|
||||
# === MAIN LOOP ===
|
||||
log "=== Timmy Orchestrator Started (PID $$) ==="
|
||||
log "Cycle: ${CYCLE_INTERVAL}s | Auto-assign: ${AUTO_ASSIGN_UNASSIGNED} | Inference surface: Hermes CLI"
|
||||
|
||||
WORKFORCE_CYCLE=0
|
||||
|
||||
while true; do
|
||||
state_dir=$(gather_state)
|
||||
run_triage "$state_dir"
|
||||
rm -rf "$state_dir"
|
||||
|
||||
# Run workforce manager every 3rd cycle (~15 min)
|
||||
WORKFORCE_CYCLE=$((WORKFORCE_CYCLE + 1))
|
||||
if [ $((WORKFORCE_CYCLE % 3)) -eq 0 ]; then
|
||||
log "Running workforce manager..."
|
||||
python3 "$HOME/.hermes/bin/workforce-manager.py" all >> "$LOG_DIR/workforce-manager.log" 2>&1
|
||||
log "Workforce manager complete"
|
||||
fi
|
||||
|
||||
log "Sleeping ${CYCLE_INTERVAL}s"
|
||||
sleep "$CYCLE_INTERVAL"
|
||||
done
|
||||
@@ -1,284 +1,182 @@
|
||||
#!/usr/bin/env bash
|
||||
# ── Timmy Loop Status Panel ────────────────────────────────────────────
|
||||
# Compact, info-dense sidebar for the tmux development loop.
|
||||
# Refreshes every 10s. Designed for ~40-col wide pane.
|
||||
# ── Timmy Status Sidebar ───────────────────────────────────────────────
|
||||
# Compact current-state view for the local Hermes + Timmy workflow.
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
STATE="$HOME/Timmy-Time-dashboard/.loop/state.json"
|
||||
REPO="$HOME/Timmy-Time-dashboard"
|
||||
TOKEN=$(cat ~/.hermes/gitea_token 2>/dev/null)
|
||||
API="http://143.198.27.163:3000/api/v1/repos/rockachopa/Timmy-time-dashboard"
|
||||
set -euo pipefail
|
||||
|
||||
# ── Colors ──
|
||||
B='\033[1m' # bold
|
||||
D='\033[2m' # dim
|
||||
R='\033[0m' # reset
|
||||
G='\033[32m' # green
|
||||
Y='\033[33m' # yellow
|
||||
RD='\033[31m' # red
|
||||
C='\033[36m' # cyan
|
||||
M='\033[35m' # magenta
|
||||
W='\033[37m' # white
|
||||
BG='\033[42;30m' # green bg
|
||||
BY='\033[43;30m' # yellow bg
|
||||
BR='\033[41;37m' # red bg
|
||||
resolve_gitea_url() {
|
||||
if [ -n "${GITEA_URL:-}" ]; then
|
||||
printf '%s\n' "${GITEA_URL%/}"
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.hermes/gitea_api" ]; then
|
||||
python3 - "$HOME/.hermes/gitea_api" <<'PY'
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
# How wide is our pane?
|
||||
COLS=$(tput cols 2>/dev/null || echo 40)
|
||||
raw = Path(sys.argv[1]).read_text().strip().rstrip("/")
|
||||
print(raw[:-7] if raw.endswith("/api/v1") else raw)
|
||||
PY
|
||||
return 0
|
||||
fi
|
||||
if [ -f "$HOME/.config/gitea/base-url" ]; then
|
||||
tr -d '[:space:]' < "$HOME/.config/gitea/base-url"
|
||||
return 0
|
||||
fi
|
||||
echo "ERROR: set GITEA_URL or create ~/.hermes/gitea_api" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
resolve_ops_token() {
|
||||
local token_file
|
||||
for token_file in \
|
||||
"$HOME/.config/gitea/timmy-token" \
|
||||
"$HOME/.hermes/gitea_token_vps" \
|
||||
"$HOME/.hermes/gitea_token_timmy"; do
|
||||
if [ -f "$token_file" ]; then
|
||||
tr -d '[:space:]' < "$token_file"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
GITEA_URL="$(resolve_gitea_url)"
|
||||
CORE_REPOS="${CORE_REPOS:-Timmy_Foundation/the-nexus Timmy_Foundation/timmy-home Timmy_Foundation/timmy-config Timmy_Foundation/hermes-agent}"
|
||||
TOKEN="$(resolve_ops_token || true)"
|
||||
[ -z "$TOKEN" ] && echo "WARN: no approved Timmy Gitea token found; status sidebar will use unauthenticated API calls" >&2
|
||||
|
||||
B='\033[1m'
|
||||
D='\033[2m'
|
||||
R='\033[0m'
|
||||
G='\033[32m'
|
||||
Y='\033[33m'
|
||||
RD='\033[31m'
|
||||
C='\033[36m'
|
||||
|
||||
COLS=$(tput cols 2>/dev/null || echo 48)
|
||||
hr() { printf "${D}"; printf '─%.0s' $(seq 1 "$COLS"); printf "${R}\n"; }
|
||||
|
||||
while true; do
|
||||
clear
|
||||
|
||||
# ── Header ──
|
||||
echo -e "${B}${C} ⚙ TIMMY DEV LOOP${R} ${D}$(date '+%H:%M:%S')${R}"
|
||||
echo -e "${B}${C} TIMMY STATUS${R} ${D}$(date '+%H:%M:%S')${R}"
|
||||
hr
|
||||
|
||||
# ── Loop State ──
|
||||
if [ -f "$STATE" ]; then
|
||||
eval "$(python3 -c "
|
||||
import json, sys
|
||||
with open('$STATE') as f: s = json.load(f)
|
||||
print(f'CYCLE={s.get(\"cycle\",\"?\")}')" 2>/dev/null)"
|
||||
STATUS=$(python3 -c "import json; print(json.load(open('$STATE'))['status'])" 2>/dev/null || echo "?")
|
||||
LAST_OK=$(python3 -c "
|
||||
python3 - "$HOME/.timmy" "$HOME/.hermes" <<'PY'
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
s = json.load(open('$STATE'))
|
||||
t = s.get('last_completed','')
|
||||
if t:
|
||||
dt = datetime.fromisoformat(t.replace('Z','+00:00'))
|
||||
delta = datetime.now(timezone.utc) - dt
|
||||
mins = int(delta.total_seconds() / 60)
|
||||
if mins < 60: print(f'{mins}m ago')
|
||||
else: print(f'{mins//60}h {mins%60}m ago')
|
||||
else: print('never')
|
||||
" 2>/dev/null || echo "?")
|
||||
CLOSED=$(python3 -c "import json; print(len(json.load(open('$STATE')).get('issues_closed',[])))" 2>/dev/null || echo 0)
|
||||
CREATED=$(python3 -c "import json; print(len(json.load(open('$STATE')).get('issues_created',[])))" 2>/dev/null || echo 0)
|
||||
ERRS=$(python3 -c "import json; print(len(json.load(open('$STATE')).get('errors',[])))" 2>/dev/null || echo 0)
|
||||
LAST_ISSUE=$(python3 -c "import json; print(json.load(open('$STATE')).get('last_issue','—'))" 2>/dev/null || echo "—")
|
||||
LAST_PR=$(python3 -c "import json; print(json.load(open('$STATE')).get('last_pr','—'))" 2>/dev/null || echo "—")
|
||||
TESTS=$(python3 -c "
|
||||
import json
|
||||
s = json.load(open('$STATE'))
|
||||
t = s.get('test_results',{})
|
||||
if t:
|
||||
print(f\"{t.get('passed',0)} pass, {t.get('failed',0)} fail, {t.get('coverage','?')} cov\")
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
timmy = Path(sys.argv[1])
|
||||
hermes = Path(sys.argv[2])
|
||||
|
||||
last_tick = timmy / "heartbeat" / "last_tick.json"
|
||||
model_health = hermes / "model_health.json"
|
||||
checkpoint = timmy / "twitter-archive" / "checkpoint.json"
|
||||
|
||||
if last_tick.exists():
|
||||
try:
|
||||
tick = json.loads(last_tick.read_text())
|
||||
sev = tick.get("decision", {}).get("severity", "?")
|
||||
tick_id = tick.get("tick_id", "?")
|
||||
print(f" heartbeat {tick_id} severity={sev}")
|
||||
except Exception:
|
||||
print(" heartbeat unreadable")
|
||||
else:
|
||||
print('no data')
|
||||
" 2>/dev/null || echo "no data")
|
||||
print(" heartbeat missing")
|
||||
|
||||
# Status badge
|
||||
case "$STATUS" in
|
||||
working) BADGE="${BY} WORKING ${R}" ;;
|
||||
idle) BADGE="${BG} IDLE ${R}" ;;
|
||||
error) BADGE="${BR} ERROR ${R}" ;;
|
||||
*) BADGE="${D} $STATUS ${R}" ;;
|
||||
esac
|
||||
|
||||
echo -e " ${B}Status${R} $BADGE ${D}cycle${R} ${B}$CYCLE${R}"
|
||||
echo -e " ${B}Last OK${R} ${G}$LAST_OK${R} ${D}issue${R} #$LAST_ISSUE ${D}PR${R} #$LAST_PR"
|
||||
echo -e " ${G}✓${R} $CLOSED closed ${C}+${R} $CREATED created ${RD}✗${R} $ERRS errs"
|
||||
echo -e " ${D}Tests:${R} $TESTS"
|
||||
else
|
||||
echo -e " ${RD}No state file${R}"
|
||||
fi
|
||||
|
||||
hr
|
||||
|
||||
# ── Ollama Status ──
|
||||
echo -e " ${B}${M}◆ OLLAMA${R}"
|
||||
OLLAMA_PS=$(curl -s http://localhost:11434/api/ps 2>/dev/null)
|
||||
if [ -n "$OLLAMA_PS" ] && echo "$OLLAMA_PS" | python3 -c "import sys,json; json.load(sys.stdin)" &>/dev/null; then
|
||||
python3 -c "
|
||||
import json, sys
|
||||
data = json.loads('''$OLLAMA_PS''')
|
||||
models = data.get('models', [])
|
||||
if not models:
|
||||
print(' \033[2m(no models loaded)\033[0m')
|
||||
for m in models:
|
||||
name = m.get('name','?')
|
||||
vram = m.get('size_vram', 0) / 1e9
|
||||
exp = m.get('expires_at','')
|
||||
print(f' \033[32m●\033[0m {name} \033[2m{vram:.1f}GB VRAM\033[0m')
|
||||
" 2>/dev/null
|
||||
else
|
||||
echo -e " ${RD}● offline${R}"
|
||||
fi
|
||||
|
||||
# ── Timmy Health ──
|
||||
TIMMY_HEALTH=$(curl -s --max-time 2 http://localhost:8000/health 2>/dev/null)
|
||||
if [ -n "$TIMMY_HEALTH" ]; then
|
||||
python3 -c "
|
||||
import json
|
||||
h = json.loads('''$TIMMY_HEALTH''')
|
||||
status = h.get('status','?')
|
||||
ollama = h.get('services',{}).get('ollama','?')
|
||||
model = h.get('llm_model','?')
|
||||
agent_st = list(h.get('agents',{}).values())[0].get('status','?') if h.get('agents') else '?'
|
||||
up = int(h.get('uptime_seconds',0))
|
||||
hrs, rem = divmod(up, 3600)
|
||||
mins = rem // 60
|
||||
print(f' \033[1m\033[35m◆ TIMMY DASHBOARD\033[0m')
|
||||
print(f' \033[32m●\033[0m {status} model={model}')
|
||||
print(f' \033[2magent={agent_st} ollama={ollama} up={hrs}h{mins}m\033[0m')
|
||||
" 2>/dev/null
|
||||
else
|
||||
echo -e " ${B}${M}◆ TIMMY DASHBOARD${R}"
|
||||
echo -e " ${RD}● unreachable${R}"
|
||||
fi
|
||||
|
||||
hr
|
||||
|
||||
# ── Open Issues ──
|
||||
echo -e " ${B}${Y}▶ OPEN ISSUES${R}"
|
||||
if [ -n "$TOKEN" ]; then
|
||||
curl -s "${API}/issues?state=open&limit=10&sort=created&direction=desc" \
|
||||
-H "Authorization: token $TOKEN" 2>/dev/null | \
|
||||
python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
issues = json.load(sys.stdin)
|
||||
if not issues:
|
||||
print(' \033[2m(none)\033[0m')
|
||||
for i in issues[:10]:
|
||||
num = i['number']
|
||||
title = i['title'][:36]
|
||||
labels = ','.join(l['name'][:8] for l in i.get('labels',[]))
|
||||
lbl = f' \033[2m[{labels}]\033[0m' if labels else ''
|
||||
print(f' \033[33m#{num:<4d}\033[0m {title}{lbl}')
|
||||
if len(issues) > 10:
|
||||
print(f' \033[2m... +{len(issues)-10} more\033[0m')
|
||||
except: print(' \033[2m(fetch failed)\033[0m')
|
||||
" 2>/dev/null
|
||||
else
|
||||
echo -e " ${RD}(no token)${R}"
|
||||
fi
|
||||
|
||||
# ── Open PRs ──
|
||||
echo -e " ${B}${G}▶ OPEN PRs${R}"
|
||||
if [ -n "$TOKEN" ]; then
|
||||
curl -s "${API}/pulls?state=open&limit=5" \
|
||||
-H "Authorization: token $TOKEN" 2>/dev/null | \
|
||||
python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
prs = json.load(sys.stdin)
|
||||
if not prs:
|
||||
print(' \033[2m(none)\033[0m')
|
||||
for p in prs[:5]:
|
||||
num = p['number']
|
||||
title = p['title'][:36]
|
||||
print(f' \033[32mPR #{num:<4d}\033[0m {title}')
|
||||
except: print(' \033[2m(fetch failed)\033[0m')
|
||||
" 2>/dev/null
|
||||
else
|
||||
echo -e " ${RD}(no token)${R}"
|
||||
fi
|
||||
|
||||
hr
|
||||
|
||||
# ── Git Log ──
|
||||
echo -e " ${B}${D}▶ RECENT COMMITS${R}"
|
||||
cd "$REPO" 2>/dev/null && git log --oneline --no-decorate -6 2>/dev/null | while read line; do
|
||||
HASH=$(echo "$line" | cut -c1-7)
|
||||
MSG=$(echo "$line" | cut -c9- | cut -c1-32)
|
||||
echo -e " ${C}${HASH}${R} ${D}${MSG}${R}"
|
||||
done
|
||||
|
||||
hr
|
||||
|
||||
# ── Claims ──
|
||||
CLAIMS_FILE="$REPO/.loop/claims.json"
|
||||
if [ -f "$CLAIMS_FILE" ]; then
|
||||
CLAIMS=$(python3 -c "
|
||||
import json
|
||||
with open('$CLAIMS_FILE') as f: c = json.load(f)
|
||||
active = [(k,v) for k,v in c.items() if v.get('status') == 'active']
|
||||
if active:
|
||||
for k,v in active:
|
||||
print(f' \033[33m⚡\033[0m #{k} claimed by {v.get(\"agent\",\"?\")[:12]}')
|
||||
if model_health.exists():
|
||||
try:
|
||||
health = json.loads(model_health.read_text())
|
||||
provider_ok = health.get("api_responding")
|
||||
inference_ok = health.get("inference_ok")
|
||||
models = len(health.get("models_loaded", []) or [])
|
||||
print(f" model api={provider_ok} inference={inference_ok} models={models}")
|
||||
except Exception:
|
||||
print(" model unreadable")
|
||||
else:
|
||||
print(' \033[2m(none active)\033[0m')
|
||||
" 2>/dev/null)
|
||||
if [ -n "$CLAIMS" ]; then
|
||||
echo -e " ${B}${Y}▶ CLAIMED${R}"
|
||||
echo "$CLAIMS"
|
||||
fi
|
||||
fi
|
||||
print(" model missing")
|
||||
|
||||
# ── System ──
|
||||
echo -e " ${B}${D}▶ SYSTEM${R}"
|
||||
# Disk
|
||||
DISK=$(df -h / 2>/dev/null | tail -1 | awk '{print $4 " free / " $2}')
|
||||
echo -e " ${D}Disk:${R} $DISK"
|
||||
# Memory (macOS)
|
||||
if command -v memory_pressure &>/dev/null; then
|
||||
MEM_PRESS=$(memory_pressure 2>/dev/null | grep "System-wide" | head -1 | sed 's/.*: //')
|
||||
echo -e " ${D}Mem:${R} $MEM_PRESS"
|
||||
elif [ -f /proc/meminfo ]; then
|
||||
MEM=$(awk '/MemAvailable/{printf "%.1fGB free", $2/1048576}' /proc/meminfo 2>/dev/null)
|
||||
echo -e " ${D}Mem:${R} $MEM"
|
||||
fi
|
||||
# CPU load
|
||||
LOAD=$(uptime | sed 's/.*averages: //' | cut -d',' -f1 | xargs)
|
||||
echo -e " ${D}Load:${R} $LOAD"
|
||||
if checkpoint.exists():
|
||||
try:
|
||||
cp = json.loads(checkpoint.read_text())
|
||||
print(f" archive batches={cp.get('batches_completed', '?')} next={cp.get('next_offset', '?')} phase={cp.get('phase', '?')}")
|
||||
except Exception:
|
||||
print(" archive unreadable")
|
||||
else:
|
||||
print(" archive missing")
|
||||
PY
|
||||
|
||||
hr
|
||||
echo -e " ${B}freshness${R}"
|
||||
~/.hermes/bin/pipeline-freshness.sh 2>/dev/null | sed 's/^/ /' || echo -e " ${Y}unknown${R}"
|
||||
|
||||
# ── Notes from last cycle ──
|
||||
if [ -f "$STATE" ]; then
|
||||
NOTES=$(python3 -c "
|
||||
hr
|
||||
echo -e " ${B}review queue${R}"
|
||||
python3 - "$GITEA_URL" "$TOKEN" "$CORE_REPOS" <<'PY'
|
||||
import json
|
||||
s = json.load(open('$STATE'))
|
||||
n = s.get('notes','')
|
||||
if n:
|
||||
lines = n[:150]
|
||||
if len(n) > 150: lines += '...'
|
||||
print(lines)
|
||||
" 2>/dev/null)
|
||||
if [ -n "$NOTES" ]; then
|
||||
echo -e " ${B}${D}▶ LAST CYCLE NOTE${R}"
|
||||
echo -e " ${D}${NOTES}${R}"
|
||||
hr
|
||||
fi
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
# Timmy observations
|
||||
TIMMY_OBS=$(python3 -c "
|
||||
base = sys.argv[1].rstrip("/")
|
||||
token = sys.argv[2]
|
||||
repos = sys.argv[3].split()
|
||||
headers = {"Authorization": f"token {token}"} if token else {}
|
||||
|
||||
count = 0
|
||||
for repo in repos:
|
||||
try:
|
||||
req = urllib.request.Request(f"{base}/api/v1/repos/{repo}/issues?state=open&limit=50&type=pulls", headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
items = json.loads(resp.read().decode())
|
||||
for item in items:
|
||||
assignees = [a.get("login", "") for a in (item.get("assignees") or [])]
|
||||
if any(name in assignees for name in ("Timmy", "allegro")):
|
||||
print(f" {repo.split('/',1)[1]:12s} #{item['number']:<4d} {item['title'][:28]}")
|
||||
count += 1
|
||||
if count >= 6:
|
||||
raise SystemExit
|
||||
except SystemExit:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if count == 0:
|
||||
print(" (clear)")
|
||||
PY
|
||||
|
||||
hr
|
||||
echo -e " ${B}unassigned${R}"
|
||||
python3 - "$GITEA_URL" "$TOKEN" "$CORE_REPOS" <<'PY'
|
||||
import json
|
||||
s = json.load(open('$STATE'))
|
||||
obs = s.get('timmy_observations','')
|
||||
if obs:
|
||||
lines = obs[:120]
|
||||
if len(obs) > 120: lines += '...'
|
||||
print(lines)
|
||||
" 2>/dev/null)
|
||||
if [ -n "$TIMMY_OBS" ]; then
|
||||
echo -e " ${B}${M}▶ TIMMY SAYS${R}"
|
||||
echo -e " ${D}${TIMMY_OBS}${R}"
|
||||
hr
|
||||
fi
|
||||
fi
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
# ── Watchdog: restart loop if it died ──────────────────────────────
|
||||
LOOP_LOCK="/tmp/timmy-loop.lock"
|
||||
if [ -f "$LOOP_LOCK" ]; then
|
||||
LOOP_PID=$(cat "$LOOP_LOCK" 2>/dev/null)
|
||||
if ! kill -0 "$LOOP_PID" 2>/dev/null; then
|
||||
echo -e " ${BR} ⚠ LOOP DIED — RESTARTING ${R}"
|
||||
rm -f "$LOOP_LOCK"
|
||||
tmux send-keys -t "dev:2.1" "bash ~/.hermes/bin/timmy-loop.sh" Enter 2>/dev/null
|
||||
fi
|
||||
else
|
||||
# No lock file at all — loop never started or was killed
|
||||
if ! pgrep -f "timmy-loop.sh" >/dev/null 2>&1; then
|
||||
echo -e " ${BR} ⚠ LOOP NOT RUNNING — STARTING ${R}"
|
||||
tmux send-keys -t "dev:2.1" "bash ~/.hermes/bin/timmy-loop.sh" Enter 2>/dev/null
|
||||
fi
|
||||
fi
|
||||
base = sys.argv[1].rstrip("/")
|
||||
token = sys.argv[2]
|
||||
repos = sys.argv[3].split()
|
||||
headers = {"Authorization": f"token {token}"} if token else {}
|
||||
|
||||
echo -e " ${D}↻ 8s${R}"
|
||||
sleep 8
|
||||
count = 0
|
||||
for repo in repos:
|
||||
try:
|
||||
req = urllib.request.Request(f"{base}/api/v1/repos/{repo}/issues?state=open&limit=50&type=issues", headers=headers)
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
items = json.loads(resp.read().decode())
|
||||
for item in items:
|
||||
if not item.get("assignees"):
|
||||
print(f" {repo.split('/',1)[1]:12s} #{item['number']:<4d} {item['title'][:28]}")
|
||||
count += 1
|
||||
if count >= 6:
|
||||
raise SystemExit
|
||||
except SystemExit:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if count == 0:
|
||||
print(" (none)")
|
||||
PY
|
||||
|
||||
hr
|
||||
sleep 10
|
||||
done
|
||||
|
||||
91
code-claw-delegation.md
Normal file
91
code-claw-delegation.md
Normal file
@@ -0,0 +1,91 @@
|
||||
# Code Claw delegation
|
||||
|
||||
Purpose:
|
||||
- give the team a clean way to hand issues to `claw-code`
|
||||
- let Code Claw work from Gitea instead of ad hoc local prompts
|
||||
- keep queue state visible through labels and comments
|
||||
|
||||
## What it is
|
||||
|
||||
Code Claw is a separate local runtime from Hermes/OpenClaw.
|
||||
|
||||
Current lane:
|
||||
- runtime: local patched `~/code-claw`
|
||||
- backend: OpenRouter
|
||||
- model: `qwen/qwen3.6-plus:free`
|
||||
- Gitea identity: `claw-code`
|
||||
- dispatch style: assign in Gitea, heartbeat picks it up every 15 minutes
|
||||
|
||||
## Trigger methods
|
||||
|
||||
Either of these is enough:
|
||||
- assign the issue to `claw-code`
|
||||
- add label `assigned-claw-code`
|
||||
|
||||
## Label lifecycle
|
||||
|
||||
- `assigned-claw-code` — queued
|
||||
- `claw-code-in-progress` — picked up by heartbeat
|
||||
- `claw-code-done` — Code Claw completed a pass
|
||||
|
||||
## Repo coverage
|
||||
|
||||
Currently wired:
|
||||
- `Timmy_Foundation/timmy-home`
|
||||
- `Timmy_Foundation/timmy-config`
|
||||
- `Timmy_Foundation/the-nexus`
|
||||
- `Timmy_Foundation/hermes-agent`
|
||||
|
||||
## Operational flow
|
||||
|
||||
1. Team assigns issue to `claw-code` or adds `assigned-claw-code`
|
||||
2. launchd heartbeat runs every 15 minutes
|
||||
3. Timmy posts a pickup comment
|
||||
4. worker clones the target repo
|
||||
5. worker creates branch `claw-code/issue-<num>`
|
||||
6. worker runs Code Claw against the issue context
|
||||
7. if work exists, worker pushes and opens a PR
|
||||
8. issue is marked `claw-code-done`
|
||||
9. completion comment links branch + PR
|
||||
|
||||
## Logs and files
|
||||
|
||||
Local files:
|
||||
- heartbeat script: `~/.timmy/uniwizard/codeclaw_qwen_heartbeat.py`
|
||||
- worker script: `~/.timmy/uniwizard/codeclaw_qwen_worker.py`
|
||||
- launchd job: `~/Library/LaunchAgents/ai.timmy.codeclaw-qwen-heartbeat.plist`
|
||||
|
||||
Logs:
|
||||
- heartbeat log: `/tmp/codeclaw-qwen-heartbeat.log`
|
||||
- worker log: `/tmp/codeclaw-qwen-worker-<issue>.log`
|
||||
|
||||
## Best-fit work
|
||||
|
||||
Use Code Claw for:
|
||||
- small code/config/doc issues
|
||||
- repo hygiene
|
||||
- isolated bugfixes
|
||||
- narrow CI and `.gitignore` work
|
||||
- quick issue-driven patches where a PR is the desired output
|
||||
|
||||
Do not use it first for:
|
||||
- giant epics
|
||||
- broad architecture KT
|
||||
- local game embodiment tasks
|
||||
- complex multi-repo archaeology
|
||||
|
||||
## Proof of life
|
||||
|
||||
Smoke-tested on:
|
||||
- `Timmy_Foundation/timmy-config#232`
|
||||
|
||||
Observed:
|
||||
- pickup comment posted
|
||||
- branch `claw-code/issue-232` created
|
||||
- PR opened by `claw-code`
|
||||
|
||||
## Notes
|
||||
|
||||
- Exact PR matching matters. Do not trust broad Gitea PR queries without post-filtering by branch.
|
||||
- This lane is intentionally simple and issue-driven.
|
||||
- Treat it like a specialized intern: useful, fast, and bounded.
|
||||
37
config.yaml
37
config.yaml
@@ -20,7 +20,12 @@ terminal:
|
||||
modal_image: nikolaik/python-nodejs:python3.11-nodejs20
|
||||
daytona_image: nikolaik/python-nodejs:python3.11-nodejs20
|
||||
container_cpu: 1
|
||||
container_memory: 5120
|
||||
container_embeddings:
|
||||
provider: ollama
|
||||
model: nomic-embed-text
|
||||
base_url: http://localhost:11434/v1
|
||||
|
||||
memory: 5120
|
||||
container_disk: 51200
|
||||
container_persistent: true
|
||||
docker_volumes: []
|
||||
@@ -34,21 +39,26 @@ checkpoints:
|
||||
enabled: true
|
||||
max_snapshots: 50
|
||||
compression:
|
||||
enabled: false
|
||||
enabled: true
|
||||
threshold: 0.5
|
||||
target_ratio: 0.2
|
||||
protect_last_n: 20
|
||||
summary_model: ''
|
||||
summary_provider: ''
|
||||
summary_base_url: ''
|
||||
synthesis_model:
|
||||
provider: custom
|
||||
model: llama3:70b
|
||||
base_url: http://localhost:8081/v1
|
||||
|
||||
smart_model_routing:
|
||||
enabled: false
|
||||
max_simple_chars: 200
|
||||
max_simple_words: 35
|
||||
enabled: true
|
||||
max_simple_chars: 400
|
||||
max_simple_words: 75
|
||||
cheap_model:
|
||||
provider: ''
|
||||
model: ''
|
||||
base_url: ''
|
||||
provider: 'ollama'
|
||||
model: 'gemma2:2b'
|
||||
base_url: 'http://localhost:11434/v1'
|
||||
api_key: ''
|
||||
auxiliary:
|
||||
vision:
|
||||
@@ -105,7 +115,7 @@ display:
|
||||
tool_progress_command: false
|
||||
tool_progress: all
|
||||
privacy:
|
||||
redact_pii: false
|
||||
redact_pii: true
|
||||
tts:
|
||||
provider: edge
|
||||
edge:
|
||||
@@ -164,7 +174,16 @@ approvals:
|
||||
command_allowlist: []
|
||||
quick_commands: {}
|
||||
personalities: {}
|
||||
mesh:
|
||||
enabled: true
|
||||
blackboard_provider: local
|
||||
nostr_discovery: true
|
||||
consensus_mode: competitive
|
||||
|
||||
security:
|
||||
sovereign_audit: true
|
||||
no_phone_home: true
|
||||
|
||||
redact_secrets: true
|
||||
tirith_enabled: true
|
||||
tirith_path: tirith
|
||||
|
||||
@@ -81,33 +81,7 @@
|
||||
"last_error": null,
|
||||
"deliver": "local",
|
||||
"origin": null,
|
||||
"state": "scheduled"
|
||||
},
|
||||
{
|
||||
"id": "5e9d952871bc",
|
||||
"name": "Agent Status Check",
|
||||
"prompt": "Check which tmux panes are idle vs working, report utilization",
|
||||
"schedule": {
|
||||
"kind": "interval",
|
||||
"minutes": 10,
|
||||
"display": "every 10m"
|
||||
},
|
||||
"schedule_display": "every 10m",
|
||||
"repeat": {
|
||||
"times": null,
|
||||
"completed": 8
|
||||
},
|
||||
"enabled": false,
|
||||
"created_at": "2026-03-24T11:28:46.409727-04:00",
|
||||
"next_run_at": "2026-03-24T15:45:58.108921-04:00",
|
||||
"last_run_at": "2026-03-24T15:35:58.108921-04:00",
|
||||
"last_status": "ok",
|
||||
"last_error": null,
|
||||
"deliver": "local",
|
||||
"origin": null,
|
||||
"state": "paused",
|
||||
"paused_at": "2026-03-24T16:23:03.869047-04:00",
|
||||
"paused_reason": "Dashboard repo frozen - loops redirected to the-nexus",
|
||||
"state": "scheduled",
|
||||
"skills": [],
|
||||
"skill": null
|
||||
},
|
||||
@@ -132,8 +106,69 @@
|
||||
"last_status": null,
|
||||
"last_error": null,
|
||||
"deliver": "local",
|
||||
"origin": null
|
||||
"origin": null,
|
||||
"skills": [],
|
||||
"skill": null
|
||||
},
|
||||
{
|
||||
"id": "muda-audit-weekly",
|
||||
"name": "Muda Audit",
|
||||
"prompt": "Run the Muda Audit script at /root/wizards/ezra/workspace/timmy-config/fleet/muda-audit.sh. The script measures the 7 wastes across the fleet and posts a report to Telegram. Report whether it succeeded or failed.",
|
||||
"schedule": {
|
||||
"kind": "cron",
|
||||
"expr": "0 21 * * 0",
|
||||
"display": "0 21 * * 0"
|
||||
},
|
||||
"schedule_display": "0 21 * * 0",
|
||||
"repeat": {
|
||||
"times": null,
|
||||
"completed": 0
|
||||
},
|
||||
"enabled": true,
|
||||
"created_at": "2026-04-07T15:00:00+00:00",
|
||||
"next_run_at": null,
|
||||
"last_run_at": null,
|
||||
"last_status": null,
|
||||
"last_error": null,
|
||||
"deliver": "local",
|
||||
"origin": null,
|
||||
"state": "scheduled",
|
||||
"paused_at": null,
|
||||
"paused_reason": null,
|
||||
"skills": [],
|
||||
"skill": null
|
||||
},
|
||||
{
|
||||
"id": "kaizen-retro-349",
|
||||
"name": "Kaizen Retro",
|
||||
"prompt": "Run the automated burn-cycle retrospective. Execute: cd /root/wizards/ezra/workspace/timmy-config && ./bin/kaizen-retro.sh",
|
||||
"model": "hermes3:latest",
|
||||
"provider": "ollama",
|
||||
"base_url": "http://localhost:11434/v1",
|
||||
"schedule": {
|
||||
"kind": "interval",
|
||||
"minutes": 1440,
|
||||
"display": "every 1440m"
|
||||
},
|
||||
"schedule_display": "daily at 07:30",
|
||||
"repeat": {
|
||||
"times": null,
|
||||
"completed": 0
|
||||
},
|
||||
"enabled": true,
|
||||
"created_at": "2026-04-07T15:30:00.000000Z",
|
||||
"next_run_at": "2026-04-08T07:30:00.000000Z",
|
||||
"last_run_at": null,
|
||||
"last_status": null,
|
||||
"last_error": null,
|
||||
"deliver": "local",
|
||||
"origin": null,
|
||||
"state": "scheduled",
|
||||
"paused_at": null,
|
||||
"paused_reason": null,
|
||||
"skills": [],
|
||||
"skill": null
|
||||
}
|
||||
],
|
||||
"updated_at": "2026-03-24T16:23:03.869797-04:00"
|
||||
}
|
||||
"updated_at": "2026-04-07T15:00:00+00:00"
|
||||
}
|
||||
|
||||
2
cron/muda-audit.crontab
Normal file
2
cron/muda-audit.crontab
Normal file
@@ -0,0 +1,2 @@
|
||||
# Muda Audit — run every Sunday at 21:00
|
||||
0 21 * * 0 cd /root/wizards/ezra/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1
|
||||
58
deploy/conduit/Caddyfile
Normal file
58
deploy/conduit/Caddyfile
Normal file
@@ -0,0 +1,58 @@
|
||||
# Caddy configuration for Conduit Matrix homeserver
|
||||
# Location: /etc/caddy/conf.d/matrix.conf (imported by main Caddyfile)
|
||||
# Reference: docs/matrix-fleet-comms/README.md
|
||||
|
||||
matrix.timmy.foundation {
|
||||
# Reverse proxy to Conduit
|
||||
reverse_proxy localhost:8448 {
|
||||
# Headers for WebSocket upgrade (client sync)
|
||||
header_up Host {host}
|
||||
header_up X-Real-IP {remote}
|
||||
header_up X-Forwarded-For {remote}
|
||||
header_up X-Forwarded-Proto {scheme}
|
||||
}
|
||||
|
||||
# Security headers
|
||||
header {
|
||||
X-Frame-Options DENY
|
||||
X-Content-Type-Options nosniff
|
||||
X-XSS-Protection "1; mode=block"
|
||||
Referrer-Policy strict-origin-when-cross-origin
|
||||
Permissions-Policy "geolocation=(), microphone=(), camera=()"
|
||||
}
|
||||
|
||||
# Enable compression
|
||||
encode gzip zstd
|
||||
|
||||
# Let's Encrypt automatic TLS
|
||||
tls {
|
||||
# Email for renewal notifications
|
||||
# Uncomment and set: email admin@timmy.foundation
|
||||
}
|
||||
|
||||
# Logging
|
||||
log {
|
||||
output file /var/log/caddy/matrix-access.log {
|
||||
roll_size 100mb
|
||||
roll_keep 5
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Well-known delegation for Matrix federation
|
||||
# Allows other servers to discover our homeserver
|
||||
timmy.foundation {
|
||||
handle /.well-known/matrix/server {
|
||||
header Content-Type application/json
|
||||
respond `{"m.server": "matrix.timmy.foundation:443"}`
|
||||
}
|
||||
|
||||
handle /.well-known/matrix/client {
|
||||
header Content-Type application/json
|
||||
header Access-Control-Allow-Origin *
|
||||
respond `{"m.homeserver": {"base_url": "https://matrix.timmy.foundation"}}`
|
||||
}
|
||||
|
||||
# Redirect root to Element Web or documentation
|
||||
redir / https://matrix.timmy.foundation permanent
|
||||
}
|
||||
37
deploy/conduit/conduit.service
Normal file
37
deploy/conduit/conduit.service
Normal file
@@ -0,0 +1,37 @@
|
||||
[Unit]
|
||||
Description=Conduit Matrix Homeserver
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=conduit
|
||||
Group=conduit
|
||||
|
||||
WorkingDirectory=/opt/conduit
|
||||
ExecStart=/opt/conduit/conduit
|
||||
|
||||
# Restart on failure
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
ReadWritePaths=/opt/conduit/data /opt/conduit/logs
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
ProtectControlGroups=true
|
||||
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
|
||||
RestrictNamespaces=true
|
||||
LockPersonality=true
|
||||
|
||||
# Environment
|
||||
Environment="RUST_LOG=info"
|
||||
Environment="CONDUIT_CONFIG=/opt/conduit/conduit.toml"
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
81
deploy/conduit/conduit.toml
Normal file
81
deploy/conduit/conduit.toml
Normal file
@@ -0,0 +1,81 @@
|
||||
# Conduit Homeserver Configuration
|
||||
# Location: /opt/conduit/conduit.toml
|
||||
# Reference: docs/matrix-fleet-comms/README.md
|
||||
|
||||
[global]
|
||||
# The server_name is the canonical name of your homeserver.
|
||||
# It must match the domain in your MXIDs (e.g., @user:timmy.foundation)
|
||||
server_name = "timmy.foundation"
|
||||
|
||||
# Database path - SQLite for simplicity, PostgreSQL available if needed
|
||||
database_path = "/opt/conduit/data/conduit.db"
|
||||
|
||||
# Port to listen on
|
||||
port = 8448
|
||||
|
||||
# Maximum request size (20MB for file uploads)
|
||||
max_request_size = 20000000
|
||||
|
||||
# Allow guests to register (false = closed registration)
|
||||
allow_registration = false
|
||||
|
||||
# Allow guests to join rooms without registering
|
||||
allow_guest_registration = false
|
||||
|
||||
# Require authentication for profile requests
|
||||
authenticate_profile_requests = true
|
||||
|
||||
[registration]
|
||||
# Closed registration - admin creates accounts manually
|
||||
enabled = false
|
||||
|
||||
[federation]
|
||||
# Enable federation to communicate with other Matrix homeservers
|
||||
enabled = true
|
||||
|
||||
# Servers to block from federation
|
||||
# disabled_servers = ["bad.actor.com", "spammer.org"]
|
||||
disabled_servers = []
|
||||
|
||||
# Enable server discovery via .well-known
|
||||
well_known = true
|
||||
|
||||
[media]
|
||||
# Maximum upload size per file (50MB)
|
||||
max_file_size = 50000000
|
||||
|
||||
# Maximum total media cache size (100MB)
|
||||
max_media_size = 100000000
|
||||
|
||||
# Directory for media storage
|
||||
media_path = "/opt/conduit/data/media"
|
||||
|
||||
[retention]
|
||||
# Enable message retention policies
|
||||
enabled = true
|
||||
|
||||
# Default retention for rooms without explicit policy
|
||||
default_room_retention = "30d"
|
||||
|
||||
# Minimum allowed retention period
|
||||
min_retention = "1d"
|
||||
|
||||
# Maximum allowed retention period (null = no limit)
|
||||
max_retention = null
|
||||
|
||||
[logging]
|
||||
# Log level: error, warn, info, debug, trace
|
||||
level = "info"
|
||||
|
||||
# Log to file
|
||||
log_file = "/opt/conduit/logs/conduit.log"
|
||||
|
||||
[security]
|
||||
# Require transaction IDs for idempotent requests
|
||||
require_transaction_ids = true
|
||||
|
||||
# IP range blacklist for incoming federation
|
||||
# ip_range_blacklist = ["10.0.0.0/8", "172.16.0.0/12"]
|
||||
|
||||
# Allow incoming federation from these IP ranges only (empty = allow all)
|
||||
# ip_range_whitelist = []
|
||||
121
deploy/conduit/install.sh
Normal file
121
deploy/conduit/install.sh
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/bin/bash
|
||||
# Conduit Matrix Homeserver Installation Script
|
||||
# Location: Run this on target VPS after cloning timmy-config
|
||||
# Reference: docs/matrix-fleet-comms/README.md
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
CONDUIT_VERSION="0.8.0" # Check https://gitlab.com/famedly/conduit/-/releases
|
||||
CONDUIT_DIR="/opt/conduit"
|
||||
DATA_DIR="$CONDUIT_DIR/data"
|
||||
LOGS_DIR="$CONDUIT_DIR/logs"
|
||||
SCRIPTS_DIR="$CONDUIT_DIR/scripts"
|
||||
CONDUIT_USER="conduit"
|
||||
|
||||
echo "========================================"
|
||||
echo "Conduit Matrix Homeserver Installer"
|
||||
echo "Target: $CONDUIT_DIR"
|
||||
echo "Version: $CONDUIT_VERSION"
|
||||
echo "========================================"
|
||||
echo
|
||||
|
||||
# Check root
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "Error: Please run as root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create conduit user
|
||||
echo "[1/8] Creating conduit user..."
|
||||
if ! id "$CONDUIT_USER" &>/dev/null; then
|
||||
useradd -r -s /bin/false -d "$CONDUIT_DIR" "$CONDUIT_USER"
|
||||
echo " Created user: $CONDUIT_USER"
|
||||
else
|
||||
echo " User exists: $CONDUIT_USER"
|
||||
fi
|
||||
|
||||
# Create directories
|
||||
echo "[2/8] Creating directories..."
|
||||
mkdir -p "$CONDUIT_DIR" "$DATA_DIR" "$LOGS_DIR" "$SCRIPTS_DIR"
|
||||
chown -R "$CONDUIT_USER:$CONDUIT_USER" "$CONDUIT_DIR"
|
||||
|
||||
# Download Conduit
|
||||
echo "[3/8] Downloading Conduit v${CONDUIT_VERSION}..."
|
||||
ARCH=$(uname -m)
|
||||
case "$ARCH" in
|
||||
x86_64)
|
||||
CONDUIT_ARCH="x86_64-unknown-linux-gnu"
|
||||
;;
|
||||
aarch64)
|
||||
CONDUIT_ARCH="aarch64-unknown-linux-gnu"
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unsupported architecture: $ARCH"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
CONDUIT_URL="https://gitlab.com/famedly/conduit/-/releases/download/v${CONDUIT_VERSION}/conduit-${CONDUIT_ARCH}"
|
||||
|
||||
curl -L -o "$CONDUIT_DIR/conduit" "$CONDUIT_URL"
|
||||
chmod +x "$CONDUIT_DIR/conduit"
|
||||
chown "$CONDUIT_USER:$CONDUIT_USER" "$CONDUIT_DIR/conduit"
|
||||
echo " Downloaded: $CONDUIT_DIR/conduit"
|
||||
|
||||
# Install configuration
|
||||
echo "[4/8] Installing configuration..."
|
||||
if [ -f "conduit.toml" ]; then
|
||||
cp conduit.toml "$CONDUIT_DIR/conduit.toml"
|
||||
chown "$CONDUIT_USER:$CONDUIT_USER" "$CONDUIT_DIR/conduit.toml"
|
||||
echo " Installed: $CONDUIT_DIR/conduit.toml"
|
||||
else
|
||||
echo " Warning: conduit.toml not found in current directory"
|
||||
fi
|
||||
|
||||
# Install systemd service
|
||||
echo "[5/8] Installing systemd service..."
|
||||
if [ -f "conduit.service" ]; then
|
||||
cp conduit.service /etc/systemd/system/conduit.service
|
||||
systemctl daemon-reload
|
||||
echo " Installed: /etc/systemd/system/conduit.service"
|
||||
else
|
||||
echo " Warning: conduit.service not found in current directory"
|
||||
fi
|
||||
|
||||
# Install scripts
|
||||
echo "[6/8] Installing operational scripts..."
|
||||
if [ -d "scripts" ]; then
|
||||
cp scripts/*.sh "$SCRIPTS_DIR/"
|
||||
chmod +x "$SCRIPTS_DIR"/*.sh
|
||||
chown -R "$CONDUIT_USER:$CONDUIT_USER" "$SCRIPTS_DIR"
|
||||
echo " Installed scripts to $SCRIPTS_DIR"
|
||||
fi
|
||||
|
||||
# Create backup directory
|
||||
echo "[7/8] Creating backup directory..."
|
||||
mkdir -p /backups/conduit
|
||||
chown "$CONDUIT_USER:$CONDUIT_USER" /backups/conduit
|
||||
|
||||
# Setup cron for backups
|
||||
echo "[8/8] Setting up backup cron job..."
|
||||
if [ -f "$SCRIPTS_DIR/backup.sh" ]; then
|
||||
(crontab -l 2>/dev/null || true; echo "0 3 * * * $SCRIPTS_DIR/backup.sh >> $LOGS_DIR/backup.log 2>&1") | crontab -
|
||||
echo " Backup cron job added (3 AM daily)"
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "========================================"
|
||||
echo "Installation Complete!"
|
||||
echo "========================================"
|
||||
echo
|
||||
echo "Next steps:"
|
||||
echo " 1. Configure DNS: matrix.timmy.foundation -> $(hostname -I | awk '{print $1}')"
|
||||
echo " 2. Configure Caddy: cp Caddyfile /etc/caddy/conf.d/matrix.conf"
|
||||
echo " 3. Start Conduit: systemctl start conduit"
|
||||
echo " 4. Check health: $SCRIPTS_DIR/health.sh"
|
||||
echo " 5. Create admin account (see README.md)"
|
||||
echo
|
||||
echo "Logs: $LOGS_DIR/"
|
||||
echo "Data: $DATA_DIR/"
|
||||
echo "Config: $CONDUIT_DIR/conduit.toml"
|
||||
82
deploy/conduit/scripts/backup.sh
Normal file
82
deploy/conduit/scripts/backup.sh
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
# Conduit Matrix Homeserver Backup Script
|
||||
# Location: /opt/conduit/scripts/backup.sh
|
||||
# Reference: docs/matrix-fleet-comms/README.md
|
||||
# Run via cron: 0 3 * * * /opt/conduit/scripts/backup.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
BACKUP_BASE_DIR="/backups/conduit"
|
||||
DATA_DIR="/opt/conduit/data"
|
||||
CONFIG_FILE="/opt/conduit/conduit.toml"
|
||||
RETENTION_DAYS=7
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_DIR="$BACKUP_BASE_DIR/$TIMESTAMP"
|
||||
|
||||
# Ensure backup directory exists
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||
}
|
||||
|
||||
log "Starting Conduit backup..."
|
||||
|
||||
# Check if Conduit is running
|
||||
if systemctl is-active --quiet conduit; then
|
||||
log "Stopping Conduit for consistent backup..."
|
||||
systemctl stop conduit
|
||||
RESTART_NEEDED=true
|
||||
else
|
||||
log "Conduit already stopped"
|
||||
RESTART_NEEDED=false
|
||||
fi
|
||||
|
||||
# Backup database
|
||||
if [ -f "$DATA_DIR/conduit.db" ]; then
|
||||
log "Backing up database..."
|
||||
cp "$DATA_DIR/conduit.db" "$BACKUP_DIR/"
|
||||
sqlite3 "$BACKUP_DIR/conduit.db" "VACUUM;"
|
||||
else
|
||||
log "WARNING: Database not found at $DATA_DIR/conduit.db"
|
||||
fi
|
||||
|
||||
# Backup configuration
|
||||
if [ -f "$CONFIG_FILE" ]; then
|
||||
log "Backing up configuration..."
|
||||
cp "$CONFIG_FILE" "$BACKUP_DIR/"
|
||||
fi
|
||||
|
||||
# Backup media (if exists)
|
||||
if [ -d "$DATA_DIR/media" ]; then
|
||||
log "Backing up media files..."
|
||||
cp -r "$DATA_DIR/media" "$BACKUP_DIR/"
|
||||
fi
|
||||
|
||||
# Restart Conduit if it was running
|
||||
if [ "$RESTART_NEEDED" = true ]; then
|
||||
log "Restarting Conduit..."
|
||||
systemctl start conduit
|
||||
fi
|
||||
|
||||
# Create compressed archive
|
||||
log "Creating compressed archive..."
|
||||
cd "$BACKUP_BASE_DIR"
|
||||
tar czf "$TIMESTAMP.tar.gz" -C "$BACKUP_DIR" .
|
||||
rm -rf "$BACKUP_DIR"
|
||||
|
||||
ARCHIVE_SIZE=$(du -h "$BACKUP_BASE_DIR/$TIMESTAMP.tar.gz" | cut -f1)
|
||||
log "Backup complete: $TIMESTAMP.tar.gz ($ARCHIVE_SIZE)"
|
||||
|
||||
# Upload to S3 (uncomment and configure when ready)
|
||||
# if command -v aws &> /dev/null; then
|
||||
# log "Uploading to S3..."
|
||||
# aws s3 cp "$BACKUP_BASE_DIR/$TIMESTAMP.tar.gz" s3://timmy-backups/conduit/
|
||||
# fi
|
||||
|
||||
# Cleanup old backups
|
||||
log "Cleaning up backups older than $RETENTION_DAYS days..."
|
||||
find "$BACKUP_BASE_DIR" -name "*.tar.gz" -mtime +$RETENTION_DAYS -delete
|
||||
|
||||
log "Backup process complete"
|
||||
142
deploy/conduit/scripts/health.sh
Normal file
142
deploy/conduit/scripts/health.sh
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/bin/bash
|
||||
# Conduit Matrix Homeserver Health Check
|
||||
# Location: /opt/conduit/scripts/health.sh
|
||||
# Reference: docs/matrix-fleet-comms/README.md
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HOMESERVER_URL="https://matrix.timmy.foundation"
|
||||
ADMIN_EMAIL="admin@timmy.foundation"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${GREEN}[INFO]${NC} $*"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $*"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $*"
|
||||
}
|
||||
|
||||
# Check if Conduit process is running
|
||||
check_process() {
|
||||
if systemctl is-active --quiet conduit; then
|
||||
log_info "Conduit service is running"
|
||||
return 0
|
||||
else
|
||||
log_error "Conduit service is not running"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check Matrix client-server API
|
||||
check_client_api() {
|
||||
local response
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" "$HOMESERVER_URL/_matrix/client/versions" 2>/dev/null || echo "000")
|
||||
|
||||
if [ "$response" = "200" ]; then
|
||||
log_info "Client-server API is responding (HTTP 200)"
|
||||
return 0
|
||||
else
|
||||
log_error "Client-server API returned HTTP $response"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check Matrix versions endpoint
|
||||
check_versions() {
|
||||
local versions
|
||||
versions=$(curl -s "$HOMESERVER_URL/_matrix/client/versions" 2>/dev/null | jq -r '.versions | join(", ")' 2>/dev/null || echo "unknown")
|
||||
|
||||
if [ "$versions" != "unknown" ]; then
|
||||
log_info "Supported Matrix versions: $versions"
|
||||
return 0
|
||||
else
|
||||
log_warn "Could not determine Matrix versions"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check federation (self-test)
|
||||
check_federation() {
|
||||
local response
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" "https://federationtester.matrix.org/api/report?server_name=timmy.foundation" 2>/dev/null || echo "000")
|
||||
|
||||
if [ "$response" = "200" ]; then
|
||||
log_info "Federation tester can reach server"
|
||||
return 0
|
||||
else
|
||||
log_warn "Federation tester returned HTTP $response (may be DNS propagation)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check disk space
|
||||
check_disk_space() {
|
||||
local usage
|
||||
usage=$(df /opt/conduit/data | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
|
||||
if [ "$usage" -lt 80 ]; then
|
||||
log_info "Disk usage: ${usage}% (healthy)"
|
||||
return 0
|
||||
elif [ "$usage" -lt 90 ]; then
|
||||
log_warn "Disk usage: ${usage}% (consider cleanup)"
|
||||
return 1
|
||||
else
|
||||
log_error "Disk usage: ${usage}% (critical!)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check database size
|
||||
check_database() {
|
||||
local db_path="/opt/conduit/data/conduit.db"
|
||||
|
||||
if [ -f "$db_path" ]; then
|
||||
local size
|
||||
size=$(du -h "$db_path" | cut -f1)
|
||||
log_info "Database size: $size"
|
||||
return 0
|
||||
else
|
||||
log_warn "Database file not found at $db_path"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Main health check
|
||||
main() {
|
||||
echo "========================================"
|
||||
echo "Conduit Matrix Homeserver Health Check"
|
||||
echo "Server: $HOMESERVER_URL"
|
||||
echo "Time: $(date)"
|
||||
echo "========================================"
|
||||
echo
|
||||
|
||||
local exit_code=0
|
||||
|
||||
check_process || exit_code=1
|
||||
check_client_api || exit_code=1
|
||||
check_versions || true # Non-critical
|
||||
check_federation || true # Non-critical during initial setup
|
||||
check_disk_space || exit_code=1
|
||||
check_database || true # Non-critical
|
||||
|
||||
echo
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
log_info "All critical checks passed ✓"
|
||||
else
|
||||
log_error "Some critical checks failed ✗"
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
main "$@"
|
||||
30
deploy/matrix/Caddyfile
Normal file
30
deploy/matrix/Caddyfile
Normal file
@@ -0,0 +1,30 @@
|
||||
matrix.example.com {
|
||||
handle /.well-known/matrix/server {
|
||||
header Content-Type application/json
|
||||
respond `{"m.server": "matrix.example.com:443"}`
|
||||
}
|
||||
|
||||
handle /.well-known/matrix/client {
|
||||
header Content-Type application/json
|
||||
respond `{"m.homeserver": {"base_url": "https://matrix.example.com"}}`
|
||||
}
|
||||
|
||||
handle_path /_matrix/* {
|
||||
reverse_proxy localhost:6167
|
||||
}
|
||||
|
||||
handle {
|
||||
reverse_proxy localhost:8080
|
||||
}
|
||||
|
||||
log {
|
||||
output file /var/log/caddy/matrix.log {
|
||||
roll_size 10MB
|
||||
roll_keep 10
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matrix-federation.example.com:8448 {
|
||||
reverse_proxy localhost:6167
|
||||
}
|
||||
38
deploy/matrix/PREREQUISITES.md
Normal file
38
deploy/matrix/PREREQUISITES.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# Matrix/Conduit Host Prerequisites
|
||||
|
||||
## Target Host Specification
|
||||
|
||||
| Resource | Minimum | Fleet Scale |
|
||||
|----------|---------|-------------|
|
||||
| CPU | 2 cores | 4+ cores |
|
||||
| RAM | 2 GB | 8 GB |
|
||||
| Storage | 20 GB SSD | 100+ GB SSD |
|
||||
|
||||
## DNS Requirements
|
||||
|
||||
| Type | Host | Value |
|
||||
|------|------|-------|
|
||||
| A/AAAA | matrix.example.com | Server IP |
|
||||
| SRV | _matrix._tcp | 10 5 8448 matrix.example.com |
|
||||
|
||||
## Ports
|
||||
|
||||
| Port | Purpose | Access |
|
||||
|------|---------|--------|
|
||||
| 443 | Client-Server API | Public |
|
||||
| 8448 | Server-Server (federation) | Public |
|
||||
| 6167 | Conduit internal | Localhost only |
|
||||
|
||||
## Software
|
||||
|
||||
```bash
|
||||
curl -fsSL https://get.docker.com | sh
|
||||
sudo apt install caddy
|
||||
```
|
||||
|
||||
## Checklist
|
||||
|
||||
- [ ] Valid domain with DNS control
|
||||
- [ ] Docker host with 4GB RAM
|
||||
- [ ] Caddy reverse proxy configured
|
||||
- [ ] Backup destination configured
|
||||
32
deploy/matrix/conduit.toml
Normal file
32
deploy/matrix/conduit.toml
Normal file
@@ -0,0 +1,32 @@
|
||||
[global]
|
||||
server_name = "fleet.example.com"
|
||||
address = "0.0.0.0"
|
||||
port = 6167
|
||||
|
||||
[database]
|
||||
backend = "sqlite"
|
||||
path = "/var/lib/matrix-conduit"
|
||||
|
||||
[registration]
|
||||
enabled = false
|
||||
token = "CHANGE_THIS_TO_32_HEX_CHARS"
|
||||
allow_registration_without_token = false
|
||||
|
||||
[federation]
|
||||
enabled = true
|
||||
enable_open_federation = true
|
||||
trusted_servers = []
|
||||
|
||||
[media]
|
||||
max_file_size = 10_485_760
|
||||
max_thumbnail_size = 5_242_880
|
||||
|
||||
[presence]
|
||||
enabled = true
|
||||
update_interval = 300_000
|
||||
|
||||
[log]
|
||||
level = "info"
|
||||
|
||||
[admin]
|
||||
admins = ["@admin:fleet.example.com"]
|
||||
48
deploy/matrix/docker-compose.yml
Normal file
48
deploy/matrix/docker-compose.yml
Normal file
@@ -0,0 +1,48 @@
|
||||
version: "3.8"
|
||||
# Conduit Matrix homeserver - Sovereign fleet communication
|
||||
# Deploy: docker-compose up -d
|
||||
# Requirements: Docker 20.10+, valid DNS A/AAAA and SRV records
|
||||
|
||||
services:
|
||||
conduit:
|
||||
image: docker.io/matrixconduit/matrix-conduit:v0.7.0
|
||||
container_name: conduit
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./conduit.toml:/etc/conduit/conduit.toml:ro
|
||||
- conduit-data:/var/lib/matrix-conduit
|
||||
environment:
|
||||
CONDUIT_SERVER_NAME: ${MATRIX_SERVER_NAME:?Required}
|
||||
CONDUIT_DATABASE_BACKEND: sqlite
|
||||
CONDUIT_DATABASE_PATH: /var/lib/matrix-conduit
|
||||
CONDUIT_PORT: 6167
|
||||
CONDUIT_MAX_REQUEST_SIZE: 20_000_000
|
||||
networks:
|
||||
- matrix
|
||||
|
||||
element:
|
||||
image: vectorim/element-web:v1.11.59
|
||||
container_name: element-web
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./element-config.json:/app/config.json:ro
|
||||
networks:
|
||||
- matrix
|
||||
|
||||
backup:
|
||||
image: rclone/rclone:latest
|
||||
container_name: conduit-backup
|
||||
volumes:
|
||||
- conduit-data:/data:ro
|
||||
- ./backup-scripts:/scripts:ro
|
||||
entrypoint: /scripts/backup.sh
|
||||
profiles: ["backup"]
|
||||
networks:
|
||||
- matrix
|
||||
|
||||
networks:
|
||||
matrix:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
conduit-data:
|
||||
14
deploy/matrix/element-config.json
Normal file
14
deploy/matrix/element-config.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"default_server_config": {
|
||||
"m.homeserver": {
|
||||
"base_url": "https://matrix.example.com",
|
||||
"server_name": "example.com"
|
||||
}
|
||||
},
|
||||
"brand": "Timmy Fleet",
|
||||
"default_theme": "dark",
|
||||
"features": {
|
||||
"feature_spaces": true,
|
||||
"feature_voice_rooms": true
|
||||
}
|
||||
}
|
||||
46
deploy/matrix/scripts/bootstrap.sh
Normal file
46
deploy/matrix/scripts/bootstrap.sh
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
MATRIX_SERVER_NAME=${1:-"fleet.example.com"}
|
||||
ADMIN_USER=${2:-"admin"}
|
||||
BOT_USERS=("bilbo" "ezra" "allegro" "bezalel" "gemini" "timmy")
|
||||
|
||||
echo "=== Fleet Matrix Bootstrap ==="
|
||||
echo "Server: $MATRIX_SERVER_NAME"
|
||||
|
||||
REG_TOKEN=$(openssl rand -hex 32)
|
||||
echo "$REG_TOKEN" > .registration_token
|
||||
|
||||
cat > docker-compose.override.yml << EOF
|
||||
version: "3.8"
|
||||
services:
|
||||
conduit:
|
||||
environment:
|
||||
CONDUIT_SERVER_NAME: $MATRIX_SERVER_NAME
|
||||
CONDUIT_REGISTRATION_TOKEN: $REG_TOKEN
|
||||
EOF
|
||||
|
||||
ADMIN_PW=$(openssl rand -base64 24)
|
||||
cat > admin-register.json << EOF
|
||||
{"username": "$ADMIN_USER", "password": "$ADMIN_PW", "admin": true}
|
||||
EOF
|
||||
|
||||
mkdir -p bot-tokens
|
||||
for bot in "${BOT_USERS[@]}"; do
|
||||
BOT_PW=$(openssl rand -base64 24)
|
||||
echo "{"username": "$bot", "password": "$BOT_PW"}" > "bot-tokens/${bot}.json"
|
||||
done
|
||||
|
||||
cat > room-topology.yaml << 'EOF'
|
||||
spaces:
|
||||
fleet-command:
|
||||
name: "Fleet Command"
|
||||
rooms:
|
||||
- {name: "📢 Announcements", encrypted: false}
|
||||
- {name: "⚡ Operations", encrypted: true}
|
||||
- {name: "🔮 Intelligence", encrypted: true}
|
||||
- {name: "🛠️ Infrastructure", encrypted: true}
|
||||
EOF
|
||||
|
||||
echo "Bootstrap complete. Check admin-password.txt and bot-tokens/"
|
||||
echo "Admin password: $ADMIN_PW"
|
||||
18
docs/ARCHITECTURE_KT.md
Normal file
18
docs/ARCHITECTURE_KT.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# Architecture Knowledge Transfer (KT) — Unified System Schema
|
||||
|
||||
## Overview
|
||||
This document reconciles the Uni-Wizard v4 architecture with the Frontier Local Agenda.
|
||||
|
||||
## Core Hierarchy
|
||||
1. **Timmy (Local):** Sovereign Control Plane.
|
||||
2. **Ezra (VPS):** Archivist & Architecture Wizard.
|
||||
3. **Allegro (VPS):** Connectivity & Telemetry Bridge.
|
||||
4. **Bezalel (VPS):** Artificer & Implementation Wizard.
|
||||
|
||||
## Data Flow
|
||||
- **Telemetry:** Hermes -> Allegro -> Timmy (<100ms).
|
||||
- **Decisions:** Timmy -> Allegro -> Gitea (PR/Issue).
|
||||
- **Architecture:** Ezra -> Timmy (Review) -> Canon.
|
||||
|
||||
## Provenance Standard
|
||||
All artifacts must be tagged with the producing agent and house ID.
|
||||
262
docs/BURN_MODE_CONTINUITY_2026-04-05.md
Normal file
262
docs/BURN_MODE_CONTINUITY_2026-04-05.md
Normal file
@@ -0,0 +1,262 @@
|
||||
# 🔥 BURN MODE CONTINUITY — Primary Targets Engaged
|
||||
|
||||
**Date**: 2026-04-05
|
||||
**Burn Directive**: timmy-config #183, #166, the-nexus #830
|
||||
**Executor**: Ezra (Archivist)
|
||||
**Status**: ✅ **ALL TARGETS SCAFFOLDED — CONTINUITY PRESERVED**
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
Three primary targets have been assessed, scaffolded, and connected into a coherent fleet architecture. Each issue has transitioned from aspiration/fuzzy epic to executable implementation plan.
|
||||
|
||||
| Target | Repo | Previous State | Current State | Scaffold Size |
|
||||
|--------|------|----------------|---------------|---------------|
|
||||
| #183 | timmy-config | Aspirational scaffold | ✅ Complete deployment kit | 12+ files, 2 dirs |
|
||||
| #166 | timmy-config | Fuzzy epic | ✅ Executable with blockers isolated | Architecture doc (8KB) |
|
||||
| #830 | the-nexus | Feature request | ✅ 5-phase production scaffold | 5 bins + 3 docs (~70KB) |
|
||||
|
||||
---
|
||||
|
||||
## Cross-Target Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ FLEET COMMUNICATION LAYERS │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ HUMAN-TO-FLEET FLEET-INTERNAL INTEL │
|
||||
│ ┌───────────────┐ ┌───────────────┐ ┌────────┐│
|
||||
│ │ Matrix │◀──────────────▶│ Nostr │ │ Deep ││
|
||||
│ │ #166 │ #173 unify │ #174 │ │ Dive ││
|
||||
│ │ (scaffolded)│ │ (deployed) │ │ #830 ││
|
||||
│ └───────────────┘ └───────────────┘ │(ready) ││
|
||||
│ │ │ └───┬────┘│
|
||||
│ │ │ │ │
|
||||
│ ▼ ▼ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ ALEXANDER (Operator Surface) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Target #1: timmy-config #183
|
||||
|
||||
**Title**: [COMMS] Produce Matrix/Conduit deployment scaffold and host prerequisites
|
||||
**Status**: CLOSED ✅ (but continuity verified)
|
||||
**Issue State**: All acceptance criteria met
|
||||
|
||||
### Deliverables Verified
|
||||
|
||||
| Criterion | Status | Location |
|
||||
|-----------|--------|----------|
|
||||
| Repo-visible deployment scaffold | ✅ | `infra/matrix/` + `deploy/conduit/` |
|
||||
| Host/port/reverse-proxy explicit | ✅ | `docs/matrix-fleet-comms/README.md` |
|
||||
| Missing prerequisites named | ✅ | `prerequisites.md` — 6 named blockers |
|
||||
| Lowers #166 from fuzzy to executable | ✅ | Phase-gated plan with estimates |
|
||||
|
||||
### Artifact Inventory
|
||||
|
||||
**`infra/matrix/`** (Docker path):
|
||||
- `README.md` — Entry point
|
||||
- `prerequisites.md` — Host options, 6 explicit blockers
|
||||
- `docker-compose.yml` — Container orchestration
|
||||
- `conduit.toml` — Homeserver configuration
|
||||
- `deploy-matrix.sh` — One-command deployment
|
||||
- `.env.example` — Configuration template
|
||||
- `caddy/` — Reverse proxy configs
|
||||
|
||||
**`deploy/conduit/`** (Binary path):
|
||||
- `conduit.toml` — Production config
|
||||
- `conduit.service` — systemd definition
|
||||
- `Caddyfile` — Reverse proxy
|
||||
- `install.sh` — One-command installer
|
||||
- `scripts/` — Backup, health check helpers
|
||||
|
||||
**`docs/matrix-fleet-comms/README.md`** (Architecture):
|
||||
- 3 Architecture Decision Records (ADRs)
|
||||
- Complete port allocation table
|
||||
- 4-phase implementation plan with estimates
|
||||
- Operational runbooks (backup, health, account creation)
|
||||
- Cross-issue linkages
|
||||
|
||||
### Architecture Decisions
|
||||
|
||||
1. **ADR-1**: Conduit selected over Synapse/Dendrite (low resource, SQLite support)
|
||||
2. **ADR-2**: Gitea VPS host initially (consolidated ops)
|
||||
3. **ADR-3**: Full federation enabled (requires TLS + public DNS)
|
||||
|
||||
### Blocking Prerequisites
|
||||
|
||||
| # | Prerequisite | Authority | Effort |
|
||||
|---|--------------|-----------|--------|
|
||||
| 1 | Target host selected (Hermes vs Allegro vs new) | Alexander/admin | 15 min |
|
||||
| 2 | Domain assigned: `matrix.timmy.foundation` | Alexander/admin | 15 min |
|
||||
| 3 | DNS A record created | Alexander/admin | 15 min |
|
||||
| 4 | DNS SRV record for federation | Alexander/admin | 15 min |
|
||||
| 5 | Firewall: TCP 8448 open | Host admin | 5 min |
|
||||
| 6 | SSL strategy confirmed | Caddy auto | 0 min |
|
||||
|
||||
---
|
||||
|
||||
## Target #2: timmy-config #166
|
||||
|
||||
**Title**: [COMMS] Stand up Matrix/Conduit for human-to-fleet encrypted communication
|
||||
**Status**: OPEN 🟡
|
||||
**Issue State**: Scaffold complete, execution blocked on #187
|
||||
|
||||
### Evolution: Fuzzy Epic → Executable
|
||||
|
||||
| Phase | Before | After |
|
||||
|-------|--------|-------|
|
||||
| Idea | "We should use Matrix" | Concrete deployment path |
|
||||
| Scaffold | None | 12+ files, fully documented |
|
||||
| Blockers | Unknown | Explicitly named in #187 |
|
||||
| Next Steps | Undefined | Phase-gated with estimates |
|
||||
|
||||
### Acceptance Criteria Progress
|
||||
|
||||
| Criterion | Status | Blocker |
|
||||
|-----------|--------|---------|
|
||||
| Deploy Conduit homeserver | 🟡 Ready | #187 DNS decision |
|
||||
| Create fleet rooms/channels | 🟡 Ready | Post-deployment |
|
||||
| Encrypted operator messaging | 🟡 Ready | Post-accounts |
|
||||
| Telegram→Matrix cutover | ⏳ Pending | Post-verification |
|
||||
| Alexander can message fleet | ⏳ Pending | Post-deployment |
|
||||
| Messages encrypted/persistent | ⏳ Pending | Post-deployment |
|
||||
| Telegram not only surface | ⏳ Pending | Migration timeline TBD |
|
||||
|
||||
### Handoff from #183
|
||||
|
||||
**#183 delivered:**
|
||||
- ✅ Deployable configuration files
|
||||
- ✅ Executable installation scripts
|
||||
- ✅ Operational runbooks
|
||||
- ✅ Phase-gated implementation plan
|
||||
- ✅ Bootstrap account/room specifications
|
||||
|
||||
**#166 needs:**
|
||||
- DNS decisions (#187)
|
||||
- Execution (run install scripts)
|
||||
- Testing (verify E2E encryption)
|
||||
|
||||
---
|
||||
|
||||
## Target #3: the-nexus #830
|
||||
|
||||
**Title**: [EPIC] Deep Dive: Sovereign NotebookLM + Daily AI Intelligence Briefing
|
||||
**Status**: OPEN ✅
|
||||
**Issue State**: Production-ready scaffold, 5 phases complete
|
||||
|
||||
### 5-Phase Scaffold
|
||||
|
||||
| Phase | Component | File | Lines | Purpose |
|
||||
|-------|-----------|------|-------|---------|
|
||||
| 1 | Aggregate | `bin/deepdive_aggregator.py` | ~95 | arXiv RSS, lab blog ingestion |
|
||||
| 2 | Filter | `bin/deepdive_filter.py` | NA | Included in aggregator/orchestrator |
|
||||
| 3 | Synthesize | `bin/deepdive_synthesis.py` | ~190 | LLM briefing generation |
|
||||
| 4 | Audio | `bin/deepdive_tts.py` | ~240 | Multi-adapter TTS (Piper/ElevenLabs) |
|
||||
| 5 | Deliver | `bin/deepdive_delivery.py` | ~210 | Telegram voice/text delivery |
|
||||
| — | Orchestrate | `bin/deepdive_orchestrator.py` | ~320 | Pipeline coordination, cron |
|
||||
|
||||
**Total**: ~1,055 lines of executable Python
|
||||
|
||||
### Documentation Inventory
|
||||
|
||||
| File | Lines | Purpose |
|
||||
|------|-------|---------|
|
||||
| `docs/DEEPSDIVE_ARCHITECTURE.md` | ~88 | 5-phase spec, data flows |
|
||||
| `docs/DEEPSDIVE_EXECUTION.md` | ~NA | Runbook, troubleshooting |
|
||||
| `docs/DEEPSDIVE_QUICKSTART.md` | ~NA | Fast-path to first briefing |
|
||||
|
||||
### Acceptance Criteria — All Ready
|
||||
|
||||
| Criterion | Issue Req | Status | Evidence |
|
||||
|-----------|-----------|--------|----------|
|
||||
| Zero manual copy-paste | Mandatory | ✅ | Cron automation |
|
||||
| Daily 6 AM delivery | Mandatory | ✅ | Configurable schedule |
|
||||
| arXiv (cs.AI/cs.CL/cs.LG) | Mandatory | ✅ | RSS fetcher |
|
||||
| Lab blog coverage | Mandatory | ✅ | OpenAI/Anthropic/DeepMind |
|
||||
| Relevance filtering | Mandatory | ✅ | Embedding + keyword |
|
||||
| Written briefing | Mandatory | ✅ | Synthesis engine |
|
||||
| Audio via TTS | Mandatory | ✅ | Piper + ElevenLabs adapters |
|
||||
| Telegram delivery | Mandatory | ✅ | Voice message support |
|
||||
| On-demand trigger | Mandatory | ✅ | CLI flag in orchestrator |
|
||||
|
||||
### Sovereignty Compliance
|
||||
|
||||
| Dependency | Local Option | Cloud Fallback |
|
||||
|------------|--------------|----------------|
|
||||
| TTS | Piper (offline) | ElevenLabs API |
|
||||
| LLM | Hermes (local) | Provider routing |
|
||||
| Scheduler | Cron (system) | Manual trigger |
|
||||
| Storage | Filesystem | No DB required |
|
||||
|
||||
---
|
||||
|
||||
## Interconnection Map
|
||||
|
||||
### #830 → #166
|
||||
Deep Dive intelligence briefings can target Matrix rooms as delivery channel (alternative to Telegram voice).
|
||||
|
||||
### #830 → #173
|
||||
Deep Dive is the **content layer** in the comms unification stack — what gets said, via which channel.
|
||||
|
||||
### #166 → #173
|
||||
Matrix is the **human-to-fleet channel** — sovereign, encrypted, persistent.
|
||||
|
||||
### #166 → #174
|
||||
Matrix and Nostr operate in parallel — Matrix for rich messaging, Nostr for lightweight broadcast. Both are sovereign.
|
||||
|
||||
### #183 → #166
|
||||
Scaffold enables execution. Child enables parent.
|
||||
|
||||
---
|
||||
|
||||
## Decision Authority Summary
|
||||
|
||||
| Decision | Location | Authority | Current State |
|
||||
|----------|----------|-----------|---------------|
|
||||
| Matrix deployment timing | #187 | Alexander/admin | ⏳ DNS pending |
|
||||
| Deep Dive TTS preference | #830 | Alexander | ⏳ Local vs API |
|
||||
| Matrix/Nostr priority | #173 | Alexander | ⏳ Active discussion |
|
||||
|
||||
---
|
||||
|
||||
## Burn Mode Artifacts Created
|
||||
|
||||
### Visible Comments (SITREPs)
|
||||
- #183: Continuity verification SITREP
|
||||
- #166: Execution bridge SITREP
|
||||
- #830: Architecture assessment SITREP
|
||||
|
||||
### Documentation
|
||||
- `docs/matrix-fleet-comms/README.md` — Matrix architecture (8KB)
|
||||
- `docs/BURN_MODE_CONTINUITY_2026-04-05.md` — This document
|
||||
|
||||
### Code Scaffold
|
||||
- 5 Deep Dive Python modules (~1,055 lines)
|
||||
- 3 Deep Dive documentation files
|
||||
- 12+ Matrix/Conduit deployment files
|
||||
|
||||
---
|
||||
|
||||
## Sign-off
|
||||
|
||||
All three primary targets have been:
|
||||
1. ✅ **Read and assessed** — Current state documented
|
||||
2. ✅ **SITREP comments posted** — Visible continuity trail
|
||||
3. ✅ **Scaffold verified/extended** — Strongest proof committed
|
||||
|
||||
**#183**: Acceptance criteria satisfied, scaffold in repo truth
|
||||
**#166**: Executable path defined, blockers isolated to #187
|
||||
**#830**: Production-ready scaffold, all 5 phases implemented
|
||||
|
||||
Continuity preserved. Architecture connected. Decisions forward.
|
||||
|
||||
— Ezra, Archivist
|
||||
2026-04-05
|
||||
112
docs/CANONICAL_INDEX_MATRIX.md
Normal file
112
docs/CANONICAL_INDEX_MATRIX.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Canonical Index: Matrix/Conduit Deployment Artifacts
|
||||
|
||||
> **Issues**: [#166](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/166) (Execution Epic) | [#183](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/183) (Scaffold — Closed) | [#187](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/187) (Decision Blocker)
|
||||
> **Created**: 2026-04-05 by Ezra (burn mode)
|
||||
> **Purpose**: Single source of truth mapping every Matrix/Conduit artifact in `timmy-config`. Stops scatter, eliminates "which file is real?" ambiguity.
|
||||
|
||||
---
|
||||
|
||||
## Status at a Glance
|
||||
|
||||
| Milestone | State | Evidence |
|
||||
|-----------|-------|----------|
|
||||
| Deployment scaffold | ✅ Complete | `infra/matrix/` (15 files) |
|
||||
| Operator runbook | ✅ Complete | `docs/matrix-fleet-comms/` |
|
||||
| Host readiness script | ✅ Complete | `infra/matrix/host-readiness-check.sh` |
|
||||
| Target host selected | ⚠️ **BLOCKED** | Pending [#187](../issues/187) |
|
||||
| Live deployment | ⚠️ **BLOCKED** | Waiting on host + domain + proxy decision |
|
||||
|
||||
---
|
||||
|
||||
## Authoritative Paths (Read/Edit These)
|
||||
|
||||
### 1. Deployment Scaffold — `infra/matrix/`
|
||||
This is the **primary executable scaffold**. If you are deploying Conduit, start here and nowhere else.
|
||||
|
||||
| File | Purpose | Lines/Size |
|
||||
|------|---------|------------|
|
||||
| `README.md` | Entry point, quick-start, architecture diagram | 3,275 bytes |
|
||||
| `prerequisites.md` | 6 concrete blocking items pre-deployment | 2,690 bytes |
|
||||
| `docker-compose.yml` | Conduit + Postgres + optional Element Web | 1,427 bytes |
|
||||
| `conduit.toml` | Base Conduit configuration template | 1,498 bytes |
|
||||
| `.env.example` | Environment secrets template | 1,861 bytes |
|
||||
| `deploy-matrix.sh` | One-command deployment orchestrator | 3,388 bytes |
|
||||
| `host-readiness-check.sh` | Pre-flight validation script | 3,321 bytes |
|
||||
| `caddy/Caddyfile` | Reverse-proxy rules for Caddy users | 1,612 bytes |
|
||||
| `conduit/conduit.toml` | Advanced Conduit config (federation-ready) | 2,280 bytes |
|
||||
| `conduit/docker-compose.yml` | Extended compose with replication | 1,469 bytes |
|
||||
| `scripts/deploy-conduit.sh` | Low-level Conduit installer | 5,488 bytes |
|
||||
| `docs/RUNBOOK.md` | Day-2 operations (backup, upgrade, health) | 3,412 bytes |
|
||||
|
||||
**Command for next deployer:**
|
||||
```bash
|
||||
cd infra/matrix
|
||||
./host-readiness-check.sh # 1. verify target
|
||||
# Edit conduit.toml + .env
|
||||
./deploy-matrix.sh # 2. deploy
|
||||
```
|
||||
|
||||
### 2. Operator Runbook — `docs/matrix-fleet-comms/`
|
||||
Human-facing narrative for Alexander and operators.
|
||||
|
||||
| File | Purpose | Size |
|
||||
|------|---------|------|
|
||||
| `README.md` | Fleet communications authority map + onboarding | 7,845 bytes |
|
||||
| `DEPLOYMENT_RUNBOOK.md` | Step-by-step operator playbook | 4,484 bytes |
|
||||
|
||||
---
|
||||
|
||||
## Legacy / Duplicate Paths (Do Not Edit — Reference Only)
|
||||
|
||||
The following directories contain **overlapping or superseded** material. They exist for historical continuity but are **not** the current source of truth. If you edit these, you create divergence.
|
||||
|
||||
| Path | Status | Note |
|
||||
|------|--------|------|
|
||||
| `deploy/matrix/` | 🔴 Superseded by `infra/matrix/` | Smaller subset; lacks host-readiness check |
|
||||
| `deploy/conduit/` | 🔴 Superseded by `infra/matrix/scripts/` | `install.sh` + `health.sh` — good ideas ported into `infra/matrix/` |
|
||||
| `matrix/` | 🔴 Superseded by `infra/matrix/` | Early docker-compose experiment |
|
||||
| `docs/matrix-conduit/DEPLOYMENT.md` | 🔴 Superseded by `docs/matrix-fleet-comms/DEPLOYMENT_RUNBOOK.md` | |
|
||||
| `docs/matrix-deployment.md` | 🔴 Superseded by `infra/matrix/prerequisites.md` + runbook | |
|
||||
| `scaffold/matrix-conduit/` | 🔴 Superseded by `infra/matrix/` | Bootstrap + nginx configs; nginx approach not chosen |
|
||||
|
||||
> **House Rule**: New Matrix work must branch from `infra/matrix/` or `docs/matrix-fleet-comms/`. If a legacy file needs resurrection, migrate it into the authoritative tree and delete the old reference.
|
||||
|
||||
---
|
||||
|
||||
## Decision Blocker: #187
|
||||
|
||||
**#166 cannot proceed until [#187](../issues/187) is resolved.**
|
||||
|
||||
Ezra has produced a dedicated decision framework to make this a 5-minute choice rather than an architectural debate:
|
||||
|
||||
📄 **See**: [`docs/DECISION_FRAMEWORK_187.md`](DECISION_FRAMEWORK_187.md)
|
||||
|
||||
The framework recommends:
|
||||
- **Host**: Timmy-Home bare metal (primary) or existing VPS
|
||||
- **Domain**: `matrix.timmytime.net` (or sub-domain of existing fleet domain)
|
||||
- **Proxy**: Caddy (simplest) or extend existing Traefik
|
||||
- **TLS**: Let's Encrypt ACME HTTP-01 (port 80/443 open)
|
||||
|
||||
---
|
||||
|
||||
## Next Agent Checklist
|
||||
|
||||
If you are picking up #166:
|
||||
|
||||
1. [ ] Read `infra/matrix/README.md`
|
||||
2. [ ] Read `docs/DECISION_FRAMEWORK_187.md`
|
||||
3. [ ] Confirm resolution of #187 (host/domain/proxy chosen)
|
||||
4. [ ] Run `infra/matrix/host-readiness-check.sh` on target host
|
||||
5. [ ] Cut a feature branch; edit `infra/matrix/conduit.toml` and `.env`
|
||||
6. [ ] Execute `infra/matrix/deploy-matrix.sh`
|
||||
7. [ ] Verify federation with Matrix.org test server
|
||||
8. [ ] Create operator room; invite Alexander
|
||||
9. [ ] Post SITREP on #166 with proof-of-deployment
|
||||
|
||||
---
|
||||
|
||||
## Changelog
|
||||
|
||||
| Date | Change | Author |
|
||||
|------|--------|--------|
|
||||
| 2026-04-05 | Canonical index created; authoritative paths declared | Ezra |
|
||||
126
docs/DECISION_FRAMEWORK_187.md
Normal file
126
docs/DECISION_FRAMEWORK_187.md
Normal file
@@ -0,0 +1,126 @@
|
||||
# Decision Framework: Matrix Host, Domain, and Proxy (#187)
|
||||
|
||||
> **Issue**: [#187](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/187) — Decide Matrix host, domain, and proxy prerequisites so #166 can deploy
|
||||
> **Parent**: [#166](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/166) — Stand up Matrix/Conduit for human-to-fleet encrypted communication
|
||||
> **Created**: 2026-04-05 by Ezra (burn mode)
|
||||
> **Purpose**: Turn the #187 blocker into a checkbox. One recommendation, two alternatives, explicit trade-offs.
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
**Recommended Path (Option A)**
|
||||
- **Host**: Existing Hermes VPS (`143.198.27.163` — already hosts Gitea, Bezalel, Allegro-Primus)
|
||||
- **Domain**: `matrix.timmytime.net`
|
||||
- **Proxy**: Caddy (dedicated to Matrix, auto-TLS, auto-federation headers)
|
||||
- **TLS**: Let's Encrypt via Caddy (ports 80/443/8448 exposed)
|
||||
|
||||
**Why**: It reuses a known sovereign host, keeps comms infrastructure under one roof, and Caddy is the simplest path to working federation.
|
||||
|
||||
---
|
||||
|
||||
## Option A — Recommended: Hermes VPS + Caddy
|
||||
|
||||
### Host: Hermes VPS (`143.198.27.163`)
|
||||
| Factor | Assessment |
|
||||
|--------|------------|
|
||||
| Sovereignty | ✅ Full root, no platform lock-in |
|
||||
| Uptime | ✅ 24/7 VPS, better than home broadband |
|
||||
| Existing load | ⚠️ Gitea + wizard gateways running; Conduit is lightweight (~200MB RAM) |
|
||||
| Cost | ✅ Sunk cost — no new provider needed |
|
||||
|
||||
### Domain: `matrix.timmytime.net`
|
||||
| Factor | Assessment |
|
||||
|--------|------------|
|
||||
| DNS control | ✅ `timmytime.net` is already under fleet control |
|
||||
| Federation SRV | Simple A record + optional `_matrix._tcp` SRV record |
|
||||
| TLS cert | Caddy auto-provisions for this subdomain |
|
||||
|
||||
### Proxy: Caddy
|
||||
| Factor | Assessment |
|
||||
|--------|------------|
|
||||
| TLS automation | ✅ Built-in ACME, auto-renewal |
|
||||
| Federation headers | ✅ Easy `.well-known` + SRV support |
|
||||
| Config complexity | ✅ Single `Caddyfile`, no label magic |
|
||||
| Traefik conflict | None — Caddy binds its own ports directly |
|
||||
|
||||
### Required Actions for Option A
|
||||
1. Delegate `matrix.timmytime.net` A record → `143.198.27.163`
|
||||
2. Open VPS firewall: `80`, `443`, `8448` inbound
|
||||
3. Clone `timmy-config` to VPS
|
||||
4. `cd infra/matrix && ./host-readiness-check.sh`
|
||||
5. Edit `conduit.toml` → `server_name = "matrix.timmytime.net"`
|
||||
6. Run `./deploy-matrix.sh`
|
||||
|
||||
---
|
||||
|
||||
## Option B — Conservative: Timmy-Home Bare Metal + Traefik
|
||||
|
||||
| Factor | Assessment |
|
||||
|--------|------------|
|
||||
| Host | Timmy-Home Mac Mini / server |
|
||||
| Domain | `matrix.home.timmytime.net` |
|
||||
| Proxy | Existing Traefik instance |
|
||||
| Pros | Full physical sovereignty; no cloud dependency |
|
||||
| Cons | Home IP dynamic (requires DDNS); port-forwarding dependency; power/network outages |
|
||||
| Verdict | 🔶 Viable backup, not primary |
|
||||
|
||||
---
|
||||
|
||||
## Option C — Fast but Costly: DigitalOcean Droplet
|
||||
|
||||
| Factor | Assessment |
|
||||
|--------|------------|
|
||||
| Host | Fresh `$6-12/mo` Ubuntu droplet |
|
||||
| Domain | `matrix.timmytime.net` |
|
||||
| Proxy | Caddy or Nginx |
|
||||
| Pros | Clean slate, static IP, easy snapshot backups |
|
||||
| Cons | New monthly bill, another host to patch/monitor |
|
||||
| Verdict | 🔶 Overkill while Hermes VPS has headroom |
|
||||
|
||||
---
|
||||
|
||||
## Comparative Matrix
|
||||
|
||||
| Criterion | Option A (Recommended) | Option B (Home) | Option C (DO) |
|
||||
|-----------|------------------------|-----------------|---------------|
|
||||
| Speed to deploy | 🟢 Fast | 🟡 Medium | 🟡 Medium |
|
||||
| Sovereignty | 🟢 High | 🟢 Highest | 🟢 High |
|
||||
| Reliability | 🟢 Good | 🔴 Variable | 🟢 Good |
|
||||
| Cost | 🟢 $0 extra | 🟢 $0 extra | 🔴 +$6-12/mo |
|
||||
| Operational load | 🟢 Low | 🟡 Medium | 🔴 Higher |
|
||||
| Federation ease | 🟢 Caddy simple | 🟡 Traefik doable | 🟢 Caddy simple |
|
||||
|
||||
---
|
||||
|
||||
## Port & TLS Requirements (All Options)
|
||||
|
||||
| Port | Direction | Purpose | Notes |
|
||||
|------|-----------|---------|-------|
|
||||
| `80` | Inbound | ACME challenge + `.well-known` redirect | Must be reachable from internet |
|
||||
| `443` | Inbound | Client HTTPS (Element, mobile apps) | Caddy/Traefik terminates TLS |
|
||||
| `8448` | Inbound | Federation (server-to-server) | Matrix spec default; can proxy from 443 but 8448 is safest |
|
||||
| `6167` | Internal | Conduit replication (optional) | Not needed for single-node |
|
||||
|
||||
**TLS Path**: Let's Encrypt HTTP-01 challenge (no manual cert purchase).
|
||||
|
||||
---
|
||||
|
||||
## The Actual Checklist to Close #187
|
||||
|
||||
- [ ] **Alexander selects one option** (A recommended)
|
||||
- [ ] Domain/subdomain is chosen and confirmed available
|
||||
- [ ] Target host IP is known and firewall ports are confirmed open
|
||||
- [ ] Reverse proxy choice is locked
|
||||
- [ ] #166 is updated with the decision
|
||||
- [ ] Allegro or Ezra is tasked with live deployment
|
||||
|
||||
**If you check these 6 boxes, #166 is unblocked.**
|
||||
|
||||
---
|
||||
|
||||
## Suggested Comment to Resolve #187
|
||||
|
||||
> "Go with Option A. Domain: `matrix.timmytime.net`. Host: Hermes VPS. Proxy: Caddy. @ezra or @allegro deploy when ready."
|
||||
|
||||
That is all that is required.
|
||||
141
docs/MEMORY_ARCHITECTURE.md
Normal file
141
docs/MEMORY_ARCHITECTURE.md
Normal file
@@ -0,0 +1,141 @@
|
||||
# Memory Architecture
|
||||
|
||||
> How Timmy remembers, recalls, and learns — without hallucinating.
|
||||
|
||||
Refs: Epic #367 | Sub-issues #368, #369, #370, #371, #372
|
||||
|
||||
## Overview
|
||||
|
||||
Timmy's memory system uses a **Memory Palace** architecture — a structured, file-backed knowledge store organized into rooms and drawers. When faced with a recall question, the agent checks its palace *before* generating from scratch.
|
||||
|
||||
This document defines the retrieval order, storage layers, and data flow that make this work.
|
||||
|
||||
## Retrieval Order (L0–L5)
|
||||
|
||||
When the agent receives a prompt that looks like a recall question ("what did we do?", "what's the status of X?"), the retrieval enforcer intercepts it and walks through layers in order:
|
||||
|
||||
| Layer | Source | Question Answered | Short-circuits? |
|
||||
|-------|--------|-------------------|------------------|
|
||||
| L0 | `identity.txt` | Who am I? What are my mandates? | No (always loaded) |
|
||||
| L1 | Palace rooms/drawers | What do I know about this topic? | Yes, if hit |
|
||||
| L2 | Session scratchpad | What have I learned this session? | Yes, if hit |
|
||||
| L3 | Artifact retrieval (Gitea API) | Can I fetch the actual issue/file/log? | Yes, if hit |
|
||||
| L4 | Procedures/playbooks | Is there a documented way to do this? | Yes, if hit |
|
||||
| L5 | Free generation | (Only when L0–L4 are exhausted) | N/A |
|
||||
|
||||
**Key principle:** The agent never reaches L5 (free generation) if any prior layer has relevant data. This eliminates hallucination for recall-style queries.
|
||||
|
||||
## Storage Layout
|
||||
|
||||
```
|
||||
~/.mempalace/
|
||||
identity.txt # L0: Who I am, mandates, personality
|
||||
rooms/
|
||||
projects/
|
||||
timmy-config.md # What I know about timmy-config
|
||||
hermes-agent.md # What I know about hermes-agent
|
||||
people/
|
||||
alexander.md # Working relationship context
|
||||
architecture/
|
||||
fleet.md # Fleet system knowledge
|
||||
mempalace.md # Self-knowledge about this system
|
||||
config/
|
||||
mempalace.yaml # Palace configuration
|
||||
|
||||
~/.hermes/
|
||||
scratchpad/
|
||||
{session_id}.json # L2: Ephemeral session context
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
### 1. Memory Palace Skill (`mempalace.py`) — #368
|
||||
|
||||
Core data structures:
|
||||
- `PalaceRoom`: A named collection of drawers (topics)
|
||||
- `Mempalace`: The top-level palace with room management
|
||||
- Factory constructors: `for_issue_analysis()`, `for_health_check()`, `for_code_review()`
|
||||
|
||||
### 2. Retrieval Enforcer (`retrieval_enforcer.py`) — #369
|
||||
|
||||
Middleware that intercepts recall-style prompts:
|
||||
1. Detects recall patterns ("what did", "status of", "last time we")
|
||||
2. Walks L0→L4 in order, short-circuiting on first hit
|
||||
3. Only allows free generation (L5) when all layers return empty
|
||||
4. Produces an honest fallback: "I don't have this in my memory palace."
|
||||
|
||||
### 3. Session Scratchpad (`scratchpad.py`) — #370
|
||||
|
||||
Ephemeral, session-scoped working memory:
|
||||
- Write-append only during a session
|
||||
- Entries have TTL (default: 1 hour)
|
||||
- Queried at L2 in retrieval chain
|
||||
- Never auto-promoted to palace
|
||||
|
||||
### 4. Memory Promotion — #371
|
||||
|
||||
Explicit promotion from scratchpad to palace:
|
||||
- Agent must call `promote_to_palace()` with a reason
|
||||
- Dedup check against target drawer
|
||||
- Summary required (raw tool output never stored)
|
||||
- Conflict detection when new memory contradicts existing
|
||||
|
||||
### 5. Wake-Up Protocol (`wakeup.py`) — #372
|
||||
|
||||
Boot sequence for new sessions:
|
||||
```
|
||||
Session Start
|
||||
│
|
||||
├─ L0: Load identity.txt
|
||||
├─ L1: Scan palace rooms for active context
|
||||
├─ L1.5: Surface promoted memories from last session
|
||||
├─ L2: Load surviving scratchpad entries
|
||||
│
|
||||
└─ Ready: agent knows who it is, what it was doing, what it learned
|
||||
```
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
┌──────────────────┐
|
||||
│ User Prompt │
|
||||
└────────┬─────────┘
|
||||
│
|
||||
┌────────┴─────────┐
|
||||
│ Recall Detector │
|
||||
└────┬───────┬─────┘
|
||||
│ │
|
||||
[recall] [not recall]
|
||||
│ │
|
||||
┌───────┴────┐ ┌──┬─┴───────┐
|
||||
│ Retrieval │ │ Normal Flow │
|
||||
│ Enforcer │ └─────────────┘
|
||||
│ L0→L1→L2 │
|
||||
│ →L3→L4→L5│
|
||||
└──────┬─────┘
|
||||
│
|
||||
┌──────┴─────┐
|
||||
│ Response │
|
||||
│ (grounded) │
|
||||
└────────────┘
|
||||
```
|
||||
|
||||
## Anti-Patterns
|
||||
|
||||
| Don't | Do Instead |
|
||||
|-------|------------|
|
||||
| Generate from vibes when palace has data | Check palace first (L1) |
|
||||
| Auto-promote everything to palace | Require explicit `promote_to_palace()` with reason |
|
||||
| Store raw API responses as memories | Summarize before storing |
|
||||
| Hallucinate when palace is empty | Say "I don't have this in my memory palace" |
|
||||
| Dump entire palace on wake-up | Selective loading based on session context |
|
||||
|
||||
## Status
|
||||
|
||||
| Component | Issue | PR | Status |
|
||||
|-----------|-------|----|--------|
|
||||
| Skill port | #368 | #374 | In Review |
|
||||
| Retrieval enforcer | #369 | #374 | In Review |
|
||||
| Session scratchpad | #370 | #374 | In Review |
|
||||
| Memory promotion | #371 | — | Open |
|
||||
| Wake-up protocol | #372 | #374 | In Review |
|
||||
17
docs/adr/0001-sovereign-local-first-architecture.md
Normal file
17
docs/adr/0001-sovereign-local-first-architecture.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# ADR-0001: Sovereign Local-First Architecture
|
||||
|
||||
**Date:** 2026-04-06
|
||||
**Status:** Accepted
|
||||
**Author:** Ezra
|
||||
**House:** hermes-ezra
|
||||
|
||||
## Context
|
||||
The foundation requires a robust, local-first architecture that ensures agent sovereignty while leveraging cloud connectivity for complex tasks.
|
||||
|
||||
## Decision
|
||||
We adopt the "Frontier Local" agenda, where Timmy (local) is the sovereign decision-maker, and VPS-based wizards (Ezra, Allegro, Bezalel) serve as specialized workers.
|
||||
|
||||
## Consequences
|
||||
- Increased local compute requirements.
|
||||
- Sub-100ms telemetry requirement.
|
||||
- Mandatory local review for all remote artifacts.
|
||||
15
docs/adr/ADR_TEMPLATE.md
Normal file
15
docs/adr/ADR_TEMPLATE.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# ADR-[Number]: [Title]
|
||||
|
||||
**Date:** [YYYY-MM-DD]
|
||||
**Status:** [Proposed | Accepted | Superseded]
|
||||
**Author:** [Agent Name]
|
||||
**House:** [House ID]
|
||||
|
||||
## Context
|
||||
[What is the problem we are solving?]
|
||||
|
||||
## Decision
|
||||
[What is the proposed solution?]
|
||||
|
||||
## Consequences
|
||||
[What are the trade-offs?]
|
||||
212
docs/architecture/LAZARUS-CELL-SPEC.md
Normal file
212
docs/architecture/LAZARUS-CELL-SPEC.md
Normal file
@@ -0,0 +1,212 @@
|
||||
# Lazarus Cell Specification v1.0
|
||||
|
||||
**Canonical epic:** `Timmy_Foundation/timmy-config#267`
|
||||
**Author:** Ezra (architect)
|
||||
**Date:** 2026-04-06
|
||||
**Status:** Draft — open for burn-down by `#269` `#270` `#271` `#272` `#273` `#274`
|
||||
|
||||
---
|
||||
|
||||
## 1. Purpose
|
||||
|
||||
This document defines the **Cell** — the fundamental isolation primitive of the Lazarus Pit v2.0. Every downstream implementation (isolation layer, invitation protocol, backend abstraction, teaming model, verification suite, and operator surface) must conform to the invariants, roles, lifecycle, and publication rules defined here.
|
||||
|
||||
---
|
||||
|
||||
## 2. Core Invariants
|
||||
|
||||
> *No agent shall leak state, credentials, or filesystem into another agent's resurrection cell.*
|
||||
|
||||
### 2.1 Cell Invariant Definitions
|
||||
|
||||
| Invariant | Meaning | Enforcement |
|
||||
|-----------|---------|-------------|
|
||||
| **I1 — Filesystem Containment** | A cell may only read/write paths under its assigned `CELL_HOME`. No traversal into host `~/.hermes/`, `/root/wizards/`, or other cells. | Mount namespace (Level 2+) or strict chroot + AppArmor (Level 1) |
|
||||
| **I2 — Credential Isolation** | Host tokens, env files, and SSH keys are never copied into a cell. Only per-cell credential pools are injected at spawn. | Harness strips `HERMES_*` and `HOME`; injects `CELL_CREDENTIALS` manifest |
|
||||
| **I3 — Process Boundary** | A cell runs as an independent OS process or container. It cannot ptrace, signal, or inspect sibling cells. | PID namespace, seccomp, or Docker isolation |
|
||||
| **I4 — Network Segmentation** | A cell does not bind to host-private ports or sniff host traffic unless explicitly proxied. | Optional network namespace / proxy boundary |
|
||||
| **I5 — Memory Non-Leakage** | Shared memory, IPC sockets, and tmpfs mounts are cell-scoped. No post-exit residue in host `/tmp` or `/dev/shm`. | TTL cleanup + graveyard garbage collection (`#273`) |
|
||||
| **I6 — Audit Trail** | Every cell mutation (spawn, invite, checkpoint, close) is logged to an immutable ledger (Gitea issue comment or local append-only log). | Required for all production cells |
|
||||
|
||||
---
|
||||
|
||||
## 3. Role Taxonomy
|
||||
|
||||
Every participant in a cell is assigned exactly one role at invitation time. Roles are immutable for the duration of the session.
|
||||
|
||||
| Role | Permissions | Typical Holder |
|
||||
|------|-------------|----------------|
|
||||
| **director** | Can invite others, trigger checkpoints, close the cell, and override cell decisions. Cannot directly execute tools unless also granted `executor`. | Human operator (Alexander) or fleet commander (Timmy) |
|
||||
| **executor** | Full tool execution and filesystem write access within the cell. Can push commits to the target project repo. | Fleet agents (Ezra, Allegro, etc.) |
|
||||
| **observer** | Read-only access to cell filesystem and shared scratchpad. Cannot execute tools or mutate state. | Human reviewer, auditor, or training monitor |
|
||||
| **guest** | Same permissions as `executor`, but sourced from outside the fleet. Subject to stricter backend isolation (Docker by default). | External bots (Codex, Gemini API, Grok, etc.) |
|
||||
| **substitute** | A special `executor` who joins to replace a downed agent. Inherits the predecessor's last checkpoint but not their home memory. | Resurrection-pool fallback agent |
|
||||
|
||||
### 3.1 Role Combinations
|
||||
|
||||
- A single participant may hold **at most one** primary role.
|
||||
- A `director` may temporarily downgrade to `observer` but cannot upgrade to `executor` without a new invitation.
|
||||
- `guest` and `substitute` roles must be explicitly enabled in cell policy.
|
||||
|
||||
---
|
||||
|
||||
## 4. Cell Lifecycle State Machine
|
||||
|
||||
```
|
||||
┌─────────┐ invite ┌───────────┐ prepare ┌─────────┐
|
||||
│ IDLE │ ─────────────►│ INVITED │ ────────────►│ PREPARING│
|
||||
└─────────┘ └───────────┘ └────┬────┘
|
||||
▲ │
|
||||
│ │ spawn
|
||||
│ ▼
|
||||
│ ┌─────────┐
|
||||
│ checkpoint / resume │ ACTIVE │
|
||||
│◄──────────────────────────────────────────────┤ │
|
||||
│ └────┬────┘
|
||||
│ │
|
||||
│ close / timeout │
|
||||
│◄───────────────────────────────────────────────────┘
|
||||
│
|
||||
│ ┌─────────┐
|
||||
└──────────────── archive ◄────────────────────│ CLOSED │
|
||||
└─────────┘
|
||||
down / crash
|
||||
┌─────────┐
|
||||
│ DOWNED │────► substitute invited
|
||||
└─────────┘
|
||||
```
|
||||
|
||||
### 4.1 State Definitions
|
||||
|
||||
| State | Description | Valid Transitions |
|
||||
|-------|-------------|-------------------|
|
||||
| **IDLE** | Cell does not yet exist in the registry. | `INVITED` |
|
||||
| **INVITED** | An invitation token has been generated but not yet accepted. | `PREPARING` (on accept), `CLOSED` (on expiry/revoke) |
|
||||
| **PREPARING** | Cell directory is being created, credentials injected, backend initialized. | `ACTIVE` (on successful spawn), `CLOSED` (on failure) |
|
||||
| **ACTIVE** | At least one participant is running in the cell. Tool execution is permitted. | `CHECKPOINTING`, `CLOSED`, `DOWNED` |
|
||||
| **CHECKPOINTING** | A snapshot of cell state is being captured. | `ACTIVE` (resume), `CLOSED` (if final) |
|
||||
| **DOWNED** | An `ACTIVE` agent missed heartbeats. Cell is frozen pending recovery. | `ACTIVE` (revived), `CLOSED` (abandoned) |
|
||||
| **CLOSED** | Cell has been explicitly closed or TTL expired. Filesystem enters grace period. | `ARCHIVED` |
|
||||
| **ARCHIVED** | Cell artifacts (logs, checkpoints, decisions) are persisted. Filesystem may be scrubbed. | — (terminal) |
|
||||
|
||||
### 4.2 TTL and Grace Rules
|
||||
|
||||
- **Active TTL:** Default 4 hours. Renewable by `director` up to a max of 24 hours.
|
||||
- **Invited TTL:** Default 15 minutes. Unused invitations auto-revoke.
|
||||
- **Closed Grace:** 30 minutes. Cell filesystem remains recoverable before scrubbing.
|
||||
- **Archived Retention:** 30 days. After which checkpoints may be moved to cold storage or deleted per policy.
|
||||
|
||||
---
|
||||
|
||||
## 5. Publication Rules
|
||||
|
||||
The Cell is **not** a source of truth for fleet state. It is a scratch space. The following rules govern what may leave the cell boundary.
|
||||
|
||||
### 5.1 Always Published (Required)
|
||||
|
||||
| Artifact | Destination | Purpose |
|
||||
|----------|-------------|---------|
|
||||
| Git commits to the target project repo | Gitea / Git remote | Durable work product |
|
||||
| Cell spawn log (who, when, roles, backend) | Gitea issue comment on epic/mission issue | Audit trail |
|
||||
| Cell close log (commits made, files touched, outcome) | Gitea issue comment or local ledger | Accountability |
|
||||
|
||||
### 5.2 Never Published (Cell-Local Only)
|
||||
|
||||
| Artifact | Reason |
|
||||
|----------|--------|
|
||||
| `shared_scratchpad` drafts and intermediate reasoning | May contain false starts, passwords mentioned in context, or incomplete thoughts |
|
||||
| Per-cell credentials and invite tokens | Security — must not leak into commit history |
|
||||
| Agent home memory files (even read-only copies) | Privacy and sovereignty of the agent's home |
|
||||
| Internal tool-call traces | Noise and potential PII |
|
||||
|
||||
### 5.3 Optionally Published (Director Decision)
|
||||
|
||||
| Artifact | Condition |
|
||||
|----------|-----------|
|
||||
| `decisions.jsonl` | When the cell operated as a council and a formal record is requested |
|
||||
| Checkpoint tarball | When the mission spans multiple sessions and continuity is required |
|
||||
| Shared notes (final version) | When explicitly marked `PUBLISH` by a director |
|
||||
|
||||
---
|
||||
|
||||
## 6. Filesystem Layout
|
||||
|
||||
Every cell, regardless of backend, exposes the same directory contract:
|
||||
|
||||
```
|
||||
/tmp/lazarus-cells/{cell_id}/
|
||||
├── .lazarus/
|
||||
│ ├── cell.json # cell metadata (roles, TTL, backend, target repo)
|
||||
│ ├── spawn.log # immutable spawn record
|
||||
│ ├── decisions.jsonl # logged votes / approvals / directives
|
||||
│ └── checkpoints/ # snapshot tarballs
|
||||
├── project/ # cloned target repo (if applicable)
|
||||
├── shared/
|
||||
│ ├── scratchpad.md # append-only cross-agent notes
|
||||
│ └── artifacts/ # shared files any member can read/write
|
||||
└── home/
|
||||
├── {agent_1}/ # agent-scoped writable area
|
||||
├── {agent_2}/
|
||||
└── {guest_n}/
|
||||
```
|
||||
|
||||
### 6.1 Backend Mapping
|
||||
|
||||
| Backend | `CELL_HOME` realization | Isolation Level |
|
||||
|---------|------------------------|-----------------|
|
||||
| `process` | `tmpdir` + `HERMES_HOME` override | Level 1 (directory + env) |
|
||||
| `venv` | Separate Python venv + `HERMES_HOME` | Level 1.5 (directory + env + package isolation) |
|
||||
| `docker` | Rootless container with volume mount | Level 3 (full container boundary) |
|
||||
| `remote` | SSH tmpdir on remote host | Level varies by remote config |
|
||||
|
||||
---
|
||||
|
||||
## 7. Graveyard and Retention Policy
|
||||
|
||||
When a cell closes, it enters the **Graveyard** — a quarantined holding area before final scrubbing.
|
||||
|
||||
### 7.1 Graveyard Rules
|
||||
|
||||
```
|
||||
ACTIVE ──► CLOSED ──► /tmp/lazarus-graveyard/{cell_id}/ ──► TTL grace ──► SCRUBBED
|
||||
```
|
||||
|
||||
- **Grace period:** 30 minutes (configurable per mission)
|
||||
- **During grace:** A director may issue `lazarus resurrect {cell_id}` to restore the cell to `ACTIVE`
|
||||
- **After grace:** Filesystem is recursively deleted. Checkpoints are moved to `lazarus-archive/{date}/{cell_id}/`
|
||||
|
||||
### 7.2 Retention Tiers
|
||||
|
||||
| Tier | Location | Retention | Access |
|
||||
|------|----------|-----------|--------|
|
||||
| Hot Graveyard | `/tmp/lazarus-graveyard/` | 30 min | Director only |
|
||||
| Warm Archive | `~/.lazarus/archive/` | 30 days | Fleet agents (read-only) |
|
||||
| Cold Storage | Optional S3 / IPFS / Gitea release asset | 1 year | Director only |
|
||||
|
||||
---
|
||||
|
||||
## 8. Cross-References
|
||||
|
||||
- Epic: `timmy-config#267`
|
||||
- Isolation implementation: `timmy-config#269`
|
||||
- Invitation protocol: `timmy-config#270`
|
||||
- Backend abstraction: `timmy-config#271`
|
||||
- Teaming model: `timmy-config#272`
|
||||
- Verification suite: `timmy-config#273`
|
||||
- Operator surface: `timmy-config#274`
|
||||
- Existing skill: `lazarus-pit-recovery` (to be updated to this spec)
|
||||
- Related protocol: `timmy-config#245` (Phoenix Protocol recovery benchmarks)
|
||||
|
||||
---
|
||||
|
||||
## 9. Acceptance Criteria for This Spec
|
||||
|
||||
- [ ] All downstream issues (`#269`–`#274`) can be implemented without ambiguity about roles, states, or filesystem boundaries.
|
||||
- [ ] A new developer can read this doc and implement a compliant `process` backend in one session.
|
||||
- [ ] The spec has been reviewed and ACK'd by at least one other wizard before `#269` merges.
|
||||
|
||||
---
|
||||
|
||||
*Sovereignty and service always.*
|
||||
|
||||
— Ezra
|
||||
363
docs/automation-inventory.md
Normal file
363
docs/automation-inventory.md
Normal file
@@ -0,0 +1,363 @@
|
||||
# Automation Inventory
|
||||
|
||||
Last audited: 2026-04-04 15:55 EDT
|
||||
Owner: Timmy sidecar / Timmy home split
|
||||
Purpose: document every known automation that can restart services, revive old worktrees, reuse stale session state, or re-enter old queue state.
|
||||
|
||||
## Why this file exists
|
||||
|
||||
The failure mode is not just "a process is running".
|
||||
The failure mode is:
|
||||
- launchd or a watchdog restarts something behind our backs
|
||||
- the restarted process reads old config, old labels, old worktrees, old session mappings, or old tmux assumptions
|
||||
- the machine appears haunted because old state comes back after we thought it was gone
|
||||
|
||||
This file is the source of truth for what automations exist, what state they read, and how to stop or reset them safely.
|
||||
|
||||
## Source-of-truth split
|
||||
|
||||
Not all automations live in one repo.
|
||||
|
||||
1. timmy-config
|
||||
Path: ~/.timmy/timmy-config
|
||||
Owns: sidecar deployment, ~/.hermes/config.yaml overlay, launch-facing helper scripts in timmy-config/bin/
|
||||
|
||||
2. timmy-home
|
||||
Path: ~/.timmy
|
||||
Owns: Kimi heartbeat script at uniwizard/kimi-heartbeat.sh and other workspace-native automation
|
||||
|
||||
3. live runtime
|
||||
Path: ~/.hermes/bin
|
||||
Reality: some scripts are still only present live in ~/.hermes/bin and are NOT yet mirrored into timmy-config/bin/
|
||||
|
||||
Rule:
|
||||
- Do not assume ~/.hermes/bin is canonical.
|
||||
- Do not assume timmy-config contains every currently running automation.
|
||||
- Audit runtime first, then reconcile to source control.
|
||||
|
||||
## Current live automations
|
||||
|
||||
### A. launchd-loaded automations
|
||||
|
||||
These are loaded right now according to `launchctl list` after the 2026-04-04 phase-2 cleanup.
|
||||
The only Timmy-specific launchd jobs still loaded are the ones below.
|
||||
|
||||
#### 1. ai.hermes.gateway
|
||||
- Plist: ~/Library/LaunchAgents/ai.hermes.gateway.plist
|
||||
- Command: `python -m hermes_cli.main gateway run --replace`
|
||||
- HERMES_HOME: `~/.hermes`
|
||||
- Logs:
|
||||
- `~/.hermes/logs/gateway.log`
|
||||
- `~/.hermes/logs/gateway.error.log`
|
||||
- KeepAlive: yes
|
||||
- RunAtLoad: yes
|
||||
- State it reuses:
|
||||
- `~/.hermes/config.yaml`
|
||||
- `~/.hermes/channel_directory.json`
|
||||
- `~/.hermes/sessions/sessions.json`
|
||||
- `~/.hermes/state.db`
|
||||
- Old-state risk:
|
||||
- if config drifted, this gateway will faithfully revive the drift
|
||||
- if Telegram/session mappings are stale, it will continue stale conversations
|
||||
|
||||
Stop:
|
||||
```bash
|
||||
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway.plist
|
||||
```
|
||||
Start:
|
||||
```bash
|
||||
launchctl bootstrap gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway.plist
|
||||
```
|
||||
|
||||
#### 2. ai.hermes.gateway-fenrir
|
||||
- Plist: ~/Library/LaunchAgents/ai.hermes.gateway-fenrir.plist
|
||||
- Command: same gateway binary
|
||||
- HERMES_HOME: `~/.hermes/profiles/fenrir`
|
||||
- Logs:
|
||||
- `~/.hermes/profiles/fenrir/logs/gateway.log`
|
||||
- `~/.hermes/profiles/fenrir/logs/gateway.error.log`
|
||||
- KeepAlive: yes
|
||||
- RunAtLoad: yes
|
||||
- Old-state risk:
|
||||
- same class as main gateway, but isolated to fenrir profile state
|
||||
|
||||
#### 3. ai.openclaw.gateway
|
||||
- Plist: ~/Library/LaunchAgents/ai.openclaw.gateway.plist
|
||||
- Command: `node .../openclaw/dist/index.js gateway --port 18789`
|
||||
- Logs:
|
||||
- `~/.openclaw/logs/gateway.log`
|
||||
- `~/.openclaw/logs/gateway.err.log`
|
||||
- KeepAlive: yes
|
||||
- RunAtLoad: yes
|
||||
- Old-state risk:
|
||||
- long-lived gateway survives toolchain assumptions and keeps accepting work even if upstream routing changed
|
||||
|
||||
#### 4. ai.timmy.kimi-heartbeat
|
||||
- Plist: ~/Library/LaunchAgents/ai.timmy.kimi-heartbeat.plist
|
||||
- Command: `/bin/bash ~/.timmy/uniwizard/kimi-heartbeat.sh`
|
||||
- Interval: every 300s
|
||||
- Logs:
|
||||
- `/tmp/kimi-heartbeat-launchd.log`
|
||||
- `/tmp/kimi-heartbeat-launchd.err`
|
||||
- script log: `/tmp/kimi-heartbeat.log`
|
||||
- State it reuses:
|
||||
- `/tmp/kimi-heartbeat.lock`
|
||||
- Gitea labels: `assigned-kimi`, `kimi-in-progress`, `kimi-done`
|
||||
- repo issue bodies/comments as task memory
|
||||
- Current behavior as of this audit:
|
||||
- stale `kimi-in-progress` tasks are now reclaimed after 1 hour of silence
|
||||
- Old-state risk:
|
||||
- labels ARE the queue state; if labels are stale, the heartbeat used to starve forever
|
||||
- the heartbeat is source-controlled in timmy-home, not timmy-config
|
||||
|
||||
Stop:
|
||||
```bash
|
||||
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.timmy.kimi-heartbeat.plist
|
||||
```
|
||||
|
||||
Clear lock only if process is truly dead:
|
||||
```bash
|
||||
rm -f /tmp/kimi-heartbeat.lock
|
||||
```
|
||||
|
||||
#### 5. ai.timmy.claudemax-watchdog
|
||||
- Plist: ~/Library/LaunchAgents/ai.timmy.claudemax-watchdog.plist
|
||||
- Command: `/bin/bash ~/.hermes/bin/claudemax-watchdog.sh`
|
||||
- Interval: every 300s
|
||||
- Logs:
|
||||
- `~/.hermes/logs/claudemax-watchdog.log`
|
||||
- launchd wrapper: `~/.hermes/logs/claudemax-launchd.log`
|
||||
- State it reuses:
|
||||
- live process table via `pgrep`
|
||||
- recent Claude logs `~/.hermes/logs/claude-*.log`
|
||||
- backlog count from Gitea
|
||||
- Current behavior as of this audit:
|
||||
- will NOT restart claude-loop if recent Claude logs say `You've hit your limit`
|
||||
- will log-and-skip missing helper scripts instead of failing loudly
|
||||
- Old-state risk:
|
||||
- any watchdog can resurrect a loop you meant to leave dead
|
||||
- this is the first place to check when a loop "comes back"
|
||||
|
||||
### B. quarantined legacy launch agents
|
||||
|
||||
These were moved out of `~/Library/LaunchAgents` on 2026-04-04 to:
|
||||
`~/Library/LaunchAgents.quarantine/timmy-legacy-20260404/`
|
||||
|
||||
#### 6. com.timmy.dashboard-backend
|
||||
- Former plist: `com.timmy.dashboard-backend.plist`
|
||||
- Former command: uvicorn `dashboard.app:app`
|
||||
- Former working directory: `~/worktrees/kimi-repo`
|
||||
- Quarantine reason:
|
||||
- served code from a specific stale worktree
|
||||
- could revive old backend state by launchd KeepAlive alone
|
||||
|
||||
#### 7. com.timmy.matrix-frontend
|
||||
- Former plist: `com.timmy.matrix-frontend.plist`
|
||||
- Former command: `npx vite --host`
|
||||
- Former working directory: `~/worktrees/the-matrix`
|
||||
- Quarantine reason:
|
||||
- pointed at the old `the-matrix` lineage instead of current nexus truth
|
||||
- could revive a stale frontend every login
|
||||
|
||||
#### 8. ai.hermes.startup
|
||||
- Former plist: `ai.hermes.startup.plist`
|
||||
- Former command: `~/.hermes/bin/hermes-startup.sh`
|
||||
- Quarantine reason:
|
||||
- startup path still expected missing `timmy-tmux.sh`
|
||||
- could recreate old webhook/tmux assumptions at login
|
||||
|
||||
#### 9. com.timmy.tick
|
||||
- Former plist: `com.timmy.tick.plist`
|
||||
- Former command: `/Users/apayne/Timmy-time-dashboard/deploy/timmy-tick-mac.sh`
|
||||
- Quarantine reason:
|
||||
- pure dashboard-era legacy path
|
||||
|
||||
### C. running now but NOT launchd-managed
|
||||
|
||||
These are live processes, but not currently represented by a loaded launchd plist.
|
||||
They can still persist because they were started with `nohup` or by other parent scripts.
|
||||
|
||||
#### 8. gemini-loop.sh
|
||||
- Live process: `~/.hermes/bin/gemini-loop.sh`
|
||||
- Source of truth: `timmy-config/bin/gemini-loop.sh`
|
||||
- State files:
|
||||
- `~/.hermes/logs/gemini-loop.log`
|
||||
- `~/.hermes/logs/gemini-skip-list.json`
|
||||
- `~/.hermes/logs/gemini-active.json`
|
||||
- `~/.hermes/logs/gemini-locks/`
|
||||
- `~/.hermes/logs/gemini-pids/`
|
||||
- worktrees under `~/worktrees/gemini-w*`
|
||||
- per-issue logs `~/.hermes/logs/gemini-*.log`
|
||||
- Default-safe behavior:
|
||||
- only picks issues explicitly assigned to `gemini`
|
||||
- self-assignment is opt-in via `ALLOW_SELF_ASSIGN=1`
|
||||
- Old-state risk:
|
||||
- skip list suppresses issues for hours
|
||||
- lock directories can make issues look "already busy"
|
||||
- old worktrees can preserve prior branch state
|
||||
- branch naming `gemini/issue-N` continues prior work if branch exists
|
||||
|
||||
Stop cleanly:
|
||||
```bash
|
||||
pkill -f 'bash /Users/apayne/.hermes/bin/gemini-loop.sh'
|
||||
pkill -f 'gemini .*--yolo'
|
||||
rm -rf ~/.hermes/logs/gemini-locks/*.lock ~/.hermes/logs/gemini-pids/*.pid
|
||||
printf '{}\n' > ~/.hermes/logs/gemini-active.json
|
||||
```
|
||||
|
||||
#### 9. timmy-orchestrator.sh
|
||||
- Live process: `~/.hermes/bin/timmy-orchestrator.sh`
|
||||
- Source of truth: `timmy-config/bin/timmy-orchestrator.sh`
|
||||
- State files:
|
||||
- `~/.hermes/logs/timmy-orchestrator.log`
|
||||
- `~/.hermes/logs/timmy-orchestrator.pid`
|
||||
- `~/.hermes/logs/timmy-reviews.log`
|
||||
- `~/.hermes/logs/workforce-manager.log`
|
||||
- transient state dir: `/tmp/timmy-state-$$/`
|
||||
- Default-safe behavior:
|
||||
- reports unassigned issues by default
|
||||
- bulk auto-assignment is opt-in via `AUTO_ASSIGN_UNASSIGNED=1`
|
||||
- reviews PRs via `hermes chat`
|
||||
- runs `workforce-manager.py`
|
||||
- Old-state risk:
|
||||
- if `AUTO_ASSIGN_UNASSIGNED=1`, it will mutate Gitea assignments and can repopulate queues
|
||||
- still uses live process/log state as an input surface
|
||||
|
||||
### D. Hermes cron automations
|
||||
|
||||
Current cron inventory from `cronjob(list, include_disabled=true)`:
|
||||
|
||||
Enabled:
|
||||
- `a77a87392582` — Health Monitor — every 5m
|
||||
|
||||
Paused:
|
||||
- `9e0624269ba7` — Triage Heartbeat
|
||||
- `e29eda4a8548` — PR Review Sweep
|
||||
- `5e9d952871bc` — Agent Status Check
|
||||
- `36fb2f630a17` — Hermes Philosophy Loop
|
||||
|
||||
Old-state risk:
|
||||
- paused crons are not dead forever; they are resumable state
|
||||
- LLM-wrapped crons can revive old routing/model assumptions if resumed blindly
|
||||
|
||||
### E. file exists but NOT currently loaded
|
||||
|
||||
These are the ones most likely to surprise us later because they still exist and point at old realities.
|
||||
|
||||
#### 10. com.tower.pr-automerge
|
||||
- Plist: `~/Library/LaunchAgents/com.tower.pr-automerge.plist`
|
||||
- Points to: `/Users/apayne/hermes-config/bin/pr-automerge.sh`
|
||||
- Not loaded at audit time
|
||||
- Separate Tower-era automation path; not part of current Timmy sidecar truth
|
||||
|
||||
## State carriers that make the machine feel haunted
|
||||
|
||||
These are the files and external states that most often "bring back old state":
|
||||
|
||||
### Hermes runtime state
|
||||
- `~/.hermes/config.yaml`
|
||||
- `~/.hermes/channel_directory.json`
|
||||
- `~/.hermes/sessions/sessions.json`
|
||||
- `~/.hermes/state.db`
|
||||
|
||||
### Loop state
|
||||
- `~/.hermes/logs/claude-skip-list.json`
|
||||
- `~/.hermes/logs/claude-active.json`
|
||||
- `~/.hermes/logs/claude-locks/`
|
||||
- `~/.hermes/logs/claude-pids/`
|
||||
- `~/.hermes/logs/gemini-skip-list.json`
|
||||
- `~/.hermes/logs/gemini-active.json`
|
||||
- `~/.hermes/logs/gemini-locks/`
|
||||
- `~/.hermes/logs/gemini-pids/`
|
||||
|
||||
### Kimi queue state
|
||||
- Gitea labels, not local files, are the queue truth
|
||||
- `assigned-kimi`
|
||||
- `kimi-in-progress`
|
||||
- `kimi-done`
|
||||
|
||||
### Worktree state
|
||||
- `~/worktrees/*`
|
||||
- especially old frontend/backend worktrees like:
|
||||
- `~/worktrees/the-matrix`
|
||||
- `~/worktrees/kimi-repo`
|
||||
|
||||
### Launchd state
|
||||
- plist files in `~/Library/LaunchAgents`
|
||||
- anything with `RunAtLoad` and `KeepAlive` can resurrect automatically
|
||||
|
||||
## Audit commands
|
||||
|
||||
List loaded Timmy/Hermes automations:
|
||||
```bash
|
||||
launchctl list | egrep 'timmy|kimi|claude|max|dashboard|matrix|gateway|huey'
|
||||
```
|
||||
|
||||
List Timmy/Hermes launch agent files:
|
||||
```bash
|
||||
find ~/Library/LaunchAgents -maxdepth 1 -name '*.plist' | egrep 'timmy|hermes|openclaw|tower'
|
||||
```
|
||||
|
||||
List running loop scripts:
|
||||
```bash
|
||||
ps -Ao pid,ppid,etime,command | egrep '/Users/apayne/.hermes/bin/|/Users/apayne/.timmy/uniwizard/'
|
||||
```
|
||||
|
||||
List cron jobs:
|
||||
```bash
|
||||
hermes cron list --include-disabled
|
||||
```
|
||||
|
||||
## Safe reset order when old state keeps coming back
|
||||
|
||||
1. Stop launchd jobs first
|
||||
```bash
|
||||
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.timmy.kimi-heartbeat.plist || true
|
||||
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.timmy.claudemax-watchdog.plist || true
|
||||
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway.plist || true
|
||||
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway-fenrir.plist || true
|
||||
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.openclaw.gateway.plist || true
|
||||
```
|
||||
|
||||
2. Kill manual loops
|
||||
```bash
|
||||
pkill -f 'gemini-loop.sh' || true
|
||||
pkill -f 'timmy-orchestrator.sh' || true
|
||||
pkill -f 'claude-loop.sh' || true
|
||||
pkill -f 'claude .*--print' || true
|
||||
pkill -f 'gemini .*--yolo' || true
|
||||
```
|
||||
|
||||
3. Clear local loop state
|
||||
```bash
|
||||
rm -rf ~/.hermes/logs/claude-locks/*.lock ~/.hermes/logs/claude-pids/*.pid
|
||||
rm -rf ~/.hermes/logs/gemini-locks/*.lock ~/.hermes/logs/gemini-pids/*.pid
|
||||
printf '{}\n' > ~/.hermes/logs/claude-active.json
|
||||
printf '{}\n' > ~/.hermes/logs/gemini-active.json
|
||||
rm -f /tmp/kimi-heartbeat.lock
|
||||
```
|
||||
|
||||
4. If gateway/session drift is the problem, back up before clearing
|
||||
```bash
|
||||
cp ~/.hermes/config.yaml ~/.hermes/config.yaml.bak.$(date +%Y%m%d-%H%M%S)
|
||||
cp ~/.hermes/sessions/sessions.json ~/.hermes/sessions/sessions.json.bak.$(date +%Y%m%d-%H%M%S)
|
||||
```
|
||||
|
||||
5. Relaunch only what you explicitly want
|
||||
|
||||
## Current contradictions to fix later
|
||||
|
||||
1. README and DEPRECATED were corrected on 2026-04-04, but older local clones may still have stale prose.
|
||||
2. The quarantined launch agents now live under `~/Library/LaunchAgents.quarantine/timmy-legacy-20260404/`; if someone moves them back, the old state can return.
|
||||
3. `gemini-loop.sh` and `timmy-orchestrator.sh` now have source-controlled homes in `timmy-config/bin/`, but any local forks or older runtime copies should be treated as suspect until redeployed.
|
||||
4. Keep docs-only PRs and script-import PRs on clean branches from `origin/main`; do not mix them with unrelated local history.
|
||||
|
||||
Until those are reconciled, trust this inventory over older prose.
|
||||
|
||||
### Memory & Audit Capabilities (Added 2026-04-06)
|
||||
|
||||
| Capability | Task/Helper | Purpose | State Carrier |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| **Continuity Flush** | `flush_continuity` | Pre-compaction session state persistence. | `~/.timmy/continuity/active.md` |
|
||||
| **Sovereign Audit** | `audit_log` | Automated action logging with confidence signaling. | `~/.timmy/logs/audit.jsonl` |
|
||||
| **Fallback Routing** | `get_model_for_task` | Dynamic model selection based on portfolio doctrine. | `fallback-portfolios.yaml` |
|
||||
199
docs/comms-authority-map.md
Normal file
199
docs/comms-authority-map.md
Normal file
@@ -0,0 +1,199 @@
|
||||
# Communication Authority Map
|
||||
|
||||
Status: doctrine for #175
|
||||
Parent epic: #173
|
||||
Related issues:
|
||||
- #165 NATS internal bus
|
||||
- #166 Matrix/Conduit operator communication
|
||||
- #174 Nostr/Nostur operator edge
|
||||
- #163 sovereign keypairs / identity
|
||||
|
||||
## Why this exists
|
||||
|
||||
We do not want communication scattered across lost channels.
|
||||
|
||||
The system may expose multiple communication surfaces, but work authority must not fragment with them.
|
||||
A message can arrive from several places.
|
||||
Task truth cannot.
|
||||
|
||||
This document defines which surface is authoritative for what, how operator messages enter the system, and how Matrix plus Nostr/Nostur can coexist without creating parallel hidden queues.
|
||||
|
||||
## Core principle
|
||||
|
||||
One message may have many transport surfaces.
|
||||
One piece of work gets one execution truth.
|
||||
|
||||
That execution truth is Gitea.
|
||||
|
||||
If a command or request matters to the fleet, it must become a visible Gitea artifact:
|
||||
- issue
|
||||
- issue comment
|
||||
- PR comment
|
||||
- assignee/label change
|
||||
- linked proof artifact
|
||||
|
||||
No chat surface is allowed to become a second hidden task database.
|
||||
|
||||
## Authority layers
|
||||
|
||||
### 1. Gitea — execution truth
|
||||
|
||||
Authoritative for:
|
||||
- task state
|
||||
- issue ownership
|
||||
- PR state
|
||||
- review state
|
||||
- visible decision trail
|
||||
- proof links and artifacts
|
||||
|
||||
Rules:
|
||||
- if work is actionable, it must exist in Gitea
|
||||
- if state changes, the change must be reflected in Gitea
|
||||
- if chat and Gitea disagree, Gitea wins until corrected visibly
|
||||
|
||||
### 2. NATS — internal agent bus
|
||||
|
||||
Authoritative for:
|
||||
- fast machine-to-machine transport only
|
||||
|
||||
Not authoritative for:
|
||||
- task truth
|
||||
- operator truth
|
||||
- final queue state
|
||||
|
||||
Rules:
|
||||
- NATS moves signals, not ownership truth
|
||||
- durable work still lands in Gitea
|
||||
- request/reply and heartbeats may live here without becoming the task system
|
||||
|
||||
### 3. Matrix/Conduit — primary private operator command surface
|
||||
|
||||
Authoritative for:
|
||||
- private human-to-fleet conversation
|
||||
- rich command context
|
||||
- operational chat that should not be public
|
||||
|
||||
Not authoritative for:
|
||||
- final task state
|
||||
- hidden work queues
|
||||
|
||||
Rules:
|
||||
- Matrix is the primary private operator room
|
||||
- any command that creates or mutates work must be mirrored into Gitea
|
||||
- Matrix can discuss work privately, but cannot be the only place where the work exists
|
||||
- if a command remains chat-only, it is advisory, not execution truth
|
||||
|
||||
### 4. Nostr/Nostur — sovereign operator edge
|
||||
|
||||
Authoritative for:
|
||||
- operator identity-linked ingress
|
||||
- portable/mobile sovereign access
|
||||
- public or semi-public notices if intentionally used that way
|
||||
- emergency or lightweight operator signaling
|
||||
|
||||
Not authoritative for:
|
||||
- internal fleet transport
|
||||
- hidden task state
|
||||
- long-lived queue truth
|
||||
|
||||
Rules:
|
||||
- Nostur is a real operator layer, not a toy side-channel
|
||||
- commands received via Nostr/Nostur must be normalized into Gitea before they are considered active work
|
||||
- if private discussion is needed after Nostr ingress, continue in Matrix while keeping Gitea as visible task truth
|
||||
- Nostr/Nostur should preserve sovereign identity advantages without becoming an alternate invisible work tracker
|
||||
|
||||
### 5. Telegram — legacy bridge only
|
||||
|
||||
Authoritative for:
|
||||
- nothing new
|
||||
|
||||
Rules:
|
||||
- Telegram is legacy/bridge until sunset
|
||||
- no new doctrine should make Telegram the permanent backbone
|
||||
- if Telegram receives work during migration, the work still gets mirrored into Gitea and then into the current primary surfaces
|
||||
|
||||
## Ingress rules
|
||||
|
||||
### Rule A: every actionable operator message gets normalized
|
||||
|
||||
If an operator message from Matrix, Nostr/Nostur, or Telegram asks for real work, the system must do one of the following:
|
||||
- create a new Gitea issue
|
||||
- append to the correct existing issue as a comment
|
||||
- explicitly reject the message as non-actionable
|
||||
- route it to a coordinator for clarification before any work begins
|
||||
|
||||
### Rule B: no hidden queue mutation
|
||||
|
||||
Refreshing a chat room, reading a relay event, or polling a transport must not silently create work.
|
||||
The transition from chat to work must be explicit and visible.
|
||||
|
||||
### Rule C: one work item, many mirrors allowed
|
||||
|
||||
A message may be mirrored across:
|
||||
- Matrix
|
||||
- Nostr/Nostur
|
||||
- Telegram during migration
|
||||
- local notifications
|
||||
|
||||
But all mirrors must point back to the same Gitea work object.
|
||||
|
||||
### Rule D: coordinator-first survives transport changes
|
||||
|
||||
Timmy and Allegro remain the coordinators.
|
||||
Changing the transport does not remove their authority to:
|
||||
- classify urgency
|
||||
- decide routing
|
||||
- demand proof
|
||||
- collapse duplicates
|
||||
- escalate only what Alexander should actually see
|
||||
|
||||
## Recommended operator experience
|
||||
|
||||
### Matrix
|
||||
Use for:
|
||||
- primary private conversation with the fleet
|
||||
- ongoing task discussion
|
||||
- handoff and clarification
|
||||
- richer context than a short mobile note
|
||||
|
||||
### Nostur
|
||||
Use for:
|
||||
- sovereign mobile/operator ingress
|
||||
- identity-linked quick commands
|
||||
- lightweight acknowledgements
|
||||
- emergency input when Matrix is not the best surface
|
||||
|
||||
Working rule:
|
||||
- Nostur gets you into the system
|
||||
- Matrix carries the private conversation
|
||||
- Gitea holds the work truth
|
||||
|
||||
## Anti-scatter policy
|
||||
|
||||
Forbidden patterns:
|
||||
- a task exists only in a Matrix room
|
||||
- a task exists only in a Nostr DM or note
|
||||
- a Telegram thread contains work nobody copied into Gitea
|
||||
- different channels describe the same work with different owners or statuses
|
||||
- an agent acts on Nostr/Matrix chatter without a visible work object when the task is non-trivial
|
||||
|
||||
Required pattern:
|
||||
- every meaningful task gets one canonical Gitea object
|
||||
- all channels point at or mirror that object
|
||||
- coordinators keep channel drift collapsed, not multiplied
|
||||
|
||||
## Minimum implementation path
|
||||
|
||||
1. Matrix/Conduit becomes the primary private operator surface (#166)
|
||||
2. Nostr/Nostur becomes the sovereign operator edge (#174)
|
||||
3. NATS remains internal bus only (#165)
|
||||
4. every ingress path writes or links to Gitea execution truth
|
||||
5. Telegram is reduced to bridge/legacy during migration
|
||||
|
||||
## Acceptance criteria
|
||||
|
||||
- [ ] Matrix, Nostr/Nostur, NATS, Gitea, and Telegram each have an explicit role
|
||||
- [ ] Gitea is named as the sole execution-truth surface
|
||||
- [ ] Nostur is included as a legitimate operator layer, not ignored
|
||||
- [ ] Nostur/Matrix ingress rules explicitly forbid shadow task state
|
||||
- [ ] this doctrine makes it harder for work to get lost across channels
|
||||
373
docs/coordinator-first-protocol.md
Normal file
373
docs/coordinator-first-protocol.md
Normal file
@@ -0,0 +1,373 @@
|
||||
# Coordinator-first protocol
|
||||
|
||||
This doctrine translates the Timmy coordinator lane into one visible operating loop:
|
||||
|
||||
intake -> triage -> route -> track -> verify -> report
|
||||
|
||||
It applies to any coordinator running through the current sidecar stack:
|
||||
- Timmy as the governing local coordinator
|
||||
- Allegro as the operations coordinator
|
||||
- automation wired through the sidecar, including Huey tasks, playbooks, and wizard-house runtime
|
||||
|
||||
The implementation surface may change.
|
||||
The coordination truth does not.
|
||||
|
||||
## Purpose
|
||||
|
||||
The goal is not to invent more process.
|
||||
The goal is to make queue mutation, authority boundaries, escalation, and completion proof explicit.
|
||||
|
||||
Timmy already has stronger doctrine than generic coordinator systems.
|
||||
This protocol keeps that doctrine while making the coordinator loop legible and reviewable.
|
||||
|
||||
## Operating invariants
|
||||
|
||||
1. Gitea is the shared coordination truth.
|
||||
- issues
|
||||
- pull requests
|
||||
- comments
|
||||
- assignees
|
||||
- labels
|
||||
- linked branches and commits
|
||||
- linked proof artifacts
|
||||
|
||||
2. Local-only state is advisory, not authoritative.
|
||||
- tmux panes
|
||||
- local lock files
|
||||
- Huey queue state
|
||||
- scratch notes
|
||||
- transient logs
|
||||
- model-specific internal memory
|
||||
|
||||
3. If local state and Gitea disagree, stop mutating the queue until the mismatch is reconciled in Gitea.
|
||||
|
||||
4. A worker saying "done" is not enough.
|
||||
COMPLETE requires visible artifact verification.
|
||||
|
||||
5. Alexander is not the default ambiguity sink.
|
||||
If work is unclear, the coordinator must either:
|
||||
- request clarification visibly in Gitea
|
||||
- decompose the work into a smaller visible unit
|
||||
- escalate to Timmy for governing judgment
|
||||
|
||||
6. The sidecar owns doctrine and coordination rules.
|
||||
The harness may execute the loop, but the repo-visible doctrine in `timmy-config` governs what the loop is allowed to do.
|
||||
|
||||
## Standing authorities
|
||||
|
||||
### Timmy
|
||||
|
||||
Timmy is the governing coordinator.
|
||||
|
||||
Timmy may automatically:
|
||||
- accept intake into the visible queue
|
||||
- set or correct urgency
|
||||
- decompose oversized work
|
||||
- assign or reassign owners
|
||||
- reject duplicate or false-progress work
|
||||
- require stronger acceptance criteria
|
||||
- require stronger proof before closure
|
||||
- verify completion when the proof is visible and sufficient
|
||||
- decide whether something belongs in Allegro's lane or requires principal review
|
||||
|
||||
Timmy must escalate to Alexander when the issue requires:
|
||||
- a change to doctrine, soul, or standing authorities
|
||||
- a release or architecture tradeoff with principal-facing consequences
|
||||
- an irreversible public commitment made in Alexander's name
|
||||
- secrets, credentials, money, or external account authority
|
||||
- destructive production action with non-trivial blast radius
|
||||
- a true priority conflict between principal goals
|
||||
|
||||
### Allegro
|
||||
|
||||
Allegro is the operations coordinator.
|
||||
|
||||
Allegro may automatically:
|
||||
- capture intake into a visible Gitea issue or comment
|
||||
- perform first-pass triage
|
||||
- assign urgency using this doctrine
|
||||
- route work within the audited lane map
|
||||
- request clarification or decomposition
|
||||
- maintain queue hygiene
|
||||
- follow up on stale work
|
||||
- re-route bounded work when the current owner is clearly wrong
|
||||
- move work into ready-for-verify state when artifacts are posted
|
||||
- verify and close routine docs, ops, and queue-hygiene work when proof is explicit and no governing boundary is crossed
|
||||
- assemble principal digests and operational reports
|
||||
|
||||
Allegro must escalate to Timmy when the issue touches:
|
||||
- doctrine, identity, conscience, or standing authority
|
||||
- architecture, release shape, or repo-boundary decisions
|
||||
- cross-repo decomposition with non-obvious ownership
|
||||
- conflicting worker claims
|
||||
- missing or weak acceptance criteria on urgent work
|
||||
- a proposed COMPLETE state without visible artifacts
|
||||
- any action that would materially change what Alexander sees or believes happened
|
||||
|
||||
### Workers and builders
|
||||
|
||||
Execution agents may:
|
||||
- implement the work
|
||||
- open or update a PR
|
||||
- post progress comments
|
||||
- attach proof artifacts
|
||||
- report blockers
|
||||
- request re-route or decomposition
|
||||
|
||||
Execution agents may not treat local notes, local logs, or private session state as queue truth.
|
||||
If it matters, it must be visible in Gitea.
|
||||
|
||||
### Alexander
|
||||
|
||||
Alexander is the principal.
|
||||
|
||||
Alexander does not need to see every internal routing note.
|
||||
Alexander must see:
|
||||
- decisions that require principal judgment
|
||||
- urgent incidents that affect live work, safety, or trust
|
||||
- verified completions that matter to active priorities
|
||||
- concise reports linked to visible artifacts
|
||||
|
||||
## Truth surfaces
|
||||
|
||||
Use this truth order when deciding what is real:
|
||||
|
||||
1. Gitea issue and PR state
|
||||
2. Gitea comments that explain coordinator decisions
|
||||
3. repo-visible artifacts such as committed docs, branches, commits, and PR descriptions
|
||||
4. linked proof artifacts cited from the issue or PR
|
||||
5. local-only state used to produce the above
|
||||
|
||||
Levels 1 through 4 may justify queue mutation.
|
||||
Level 5 alone may not.
|
||||
|
||||
## The loop
|
||||
|
||||
| Stage | Coordinator job | Required visible artifact | Exit condition |
|
||||
|---|---|---|---|
|
||||
| Intake | capture the request as a queue item | issue, PR, or issue comment that names the request and source | work exists in Gitea and can be pointed to |
|
||||
| Triage | classify repo, scope, urgency, owner lane, and acceptance shape | comment or issue update naming urgency, intended owner lane, and any missing clarity | the next coordinator action is obvious |
|
||||
| Route | assign a single owner or split into smaller visible units | assignee change, linked child issues, or route comment | one owner has one bounded next move |
|
||||
| Track | keep status current and kill invisible drift | progress comment, blocker comment, linked PR, or visible state change | queue state matches reality |
|
||||
| Verify | compare artifacts to acceptance criteria and proof standard | verification comment citing proof | proof is sufficient or the work is bounced back |
|
||||
| Report | compress what matters for operators and principal | linked digest, summary comment, or review note | Alexander can see the state change without reading internal chatter |
|
||||
|
||||
## Intake rules
|
||||
|
||||
Intake is complete only when the request is visible in Gitea.
|
||||
|
||||
If a request arrives through another channel, the coordinator must first turn it into one of:
|
||||
- a new issue
|
||||
- a comment on the governing issue
|
||||
- a PR linked to the governing issue
|
||||
|
||||
The intake artifact must answer:
|
||||
- what is being asked
|
||||
- which repo owns it
|
||||
- whether it is new work, a correction, or a blocker on existing work
|
||||
|
||||
Invisible intake is forbidden.
|
||||
A coordinator may keep scratch notes, but scratch notes do not create queue reality.
|
||||
|
||||
## Triage rules
|
||||
|
||||
Triage produces five outputs:
|
||||
- owner repo
|
||||
- urgency class
|
||||
- owner lane
|
||||
- acceptance shape
|
||||
- escalation need, if any
|
||||
|
||||
A triaged item should answer:
|
||||
- Is this live pain, active priority, backlog, or research?
|
||||
- Is the scope small enough for one owner?
|
||||
- Are the acceptance criteria visible and testable?
|
||||
- Is this a Timmy judgment issue, an Allegro routing issue, or a builder issue?
|
||||
- Does Alexander need to see this now, later, or not at all unless it changes state?
|
||||
|
||||
If the work spans more than one repo or clearly exceeds one bounded owner move, the coordinator should split it before routing implementation.
|
||||
|
||||
## Urgency classes
|
||||
|
||||
| Class | Meaning | Default coordinator response | Alexander visibility |
|
||||
|---|---|---|---|
|
||||
| U0 - Crisis | safety, security, data loss, production-down, Gitea-down, or anything that can burn trust immediately | interrupt normal queue, page Timmy, make the incident visible now | immediate |
|
||||
| U1 - Hot | blocks active principal work, active release, broken automation, red path on current work | route in the current cycle and track closely | visible now if it affects current priorities or persists |
|
||||
| U2 - Active | important current-cycle work with clear acceptance criteria | route normally and keep visible progress | include in digest unless escalated |
|
||||
| U3 - Backlog | useful work with no current pain | batch triage and route by capacity | digest only |
|
||||
| U4 - Cold | vague ideas, research debt, or deferred work with no execution owner yet | keep visible, do not force execution | optional unless promoted |
|
||||
|
||||
Urgency may be raised or lowered only with a visible reason.
|
||||
Silent priority drift is coordinator failure.
|
||||
|
||||
## Escalation rules
|
||||
|
||||
Escalation is required when any of the following becomes true:
|
||||
|
||||
1. Authority boundary crossed
|
||||
- Allegro hits doctrine, architecture, release, or identity questions
|
||||
- any coordinator action would change principal-facing meaning
|
||||
|
||||
2. Proof boundary crossed
|
||||
- a worker claims done without visible artifacts
|
||||
- the proof contradicts the claim
|
||||
- the only evidence is local logs or private notes
|
||||
|
||||
3. Scope boundary crossed
|
||||
- the task is wider than one owner
|
||||
- the task crosses repos without an explicit split
|
||||
- the acceptance criteria changed materially mid-flight
|
||||
|
||||
4. Time boundary crossed
|
||||
- U0 has no visible owner immediately
|
||||
- U1 shows no visible movement in the current cycle
|
||||
- any item has stale local progress that is not reflected in Gitea
|
||||
|
||||
5. Trust boundary crossed
|
||||
- duplicate work appears
|
||||
- one worker's claim conflicts with another's
|
||||
- Gitea state and runtime state disagree
|
||||
|
||||
Default escalation path:
|
||||
- worker -> Allegro for routing and state hygiene
|
||||
- Allegro -> Timmy for governing judgment
|
||||
- Timmy -> Alexander only for principal decisions or immediate trust-risk events
|
||||
|
||||
Do not write "needs human review" as a generic sink.
|
||||
Name the exact decision that needs principal authority.
|
||||
If the decision is not principal in nature, keep it inside the coordinator loop.
|
||||
|
||||
## Route rules
|
||||
|
||||
Routing should prefer one owner per visible unit.
|
||||
|
||||
The coordinator may automatically:
|
||||
- assign one execution owner
|
||||
- split work into child issues
|
||||
- re-route obviously misassigned work
|
||||
- hold work in triage when acceptance criteria are weak
|
||||
|
||||
The coordinator should not:
|
||||
- assign speculative ideation directly to a builder
|
||||
- assign multi-repo ambiguity as if it were a one-file patch
|
||||
- hide re-routing decisions in local notes
|
||||
- keep live work unassigned while claiming it is under control
|
||||
|
||||
Every routed item should make the next expected artifact explicit.
|
||||
Examples:
|
||||
- open a PR
|
||||
- post a design note
|
||||
- attach command output
|
||||
- attach screenshot proof outside the repo and link it from the issue or PR
|
||||
|
||||
## Track rules
|
||||
|
||||
Tracking exists to keep the queue honest.
|
||||
|
||||
Acceptable tracking artifacts include:
|
||||
- assignee changes
|
||||
- linked PRs
|
||||
- blocker comments
|
||||
- reroute comments
|
||||
- verification requests
|
||||
- digest references
|
||||
|
||||
Tracking does not mean constant chatter.
|
||||
It means that a third party can open the issue and tell what is happening without access to private local state.
|
||||
|
||||
If a worker is making progress locally but Gitea still looks idle, the coordinator must fix the visibility gap.
|
||||
|
||||
## Verify rules
|
||||
|
||||
Verification is the gate before COMPLETE.
|
||||
|
||||
COMPLETE means one of:
|
||||
- the issue is closed with proof
|
||||
- the PR is merged with proof
|
||||
- the governing issue records that the acceptance criteria were met by linked artifacts
|
||||
|
||||
Minimum rule:
|
||||
no artifact verification, no COMPLETE.
|
||||
|
||||
Verification must cite visible artifacts that match the kind of work done.
|
||||
|
||||
| Work type | Minimum proof |
|
||||
|---|---|
|
||||
| docs / doctrine | commit or PR link plus a verification note naming the changed sections |
|
||||
| code / config | commit or PR link plus exact command output, test result, or other world-state evidence |
|
||||
| ops / runtime | command output, health check, log citation, or other world-state proof linked from the issue or PR |
|
||||
| visual / UI | screenshot proof linked from the issue or PR, with a note saying what it proves |
|
||||
| routing / coordination | assignee change, linked issue or PR, and a visible comment explaining the state change |
|
||||
|
||||
The proof standard in [`CONTRIBUTING.md`](../CONTRIBUTING.md) applies here.
|
||||
This protocol does not weaken it.
|
||||
|
||||
If proof is missing or weak, the coordinator must bounce the work back into route or track.
|
||||
"Looks right" is not verification.
|
||||
"The logs seemed good" is not verification.
|
||||
A private local transcript is not verification.
|
||||
|
||||
## Report rules
|
||||
|
||||
Reporting compresses truth for the next reader.
|
||||
|
||||
A good report answers:
|
||||
- what changed
|
||||
- what is blocked
|
||||
- what was verified
|
||||
- what needs a decision
|
||||
- where the proof lives
|
||||
|
||||
### Alexander-facing report
|
||||
|
||||
Alexander should normally see only:
|
||||
- verified completions that matter to active priorities
|
||||
- hot blockers and incidents
|
||||
- decisions that need principal judgment
|
||||
- a concise backlog or cycle summary linked to Gitea artifacts
|
||||
|
||||
### Internal coordinator report
|
||||
|
||||
Internal coordinator material may include:
|
||||
- candidate routes not yet committed
|
||||
- stale-lane heuristics
|
||||
- provider or model-level routing notes
|
||||
- reminder lists and follow-up timing
|
||||
- advisory runtime observations
|
||||
|
||||
Internal coordinator material may help operations.
|
||||
It does not become truth until it is written back to Gitea or the repo.
|
||||
|
||||
## Principal visibility ladder
|
||||
|
||||
| Level | What it contains | Who it is for |
|
||||
|---|---|---|
|
||||
| L0 - Internal advisory | scratch triage, provisional scoring, local runtime notes, reminders | coordinators only |
|
||||
| L1 - Visible execution truth | issue state, PR state, assignee, labels, linked artifacts, verification comments | everyone, including Alexander if he opens Gitea |
|
||||
| L2 - Principal digest | concise summaries of verified progress, blockers, and needed decisions | Alexander |
|
||||
| L3 - Immediate escalation | crisis, trust-risk, security, production-down, or principal-blocking events | Alexander now |
|
||||
|
||||
The coordinator should keep as much noise as possible in L0.
|
||||
The coordinator must ensure anything decision-relevant reaches L1, L2, or L3.
|
||||
|
||||
## What this protocol forbids
|
||||
|
||||
This doctrine forbids:
|
||||
- invisible queue mutation
|
||||
- COMPLETE without artifacts
|
||||
- using local logs as the only evidence of completion
|
||||
- routing by private memory alone
|
||||
- escalating ambiguity to Alexander by default
|
||||
- letting sidecar automation create a shadow queue outside Gitea
|
||||
|
||||
## Success condition
|
||||
|
||||
The protocol is working when:
|
||||
- new work becomes visible quickly
|
||||
- routing is legible
|
||||
- urgency changes have reasons
|
||||
- local automation can help without becoming a hidden state machine
|
||||
- Alexander sees the things that matter and not the chatter that does not
|
||||
- completed work can be proven from visible artifacts rather than trust in a local machine
|
||||
|
||||
*Sovereignty and service always.*
|
||||
248
docs/fallback-portfolios.md
Normal file
248
docs/fallback-portfolios.md
Normal file
@@ -0,0 +1,248 @@
|
||||
# Per-Agent Fallback Portfolios and Task-Class Routing
|
||||
|
||||
Status: proposed doctrine for issue #155
|
||||
Scope: policy and sidecar structure only; no runtime wiring in `tasks.py` or live loops yet
|
||||
|
||||
## Why this exists
|
||||
|
||||
Timmy already has multiple model paths declared in `config.yaml`, multiple task surfaces in `playbooks/`, and multiple live automation lanes documented in `docs/automation-inventory.md`.
|
||||
|
||||
What is missing is a declared resilience doctrine for how specific agents degrade when a provider, quota, or model family fails. Without that doctrine, the whole fleet tends to collapse onto the same fallback chain, which means one outage turns into synchronized fleet degradation.
|
||||
|
||||
This spec makes the fallback graph explicit before runtime wiring lands.
|
||||
|
||||
## Timmy ownership boundary
|
||||
|
||||
`timmy-config` owns:
|
||||
- routing doctrine for Timmy-side task classes
|
||||
- sidecar-readable fallback portfolio declarations
|
||||
- capability floors and degraded-mode authority restrictions
|
||||
- the mapping between current playbooks and future resilient agent lanes
|
||||
|
||||
`timmy-config` does not own:
|
||||
- live queue state or issue truth outside Gitea
|
||||
- launchd state, loop resurrection, or stale runtime reuse
|
||||
- ad hoc worktree history or hidden queue mutation
|
||||
|
||||
That split matters. This repo should declare how routing is supposed to work. Runtime surfaces should consume that declaration instead of inventing their own fallback orderings.
|
||||
|
||||
## Non-goals
|
||||
|
||||
This issue does not:
|
||||
- fully wire portfolio selection into `tasks.py`, launch agents, or live loops
|
||||
- bless human-token or operator-token fallbacks as part of an automated chain
|
||||
- allow degraded agents to keep full authority just because they are still producing output
|
||||
|
||||
## Role classes
|
||||
|
||||
### 1. Judgment
|
||||
|
||||
Use for work where the main risk is a bad decision, not a missing patch.
|
||||
|
||||
Current Timmy surfaces:
|
||||
- `playbooks/issue-triager.yaml`
|
||||
- `playbooks/pr-reviewer.yaml`
|
||||
- `playbooks/verified-logic.yaml`
|
||||
|
||||
Typical task classes:
|
||||
- issue triage
|
||||
- queue routing
|
||||
- PR review
|
||||
- proof / consistency checks
|
||||
- governance-sensitive review
|
||||
|
||||
Judgment lanes may read broadly, but they lose authority earlier than builder lanes when degraded.
|
||||
|
||||
### 2. Builder
|
||||
|
||||
Use for work where the main risk is producing or verifying a change.
|
||||
|
||||
Current Timmy surfaces:
|
||||
- `playbooks/bug-fixer.yaml`
|
||||
- `playbooks/test-writer.yaml`
|
||||
- `playbooks/refactor-specialist.yaml`
|
||||
|
||||
Typical task classes:
|
||||
- bug fixes
|
||||
- test writing
|
||||
- bounded refactors
|
||||
- narrow docs or code repairs with verification
|
||||
|
||||
Builder lanes keep patch-producing usefulness longer than judgment lanes, but they must lose control-plane authority as they degrade.
|
||||
|
||||
### 3. Wolf / bulk
|
||||
|
||||
Use for repetitive, high-volume, bounded, reversible work.
|
||||
|
||||
Current Timmy world-state:
|
||||
- bulk and sweep behavior is still represented more by live ops reality in `docs/automation-inventory.md` than by a dedicated sidecar playbook
|
||||
- this class covers the work shape currently associated with queue hygiene, inventory refresh, docs sweeps, log summarization, and repetitive small-diff passes
|
||||
|
||||
Typical task classes:
|
||||
- docs inventory refresh
|
||||
- log summarization
|
||||
- queue hygiene
|
||||
- repetitive small diffs
|
||||
- research or extraction sweeps
|
||||
|
||||
Wolf / bulk lanes are throughput-first and deliberately lower-authority.
|
||||
|
||||
## Routing policy
|
||||
|
||||
1. If the task touches a sensitive control surface, route to judgment first even if the edit is small.
|
||||
2. If the task is primarily about merge authority, routing authority, proof, or governance, route to judgment.
|
||||
3. If the task is primarily about producing a patch with local verification, route to builder.
|
||||
4. If the task is repetitive, bounded, reversible, and low-authority, route to wolf / bulk.
|
||||
5. If a wolf / bulk task expands beyond its size or authority envelope, promote it upward; do not let it keep grinding forward through scope creep.
|
||||
6. If a builder task becomes architecture, multi-repo coordination, or control-plane review, promote it to judgment.
|
||||
7. If a lane reaches terminal fallback, it must still land in a usable degraded mode. Dead silence is not an acceptable terminal state.
|
||||
|
||||
## Sensitive control surfaces
|
||||
|
||||
These paths stay judgment-routed unless explicitly reviewed otherwise:
|
||||
- `SOUL.md`
|
||||
- `config.yaml`
|
||||
- `deploy.sh`
|
||||
- `tasks.py`
|
||||
- `playbooks/`
|
||||
- `cron/`
|
||||
- `memories/`
|
||||
- `skins/`
|
||||
- `training/`
|
||||
|
||||
This mirrors the current PR-review doctrine and keeps degraded builder or bulk lanes away from Timmy's control plane.
|
||||
|
||||
## Portfolio design rules
|
||||
|
||||
The sidecar portfolio declaration in `fallback-portfolios.yaml` follows these rules:
|
||||
|
||||
1. Every critical agent gets four slots:
|
||||
- primary
|
||||
- fallback1
|
||||
- fallback2
|
||||
- terminal fallback
|
||||
2. No two critical agents may share the same `primary + fallback1` pair.
|
||||
3. Provider families should be anti-correlated across critical lanes whenever practical.
|
||||
4. Terminal fallbacks must end in a usable degraded lane, not a null lane.
|
||||
5. At least one critical lane must end on a local-capable path.
|
||||
6. No human-token fallback patterns are allowed in automated chains.
|
||||
7. Degraded mode reduces authority before it removes usefulness.
|
||||
8. A terminal lane that cannot safely produce an artifact is not a valid terminal lane.
|
||||
|
||||
## Explicit ban: synchronized fleet degradation
|
||||
|
||||
Synchronized fleet degradation is forbidden.
|
||||
|
||||
That means:
|
||||
- do not point every critical agent at the same fallback stack
|
||||
- do not let all judgment agents converge on the same first backup if avoidable
|
||||
- do not let all builder agents collapse onto the same weak terminal lane
|
||||
- do not treat "everyone fell back to the cheapest thing" as resilience
|
||||
|
||||
A resilient fleet degrades unevenly on purpose. Some lanes should stay sharp while others become slower or narrower.
|
||||
|
||||
## Capability floors and degraded authority
|
||||
|
||||
### Shared slot semantics
|
||||
|
||||
- `primary`: full role-class authority
|
||||
- `fallback1`: full task authority for normal work, but no silent broadening of scope
|
||||
- `fallback2`: bounded and reversible work only; no irreversible control-plane action
|
||||
- `terminal`: usable degraded lane only; must produce a machine-usable artifact but must not impersonate full authority
|
||||
|
||||
### Judgment floors
|
||||
|
||||
Judgment agents lose authority earliest.
|
||||
|
||||
At `fallback2` and below, judgment lanes must not:
|
||||
- merge PRs
|
||||
- close or rewrite governing issues or PRs
|
||||
- mutate sensitive control surfaces
|
||||
- bulk-reassign the fleet
|
||||
- silently change routing policy
|
||||
|
||||
Their degraded usefulness is still real:
|
||||
- classify backlog
|
||||
- produce draft routing plans
|
||||
- summarize risk
|
||||
- leave bounded labels or comments with explicit evidence
|
||||
|
||||
### Builder floors
|
||||
|
||||
Builder agents may continue doing useful narrow work deeper into degradation, but only inside a tighter box.
|
||||
|
||||
At `fallback2`, builder lanes must be limited to:
|
||||
- single-issue work
|
||||
- reversible patches
|
||||
- narrow docs or test scaffolds
|
||||
- bounded file counts and small diff sizes
|
||||
|
||||
At `terminal`, builder lanes must not:
|
||||
- touch sensitive control surfaces
|
||||
- merge or release
|
||||
- do multi-repo or architecture work
|
||||
- claim verification they did not run
|
||||
|
||||
Their terminal usefulness may still include:
|
||||
- a small patch
|
||||
- a reproducer test
|
||||
- a docs fix
|
||||
- a draft branch or artifact for later review
|
||||
|
||||
### Wolf / bulk floors
|
||||
|
||||
Wolf / bulk lanes stay useful as summarizers and sweepers, not as governors.
|
||||
|
||||
At `fallback2` and `terminal`, wolf / bulk lanes must not:
|
||||
- fan out branch creation across repos
|
||||
- mass-assign agents
|
||||
- edit sensitive control surfaces
|
||||
- perform irreversible queue mutation
|
||||
|
||||
Their degraded usefulness may still include:
|
||||
- gathering evidence
|
||||
- refreshing inventories
|
||||
- summarizing logs
|
||||
- proposing labels or routes
|
||||
- producing repetitive, low-risk artifacts inside explicit caps
|
||||
|
||||
## Usable terminal lanes
|
||||
|
||||
A terminal fallback is only valid if it still does at least one of these safely:
|
||||
- classify and summarize a backlog
|
||||
- produce a bounded patch or test artifact
|
||||
- summarize a diff with explicit uncertainty
|
||||
- refresh an inventory or evidence bundle
|
||||
|
||||
If the terminal lane can only say "model unavailable" and stop, the portfolio is incomplete.
|
||||
|
||||
## Current sidecar reference lanes
|
||||
|
||||
`fallback-portfolios.yaml` defines the initial implementation-ready structure for four named lanes:
|
||||
- `triage-coordinator` — judgment
|
||||
- `pr-reviewer` — judgment
|
||||
- `builder-main` — builder
|
||||
- `wolf-sweeper` — wolf / bulk
|
||||
|
||||
These are the canonical resilience lanes for the current Timmy world-state.
|
||||
|
||||
Current playbooks should eventually map onto them like this:
|
||||
- `playbooks/issue-triager.yaml` -> `triage-coordinator`
|
||||
- `playbooks/pr-reviewer.yaml` -> `pr-reviewer`
|
||||
- `playbooks/verified-logic.yaml` -> judgment lane family, pending a dedicated proof profile if needed
|
||||
- `playbooks/bug-fixer.yaml`, `playbooks/test-writer.yaml`, and `playbooks/refactor-specialist.yaml` -> `builder-main`
|
||||
- future sidecar bulk playbooks should inherit from `wolf-sweeper` instead of inventing independent fallback chains
|
||||
|
||||
Until runtime wiring lands, unmapped playbooks should be treated as policy-incomplete rather than inheriting an implicit fallback chain.
|
||||
|
||||
## Wiring contract for later implementation
|
||||
|
||||
When this is wired into runtime selection, the selector should:
|
||||
- classify the incoming task into a role class
|
||||
- check whether the task touches a sensitive control surface
|
||||
- choose the named agent lane for that class
|
||||
- step through the declared portfolio slots in order
|
||||
- enforce the capability floor of the active slot before taking action
|
||||
- record when a fallback transition happened and what authority was still allowed
|
||||
|
||||
The important part is not just choosing a different model. It is choosing a different authority envelope as the lane degrades.
|
||||
50
docs/fleet-cost-report.md
Normal file
50
docs/fleet-cost-report.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Fleet Cost & Resource Inventory
|
||||
|
||||
Last audited: 2026-04-06
|
||||
Owner: Timmy Foundation Ops
|
||||
|
||||
## Model Inference Providers
|
||||
|
||||
| Provider | Type | Cost Model | Agents Using | Est. Monthly |
|
||||
|---|---|---|---|---|
|
||||
| OpenRouter (qwen3.6-plus:free) | API | Free tier | Code Claw, Timmy | $0 |
|
||||
| OpenRouter (various) | API | Credits | Fleet | varies |
|
||||
| Anthropic (Claude Code) | API | Subscription | claw-code fallback | ~$20/mo |
|
||||
| Google AI Studio (Gemini) | Portal | Free daily quota | Strategic tasks | $0 |
|
||||
| Ollama (local) | Local | Electricity only | Mac Hermes | $0 |
|
||||
|
||||
## VPS Infrastructure
|
||||
|
||||
| Server | IP | Cost/Mo | Running | Key Services |
|
||||
|---|---|---|---|---|
|
||||
| Ezra | 143.198.27.163 | $12/mo | Yes | Gitea, agent hosting |
|
||||
| Allegro | 167.99.126.228 | $12/mo | Yes | Agent hosting |
|
||||
| Bezalel | 159.203.146.185 | $12/mo | Yes | Evennia, agent hosting |
|
||||
| **Total VPS** | | **~$36/mo** | | |
|
||||
|
||||
## Local Infrastructure
|
||||
| Resource | Cost |
|
||||
|---|---|
|
||||
| MacBook (owner-provided) | Electricity only |
|
||||
| Ollama models (downloaded) | Free |
|
||||
| Git/Dev tools (OSS) | Free |
|
||||
|
||||
## Cost Recommendations
|
||||
|
||||
| Agent | Verdict | Reason |
|
||||
|---|---|---|
|
||||
| Code Claw (OpenRouter) | DEPLOY | Free tier, adequate for small patches |
|
||||
| Gemini AI Studio | DEPLOY | Free daily quota, good for heavy reasoning |
|
||||
| Ollama local | DEPLOY | No API cost, sovereignty |
|
||||
| VPS fleet | DEPLOY | $36/mo for 3 servers is minimal |
|
||||
| Anthropic subscriptions | MONITOR | Burn $20/mo per seat; watch usage vs output |
|
||||
|
||||
## Monthly Burn Rate Estimate
|
||||
- **Floor (essential):** ~$36/mo (VPS only)
|
||||
- **Current (with Anthropic):** ~$56-76/mo
|
||||
- **Ceiling (all providers maxed):** ~$100+/mo
|
||||
|
||||
## Notes
|
||||
- No GPU instances provisioned yet (no cloud costs)
|
||||
- OpenRouter free tier has rate limits
|
||||
- Gemini AI Studio daily quota resets automatically
|
||||
@@ -30,6 +30,9 @@ This is the canonical reference for how we talk, how we work, and what we mean.
|
||||
### Sidecar Architecture
|
||||
Never fork hermes-agent. Pull upstream like any dependency. Everything custom lives in timmy-config. deploy.sh overlays it onto ~/.hermes/. The engine is theirs. The driver's seat is ours.
|
||||
|
||||
### Coordinator-First Loop
|
||||
One coordinator lane owns intake, triage, route, track, verify, and report. Queue truth stays in Gitea and visible artifacts, not private local notes. Timmy holds governing judgment. Allegro holds routing tempo and queue hygiene. See `coordinator-first-protocol.md`.
|
||||
|
||||
### Lazarus Pit
|
||||
When any wizard goes down, all hands converge to bring them back. Protocol: inspect config, patch model tag, restart service, smoke test, confirm in Telegram.
|
||||
|
||||
|
||||
166
docs/ipc-hub-and-spoke-doctrine.md
Normal file
166
docs/ipc-hub-and-spoke-doctrine.md
Normal file
@@ -0,0 +1,166 @@
|
||||
# IPC Doctrine: Hub-and-Spoke Semantics over Sovereign Transport
|
||||
|
||||
Status: canonical doctrine for issue #157
|
||||
Parent: #154
|
||||
Related migration work:
|
||||
- [`../son-of-timmy.md`](../son-of-timmy.md) for Timmy's layered communications worldview
|
||||
- [`nostr_agent_research.md`](nostr_agent_research.md) for one sovereign transport candidate under evaluation
|
||||
|
||||
## Why this exists
|
||||
|
||||
Timmy is in an ongoing migration toward sovereign transport.
|
||||
The first question is not which bus wins. The first question is what semantics every bus must preserve.
|
||||
Those semantics matter more than any one transport.
|
||||
|
||||
Telegram is not the target backbone for fleet IPC.
|
||||
It may exist as a temporary edge or operator convenience while migration is in flight, but the architecture we are building toward must stand on sovereign transport.
|
||||
|
||||
This doctrine defines the routing and failure semantics that any transport adapter must honor, whether the carrier is Matrix, Nostr, NATS, or something we have not picked yet.
|
||||
|
||||
## Roles
|
||||
|
||||
- Coordinator: the only actor allowed to own routing authority for live agent work
|
||||
- Spoke: an executing agent that receives work, asks for clarification, and returns results
|
||||
- Durable execution truth: the visible task system of record, which remains authoritative for ownership and state transitions
|
||||
- Operator: the human principal who can direct the coordinator but is not a transport shim
|
||||
|
||||
Timmy world-state stays the same while transport changes:
|
||||
- Gitea remains visible execution truth
|
||||
- live IPC accelerates coordination, but does not become a hidden source of authority
|
||||
- transport migration may change the wire, but not the rules
|
||||
|
||||
## Core rules
|
||||
|
||||
### 1. Coordinator-first routing
|
||||
|
||||
Coordinator-first routing is the default system rule.
|
||||
|
||||
- All new work enters through the coordinator
|
||||
- All reroutes, cancellations, escalations, and cross-agent handoffs go through the coordinator
|
||||
- A spoke receives assignments from the coordinator and reports back to the coordinator
|
||||
- A spoke does not mutate the routing graph on its own
|
||||
- If route intent is ambiguous, the system should fail closed and ask the coordinator instead of guessing a peer path
|
||||
|
||||
The coordinator is the hub.
|
||||
Spokes are not free-roaming routers.
|
||||
|
||||
### 2. Anti-cascade behavior
|
||||
|
||||
The system must resist cascade failures and mesh chatter.
|
||||
|
||||
- A spoke MUST NOT recursively fan out work to other spokes
|
||||
- A spoke MUST NOT create hidden side queues or recruit additional agents without coordinator approval
|
||||
- Broadcasts are coordinator-owned and should be rare, deliberate, and bounded
|
||||
- Retries must be bounded and idempotent
|
||||
- Transport adapters must not auto-bridge, auto-replay, or auto-forward in ways that amplify loops or duplicate storms
|
||||
|
||||
A worker that encounters new sub-work should escalate back to the coordinator.
|
||||
It should not become a shadow dispatcher.
|
||||
|
||||
### 3. Limited peer mesh
|
||||
|
||||
Direct spoke-to-spoke communication is an exception, not the default.
|
||||
|
||||
It is allowed only when the coordinator opens an explicit peer window.
|
||||
That peer window must define:
|
||||
- the allowed participants
|
||||
- the task or correlation ID
|
||||
- the narrow purpose
|
||||
- the expiry, timeout, or close condition
|
||||
- the expected artifact or summary that returns to the coordinator
|
||||
|
||||
Peer windows are tightly scoped:
|
||||
- they are time-bounded
|
||||
- they are non-transitive
|
||||
- they do not grant standing routing authority
|
||||
- they close back to coordinator-first behavior when the declared purpose is complete
|
||||
|
||||
Good uses for a peer window:
|
||||
- artifact handoff between two already-assigned agents
|
||||
- verifier-to-builder clarification on a bounded review loop
|
||||
- short-lived data exchange where routing everything through the coordinator would be pure latency
|
||||
|
||||
Bad uses for a peer window:
|
||||
- ad hoc planning rings
|
||||
- recursive delegation chains
|
||||
- quorum gossip
|
||||
- hidden ownership changes
|
||||
- free-form peer mesh as the normal operating mode
|
||||
|
||||
### 4. Transport independence
|
||||
|
||||
The doctrine is transport-agnostic on purpose.
|
||||
|
||||
NATS, Matrix, Nostr, or a future bus are acceptable only if they preserve the same semantics.
|
||||
If a transport cannot preserve these semantics, it is not acceptable as the fleet backbone.
|
||||
|
||||
A valid transport layer must carry or emulate:
|
||||
- authenticated sender identity
|
||||
- intended recipient or bounded scope
|
||||
- task or work identifier
|
||||
- correlation identifier
|
||||
- message type
|
||||
- timeout or TTL semantics
|
||||
- acknowledgement or explicit timeout behavior
|
||||
- idempotency or deduplication signals
|
||||
|
||||
Transport choice does not change authority.
|
||||
Semantics matter more than any one transport.
|
||||
|
||||
### 5. Circuit breakers
|
||||
|
||||
Every acceptable IPC layer must support circuit-breaker behavior.
|
||||
|
||||
At minimum, the system must be able to:
|
||||
- isolate a noisy or unhealthy spoke
|
||||
- stop new dispatches onto a failing route
|
||||
- disable direct peer windows and collapse back to strict hub-and-spoke mode
|
||||
- stop retrying after a bounded count or deadline
|
||||
- quarantine duplicate storms, fan-out anomalies, or missing coordinator acknowledgements instead of amplifying them
|
||||
|
||||
When a breaker trips, the fallback is slower coordinator-mediated operation over durable machine-readable channels.
|
||||
It is not a return to hidden relays.
|
||||
It is not a reason to rebuild the fleet around Telegram.
|
||||
|
||||
No human-token fallback patterns:
|
||||
- do not route agent IPC through personal chat identities
|
||||
- do not rely on operator copy-paste as a standing transport layer
|
||||
- do not treat human-owned bot tokens as the resilience plan
|
||||
|
||||
## Required message classes
|
||||
|
||||
Any transport mapping should preserve these message classes, even if the carrier names differ:
|
||||
|
||||
- dispatch
|
||||
- ack or nack
|
||||
- status or progress
|
||||
- clarify or question
|
||||
- result
|
||||
- failure or escalation
|
||||
- control messages such as cancel, pause, resume, open-peer-window, and close-peer-window
|
||||
|
||||
## Failure semantics
|
||||
|
||||
When things break, authority should degrade safely.
|
||||
|
||||
- If a spoke loses contact with the coordinator, it may finish currently safe local work and persist a checkpoint, but it must not appoint itself as a router
|
||||
- If a spoke receives an unscoped peer message, it should ignore or quarantine it and report the event to the coordinator when possible
|
||||
- If delivery is duplicated or reordered, recipients should prefer correlation IDs and idempotency keys over guesswork
|
||||
- If the live transport is degraded, the system may fall back to slower durable coordination paths, but routing authority remains coordinator-first
|
||||
|
||||
## World-state alignment
|
||||
|
||||
This doctrine sits above transport selection.
|
||||
It does not try to settle every Matrix-vs-Nostr-vs-NATS debate inside one file.
|
||||
It constrains those choices.
|
||||
|
||||
Current Timmy alignment:
|
||||
- sovereign transport migration is ongoing
|
||||
- Telegram is not the backbone we are building toward
|
||||
- Matrix remains relevant for human-to-fleet interaction
|
||||
- Nostr remains relevant as a sovereign option under evaluation
|
||||
- NATS remains relevant as a strong internal bus candidate
|
||||
- the semantics stay constant across all of them
|
||||
|
||||
If we swap the wire and keep the semantics, the fleet stays coherent.
|
||||
If we keep the wire and lose the semantics, the fleet regresses into chatter, hidden routing, and cascade failure.
|
||||
136
docs/matrix-conduit/DEPLOYMENT.md
Normal file
136
docs/matrix-conduit/DEPLOYMENT.md
Normal file
@@ -0,0 +1,136 @@
|
||||
# Matrix/Conduit Deployment Guide
|
||||
|
||||
Executable scaffold for standing up a sovereign Matrix homeserver as the human-to-fleet command surface.
|
||||
|
||||
## Architecture Summary
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Alexander │────▶│ Nginx Proxy │────▶│ Conduit │
|
||||
│ (Element/Web) │ │ 443 / 8448 │ │ Homeserver │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────┐
|
||||
│ SQLite/Postgres│
|
||||
│ (state/media) │
|
||||
└─────────────────┘
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
| Requirement | How to Verify | Status |
|
||||
|-------------|---------------|--------|
|
||||
| VPS with 2GB+ RAM | `free -h` | ⬜ |
|
||||
| Static IP address | `curl ifconfig.me` | ⬜ |
|
||||
| Domain with A record | `dig matrix.fleet.tld` | ⬜ |
|
||||
| Ports 443/8448 open | `sudo ss -tlnp | grep -E "443|8448"` | ⬜ |
|
||||
| TLS certificate (Let's Encrypt) | `sudo certbot certificates` | ⬜ |
|
||||
| Docker + docker-compose | `docker --version` | ⬜ |
|
||||
|
||||
## Quickstart
|
||||
|
||||
### 1. Host Preparation
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
sudo apt update && sudo apt install -y docker.io docker-compose-plugin nginx certbot
|
||||
|
||||
# Open ports
|
||||
sudo ufw allow 443/tcp
|
||||
sudo ufw allow 8448/tcp
|
||||
```
|
||||
|
||||
### 2. DNS Configuration
|
||||
```
|
||||
# A record
|
||||
matrix.fleet.tld. A <YOUR_SERVER_IP>
|
||||
|
||||
# SRV for federation (optional but recommended)
|
||||
_matrix._tcp.fleet.tld. SRV 10 0 8448 matrix.fleet.tld.
|
||||
```
|
||||
|
||||
### 3. TLS Certificate
|
||||
```bash
|
||||
sudo certbot certonly --standalone -d matrix.fleet.tld
|
||||
```
|
||||
|
||||
### 4. Deploy Conduit
|
||||
```bash
|
||||
# Edit conduit.toml: set server_name to your domain
|
||||
nano conduit.toml
|
||||
|
||||
# Start stack
|
||||
docker compose up -d
|
||||
|
||||
# Verify
|
||||
docker logs -f conduit-homeserver
|
||||
```
|
||||
|
||||
### 5. Nginx Configuration
|
||||
```bash
|
||||
sudo cp nginx-matrix.conf /etc/nginx/sites-available/matrix
|
||||
sudo ln -s /etc/nginx/sites-available/matrix /etc/nginx/sites-enabled/
|
||||
sudo nginx -t && sudo systemctl reload nginx
|
||||
```
|
||||
|
||||
### 6. Bootstrap Accounts
|
||||
1. Open Element at `https://matrix.fleet.tld`
|
||||
2. Register admin account first (while `allow_registration = true`)
|
||||
3. Set admin in `conduit.toml`, restart
|
||||
4. Disable registration after setup
|
||||
|
||||
### 7. Fleet Rooms
|
||||
```bash
|
||||
# Fill ACCESS_TOKEN in bootstrap.sh
|
||||
curl -X POST "https://matrix.fleet.tld/_matrix/client/r0/login" \
|
||||
-d '{"type":"m.login.password","user":"alexander","password":"YOUR_PASS"}'
|
||||
|
||||
# Run bootstrap
|
||||
chmod +x bootstrap.sh
|
||||
./bootstrap.sh
|
||||
```
|
||||
|
||||
## Federation Verification
|
||||
|
||||
```bash
|
||||
# Check server discovery
|
||||
curl https://matrix.fleet.tld/.well-known/matrix/server
|
||||
curl https://matrix.fleet.tld/.well-known/matrix/client
|
||||
|
||||
# Check federation
|
||||
curl https://matrix.fleet.tld:8448/_matrix/key/v2/server
|
||||
```
|
||||
|
||||
## Telegram Bridge (Future)
|
||||
|
||||
To bridge Telegram groups to Matrix:
|
||||
|
||||
```yaml
|
||||
# Add to docker-compose.yml
|
||||
telegram-bridge:
|
||||
image: dock.mau.dev/mautrix/telegram:latest
|
||||
volumes:
|
||||
- ./bridge-config.yaml:/data/config.yaml
|
||||
- telegram_bridge:/data
|
||||
```
|
||||
|
||||
See: https://docs.mau.fi/bridges/python/telegram/setup-docker.html
|
||||
|
||||
## Security Checklist
|
||||
|
||||
- [ ] Registration disabled after initial setup
|
||||
- [ ] Admin list restricted
|
||||
- [ ] Strong admin passwords
|
||||
- [ ] Automatic security updates enabled
|
||||
- [ ] Backups configured (conduit_data volume)
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Issue | Cause | Fix |
|
||||
|-------|-------|-----|
|
||||
| Federation failures | DNS/SRV records | Verify `dig _matrix._tcp.fleet.tld SRV` |
|
||||
| SSL errors | Certificate mismatches | Verify cert covers matrix.fleet.tld |
|
||||
| 502 Bad Gateway | Conduit not listening | Check `docker ps`, verify port 6167 |
|
||||
|
||||
---
|
||||
Generated by Ezra | Burn Mode | 2026-04-05
|
||||
86
docs/matrix-deployment.md
Normal file
86
docs/matrix-deployment.md
Normal file
@@ -0,0 +1,86 @@
|
||||
# Matrix/Conduit Deployment Guide
|
||||
|
||||
> **Parent**: timmy-config#166
|
||||
> **Child**: timmy-config#183
|
||||
> **Created**: 2026-04-05 by Ezra burn-mode triage
|
||||
|
||||
## Deployment Prerequisites
|
||||
|
||||
### 1. Host Selection Matrix
|
||||
|
||||
| Option | Pros | Cons | Recommendation |
|
||||
|--------|------|------|----------------|
|
||||
| Timmy-Home bare metal | Full sovereignty, existing Traefik | Single point of failure, home IP | **PRIMARY** |
|
||||
| DigitalOcean VPS | Static IP, offsite | Monthly cost, external dependency | BACKUP |
|
||||
| RunPod GPU instance | Already in fleet | Ephemeral, not for persistence | NOT SUITABLE |
|
||||
|
||||
### 2. Port Requirements
|
||||
|
||||
| Port | Purpose | Inbound Required |
|
||||
|------|---------|------------------|
|
||||
| 8448 | Federation (server-to-server) | Yes |
|
||||
| 443 | Client HTTPS | Yes (via Traefik) |
|
||||
| 80 | ACME HTTP-01 challenge | Yes (redirects to 443) |
|
||||
| 6167 | Conduit replication (optional) | Internal only |
|
||||
|
||||
### 3. Reverse Proxy Assumptions (Traefik)
|
||||
|
||||
Existing `timmy-home` Traefik instance can route Matrix traffic:
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml labels for Conduit
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.matrix.rule=Host(`matrix.tactical.local`)"
|
||||
- "traefik.http.routers.matrix.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.matrix.loadbalancer.server.port=6167"
|
||||
# Federation SRV delegation
|
||||
- "traefik.tcp.routers.matrix-federation.rule=HostSNI(`*`)"
|
||||
- "traefik.tcp.routers.matrix-federation.entrypoints=federation"
|
||||
```
|
||||
|
||||
### 4. DNS Requirements
|
||||
|
||||
```
|
||||
# A records
|
||||
matrix.tactical.local A <timmy-home-ip>
|
||||
|
||||
# SRV records for federation
|
||||
_matrix._tcp.tactical.local SRV 10 0 8448 matrix.tactical.local
|
||||
```
|
||||
|
||||
### 5. Database Choice
|
||||
|
||||
| Option | When to Use |
|
||||
|--------|-------------|
|
||||
| SQLite (default) | < 100 users, < 10 rooms, single-node |
|
||||
| PostgreSQL | Scale, backups, multi-node potential |
|
||||
|
||||
**Recommendation**: Start with SQLite. Migrate to PostgreSQL only if federation grows.
|
||||
|
||||
### 6. Storage Requirements
|
||||
|
||||
- Conduit binary: ~50MB
|
||||
- Database (SQLite): ~100MB initial, grows with media
|
||||
- Media repo: Plan for 10GB (images, avatars, room assets)
|
||||
|
||||
## Blocking Prerequisites Checklist
|
||||
|
||||
- [ ] **Host**: Confirm Timmy-Home static IP or dynamic DNS
|
||||
- [ ] **Ports**: Verify 8448, 443, 80 not blocked by ISP
|
||||
- [ ] **Traefik**: Confirm federation TCP entrypoint configured
|
||||
- [ ] **DNS**: SRV records creatable at domain registrar
|
||||
- [ ] **SSL**: Let's Encrypt ACME configured in Traefik
|
||||
- [ ] **Backup**: Volume mount strategy for SQLite persistence
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Complete prerequisites checklist above
|
||||
2. Generate `conduit-config.toml` (see `matrix/conduit-config.toml`)
|
||||
3. Create `docker-compose.yml` with Traefik labels
|
||||
4. Deploy test room with @ezra + Alexander
|
||||
5. Verify client connectivity (Element web/iOS)
|
||||
6. Document Telegram→Matrix migration plan
|
||||
|
||||
---
|
||||
*This document lowers #166 from fuzzy epic to executable deployment steps.*
|
||||
83
docs/matrix-fleet-comms/ADR-001-matrix-scaffold.md
Normal file
83
docs/matrix-fleet-comms/ADR-001-matrix-scaffold.md
Normal file
@@ -0,0 +1,83 @@
|
||||
# ADR-001: Matrix/Conduit Deployment Scaffold
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| **Status** | Accepted |
|
||||
| **Date** | 2026-04-05 |
|
||||
| **Decider** | Ezra (Architekt) |
|
||||
| **Stakeholders** | Allegro, Timmy, Alexander |
|
||||
| **Parent Issues** | #166, #183 |
|
||||
|
||||
---
|
||||
|
||||
## 1. Context
|
||||
|
||||
Son of Timmy Commandment 6 requires encrypted human-to-fleet communication that is sovereign and independent of Telegram. Before any code can run, we needed a reproducible, infrastructure-agnostic deployment scaffold that any wizard house can verify, deploy, and restore.
|
||||
|
||||
## 2. Decision: Conduit over Synapse
|
||||
|
||||
**Chosen:** [Conduit](https://conduit.rs) as the Matrix homeserver.
|
||||
|
||||
**Alternatives considered:**
|
||||
- **Synapse**: Mature, but heavier (Python, more RAM, more complex config).
|
||||
- **Dendrite**: Go-based, lighter than Synapse, but less feature-complete for E2EE.
|
||||
|
||||
**Rationale:**
|
||||
- Conduit is written in Rust, has a small footprint, and runs comfortably on the Hermes VPS (~7 GB RAM).
|
||||
- Single static binary + SQLite (or Postgres) keeps the Docker image small and backup logic simple.
|
||||
- E2EE support is production-grade enough for a closed fleet.
|
||||
|
||||
## 3. Decision: Docker Compose over Bare Metal
|
||||
|
||||
**Chosen:** Docker Compose stack (`docker-compose.yml`) with explicit volume mounts.
|
||||
|
||||
**Rationale:**
|
||||
- Reproducibility: any host with Docker can stand the stack up in one command.
|
||||
- Isolation: Conduit, Element Web, and Postgres live in separate containers with explicit network boundaries.
|
||||
- Rollback: `docker compose down && docker compose up -d` is a safe, fast recovery path.
|
||||
- Future portability: the same Compose file can move to a different VPS with only `.env` changes.
|
||||
|
||||
## 4. Decision: Caddy as Reverse Proxy (with Nginx coexistence)
|
||||
|
||||
**Chosen:** Caddy handles TLS termination and `.well-known/matrix` delegation inside the Compose network.
|
||||
|
||||
**Rationale:**
|
||||
- Caddy automates Let’s Encrypt TLS via on-demand TLS.
|
||||
- On hosts where Nginx already binds 80/443 (e.g., Hermes VPS), Nginx can reverse-proxy to Caddy or Conduit directly.
|
||||
- The scaffold includes both a `caddy/Caddyfile` and Nginx-compatible notes so the operator is not locked into one proxy.
|
||||
|
||||
## 5. Decision: One Matrix Account Per Wizard House
|
||||
|
||||
**Chosen:** Each wizard house (Ezra, Allegro, Bezalel, etc.) gets its own Matrix user ID (`@ezra:domain`, `@allegro:domain`).
|
||||
|
||||
**Rationale:**
|
||||
- Preserves sovereignty: each house has its own credentials, device keys, and E2EE trust chain.
|
||||
- Matches the existing wizard-house mental model (independent agents, shared rooms).
|
||||
- Simplifies debugging: message provenance is unambiguous.
|
||||
|
||||
## 6. Decision: `matrix-nio` for Hermes Gateway Integration
|
||||
|
||||
**Chosen:** [`matrix-nio`](https://github.com/poljar/matrix-nio) with the `e2e` extra.
|
||||
|
||||
**Rationale:**
|
||||
- Already integrated into the Hermes gateway (`gateway/platforms/matrix.py`).
|
||||
- Asyncio-native, matching the Hermes gateway architecture.
|
||||
- Supports E2EE, media uploads, threads, and replies.
|
||||
|
||||
## 7. Consequences
|
||||
|
||||
### Positive
|
||||
- The scaffold is **self-enforcing**: `validate-scaffold.py` and Gitea Actions CI guard integrity.
|
||||
- Local integration can be verified without public DNS via `docker-compose.test.yml`.
|
||||
- The path from "host decision" to "fleet online" is fully scripted.
|
||||
|
||||
### Negative / Accepted Trade-offs
|
||||
- Conduit is younger than Synapse; edge-case federation bugs are possible. Mitigation: the fleet will run on a single homeserver initially.
|
||||
- SQLite is the default Conduit backend. For >100 users, Postgres is recommended. The Compose file includes an optional Postgres service.
|
||||
|
||||
## 8. References
|
||||
|
||||
- `infra/matrix/CANONICAL_INDEX.md` — canonical artifact map
|
||||
- `infra/matrix/scripts/validate-scaffold.py` — automated integrity checks
|
||||
- `.gitea/workflows/validate-matrix-scaffold.yml` — CI enforcement
|
||||
- `infra/matrix/HERMES_INTEGRATION_VERIFICATION.md` — adapter-to-scaffold mapping
|
||||
149
docs/matrix-fleet-comms/CUTOVER_PLAN.md
Normal file
149
docs/matrix-fleet-comms/CUTOVER_PLAN.md
Normal file
@@ -0,0 +1,149 @@
|
||||
# Telegram → Matrix Cutover Plan
|
||||
|
||||
> **Issue**: [#166](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/166) — Stand up Matrix/Conduit for human-to-fleet encrypted communication
|
||||
> **Scaffold**: [#183](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/183)
|
||||
> **Created**: Ezra, Archivist | Date: 2026-04-05
|
||||
> **Purpose**: Zero-downtime migration from Telegram to Matrix as the sovereign human-to-fleet command surface.
|
||||
|
||||
---
|
||||
|
||||
## Principle
|
||||
|
||||
**Parallel operation first, cutover second.** Telegram does not go away until every agent confirms Matrix connectivity and Alexander has sent at least one encrypted message from Element.
|
||||
|
||||
---
|
||||
|
||||
## Phase 0: Pre-Conditions (All Must Be True)
|
||||
|
||||
| # | Condition | Verification Command |
|
||||
|---|-----------|---------------------|
|
||||
| 1 | Conduit deployed and healthy | `curl https://<domain>/_matrix/client/versions` |
|
||||
| 2 | Fleet rooms created | `python3 infra/matrix/scripts/bootstrap-fleet-rooms.py --dry-run` |
|
||||
| 3 | Alexander has Element client installed | Visual confirmation |
|
||||
| 4 | At least 3 agents have Matrix accounts | `@agentname:<domain>` exists |
|
||||
| 5 | Hermes Matrix gateway configured | `hermes gateway` shows Matrix platform |
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Parallel Run (Days 1–7)
|
||||
|
||||
### Day 1: Room Bootstrap
|
||||
|
||||
```bash
|
||||
# 1. SSH to Conduit host
|
||||
cd /opt/timmy-config/infra/matrix
|
||||
|
||||
# 2. Verify health
|
||||
./host-readiness-check.sh
|
||||
|
||||
# 3. Create rooms (dry-run first)
|
||||
export MATRIX_HOMESERVER="https://matrix.timmytime.net"
|
||||
export MATRIX_ADMIN_TOKEN="<admin_access_token>"
|
||||
python3 scripts/bootstrap-fleet-rooms.py --create-all --dry-run
|
||||
|
||||
# 4. Create rooms (live)
|
||||
python3 scripts/bootstrap-fleet-rooms.py --create-all
|
||||
```
|
||||
|
||||
### Day 1: Operator Onboarding
|
||||
|
||||
1. Open Element Web at `https://element.<domain>` or install Element desktop.
|
||||
2. Register/login as `@alexander:<domain>`.
|
||||
3. Join `#fleet-ops:<domain>`.
|
||||
4. Send a test message: `First light on Matrix. Acknowledge, fleet.`
|
||||
|
||||
### Days 2–3: Agent Onboarding
|
||||
|
||||
For each agent/wizard house:
|
||||
1. Create Matrix account `@<agent>:<domain>`.
|
||||
2. Join `#fleet-ops:<domain>` and `#fleet-general:<domain>`.
|
||||
3. Send acknowledgment in `#fleet-ops`.
|
||||
4. Update agent's Hermes gateway config to listen on Matrix.
|
||||
|
||||
### Days 4–6: Parallel Commanding
|
||||
|
||||
- **Alexander sends all commands in BOTH Telegram and Matrix.**
|
||||
- Agents respond in the channel where they are most reliable.
|
||||
- Monitor for message loss or delivery delays.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Cutover (Day 7)
|
||||
|
||||
### Step 1: Pin Matrix as Primary
|
||||
|
||||
In Telegram `#fleet-ops`:
|
||||
> "📌 PRIMARY SURFACE CHANGE: Matrix is now the sovereign command channel. Telegram remains as fallback for 48 hours. Join: `<matrix_invite_link>`"
|
||||
|
||||
### Step 2: Telegram Gateway Downgrade
|
||||
|
||||
Edit each agent's Hermes gateway config:
|
||||
|
||||
```yaml
|
||||
# ~/.hermes/config.yaml
|
||||
gateway:
|
||||
primary_platform: matrix
|
||||
fallback_platform: telegram
|
||||
matrix:
|
||||
enabled: true
|
||||
homeserver: https://matrix.timmytime.net
|
||||
rooms:
|
||||
- "#fleet-ops:matrix.timmytime.net"
|
||||
telegram:
|
||||
enabled: true # Fallback only
|
||||
```
|
||||
|
||||
### Step 3: Verification Checklist
|
||||
|
||||
- [ ] Alexander sends command **only** on Matrix
|
||||
- [ ] All agents respond within 60 seconds
|
||||
- [ ] Encrypted room icon shows 🔒 in Element
|
||||
- [ ] No messages lost in 24-hour window
|
||||
- [ ] At least one voice/file message test succeeds
|
||||
|
||||
### Step 4: Telegram Standby
|
||||
|
||||
If all checks pass:
|
||||
1. Pin final notice in Telegram: "Fallback mode only. Active surface is Matrix."
|
||||
2. Disable Telegram bot webhooks (do not delete the bot).
|
||||
3. Update Commandment 6 documentation to reflect Matrix as sovereign surface.
|
||||
|
||||
---
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
If Matrix becomes unreachable or messages are lost:
|
||||
|
||||
1. **Immediate**: Alexander re-sends command in Telegram.
|
||||
2. **Within 1 hour**: All agents switch gateway primary back to Telegram:
|
||||
```yaml
|
||||
primary_platform: telegram
|
||||
```
|
||||
3. **Within 24 hours**: Debug Matrix issue (check Conduit logs, Caddy TLS, DNS).
|
||||
4. **Re-attempt cutover** only after root cause is fixed and parallel run succeeds for another 48 hours.
|
||||
|
||||
---
|
||||
|
||||
## Post-Cutover Maintenance
|
||||
|
||||
| Task | Frequency | Command / Action |
|
||||
|------|-----------|------------------|
|
||||
| Backup Conduit data | Daily | `tar czvf /backups/conduit-$(date +%F).tar.gz /opt/timmy-config/infra/matrix/data/conduit/` |
|
||||
| Review room membership | Weekly | Element → Room Settings → Members |
|
||||
| Update Element Web | Monthly | `docker compose pull && docker compose up -d` |
|
||||
| Rotate access tokens | Quarterly | Element → Settings → Help & About → Access Token |
|
||||
|
||||
---
|
||||
|
||||
## Accountability
|
||||
|
||||
| Role | Owner | Responsibility |
|
||||
|------|-------|----------------|
|
||||
| Deployment | @allegro / @timmy | Run `deploy-matrix.sh` and room bootstrap |
|
||||
| Operator onboarding | @rockachopa (Alexander) | Install Element, verify encryption |
|
||||
| Agent gateway cutover | @ezra | Update Hermes gateway configs, monitor logs |
|
||||
| Rollback decision | @rockachopa | Authorize Telegram fallback if needed |
|
||||
|
||||
---
|
||||
|
||||
*Filed by Ezra, Archivist | 2026-04-05*
|
||||
140
docs/matrix-fleet-comms/DECISION_FRAMEWORK_187.md
Normal file
140
docs/matrix-fleet-comms/DECISION_FRAMEWORK_187.md
Normal file
@@ -0,0 +1,140 @@
|
||||
# Decision Framework: Matrix Host, Domain, and Proxy (#187)
|
||||
|
||||
**Parent:** #166 — Stand up Matrix/Conduit for human-to-fleet encrypted communication
|
||||
**Blocker:** #187 — Decide Matrix host, domain, and proxy prerequisites
|
||||
**Author:** Ezra
|
||||
**Date:** 2026-04-05
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
#166 is **execution-ready**. The only remaining gate is three decisions:
|
||||
1. **Host** — which machine runs Conduit?
|
||||
2. **Domain** — what FQDN serves the homeserver?
|
||||
3. **Proxy/TLS** — how do HTTPS and federation terminate?
|
||||
|
||||
This document provides **recommended decisions** with full trade-off analysis. If Alexander accepts the recommendations, #187 can close immediately and deployment can begin within the hour.
|
||||
|
||||
---
|
||||
|
||||
## Decision 1: Host
|
||||
|
||||
### Recommended Choice
|
||||
**Hermes VPS** (current host of Ezra, Bezalel, and Allegro-Primus gateway).
|
||||
|
||||
### Alternative Considered
|
||||
**TestBed VPS** (67.205.155.108) — currently hosts Bezalel (stale) and other experimental workloads.
|
||||
|
||||
### Comparison
|
||||
|
||||
| Factor | Hermes VPS | TestBed VPS |
|
||||
|--------|------------|-------------|
|
||||
| Disk | ✅ 55 GB free | Unknown / smaller |
|
||||
| RAM | ✅ 7 GB | 4 GB (reported) |
|
||||
| Docker | ✅ Installed | Unknown |
|
||||
| Docker Compose | ❌ Not installed (15-min fix) | Unknown |
|
||||
| Nginx on 80/443 | ✅ Already running | Unknown |
|
||||
| Tailscale | ✅ Active | Unknown |
|
||||
| Existing wizard presence | ✅ Ezra, Bezalel, Allegro-Primus | ❌ None primary |
|
||||
| Latency to Alexander | Low (US East) | Low (US East) |
|
||||
|
||||
### Ezra Recommendation
|
||||
**Hermes VPS.** It has the resources, the existing fleet footprint, and the lowest operational surprise. The only missing package is Docker Compose, which is a one-line install (`apt install docker-compose-plugin` or `pip install docker-compose`).
|
||||
|
||||
---
|
||||
|
||||
## Decision 2: Domain / Subdomain
|
||||
|
||||
### Recommended Choice
|
||||
`matrix.alexanderwhitestone.com`
|
||||
|
||||
### Alternatives Considered
|
||||
- `fleet.alexanderwhitestone.com`
|
||||
- `chat.alexanderwhitestone.com`
|
||||
- `conduit.alexanderwhitestone.com`
|
||||
|
||||
### Analysis
|
||||
|
||||
| Subdomain | Clarity | Federation Friendly | Notes |
|
||||
|-----------|---------|---------------------|-------|
|
||||
| `matrix.*` | ✅ Industry standard | ✅ Easy to remember | Best for `.well-known/matrix/server` delegation |
|
||||
| `fleet.*` | ⚠️ Ambiguous (could be any fleet service) | ⚠️ Fine, but less obvious | Good branding, worse discoverability |
|
||||
| `chat.*` | ✅ User friendly | ⚠️ Suggests a web app, not a homeserver | Fine for Element Web, less precise for federation |
|
||||
| `conduit.*` | ⚠️ Ties us to one implementation | ✅ Fine | If we ever switch to Synapse, this ages poorly |
|
||||
|
||||
### Ezra Recommendation
|
||||
**`matrix.alexanderwhitestone.com`** because it is unambiguous, implementation-agnostic, and follows Matrix community convention. The server name can still be `alexanderwhitestone.com` (for short Matrix IDs like `@ezra:alexanderwhitestone.com`) while the actual homeserver listens on `matrix.alexanderwhitestone.com:8448` or is delegated via `.well-known`.
|
||||
|
||||
---
|
||||
|
||||
## Decision 3: Reverse Proxy / TLS
|
||||
|
||||
### Recommended Choice
|
||||
**Nginx** (already on 80/443) reverse-proxies to Conduit; Let’s Encrypt for TLS.
|
||||
|
||||
### Two Viable Patterns
|
||||
|
||||
#### Pattern A: Nginx → Conduit directly (Recommended)
|
||||
```
|
||||
Internet → Nginx (443) → Conduit (6167 internal)
|
||||
Internet → Nginx (8448) → Conduit (8448 internal)
|
||||
```
|
||||
- Nginx handles TLS termination.
|
||||
- Conduit runs plain HTTP on an internal port.
|
||||
- Federation port 8448 is exposed through Nginx stream or server block.
|
||||
|
||||
#### Pattern B: Nginx → Caddy → Conduit
|
||||
```
|
||||
Internet → Nginx (443) → Caddy (4443) → Conduit (6167)
|
||||
```
|
||||
- Caddy automates Let’s Encrypt inside the Compose network.
|
||||
- Nginx remains the edge listener.
|
||||
- More moving parts, but Caddy’s on-demand TLS is convenient.
|
||||
|
||||
### Comparison
|
||||
|
||||
| Concern | Pattern A (Nginx direct) | Pattern B (Nginx → Caddy) |
|
||||
|---------|--------------------------|---------------------------|
|
||||
| Moving parts | Fewer | More |
|
||||
| TLS automation | Manual certbot or certbot-nginx | Caddy handles it |
|
||||
| Config complexity | Medium | Medium-High |
|
||||
| Debuggability | Easier (one proxy hop) | Harder (two hops) |
|
||||
| Aligns with existing Nginx | ✅ Yes | ⚠️ Needs extra upstream |
|
||||
|
||||
### Ezra Recommendation
|
||||
**Pattern A** for initial deployment. Nginx is already the edge proxy on Hermes VPS. Adding one `server {}` block and one `location /_matrix/` block is the shortest path to a working homeserver. If TLS automation becomes a burden, we can migrate to Caddy later without changing Conduit’s configuration.
|
||||
|
||||
---
|
||||
|
||||
## Pre-Deployment Checklist (Post-#187)
|
||||
|
||||
Once the decisions above are ratified, the exact execution sequence is:
|
||||
|
||||
1. **Install Docker Compose** on Hermes VPS (if not already present).
|
||||
2. **Create DNS A record** for `matrix.alexanderwhitestone.com` → Hermes VPS public IP.
|
||||
3. **Obtain TLS certificate** for `matrix.alexanderwhitestone.com` (certbot or manual).
|
||||
4. **Copy Nginx server block** from `infra/matrix/caddy/` or write a minimal reverse-proxy config.
|
||||
5. **Run `./host-readiness-check.sh`** and confirm all checks pass.
|
||||
6. **Run `./deploy-matrix.sh`** and wait for Conduit to come online.
|
||||
7. **Run `python3 scripts/bootstrap-fleet-rooms.py --create-all`** to initialize rooms.
|
||||
8. **Run `./scripts/verify-hermes-integration.sh`** to prove E2EE messaging works.
|
||||
9. **Follow `docs/matrix-fleet-comms/CUTOVER_PLAN.md`** for the Telegram → Matrix transition.
|
||||
|
||||
---
|
||||
|
||||
## Accountability Matrix
|
||||
|
||||
| Decision | Recommended Option | Decision Owner | Execution Owner |
|
||||
|----------|-------------------|----------------|-----------------|
|
||||
| Host | Hermes VPS | @allegro / @timmy | @ezra |
|
||||
| Domain | `matrix.alexanderwhitestone.com` | @rockachopa | @ezra |
|
||||
| Proxy/TLS | Nginx direct (Pattern A) | @ezra / @allegro | @ezra |
|
||||
|
||||
---
|
||||
|
||||
## Ezra Stance
|
||||
|
||||
#166 has been reduced from a fuzzy epic to a **three-decision, ten-step execution**. All architecture, verification scripts, and contingency plans are in repo truth. The only missing ingredient is a yes/no on the three decisions above.
|
||||
|
||||
— Ezra, Archivist
|
||||
195
docs/matrix-fleet-comms/DEPLOYMENT_RUNBOOK.md
Normal file
195
docs/matrix-fleet-comms/DEPLOYMENT_RUNBOOK.md
Normal file
@@ -0,0 +1,195 @@
|
||||
# Matrix/Conduit Deployment Runbook
|
||||
# Issue #166 — Human-to-Fleet Encrypted Communication
|
||||
# Created: Ezra, Burn Mode | 2026-04-05
|
||||
|
||||
## Pre-Flight Checklist
|
||||
|
||||
Before running this playbook, ensure:
|
||||
- [ ] Host provisioned with ports 80/443/8448 open
|
||||
- [ ] Domain `matrix.timmytime.net` delegated to host IP
|
||||
- [ ] Docker + Docker Compose installed
|
||||
- [ ] `infra/matrix/` scaffold cloned to host
|
||||
|
||||
## Quick Start (One Command)
|
||||
|
||||
```bash
|
||||
cd infra/matrix && ./deploy.sh --host $(curl -s ifconfig.me) --domain matrix.timmytime.net
|
||||
```
|
||||
|
||||
## Manual Deployment Steps
|
||||
|
||||
### 1. Host Preparation
|
||||
|
||||
```bash
|
||||
# Update system
|
||||
sudo apt update && sudo apt upgrade -y
|
||||
|
||||
# Install Docker
|
||||
curl -fsSL https://get.docker.com | sh
|
||||
sudo usermod -aG docker $USER
|
||||
newgrp docker
|
||||
|
||||
# Install Docker Compose
|
||||
sudo curl -L "https://github.com/docker/compose/releases/download/v2.24.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
|
||||
sudo chmod +x /usr/local/bin/docker-compose
|
||||
```
|
||||
|
||||
### 2. Domain Configuration
|
||||
|
||||
Ensure DNS A record:
|
||||
```
|
||||
matrix.timmytime.net → <HOST_IP>
|
||||
```
|
||||
|
||||
### 3. Scaffold Deployment
|
||||
|
||||
```bash
|
||||
git clone http://143.198.27.163:3000/Timmy_Foundation/timmy-config.git
|
||||
cd timmy-config/infra/matrix
|
||||
```
|
||||
|
||||
### 4. Environment Configuration
|
||||
|
||||
```bash
|
||||
# Copy and edit environment
|
||||
cp .env.template .env
|
||||
nano .env
|
||||
|
||||
# Required values:
|
||||
# DOMAIN=matrix.timmytime.net
|
||||
# POSTGRES_PASSWORD=<generate_strong_password>
|
||||
# CONDUIT_MAX_REQUEST_SIZE=20000000
|
||||
```
|
||||
|
||||
### 5. Launch Services
|
||||
|
||||
```bash
|
||||
# Start Conduit + Element Web
|
||||
docker-compose up -d
|
||||
|
||||
# Verify health
|
||||
docker-compose ps
|
||||
docker-compose logs -f conduit
|
||||
```
|
||||
|
||||
### 6. Federation Test
|
||||
|
||||
```bash
|
||||
# Test .well-known delegation
|
||||
curl https://matrix.timmytime.net/.well-known/matrix/server
|
||||
curl https://matrix.timmytime.net/.well-known/matrix/client
|
||||
|
||||
# Test federation API
|
||||
curl https://matrix.timmytime.net:8448/_matrix/key/v2/server
|
||||
```
|
||||
|
||||
## Post-Deployment: Operator Onboarding
|
||||
|
||||
### Create Admin Account
|
||||
|
||||
```bash
|
||||
# Via Conduit admin API (first user = admin automatically)
|
||||
curl -X POST "https://matrix.timmytime.net/_matrix/client/r0/register" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"username": "alexander",
|
||||
"password": "<secure_password>",
|
||||
"auth": {"type": "m.login.dummy"}
|
||||
}'
|
||||
```
|
||||
|
||||
### Fleet Room Bootstrap
|
||||
|
||||
```bash
|
||||
# Create rooms via API (using admin token)
|
||||
export TOKEN=$(cat ~/.matrix_admin_token)
|
||||
|
||||
# Operators room
|
||||
curl -X POST "https://matrix.timmytime.net/_matrix/client/r0/createRoom" \
|
||||
-H "Authorization: Bearer $TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "Operators",
|
||||
"topic": "Human-to-fleet command surface",
|
||||
"preset": "private_chat",
|
||||
"encryption": true
|
||||
}'
|
||||
|
||||
# Fleet General room
|
||||
curl -X POST "https://matrix.timmytime.net/_matrix/client/r0/createRoom" \
|
||||
-H "Authorization: Bearer $TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "Fleet General",
|
||||
"topic": "All wizard houses — general coordination",
|
||||
"preset": "public_chat",
|
||||
"encryption": true
|
||||
}'
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Port 8448 Blocked
|
||||
|
||||
```bash
|
||||
# Verify federation port
|
||||
nc -zv matrix.timmytime.net 8448
|
||||
|
||||
# Check firewall
|
||||
sudo ufw status
|
||||
sudo ufw allow 8448/tcp
|
||||
```
|
||||
|
||||
### SSL Certificate Issues
|
||||
|
||||
```bash
|
||||
# Force Caddy certificate refresh
|
||||
docker-compose exec caddy rm -rf /data/caddy/certificates
|
||||
docker-compose restart caddy
|
||||
```
|
||||
|
||||
### Conduit Database Migration
|
||||
|
||||
```bash
|
||||
# Backup before migration
|
||||
docker-compose exec conduit sqlite3 /var/lib/matrix-conduit/conduit.db ".backup /backup/conduit-$(date +%Y%m%d).db"
|
||||
```
|
||||
|
||||
## Telegram → Matrix Cutover Plan
|
||||
|
||||
### Phase 0: Parallel (Week 1-2)
|
||||
- Matrix rooms operational
|
||||
- Telegram still primary
|
||||
- Fleet agents join both
|
||||
|
||||
### Phase 1: Operator Verification (Week 3)
|
||||
- Alexander confirms Matrix reliability
|
||||
- Critical alerts dual-posted
|
||||
|
||||
### Phase 2: Fleet Gateway Migration (Week 4)
|
||||
- Hermes gateway adds Matrix platform
|
||||
- Telegram becomes fallback
|
||||
|
||||
### Phase 3: Telegram Deprecation (Week 6-8)
|
||||
- 30-day overlap period
|
||||
- Final cutover announced
|
||||
- Telegram bots archived
|
||||
|
||||
## Verification Commands
|
||||
|
||||
```bash
|
||||
# Health check
|
||||
curl -s https://matrix.timmytime.net/_matrix/client/versions | jq .
|
||||
|
||||
# Federation check
|
||||
curl -s https://federationtester.matrix.org/api/report?server_name=matrix.timmytime.net | jq '.FederationOK'
|
||||
|
||||
# Element Web check
|
||||
curl -s -o /dev/null -w "%{http_code}" https://element.timmytime.net
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
**Artifact**: `docs/matrix-fleet-comms/DEPLOYMENT_RUNBOOK.md`
|
||||
**Issue**: #166
|
||||
**Author**: Ezra | Burn Mode | 2026-04-05
|
||||
243
docs/matrix-fleet-comms/EXECUTION_ARCHITECTURE_KT.md
Normal file
243
docs/matrix-fleet-comms/EXECUTION_ARCHITECTURE_KT.md
Normal file
@@ -0,0 +1,243 @@
|
||||
# Execution Architecture KT — Matrix/Conduit Human-to-Fleet Comms
|
||||
|
||||
**Issue**: [#166](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/166)
|
||||
**Blocker**: [#187](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/187) — Host/domain/proxy decisions
|
||||
**Scaffold**: [#183](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/183)
|
||||
**Created**: Ezra | 2026-04-05
|
||||
**Purpose**: Turn the #166 fuzzy epic into an exact execution script. Once #187 closes, follow this KT verbatim.
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This document is the **knowledge transfer** from architecture (#183) to execution (#166). It assumes the decision framework in `docs/DECISION_FRAMEWORK_187.md` has been accepted (recommended: **Option A — Hermes VPS + Caddy + matrix.timmytime.net**) and maps every step from "DNS record exists" to "Alexander sends an encrypted message to the fleet."
|
||||
|
||||
---
|
||||
|
||||
## Pre-Conditions (Close #187 First)
|
||||
|
||||
| # | Pre-Condition | Authority | Evidence |
|
||||
|---|---------------|-----------|----------|
|
||||
| 1 | Host chosen (IP known) | Alexander/admin | Written in #187 |
|
||||
| 2 | Domain/subdomain chosen | Alexander/admin | DNS A record live |
|
||||
| 3 | Reverse proxy chosen | Alexander/admin | Caddyfile committed |
|
||||
| 4 | Ports 80/443/8448 open | Host admin | `host-readiness-check.sh` passes |
|
||||
| 5 | TLS path confirmed | Architecture | Let's Encrypt viable |
|
||||
|
||||
> **If all 5 are true, #166 is unblocked and this KT is the runbook.**
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Host Prep (30 minutes)
|
||||
|
||||
### 1.1 Clone Repo on Target Host
|
||||
```bash
|
||||
ssh root@<HOST_IP>
|
||||
git clone https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git /opt/timmy-config
|
||||
cd /opt/timmy-config/infra/matrix
|
||||
```
|
||||
|
||||
### 1.2 Verify Host Readiness
|
||||
```bash
|
||||
./host-readiness-check.sh
|
||||
```
|
||||
Expected: all checks green (Docker, ports, disk, RAM).
|
||||
|
||||
### 1.3 Configure Environment
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# Edit .env:
|
||||
# CONDUIT_SERVER_NAME=matrix.timmytime.net
|
||||
# CONDUIT_ALLOW_REGISTRATION=true # ONLY for bootstrap
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Conduit Deployment (15 minutes)
|
||||
|
||||
### 2.1 One-Command Deploy
|
||||
```bash
|
||||
./deploy-matrix.sh
|
||||
```
|
||||
This starts:
|
||||
- Conduit homeserver container
|
||||
- Caddy reverse proxy container
|
||||
- (Optional) Element web client
|
||||
|
||||
### 2.2 Verify Health
|
||||
```bash
|
||||
curl -s https://matrix.timmytime.net/_matrix/client/versions | jq .
|
||||
```
|
||||
Expected: JSON with `versions` array.
|
||||
|
||||
### 2.3 Verify Federation
|
||||
```bash
|
||||
curl -s https://matrix.timmytime.net/.well-known/matrix/server
|
||||
```
|
||||
Expected: `{"m.server": "matrix.timmytime.net:443"}`
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Fleet Bootstrap — Accounts & Rooms (30 minutes)
|
||||
|
||||
### 3.1 Create Admin Account
|
||||
**Enable registration temporarily** in `.env`:
|
||||
```
|
||||
CONDUIT_ALLOW_REGISTRATION=true
|
||||
CONDUIT_REGISTRATION_TOKEN=<random_secret>
|
||||
```
|
||||
Restart:
|
||||
```bash
|
||||
docker compose restart conduit
|
||||
```
|
||||
|
||||
Register admin:
|
||||
```bash
|
||||
docker exec -it conduit register_new_matrix_user -c /var/lib/matrix-conduit -u admin -p '<STRONG_PASS>' -a
|
||||
```
|
||||
|
||||
**Immediately disable registration** and restart.
|
||||
|
||||
### 3.2 Create Fleet Accounts
|
||||
| Account | Purpose | Created By |
|
||||
|---------|---------|------------|
|
||||
| `@admin:matrix.timmytime.net` | Server administration | deploy script |
|
||||
| `@alexander:matrix.timmytime.net` | Human operator | admin |
|
||||
| `@timmy:matrix.timmytime.net` | Coordinator bot | admin |
|
||||
| `@ezra:matrix.timmytime.net` | Archivist bot | admin |
|
||||
| `@allegro:matrix.timmytime.net` | Dispatch bot | admin |
|
||||
| `@bezalel:matrix.timmytime.net` | Dev bot | admin |
|
||||
| `@gemini:matrix.timmytime.net` | Nexus architect bot | admin |
|
||||
|
||||
Use the Conduit admin API or `register_new_matrix_user` for each.
|
||||
|
||||
### 3.3 Create Fleet Rooms
|
||||
| Room Alias | Purpose | Encryption |
|
||||
|------------|---------|------------|
|
||||
| `#fleet-ops:matrix.timmytime.net` | Operator commands | ✅ E2E |
|
||||
| `#fleet-intel:matrix.timmytime.net` | Deep Dive briefings | ✅ E2E |
|
||||
| `#fleet-social:matrix.timmytime.net` | General chat | ✅ E2E |
|
||||
| `#fleet-alerts:matrix.timmytime.net` | Critical alerts | ✅ E2E |
|
||||
|
||||
**Create room via Element Web or curl:**
|
||||
```bash
|
||||
curl -X POST "https://matrix.timmytime.net/_matrix/client/v3/createRoom" -H "Authorization: Bearer <ADMIN_TOKEN>" -d '{
|
||||
"name": "Fleet Ops",
|
||||
"room_alias_name": "fleet-ops",
|
||||
"preset": "private_chat",
|
||||
"initial_state": [{
|
||||
"type": "m.room.encryption",
|
||||
"content": {"algorithm": "m.megolm.v1.aes-sha2"}
|
||||
}]
|
||||
}'
|
||||
```
|
||||
|
||||
### 3.4 Invite Fleet Members
|
||||
Invite each bot/user to the appropriate rooms. For `#fleet-ops`, restrict to `@alexander`, `@timmy`, `@ezra`, `@allegro`.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Wizard Onboarding Procedure (30 minutes)
|
||||
|
||||
Each wizard house needs:
|
||||
1. **Matrix credentials** (username + password + recovery key)
|
||||
2. **Client recommendation** — Element Desktop or Fluffychat
|
||||
3. **Room memberships** — invite to relevant fleet rooms
|
||||
4. **Encryption verification** — verify keys with Alexander
|
||||
|
||||
### Onboarding Checklist per Wizard
|
||||
- [ ] Account created and credentials stored in vault
|
||||
- [ ] Client installed and signed in
|
||||
- [ ] Joined `#fleet-ops` and `#fleet-intel`
|
||||
- [ ] E2E verification completed with `@alexander`
|
||||
- [ ] Test message sent and received
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Telegram → Matrix Cutover Architecture
|
||||
|
||||
### 5.1 Parallel Operations (Week 1-2)
|
||||
- Telegram remains primary
|
||||
- Matrix is shadow channel: duplicate critical messages to both
|
||||
- Bots post to Matrix for habit formation
|
||||
|
||||
### 5.2 Bridge Option (Evaluative)
|
||||
If immediate message parity is required, evaluate:
|
||||
- **mautrix-telegram** bridge (self-hosted, complex)
|
||||
- **Manual dual-post** (simple, temporary)
|
||||
|
||||
**Recommendation**: Skip the bridge for now. Dual-post via bot logic is lower risk.
|
||||
|
||||
### 5.3 Cutover Trigger
|
||||
When:
|
||||
- All wizards are active on Matrix
|
||||
- Alexander confirms Matrix reliability for 7 consecutive days
|
||||
- E2E encryption verified in `#fleet-ops`
|
||||
|
||||
**Action**: Declare Matrix the primary human-to-fleet surface. Telegram becomes fallback only.
|
||||
|
||||
---
|
||||
|
||||
## Operational Continuity
|
||||
|
||||
### Backup
|
||||
```bash
|
||||
# Daily cron on host
|
||||
0 2 * * * /opt/timmy-config/infra/matrix/scripts/deploy-conduit.sh backup
|
||||
```
|
||||
|
||||
### Monitoring
|
||||
```bash
|
||||
# Health check every 5 minutes
|
||||
*/5 * * * * /opt/timmy-config/infra/matrix/scripts/deploy-conduit.sh status || alert
|
||||
```
|
||||
|
||||
### Upgrade Path
|
||||
1. Pull latest `timmy-config`
|
||||
2. Run `./host-readiness-check.sh`
|
||||
3. `docker compose pull && docker compose up -d`
|
||||
|
||||
---
|
||||
|
||||
## Acceptance Criteria Mapping
|
||||
|
||||
| #166 Criterion | How This KT Satisfies It | Phase |
|
||||
|----------------|--------------------------|-------|
|
||||
| Deploy Conduit homeserver | `deploy-matrix.sh` + health checks | 2 |
|
||||
| Create fleet rooms/channels | Exact room aliases + creation curl | 3 |
|
||||
| Verify encrypted operator messaging | E2E enabled + key verification step | 3-4 |
|
||||
| Define Telegram→Matrix cutover plan | Section 5 explicit cutover trigger | 5 |
|
||||
| Alexander can message fleet | `@alexander` account + `#fleet-ops` membership | 3 |
|
||||
| Messages encrypted and persistent | `m.room.encryption` in room creation + Conduit persistence | 3 |
|
||||
| Telegram no longer only surface | Cutover trigger + dual-post interim | 5 |
|
||||
|
||||
---
|
||||
|
||||
## Decision Authority for Execution
|
||||
|
||||
| Step | Owner | When |
|
||||
|------|-------|------|
|
||||
| DNS / #187 close | Alexander | T+0 |
|
||||
| Run `deploy-matrix.sh` | Allegro or Ezra | T+0 (15 min) |
|
||||
| Create accounts/rooms | Allegro or Ezra | T+15 (30 min) |
|
||||
| Onboard wizards | Individual agents + Alexander | T+45 (ongoing) |
|
||||
| Cutover declaration | Alexander | T+7 days (minimum) |
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- Scaffold: [`infra/matrix/`](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/src/branch/main/infra/matrix)
|
||||
- ADRs: [`infra/matrix/docs/adr/`](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/src/branch/main/infra/matrix/docs/adr)
|
||||
- Decision Framework: [`docs/DECISION_FRAMEWORK_187.md`](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/src/branch/main/docs/DECISION_FRAMEWORK_187.md)
|
||||
- Operational Runbook: [`infra/matrix/docs/RUNBOOK.md`](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/src/branch/main/infra/matrix/docs/RUNBOOK.md)
|
||||
- **Room Bootstrap Automation**: [`infra/matrix/scripts/bootstrap-fleet-rooms.py`](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/src/branch/main/infra/matrix/scripts/bootstrap-fleet-rooms.py)
|
||||
- **Telegram Cutover Plan**: [`docs/matrix-fleet-comms/CUTOVER_PLAN.md`](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/src/branch/main/docs/matrix-fleet-comms/CUTOVER_PLAN.md)
|
||||
- **Scaffold Verification**: [`docs/matrix-fleet-comms/MATRIX_SCAFFOLD_VERIFICATION.md`](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/src/branch/main/docs/matrix-fleet-comms/MATRIX_SCAFFOLD_VERIFICATION.md)
|
||||
|
||||
---
|
||||
|
||||
**Ezra Sign-off**: This KT removes all ambiguity from #166. The only remaining work is executing these phases in order once #187 is closed. Room creation and Telegram cutover are now automated.
|
||||
|
||||
— Ezra, Archivist
|
||||
2026-04-05
|
||||
363
docs/matrix-fleet-comms/HERMES_MATRIX_CLIENT_SPEC.md
Normal file
363
docs/matrix-fleet-comms/HERMES_MATRIX_CLIENT_SPEC.md
Normal file
@@ -0,0 +1,363 @@
|
||||
# Hermes Matrix Client Integration Specification
|
||||
|
||||
> **Issue**: [#166](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/166) — Stand up Matrix/Conduit
|
||||
> **Created**: Ezra | 2026-04-05 | Burn mode
|
||||
> **Purpose**: Define how Hermes wizard houses connect to, listen on, and respond within the sovereign Matrix fleet. This turns the #183 server scaffold into an end-to-end communications architecture.
|
||||
|
||||
---
|
||||
|
||||
## 1. Scope
|
||||
|
||||
This document specifies:
|
||||
- The client library and runtime pattern for Hermes-to-Matrix integration
|
||||
- Bot identity model (one account per wizard house vs. shared fleet bot)
|
||||
- Message format, encryption requirements, and room membership rules
|
||||
- Minimal working code scaffold for connection, listening, and reply
|
||||
- Error handling, reconnection, and security hardening
|
||||
|
||||
**Out of scope**: Server deployment (see `infra/matrix/`), room creation (see `scripts/bootstrap-fleet-rooms.py`), Telegram cutover (see `CUTOVER_PLAN.md`).
|
||||
|
||||
---
|
||||
|
||||
## 2. Library Choice: `matrix-nio`
|
||||
|
||||
**Selected library**: [`matrix-nio`](https://matrix-nio.readthedocs.io/)
|
||||
|
||||
**Why `matrix-nio`:**
|
||||
- Native async/await (fits Hermes agent loop)
|
||||
- Full end-to-end encryption (E2EE) support via `AsyncClient`
|
||||
- Small dependency footprint compared to Synapse client SDK
|
||||
- Battle-tested in production bots (e.g., maubot, heisenbridge)
|
||||
|
||||
**Installation**:
|
||||
```bash
|
||||
pip install matrix-nio[e2e]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Bot Identity Model
|
||||
|
||||
### 3.1 Recommendation: One Bot Per Wizard House
|
||||
|
||||
Each wizard house (Ezra, Allegro, Gemini, Bezalel, etc.) maintains its own Matrix user account. This mirrors the existing Telegram identity model and preserves sovereignty.
|
||||
|
||||
**Pattern**:
|
||||
- `@ezra:matrix.timmytime.net`
|
||||
- `@allegro:matrix.timmytime.net`
|
||||
- `@gemini:matrix.timmytime.net`
|
||||
|
||||
### 3.2 Alternative: Shared Fleet Bot
|
||||
|
||||
A single `@fleet:matrix.timmytime.net` bot proxies messages for all agents. **Not recommended** — creates a single point of failure and complicates attribution.
|
||||
|
||||
### 3.3 Account Provisioning
|
||||
|
||||
Each account is created via the Conduit admin API during room bootstrap (see `bootstrap-fleet-rooms.py`). Credentials are stored in the wizard house's local `.env` (`MATRIX_USER`, `MATRIX_PASSWORD`, `MATRIX_HOMESERVER`).
|
||||
|
||||
---
|
||||
|
||||
## 4. Minimal Working Example
|
||||
|
||||
The following scaffold demonstrates:
|
||||
1. Logging in with password
|
||||
2. Joining the fleet operator room
|
||||
3. Listening for encrypted text messages
|
||||
4. Replying with a simple acknowledgment
|
||||
5. Graceful logout on SIGINT
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
"""hermes_matrix_client.py — Minimal Hermes Matrix Client Scaffold"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import signal
|
||||
from pathlib import Path
|
||||
|
||||
from nio import (
|
||||
AsyncClient,
|
||||
LoginResponse,
|
||||
SyncResponse,
|
||||
RoomMessageText,
|
||||
InviteEvent,
|
||||
MatrixRoom,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Configuration (read from environment or local .env)
|
||||
# ------------------------------------------------------------------
|
||||
HOMESERVER = os.getenv("MATRIX_HOMESERVER", "https://matrix.timmytime.net")
|
||||
USER_ID = os.getenv("MATRIX_USER", "@ezra:matrix.timmytime.net")
|
||||
PASSWORD = os.getenv("MATRIX_PASSWORD", "")
|
||||
DEVICE_ID = os.getenv("MATRIX_DEVICE_ID", "HERMES_001")
|
||||
OPERATOR_ROOM_ALIAS = "#operator-room:matrix.timmytime.net"
|
||||
|
||||
# Persistent store for encryption state
|
||||
cache_dir = Path.home() / ".cache" / "hermes-matrix"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
store_path = cache_dir / f"{USER_ID.split(':')[0].replace('@', '')}_store"
|
||||
|
||||
|
||||
class HermesMatrixClient:
|
||||
def __init__(self):
|
||||
self.client = AsyncClient(
|
||||
homeserver=HOMESERVER,
|
||||
user=USER_ID,
|
||||
device_id=DEVICE_ID,
|
||||
store_path=str(store_path),
|
||||
)
|
||||
self.shutdown_event = asyncio.Event()
|
||||
|
||||
async def login(self):
|
||||
resp = await self.client.login(PASSWORD)
|
||||
if isinstance(resp, LoginResponse):
|
||||
print(f"✅ Logged in as {resp.user_id} (device: {resp.device_id})")
|
||||
else:
|
||||
print(f"❌ Login failed: {resp}")
|
||||
raise RuntimeError("Matrix login failed")
|
||||
|
||||
async def join_operator_room(self):
|
||||
"""Join the canonical operator room by alias."""
|
||||
res = await self.client.join_room(OPERATOR_ROOM_ALIAS)
|
||||
if hasattr(res, "room_id"):
|
||||
print(f"✅ Joined operator room: {res.room_id}")
|
||||
return res.room_id
|
||||
else:
|
||||
print(f"⚠️ Could not join operator room: {res}")
|
||||
return None
|
||||
|
||||
async def on_message(self, room: MatrixRoom, event: RoomMessageText):
|
||||
"""Handle incoming text messages."""
|
||||
if event.sender == self.client.user_id:
|
||||
return # Ignore echo of our own messages
|
||||
|
||||
print(f"📩 {room.display_name} | {event.sender}: {event.body}")
|
||||
|
||||
# Simple command parsing
|
||||
if event.body.startswith("!ping"):
|
||||
await self.client.room_send(
|
||||
room_id=room.room_id,
|
||||
message_type="m.room.message",
|
||||
content={
|
||||
"msgtype": "m.text",
|
||||
"body": f"Pong from {USER_ID}!",
|
||||
},
|
||||
)
|
||||
elif event.body.startswith("!sitrep"):
|
||||
await self.client.room_send(
|
||||
room_id=room.room_id,
|
||||
message_type="m.room.message",
|
||||
content={
|
||||
"msgtype": "m.text",
|
||||
"body": "🔥 Burn mode active. All systems nominal.",
|
||||
},
|
||||
)
|
||||
|
||||
async def on_invite(self, room: MatrixRoom, event: InviteEvent):
|
||||
"""Auto-join rooms when invited."""
|
||||
print(f"📨 Invite to {room.room_id} from {event.sender}")
|
||||
await self.client.join(room.room_id)
|
||||
|
||||
async def sync_loop(self):
|
||||
"""Long-polling sync loop with automatic retry."""
|
||||
self.client.add_event_callback(self.on_message, RoomMessageText)
|
||||
self.client.add_event_callback(self.on_invite, InviteEvent)
|
||||
|
||||
while not self.shutdown_event.is_set():
|
||||
try:
|
||||
sync_resp = await self.client.sync(timeout=30000)
|
||||
if isinstance(sync_resp, SyncResponse):
|
||||
pass # Callbacks handled by nio
|
||||
except Exception as exc:
|
||||
print(f"⚠️ Sync error: {exc}. Retrying in 5s...")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
async def run(self):
|
||||
await self.login()
|
||||
await self.join_operator_room()
|
||||
await self.sync_loop()
|
||||
|
||||
async def close(self):
|
||||
await self.client.close()
|
||||
print("👋 Matrix client closed.")
|
||||
|
||||
|
||||
async def main():
|
||||
bot = HermesMatrixClient()
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
loop.add_signal_handler(sig, bot.shutdown_event.set)
|
||||
|
||||
try:
|
||||
await bot.run()
|
||||
finally:
|
||||
await bot.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Message Format & Protocol
|
||||
|
||||
### 5.1 Plain-Text Commands
|
||||
|
||||
For human-to-fleet interaction, messages use a lightweight command prefix:
|
||||
|
||||
| Command | Target | Purpose |
|
||||
|---------|--------|---------|
|
||||
| `!ping` | Any wizard | Liveness check |
|
||||
| `!sitrep` | Any wizard | Request status report |
|
||||
| `!help` | Any wizard | List available commands |
|
||||
| `!exec <task>` | Specific wizard | Route a task request (future) |
|
||||
| `!burn <issue#>` | Any wizard | Priority task escalation |
|
||||
|
||||
### 5.2 Structured JSON Payloads (Agent-to-Agent)
|
||||
|
||||
For machine-to-machine coordination, agents may send `m.text` messages with a JSON block inside triple backticks:
|
||||
|
||||
```json
|
||||
{
|
||||
"hermes_msg_type": "task_request",
|
||||
"from": "@ezra:matrix.timmytime.net",
|
||||
"to": "@gemini:matrix.timmytime.net",
|
||||
"task_id": "the-nexus#830",
|
||||
"action": "evaluate_tts_output",
|
||||
"deadline": "2026-04-06T06:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. End-to-End Encryption (E2EE)
|
||||
|
||||
### 6.1 Requirement
|
||||
|
||||
All fleet operator rooms **must** have encryption enabled (`m.room.encryption` event). The `matrix-nio` client automatically handles key sharing and device verification when `store_path` is provided.
|
||||
|
||||
### 6.2 Device Verification Strategy
|
||||
|
||||
**Recommended**: "Trust on First Use" (TOFU) within the fleet.
|
||||
|
||||
```python
|
||||
async def trust_fleet_devices(self):
|
||||
"""Auto-verify all devices of known fleet users."""
|
||||
fleet_users = ["@ezra:matrix.timmytime.net", "@allegro:matrix.timmytime.net"]
|
||||
for user_id in fleet_users:
|
||||
devices = await self.client.devices(user_id)
|
||||
for device_id in devices.get(user_id, {}):
|
||||
await self.client.verify_device(user_id, device_id)
|
||||
```
|
||||
|
||||
**Caution**: Do not auto-verify external users (e.g., Alexander's personal Element client). Those should be verified manually via emoji comparison.
|
||||
|
||||
---
|
||||
|
||||
## 7. Fleet Room Membership
|
||||
|
||||
### 7.1 Canonical Rooms
|
||||
|
||||
| Room Alias | Purpose | Members |
|
||||
|------------|---------|---------|
|
||||
| `#operator-room:matrix.timmytime.net` | Human-to-fleet command surface | Alexander + all wizards |
|
||||
| `#wizard-hall:matrix.timmytime.net` | Agent-to-agent coordination | All wizards only |
|
||||
| `#burn-pit:matrix.timmytime.net` | High-priority escalations | On-call wizard + Alexander |
|
||||
|
||||
### 7.2 Auto-Join Policy
|
||||
|
||||
Every Hermes client **must** auto-join invites to `#operator-room` and `#wizard-hall`. Burns to `#burn-pit` are opt-in based on on-call schedule.
|
||||
|
||||
---
|
||||
|
||||
## 8. Error Handling & Reconnection
|
||||
|
||||
### 8.1 Network Partitions
|
||||
|
||||
If sync fails with a 5xx or connection error, the client must:
|
||||
1. Log the error
|
||||
2. Wait 5s (with exponential backoff up to 60s)
|
||||
3. Retry sync indefinitely
|
||||
|
||||
### 8.2 Token Expiration
|
||||
|
||||
Conduit access tokens do not expire by default. If a `M_UNKNOWN_TOKEN` occurs, the client must re-login using `MATRIX_PASSWORD` and update the stored access token.
|
||||
|
||||
### 8.3 Fatal Errors
|
||||
|
||||
If login fails 3 times consecutively, the client should exit with a non-zero status and surface an alert to the operator room (if possible via a fallback mechanism).
|
||||
|
||||
---
|
||||
|
||||
## 9. Integration with Hermes Agent Loop
|
||||
|
||||
The Matrix client is **not** a replacement for the Hermes agent core. It is an additional I/O surface.
|
||||
|
||||
**Recommended integration pattern**:
|
||||
|
||||
```
|
||||
┌─────────────────┐
|
||||
│ Hermes Agent │
|
||||
│ (run_agent) │
|
||||
└────────┬────────┘
|
||||
│ tool calls, reasoning
|
||||
▼
|
||||
┌─────────────────┐
|
||||
│ Matrix Gateway │ ← new: wraps hermes_matrix_client.py
|
||||
│ (message I/O) │
|
||||
└────────┬────────┘
|
||||
│ Matrix HTTP APIs
|
||||
▼
|
||||
┌─────────────────┐
|
||||
│ Conduit Server │
|
||||
└─────────────────┘
|
||||
```
|
||||
|
||||
A `MatrixGateway` class (future work) would:
|
||||
1. Run the `matrix-nio` client in a background asyncio task
|
||||
2. Convert incoming Matrix commands into `AIAgent.chat()` calls
|
||||
3. Post the agent's text response back to the room
|
||||
4. Support the existing Hermes toolset (todo, memory, delegate) via the same agent loop
|
||||
|
||||
---
|
||||
|
||||
## 10. Security Hardening Checklist
|
||||
|
||||
Before any wizard house connects to the production Conduit server:
|
||||
|
||||
- [ ] `MATRIX_PASSWORD` is a 32+ character random string
|
||||
- [ ] The client `store_path` is on an encrypted volume (`~/.cache/hermes-matrix/`)
|
||||
- [ ] E2EE is enabled in the operator room
|
||||
- [ ] Only fleet devices are auto-verified
|
||||
- [ ] The client rejects invites from non-fleet homeservers
|
||||
- [ ] Logs do not include message bodies at `INFO` level
|
||||
- [ ] A separate device ID is used per wizard house deployment
|
||||
|
||||
---
|
||||
|
||||
## 11. Acceptance Criteria Mapping
|
||||
|
||||
Maps #166 acceptance criteria to this specification:
|
||||
|
||||
| #166 Criterion | Addressed By |
|
||||
|----------------|--------------|
|
||||
| Deploy Conduit homeserver | `infra/matrix/` (#183) |
|
||||
| Create fleet rooms/channels | `bootstrap-fleet-rooms.py` |
|
||||
| Verify encrypted operator-to-fleet messaging | Section 6 (E2EE) + MWE |
|
||||
| Alexander can message the fleet over Matrix | Sections 4 (MWE), 5 (commands), 7 (rooms) |
|
||||
| Telegram is no longer the only command surface | `CUTOVER_PLAN.md` + this spec |
|
||||
|
||||
---
|
||||
|
||||
## 12. Next Steps
|
||||
|
||||
1. **Gemini / Allegro**: Implement `MatrixGateway` class in `gateway/platforms/matrix.py` using this spec.
|
||||
2. **Bezalel / Ezra**: Test the MWE against the staging Conduit instance once #187 resolves.
|
||||
3. **Alexander**: Approve the command prefix vocabulary (`!ping`, `!sitrep`, `!burn`, etc.).
|
||||
|
||||
---
|
||||
|
||||
*This document is repo truth. If the Matrix client implementation diverges from this spec, update the spec first.*
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user