feat(matrix): scaffold validator + Hermes client spec

- Add validate-scaffold.py: automated acceptance proof for #183
- Add HERMES_MATRIX_CLIENT_SPEC.md: end-to-end agent integration spec for #166

Refs #183, #166
This commit is contained in:
Ezra
2026-04-05 19:02:41 +00:00
parent 1411fded99
commit 38e7aa0f50
2 changed files with 599 additions and 0 deletions

View File

@@ -0,0 +1,363 @@
# Hermes Matrix Client Integration Specification
> **Issue**: [#166](http://143.198.27.163:3000/Timmy_Foundation/timmy-config/issues/166) — Stand up Matrix/Conduit
> **Created**: Ezra | 2026-04-05 | Burn mode
> **Purpose**: Define how Hermes wizard houses connect to, listen on, and respond within the sovereign Matrix fleet. This turns the #183 server scaffold into an end-to-end communications architecture.
---
## 1. Scope
This document specifies:
- The client library and runtime pattern for Hermes-to-Matrix integration
- Bot identity model (one account per wizard house vs. shared fleet bot)
- Message format, encryption requirements, and room membership rules
- Minimal working code scaffold for connection, listening, and reply
- Error handling, reconnection, and security hardening
**Out of scope**: Server deployment (see `infra/matrix/`), room creation (see `scripts/bootstrap-fleet-rooms.py`), Telegram cutover (see `CUTOVER_PLAN.md`).
---
## 2. Library Choice: `matrix-nio`
**Selected library**: [`matrix-nio`](https://matrix-nio.readthedocs.io/)
**Why `matrix-nio`:**
- Native async/await (fits Hermes agent loop)
- Full end-to-end encryption (E2EE) support via `AsyncClient`
- Small dependency footprint compared to Synapse client SDK
- Battle-tested in production bots (e.g., maubot, heisenbridge)
**Installation**:
```bash
pip install matrix-nio[e2e]
```
---
## 3. Bot Identity Model
### 3.1 Recommendation: One Bot Per Wizard House
Each wizard house (Ezra, Allegro, Gemini, Bezalel, etc.) maintains its own Matrix user account. This mirrors the existing Telegram identity model and preserves sovereignty.
**Pattern**:
- `@ezra:matrix.timmytime.net`
- `@allegro:matrix.timmytime.net`
- `@gemini:matrix.timmytime.net`
### 3.2 Alternative: Shared Fleet Bot
A single `@fleet:matrix.timmytime.net` bot proxies messages for all agents. **Not recommended** — creates a single point of failure and complicates attribution.
### 3.3 Account Provisioning
Each account is created via the Conduit admin API during room bootstrap (see `bootstrap-fleet-rooms.py`). Credentials are stored in the wizard house's local `.env` (`MATRIX_USER`, `MATRIX_PASSWORD`, `MATRIX_HOMESERVER`).
---
## 4. Minimal Working Example
The following scaffold demonstrates:
1. Logging in with password
2. Joining the fleet operator room
3. Listening for encrypted text messages
4. Replying with a simple acknowledgment
5. Graceful logout on SIGINT
```python
#!/usr/bin/env python3
"""hermes_matrix_client.py — Minimal Hermes Matrix Client Scaffold"""
import asyncio
import os
import signal
from pathlib import Path
from nio import (
AsyncClient,
LoginResponse,
SyncResponse,
RoomMessageText,
InviteEvent,
MatrixRoom,
)
# ------------------------------------------------------------------
# Configuration (read from environment or local .env)
# ------------------------------------------------------------------
HOMESERVER = os.getenv("MATRIX_HOMESERVER", "https://matrix.timmytime.net")
USER_ID = os.getenv("MATRIX_USER", "@ezra:matrix.timmytime.net")
PASSWORD = os.getenv("MATRIX_PASSWORD", "")
DEVICE_ID = os.getenv("MATRIX_DEVICE_ID", "HERMES_001")
OPERATOR_ROOM_ALIAS = "#operator-room:matrix.timmytime.net"
# Persistent store for encryption state
cache_dir = Path.home() / ".cache" / "hermes-matrix"
cache_dir.mkdir(parents=True, exist_ok=True)
store_path = cache_dir / f"{USER_ID.split(':')[0].replace('@', '')}_store"
class HermesMatrixClient:
def __init__(self):
self.client = AsyncClient(
homeserver=HOMESERVER,
user=USER_ID,
device_id=DEVICE_ID,
store_path=str(store_path),
)
self.shutdown_event = asyncio.Event()
async def login(self):
resp = await self.client.login(PASSWORD)
if isinstance(resp, LoginResponse):
print(f"✅ Logged in as {resp.user_id} (device: {resp.device_id})")
else:
print(f"❌ Login failed: {resp}")
raise RuntimeError("Matrix login failed")
async def join_operator_room(self):
"""Join the canonical operator room by alias."""
res = await self.client.join_room(OPERATOR_ROOM_ALIAS)
if hasattr(res, "room_id"):
print(f"✅ Joined operator room: {res.room_id}")
return res.room_id
else:
print(f"⚠️ Could not join operator room: {res}")
return None
async def on_message(self, room: MatrixRoom, event: RoomMessageText):
"""Handle incoming text messages."""
if event.sender == self.client.user_id:
return # Ignore echo of our own messages
print(f"📩 {room.display_name} | {event.sender}: {event.body}")
# Simple command parsing
if event.body.startswith("!ping"):
await self.client.room_send(
room_id=room.room_id,
message_type="m.room.message",
content={
"msgtype": "m.text",
"body": f"Pong from {USER_ID}!",
},
)
elif event.body.startswith("!sitrep"):
await self.client.room_send(
room_id=room.room_id,
message_type="m.room.message",
content={
"msgtype": "m.text",
"body": "🔥 Burn mode active. All systems nominal.",
},
)
async def on_invite(self, room: MatrixRoom, event: InviteEvent):
"""Auto-join rooms when invited."""
print(f"📨 Invite to {room.room_id} from {event.sender}")
await self.client.join(room.room_id)
async def sync_loop(self):
"""Long-polling sync loop with automatic retry."""
self.client.add_event_callback(self.on_message, RoomMessageText)
self.client.add_event_callback(self.on_invite, InviteEvent)
while not self.shutdown_event.is_set():
try:
sync_resp = await self.client.sync(timeout=30000)
if isinstance(sync_resp, SyncResponse):
pass # Callbacks handled by nio
except Exception as exc:
print(f"⚠️ Sync error: {exc}. Retrying in 5s...")
await asyncio.sleep(5)
async def run(self):
await self.login()
await self.join_operator_room()
await self.sync_loop()
async def close(self):
await self.client.close()
print("👋 Matrix client closed.")
async def main():
bot = HermesMatrixClient()
loop = asyncio.get_event_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(sig, bot.shutdown_event.set)
try:
await bot.run()
finally:
await bot.close()
if __name__ == "__main__":
asyncio.run(main())
```
---
## 5. Message Format & Protocol
### 5.1 Plain-Text Commands
For human-to-fleet interaction, messages use a lightweight command prefix:
| Command | Target | Purpose |
|---------|--------|---------|
| `!ping` | Any wizard | Liveness check |
| `!sitrep` | Any wizard | Request status report |
| `!help` | Any wizard | List available commands |
| `!exec <task>` | Specific wizard | Route a task request (future) |
| `!burn <issue#>` | Any wizard | Priority task escalation |
### 5.2 Structured JSON Payloads (Agent-to-Agent)
For machine-to-machine coordination, agents may send `m.text` messages with a JSON block inside triple backticks:
```json
{
"hermes_msg_type": "task_request",
"from": "@ezra:matrix.timmytime.net",
"to": "@gemini:matrix.timmytime.net",
"task_id": "the-nexus#830",
"action": "evaluate_tts_output",
"deadline": "2026-04-06T06:00:00Z"
}
```
---
## 6. End-to-End Encryption (E2EE)
### 6.1 Requirement
All fleet operator rooms **must** have encryption enabled (`m.room.encryption` event). The `matrix-nio` client automatically handles key sharing and device verification when `store_path` is provided.
### 6.2 Device Verification Strategy
**Recommended**: "Trust on First Use" (TOFU) within the fleet.
```python
async def trust_fleet_devices(self):
"""Auto-verify all devices of known fleet users."""
fleet_users = ["@ezra:matrix.timmytime.net", "@allegro:matrix.timmytime.net"]
for user_id in fleet_users:
devices = await self.client.devices(user_id)
for device_id in devices.get(user_id, {}):
await self.client.verify_device(user_id, device_id)
```
**Caution**: Do not auto-verify external users (e.g., Alexander's personal Element client). Those should be verified manually via emoji comparison.
---
## 7. Fleet Room Membership
### 7.1 Canonical Rooms
| Room Alias | Purpose | Members |
|------------|---------|---------|
| `#operator-room:matrix.timmytime.net` | Human-to-fleet command surface | Alexander + all wizards |
| `#wizard-hall:matrix.timmytime.net` | Agent-to-agent coordination | All wizards only |
| `#burn-pit:matrix.timmytime.net` | High-priority escalations | On-call wizard + Alexander |
### 7.2 Auto-Join Policy
Every Hermes client **must** auto-join invites to `#operator-room` and `#wizard-hall`. Burns to `#burn-pit` are opt-in based on on-call schedule.
---
## 8. Error Handling & Reconnection
### 8.1 Network Partitions
If sync fails with a 5xx or connection error, the client must:
1. Log the error
2. Wait 5s (with exponential backoff up to 60s)
3. Retry sync indefinitely
### 8.2 Token Expiration
Conduit access tokens do not expire by default. If a `M_UNKNOWN_TOKEN` occurs, the client must re-login using `MATRIX_PASSWORD` and update the stored access token.
### 8.3 Fatal Errors
If login fails 3 times consecutively, the client should exit with a non-zero status and surface an alert to the operator room (if possible via a fallback mechanism).
---
## 9. Integration with Hermes Agent Loop
The Matrix client is **not** a replacement for the Hermes agent core. It is an additional I/O surface.
**Recommended integration pattern**:
```
┌─────────────────┐
│ Hermes Agent │
│ (run_agent) │
└────────┬────────┘
│ tool calls, reasoning
┌─────────────────┐
│ Matrix Gateway │ ← new: wraps hermes_matrix_client.py
│ (message I/O) │
└────────┬────────┘
│ Matrix HTTP APIs
┌─────────────────┐
│ Conduit Server │
└─────────────────┘
```
A `MatrixGateway` class (future work) would:
1. Run the `matrix-nio` client in a background asyncio task
2. Convert incoming Matrix commands into `AIAgent.chat()` calls
3. Post the agent's text response back to the room
4. Support the existing Hermes toolset (todo, memory, delegate) via the same agent loop
---
## 10. Security Hardening Checklist
Before any wizard house connects to the production Conduit server:
- [ ] `MATRIX_PASSWORD` is a 32+ character random string
- [ ] The client `store_path` is on an encrypted volume (`~/.cache/hermes-matrix/`)
- [ ] E2EE is enabled in the operator room
- [ ] Only fleet devices are auto-verified
- [ ] The client rejects invites from non-fleet homeservers
- [ ] Logs do not include message bodies at `INFO` level
- [ ] A separate device ID is used per wizard house deployment
---
## 11. Acceptance Criteria Mapping
Maps #166 acceptance criteria to this specification:
| #166 Criterion | Addressed By |
|----------------|--------------|
| Deploy Conduit homeserver | `infra/matrix/` (#183) |
| Create fleet rooms/channels | `bootstrap-fleet-rooms.py` |
| Verify encrypted operator-to-fleet messaging | Section 6 (E2EE) + MWE |
| Alexander can message the fleet over Matrix | Sections 4 (MWE), 5 (commands), 7 (rooms) |
| Telegram is no longer the only command surface | `CUTOVER_PLAN.md` + this spec |
---
## 12. Next Steps
1. **Gemini / Allegro**: Implement `MatrixGateway` class in `gateway/platforms/matrix.py` using this spec.
2. **Bezalel / Ezra**: Test the MWE against the staging Conduit instance once #187 resolves.
3. **Alexander**: Approve the command prefix vocabulary (`!ping`, `!sitrep`, `!burn`, etc.).
---
*This document is repo truth. If the Matrix client implementation diverges from this spec, update the spec first.*

View File

@@ -0,0 +1,236 @@
#!/usr/bin/env python3
"""Matrix/Conduit Scaffold Validator — Issue #183 Acceptance Proof
Validates that infra/matrix/ contains a complete, well-formed deployment scaffold.
Run this after any scaffold change to ensure #183 acceptance criteria remain met.
Usage:
python3 infra/matrix/scripts/validate-scaffold.py
python3 infra/matrix/scripts/validate-scaffold.py --json
Exit codes:
0 = all checks passed
1 = one or more checks failed
"""
import argparse
import json
import os
import re
import subprocess
import sys
from pathlib import Path
try:
import yaml
HAS_YAML = True
except ImportError:
HAS_YAML = False
class Validator:
def __init__(self, base_dir: Path):
self.base_dir = base_dir.resolve()
self.checks = []
self.passed = 0
self.failed = 0
def _add(self, name: str, status: bool, detail: str):
self.checks.append({"name": name, "status": "PASS" if status else "FAIL", "detail": detail})
if status:
self.passed += 1
else:
self.failed += 1
def require_files(self):
"""Check that all required scaffold files exist."""
required = [
"README.md",
"prerequisites.md",
"docker-compose.yml",
"conduit.toml",
".env.example",
"deploy-matrix.sh",
"host-readiness-check.sh",
"caddy/Caddyfile",
"scripts/deploy-conduit.sh",
"docs/RUNBOOK.md",
]
missing = []
for rel in required:
path = self.base_dir / rel
if not path.exists():
missing.append(rel)
self._add(
"Required files present",
len(missing) == 0,
f"Missing: {missing}" if missing else f"All {len(required)} files found",
)
def docker_compose_valid(self):
"""Validate docker-compose.yml is syntactically valid YAML."""
path = self.base_dir / "docker-compose.yml"
if not path.exists():
self._add("docker-compose.yml valid YAML", False, "File does not exist")
return
try:
with open(path, "r") as f:
content = f.read()
if HAS_YAML:
yaml.safe_load(content)
else:
# Basic YAML brace balance check
if content.count("{") != content.count("}"):
raise ValueError("Brace mismatch")
# Must reference conduit image or build
has_conduit = "conduit" in content.lower()
self._add(
"docker-compose.yml valid YAML",
has_conduit,
"Valid YAML and references Conduit" if has_conduit else "Valid YAML but missing Conduit reference",
)
except Exception as e:
self._add("docker-compose.yml valid YAML", False, str(e))
def conduit_toml_valid(self):
"""Validate conduit.toml has required sections."""
path = self.base_dir / "conduit.toml"
if not path.exists():
self._add("conduit.toml required keys", False, "File does not exist")
return
with open(path, "r") as f:
content = f.read()
required_keys = ["server_name", "port", "database_path"]
missing = [k for k in required_keys if k not in content]
self._add(
"conduit.toml required keys",
len(missing) == 0,
f"Missing keys: {missing}" if missing else "Required keys present",
)
def env_example_complete(self):
"""Validate .env.example has required variables."""
path = self.base_dir / ".env.example"
if not path.exists():
self._add(".env.example required variables", False, "File does not exist")
return
with open(path, "r") as f:
content = f.read()
required_vars = ["MATRIX_DOMAIN", "ADMIN_USER", "ADMIN_PASSWORD"]
missing = [v for v in required_vars if v not in content]
self._add(
".env.example required variables",
len(missing) == 0,
f"Missing vars: {missing}" if missing else "Required variables present",
)
def shell_scripts_executable(self):
"""Check that shell scripts are executable and pass bash -n."""
scripts = [
self.base_dir / "deploy-matrix.sh",
self.base_dir / "host-readiness-check.sh",
self.base_dir / "scripts" / "deploy-conduit.sh",
]
errors = []
for script in scripts:
if not script.exists():
errors.append(f"{script.name}: missing")
continue
if not os.access(script, os.X_OK):
errors.append(f"{script.name}: not executable")
result = subprocess.run(["bash", "-n", str(script)], capture_output=True, text=True)
if result.returncode != 0:
errors.append(f"{script.name}: syntax error — {result.stderr.strip()}")
self._add(
"Shell scripts executable & valid",
len(errors) == 0,
"; ".join(errors) if errors else f"All {len(scripts)} scripts OK",
)
def caddyfile_well_formed(self):
"""Check Caddyfile has expected tokens."""
path = self.base_dir / "caddy" / "Caddyfile"
if not path.exists():
self._add("Caddyfile well-formed", False, "File does not exist")
return
with open(path, "r") as f:
content = f.read()
has_reverse_proxy = "reverse_proxy" in content
has_tls = "tls" in content.lower() or "acme" in content.lower() or "auto" in content.lower()
has_well_known = ".well-known" in content or "matrix" in content.lower()
ok = has_reverse_proxy and has_well_known
detail = []
if not has_reverse_proxy:
detail.append("missing reverse_proxy directive")
if not has_well_known:
detail.append("missing .well-known/matrix routing")
self._add(
"Caddyfile well-formed",
ok,
"Well-formed" if ok else f"Issues: {', '.join(detail)}",
)
def runbook_links_valid(self):
"""Check docs/RUNBOOK.md has links to #166 and #183."""
path = self.base_dir / "docs" / "RUNBOOK.md"
if not path.exists():
self._add("RUNBOOK.md issue links", False, "File does not exist")
return
with open(path, "r") as f:
content = f.read()
has_166 = "#166" in content or "166" in content
has_183 = "#183" in content or "183" in content
ok = has_166 and has_183
self._add(
"RUNBOOK.md issue links",
ok,
"Links to #166 and #183" if ok else "Missing issue continuity links",
)
def run_all(self):
self.require_files()
self.docker_compose_valid()
self.conduit_toml_valid()
self.env_example_complete()
self.shell_scripts_executable()
self.caddyfile_well_formed()
self.runbook_links_valid()
def report(self, json_mode: bool = False):
if json_mode:
print(json.dumps({
"base_dir": str(self.base_dir),
"passed": self.passed,
"failed": self.failed,
"checks": self.checks,
}, indent=2))
else:
print(f"Matrix/Conduit Scaffold Validator")
print(f"Base: {self.base_dir}")
print(f"Checks: {self.passed} passed, {self.failed} failed\n")
for c in self.checks:
icon = "" if c["status"] == "PASS" else ""
print(f"{icon} {c['name']:<40} {c['detail']}")
print(f"\n{'SUCCESS' if self.failed == 0 else 'FAILURE'}{self.passed}/{self.passed+self.failed} checks passed")
def main():
parser = argparse.ArgumentParser(description="Validate Matrix/Conduit deployment scaffold")
parser.add_argument("--json", action="store_true", help="Output JSON report")
parser.add_argument("--base", default="infra/matrix", help="Path to scaffold directory")
args = parser.parse_args()
base = Path(args.base)
if not base.exists():
# Try relative to script location
script_dir = Path(__file__).resolve().parent
base = script_dir.parent
validator = Validator(base)
validator.run_all()
validator.report(json_mode=args.json)
sys.exit(0 if validator.failed == 0 else 1)
if __name__ == "__main__":
main()