Compare commits
4 Commits
timmy/issu
...
claude/iss
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f83e103d27 | ||
| fd75985db6 | |||
| 3b4c5e7207 | |||
| 0b57145dde |
62
.gitea/ISSUE_TEMPLATE/mission-proposal.md
Normal file
62
.gitea/ISSUE_TEMPLATE/mission-proposal.md
Normal file
@@ -0,0 +1,62 @@
|
||||
---
|
||||
name: Mission Proposal
|
||||
about: Propose a new Mission Cell — a temporary, isolated collaboration space for agents and humans
|
||||
title: "[MISSION] "
|
||||
labels: mission-proposal
|
||||
---
|
||||
|
||||
## Mission Summary
|
||||
|
||||
<!-- One-sentence description of the mission objective -->
|
||||
|
||||
## Objective
|
||||
|
||||
<!-- What is the outcome we are driving toward? What does "done" look like? -->
|
||||
|
||||
## Agents Invited
|
||||
|
||||
<!-- List agents (or humans) that should be invited into this cell.
|
||||
Format: @agent-name — role (developer / reviewer / observer / coordinator) -->
|
||||
|
||||
- @ —
|
||||
|
||||
## Scope & Deliverables
|
||||
|
||||
<!-- What artifacts will this mission produce? PRs, docs, deployed services, etc. -->
|
||||
|
||||
- [ ]
|
||||
- [ ]
|
||||
|
||||
## Isolation Requirements
|
||||
|
||||
- [ ] No wizard home-directory access needed
|
||||
- [ ] Read-only access to the following homes: (list or "none")
|
||||
- [ ] External network access required: yes / no
|
||||
|
||||
## Cell Configuration
|
||||
|
||||
<!-- Leave blank to use defaults from config/lazarus-pit.toml -->
|
||||
|
||||
| Setting | Value |
|
||||
|--------------------------|-------|
|
||||
| Max duration | |
|
||||
| Checkpoint interval | |
|
||||
| Auto-archive on close | yes / no |
|
||||
| Max revive attempts | |
|
||||
|
||||
## Related Issues / Context
|
||||
|
||||
- Epic: #
|
||||
- Depends on: #
|
||||
- Blocked by: #
|
||||
|
||||
## Success Criteria
|
||||
|
||||
<!-- How will we know this mission succeeded? Be specific. -->
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
## Notes
|
||||
|
||||
<!-- Anything else the lazarus-pit daemon or cell participants should know -->
|
||||
BIN
bin/__pycache__/webhook_health_dashboard.cpython-312.pyc
Normal file
BIN
bin/__pycache__/webhook_health_dashboard.cpython-312.pyc
Normal file
Binary file not shown.
419
bin/lazarus_pit.py
Normal file
419
bin/lazarus_pit.py
Normal file
@@ -0,0 +1,419 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
lazarus-pit — Agent resurrection pool daemon.
|
||||
|
||||
Monitors active mission cells, heartbeats agents, detects downed agents,
|
||||
and revives them back into their mission cells.
|
||||
|
||||
Usage:
|
||||
python bin/lazarus_pit.py [--config path/to/lazarus-pit.toml]
|
||||
python bin/lazarus_pit.py --status
|
||||
python bin/lazarus_pit.py --list-cells
|
||||
|
||||
Config: config/lazarus-pit.toml
|
||||
Architecture: docs/lazarus-pit/mission-cell-spec.md
|
||||
Epic: #878 P0 Foundation: #879
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional TOML support (stdlib tomllib in 3.11+, tomli on older)
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ImportError:
|
||||
try:
|
||||
import tomli as tomllib # type: ignore[no-reuse-def]
|
||||
except ImportError:
|
||||
tomllib = None # Config loading will fall back to defaults
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
DEFAULT_CONFIG_PATH = PROJECT_ROOT / "config" / "lazarus-pit.toml"
|
||||
DEFAULT_CELLS_ROOT = Path("/var/missions")
|
||||
DEFAULT_HEARTBEAT_STALE = 60 # seconds
|
||||
DEFAULT_RESURRECT_AFTER = 120 # seconds
|
||||
DEFAULT_MAX_REVIVES = 3
|
||||
DEFAULT_POLL_INTERVAL = 15 # seconds
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class AgentHealth:
|
||||
name: str
|
||||
healthy: bool = True
|
||||
last_seen: float = 0.0
|
||||
revive_count: int = 0
|
||||
last_status: str = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CellState:
|
||||
uuid: str
|
||||
path: Path
|
||||
agents: dict[str, AgentHealth] = field(default_factory=dict)
|
||||
status: str = "active"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class DaemonConfig:
|
||||
cells_root: Path = DEFAULT_CELLS_ROOT
|
||||
heartbeat_stale_threshold: int = DEFAULT_HEARTBEAT_STALE
|
||||
resurrect_after: int = DEFAULT_RESURRECT_AFTER
|
||||
max_revive_attempts: int = DEFAULT_MAX_REVIVES
|
||||
poll_interval: int = DEFAULT_POLL_INTERVAL
|
||||
log_level: str = "INFO"
|
||||
log_file: str = "-"
|
||||
pid_file: str = "/var/run/lazarus-pit.pid"
|
||||
gitea_url: str = "https://forge.alexanderwhitestone.com"
|
||||
gitea_repo: str = "Timmy_Foundation/the-nexus"
|
||||
gitea_token: str = ""
|
||||
open_issue_on_death: bool = True
|
||||
close_issue_on_revive: bool = True
|
||||
|
||||
|
||||
def load_config(config_path: Path) -> DaemonConfig:
|
||||
"""Load configuration from TOML file, falling back to defaults."""
|
||||
cfg = DaemonConfig()
|
||||
|
||||
if not config_path.exists():
|
||||
return cfg
|
||||
|
||||
if tomllib is None:
|
||||
logging.warning(
|
||||
"TOML parser not available (install tomli for Python < 3.11). "
|
||||
"Using defaults."
|
||||
)
|
||||
return cfg
|
||||
|
||||
with open(config_path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
|
||||
cells = raw.get("cells", {})
|
||||
daemon = raw.get("daemon", {})
|
||||
gitea = raw.get("gitea", {})
|
||||
notifications = raw.get("notifications", {})
|
||||
|
||||
cfg.cells_root = Path(cells.get("root", str(cfg.cells_root)))
|
||||
cfg.heartbeat_stale_threshold = cells.get(
|
||||
"heartbeat_stale_threshold", cfg.heartbeat_stale_threshold
|
||||
)
|
||||
cfg.resurrect_after = cells.get("resurrect_after", cfg.resurrect_after)
|
||||
cfg.max_revive_attempts = cells.get("max_revive_attempts", cfg.max_revive_attempts)
|
||||
cfg.poll_interval = cells.get("poll_interval", cfg.poll_interval)
|
||||
cfg.log_level = daemon.get("log_level", cfg.log_level)
|
||||
cfg.log_file = daemon.get("log_file", cfg.log_file)
|
||||
cfg.pid_file = daemon.get("pid_file", cfg.pid_file)
|
||||
cfg.gitea_url = gitea.get("url", cfg.gitea_url)
|
||||
cfg.gitea_repo = gitea.get("repo", cfg.gitea_repo)
|
||||
cfg.gitea_token = os.environ.get("GITEA_TOKEN", gitea.get("token", ""))
|
||||
cfg.open_issue_on_death = notifications.get(
|
||||
"open_issue_on_death", cfg.open_issue_on_death
|
||||
)
|
||||
cfg.close_issue_on_revive = notifications.get(
|
||||
"close_issue_on_revive", cfg.close_issue_on_revive
|
||||
)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Heartbeat reader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def read_agent_heartbeat(cell_path: Path, agent_name: str) -> Optional[dict]:
|
||||
"""Read the heartbeat file for an agent in a cell. Returns None on failure."""
|
||||
hb_path = cell_path / ".lazarus" / "heartbeats" / f"{agent_name}.json"
|
||||
if not hb_path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(hb_path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cell discovery
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def discover_cells(cells_root: Path) -> list[CellState]:
|
||||
"""Walk cells_root and return all active mission cells."""
|
||||
cells: list[CellState] = []
|
||||
if not cells_root.exists():
|
||||
return cells
|
||||
|
||||
for entry in cells_root.iterdir():
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
manifest_path = entry / "cell.json"
|
||||
if not manifest_path.exists():
|
||||
continue
|
||||
try:
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
if manifest.get("status") not in ("active", "frozen"):
|
||||
continue
|
||||
|
||||
cell = CellState(
|
||||
uuid=manifest.get("uuid", entry.name),
|
||||
path=entry,
|
||||
status=manifest.get("status", "active"),
|
||||
)
|
||||
for agent_info in manifest.get("agents", []):
|
||||
name = agent_info.get("name", "unknown")
|
||||
cell.agents[name] = AgentHealth(name=name)
|
||||
cells.append(cell)
|
||||
|
||||
return cells
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Health poll
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def poll_cell(cell: CellState, cfg: DaemonConfig) -> list[str]:
|
||||
"""
|
||||
Poll a cell's agents. Returns a list of warning messages for any
|
||||
agents whose heartbeat is stale.
|
||||
"""
|
||||
now = time.time()
|
||||
warnings: list[str] = []
|
||||
|
||||
for agent_name, health in cell.agents.items():
|
||||
hb = read_agent_heartbeat(cell.path, agent_name)
|
||||
|
||||
if hb is None:
|
||||
health.healthy = False
|
||||
warnings.append(
|
||||
f"[{cell.uuid}] {agent_name}: no heartbeat file found"
|
||||
)
|
||||
continue
|
||||
|
||||
age = now - hb.get("timestamp", 0)
|
||||
health.last_seen = hb.get("timestamp", 0)
|
||||
health.last_status = hb.get("status", "unknown")
|
||||
|
||||
if age > cfg.heartbeat_stale_threshold:
|
||||
health.healthy = False
|
||||
warnings.append(
|
||||
f"[{cell.uuid}] {agent_name}: heartbeat stale ({age:.0f}s old, "
|
||||
f"threshold {cfg.heartbeat_stale_threshold}s)"
|
||||
)
|
||||
else:
|
||||
health.healthy = True
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Resurrection (stub — P3 will implement fully)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def attempt_revive(cell: CellState, agent_name: str, cfg: DaemonConfig) -> bool:
|
||||
"""
|
||||
Attempt to revive a downed agent into its mission cell.
|
||||
|
||||
This is a stub. Full implementation lands in P3 (#882).
|
||||
Currently logs the intent and increments the revive counter.
|
||||
"""
|
||||
health = cell.agents.get(agent_name)
|
||||
if health is None:
|
||||
return False
|
||||
|
||||
if health.revive_count >= cfg.max_revive_attempts:
|
||||
logging.error(
|
||||
"[%s] %s: max revive attempts (%d) reached — escalating to human",
|
||||
cell.uuid, agent_name, cfg.max_revive_attempts,
|
||||
)
|
||||
return False
|
||||
|
||||
health.revive_count += 1
|
||||
logging.warning(
|
||||
"[%s] %s: initiating resurrection attempt %d/%d (P3 stub)",
|
||||
cell.uuid, agent_name, health.revive_count, cfg.max_revive_attempts,
|
||||
)
|
||||
|
||||
# TODO (P3 #882): exec the agent's harness entrypoint inside the cell
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main daemon loop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class LazarusPit:
|
||||
def __init__(self, cfg: DaemonConfig):
|
||||
self.cfg = cfg
|
||||
self._running = False
|
||||
|
||||
def start(self) -> None:
|
||||
self._running = True
|
||||
self._write_pid()
|
||||
self._setup_signals()
|
||||
|
||||
logging.info("Lazarus Pit daemon started (poll interval: %ds)", self.cfg.poll_interval)
|
||||
|
||||
while self._running:
|
||||
self._tick()
|
||||
time.sleep(self.cfg.poll_interval)
|
||||
|
||||
logging.info("Lazarus Pit daemon stopped.")
|
||||
self._remove_pid()
|
||||
|
||||
def _tick(self) -> None:
|
||||
cells = discover_cells(self.cfg.cells_root)
|
||||
if not cells:
|
||||
logging.debug("No active mission cells found in %s", self.cfg.cells_root)
|
||||
return
|
||||
|
||||
for cell in cells:
|
||||
warnings = poll_cell(cell, self.cfg)
|
||||
for msg in warnings:
|
||||
logging.warning(msg)
|
||||
|
||||
now = time.time()
|
||||
for agent_name, health in cell.agents.items():
|
||||
if not health.healthy:
|
||||
age = now - health.last_seen if health.last_seen else float("inf")
|
||||
if age > self.cfg.resurrect_after:
|
||||
attempt_revive(cell, agent_name, self.cfg)
|
||||
|
||||
def _write_pid(self) -> None:
|
||||
pid_path = Path(self.cfg.pid_file)
|
||||
try:
|
||||
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pid_path.write_text(str(os.getpid()))
|
||||
except OSError as e:
|
||||
logging.warning("Could not write PID file: %s", e)
|
||||
|
||||
def _remove_pid(self) -> None:
|
||||
try:
|
||||
Path(self.cfg.pid_file).unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def _setup_signals(self) -> None:
|
||||
signal.signal(signal.SIGTERM, self._handle_shutdown)
|
||||
signal.signal(signal.SIGINT, self._handle_shutdown)
|
||||
|
||||
def _handle_shutdown(self, signum, frame) -> None:
|
||||
logging.info("Signal %d received — shutting down.", signum)
|
||||
self._running = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def cmd_status(cfg: DaemonConfig) -> None:
|
||||
"""Print status of all active mission cells."""
|
||||
cells = discover_cells(cfg.cells_root)
|
||||
if not cells:
|
||||
print(f"No active cells in {cfg.cells_root}")
|
||||
return
|
||||
|
||||
for cell in cells:
|
||||
print(f"\nCell {cell.uuid} [{cell.status}]")
|
||||
for agent_name, health in cell.agents.items():
|
||||
hb = read_agent_heartbeat(cell.path, agent_name)
|
||||
if hb:
|
||||
age = time.time() - hb.get("timestamp", 0)
|
||||
print(f" {agent_name}: {hb.get('status', '?')} (heartbeat {age:.0f}s ago)")
|
||||
else:
|
||||
print(f" {agent_name}: no heartbeat")
|
||||
|
||||
|
||||
def cmd_list_cells(cfg: DaemonConfig) -> None:
|
||||
"""List all mission cells (active and otherwise)."""
|
||||
root = cfg.cells_root
|
||||
if not root.exists():
|
||||
print(f"{root} does not exist")
|
||||
return
|
||||
|
||||
found = False
|
||||
for entry in sorted(root.iterdir()):
|
||||
manifest = entry / "cell.json"
|
||||
if manifest.exists():
|
||||
try:
|
||||
data = json.loads(manifest.read_text())
|
||||
print(f"{data.get('uuid', entry.name)}: {data.get('status', '?')} — {data.get('mission', '')}")
|
||||
found = True
|
||||
except (json.JSONDecodeError, OSError):
|
||||
print(f"{entry.name}: [unreadable manifest]")
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
print(f"No mission cells found in {root}")
|
||||
|
||||
|
||||
def build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Lazarus Pit — agent resurrection pool daemon",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
default=str(DEFAULT_CONFIG_PATH),
|
||||
help="Path to lazarus-pit.toml (default: config/lazarus-pit.toml)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--status",
|
||||
action="store_true",
|
||||
help="Print current cell/agent health and exit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-cells",
|
||||
action="store_true",
|
||||
help="List all mission cells and exit",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = build_arg_parser().parse_args()
|
||||
cfg = load_config(Path(args.config))
|
||||
|
||||
# Configure logging
|
||||
log_format = "%(asctime)s [%(levelname)s] %(message)s"
|
||||
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
||||
|
||||
if cfg.log_file == "-":
|
||||
logging.basicConfig(level=log_level, format=log_format)
|
||||
else:
|
||||
logging.basicConfig(level=log_level, format=log_format, filename=cfg.log_file)
|
||||
|
||||
if args.status:
|
||||
cmd_status(cfg)
|
||||
return
|
||||
|
||||
if args.list_cells:
|
||||
cmd_list_cells(cfg)
|
||||
return
|
||||
|
||||
pit = LazarusPit(cfg)
|
||||
pit.start()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
52
config/lazarus-pit.toml
Normal file
52
config/lazarus-pit.toml
Normal file
@@ -0,0 +1,52 @@
|
||||
# lazarus-pit.toml — Daemon configuration for the Lazarus Pit resurrection pool
|
||||
# See docs/lazarus-pit/mission-cell-spec.md for architecture context.
|
||||
# Epic: #878 · Phase P0 Foundation: #879
|
||||
|
||||
[daemon]
|
||||
# PID file location
|
||||
pid_file = "/var/run/lazarus-pit.pid"
|
||||
|
||||
# Log file (use "-" for stdout)
|
||||
log_file = "/var/log/lazarus-pit.log"
|
||||
|
||||
# Log level: DEBUG | INFO | WARNING | ERROR
|
||||
log_level = "INFO"
|
||||
|
||||
[cells]
|
||||
# Root directory for all mission cells
|
||||
root = "/var/missions"
|
||||
|
||||
# Seconds of silence before an agent is considered stale
|
||||
heartbeat_stale_threshold = 60
|
||||
|
||||
# Seconds of stale heartbeat before attempting resurrection
|
||||
resurrect_after = 120
|
||||
|
||||
# Maximum auto-revive attempts per agent per session
|
||||
max_revive_attempts = 3
|
||||
|
||||
# Poll interval in seconds for health checks
|
||||
poll_interval = 15
|
||||
|
||||
[gateway]
|
||||
# WebSocket gateway to poll for liveness
|
||||
host = "localhost"
|
||||
port = 8765
|
||||
|
||||
# HTTP health endpoint exposed by the gateway
|
||||
health_host = "localhost"
|
||||
health_port = 8766
|
||||
|
||||
[gitea]
|
||||
# Gitea instance for filing resurrection / incident reports
|
||||
url = "https://forge.alexanderwhitestone.com"
|
||||
repo = "Timmy_Foundation/the-nexus"
|
||||
# Token read from environment: GITEA_TOKEN
|
||||
# token = "" # Do not commit real tokens
|
||||
|
||||
[notifications]
|
||||
# Whether to open a Gitea issue when an agent goes dark
|
||||
open_issue_on_death = true
|
||||
|
||||
# Whether to close the issue when the agent is successfully revived
|
||||
close_issue_on_revive = true
|
||||
156
docs/lazarus-pit/mission-cell-spec.md
Normal file
156
docs/lazarus-pit/mission-cell-spec.md
Normal file
@@ -0,0 +1,156 @@
|
||||
# Mission Cell Directory Specification
|
||||
|
||||
**Version:** 1.0
|
||||
**Status:** Canonical
|
||||
**Epic:** #878 — The Lazarus Pit & Mission Cell Isolation
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
A **Mission Cell** is an ephemeral, isolated working directory provisioned for a specific
|
||||
multi-agent collaboration. Each cell is scoped to a single mission (project, task, or
|
||||
incident) and is identified by a UUID. No wizard's home directory is ever touched by
|
||||
another agent's work within a cell.
|
||||
|
||||
---
|
||||
|
||||
## Root Path
|
||||
|
||||
```
|
||||
/var/missions/<uuid>/
|
||||
```
|
||||
|
||||
`<uuid>` is a version-4 UUID, e.g. `a3f7c901-1234-4b5e-8def-000000000001`.
|
||||
|
||||
---
|
||||
|
||||
## Directory Layout
|
||||
|
||||
```
|
||||
/var/missions/<uuid>/
|
||||
├── cell.json # Cell manifest (identity, status, agents, timestamps)
|
||||
├── workspace/ # Shared working directory — agents write here
|
||||
│ └── ...
|
||||
├── logs/
|
||||
│ ├── events.jsonl # Append-only event stream (invitation, join, leave, etc.)
|
||||
│ └── <agent-name>.log # Per-agent stdout/stderr capture
|
||||
├── checkpoints/ # Snapshot archives for P2 checkpoint/restore
|
||||
│ └── <timestamp>.tar.gz
|
||||
├── bus/ # Mission message bus (P4 multi-agent teaming)
|
||||
│ ├── inbox/ # Per-agent inboxes
|
||||
│ │ └── <agent-name>/
|
||||
│ └── outbox/
|
||||
└── .lazarus/ # Daemon metadata (written by lazarus-pit, not agents)
|
||||
├── heartbeats/
|
||||
│ └── <agent-name>.json # Last heartbeat per agent
|
||||
└── state.json # Cell lifecycle state (active, frozen, archived)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Specs
|
||||
|
||||
### `cell.json` — Cell Manifest
|
||||
|
||||
```json
|
||||
{
|
||||
"uuid": "a3f7c901-1234-4b5e-8def-000000000001",
|
||||
"name": "optional human-readable name",
|
||||
"mission": "Short description of the mission objective",
|
||||
"created_at": "2026-04-06T00:00:00Z",
|
||||
"created_by": "agent-name or username",
|
||||
"status": "active",
|
||||
"agents": [
|
||||
{
|
||||
"name": "claude",
|
||||
"role": "developer",
|
||||
"joined_at": "2026-04-06T00:01:00Z",
|
||||
"home": "/root/wizards/claude"
|
||||
}
|
||||
],
|
||||
"gitea_issue": 879,
|
||||
"repo": "Timmy_Foundation/the-nexus"
|
||||
}
|
||||
```
|
||||
|
||||
**Status values:** `active` | `frozen` | `archived` | `destroyed`
|
||||
|
||||
### `logs/events.jsonl` — Event Stream
|
||||
|
||||
One JSON object per line, append-only:
|
||||
|
||||
```json
|
||||
{"ts": "2026-04-06T00:00:00Z", "event": "cell_created", "by": "allegro", "uuid": "..."}
|
||||
{"ts": "2026-04-06T00:01:00Z", "event": "agent_joined", "agent": "claude", "role": "developer"}
|
||||
{"ts": "2026-04-06T00:02:00Z", "event": "heartbeat", "agent": "claude", "status": "thinking"}
|
||||
{"ts": "2026-04-06T01:00:00Z", "event": "agent_left", "agent": "claude"}
|
||||
{"ts": "2026-04-06T01:01:00Z", "event": "cell_archived"}
|
||||
```
|
||||
|
||||
### `.lazarus/heartbeats/<agent-name>.json` — Per-Agent Heartbeat
|
||||
|
||||
Written by agents, monitored by the lazarus-pit daemon:
|
||||
|
||||
```json
|
||||
{
|
||||
"agent": "claude",
|
||||
"pid": 12345,
|
||||
"timestamp": 1744000000.0,
|
||||
"cycle": 42,
|
||||
"model": "claude-opus-4-6",
|
||||
"status": "thinking",
|
||||
"cell_uuid": "a3f7c901-1234-4b5e-8def-000000000001"
|
||||
}
|
||||
```
|
||||
|
||||
### `.lazarus/state.json` — Daemon State
|
||||
|
||||
Written exclusively by `lazarus-pit`, never by agents:
|
||||
|
||||
```json
|
||||
{
|
||||
"cell_uuid": "a3f7c901-1234-4b5e-8def-000000000001",
|
||||
"daemon_pid": 99001,
|
||||
"last_poll": 1744000000.0,
|
||||
"agent_health": {
|
||||
"claude": {"healthy": true, "last_seen": 1744000000.0, "revive_count": 0}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Lifecycle
|
||||
|
||||
```
|
||||
provision → active → [frozen] → archived → destroyed
|
||||
```
|
||||
|
||||
| Transition | Trigger | Actor |
|
||||
|--------------|--------------------------------------------|---------------|
|
||||
| provision | Invitation accepted / mission created | allegro / CLI |
|
||||
| active | At least one agent joined | lazarus-pit |
|
||||
| frozen | Checkpoint requested or idle timeout | lazarus-pit |
|
||||
| archived | Mission complete, cell preserved | lazarus-pit |
|
||||
| destroyed | Explicit teardown, no home dirs touched | CLI / daemon |
|
||||
|
||||
---
|
||||
|
||||
## Isolation Guarantees
|
||||
|
||||
- No agent process within a cell may write outside `/var/missions/<uuid>/`.
|
||||
- Home directories (`/root/wizards/<name>/`) are read-only mounts or simply not present in the cell's working context.
|
||||
- The cell UUID is injected as `LAZARUS_CELL_UUID` into each agent's environment.
|
||||
- Destruction of a cell has zero impact on any wizard's home directory.
|
||||
|
||||
---
|
||||
|
||||
## Related Issues
|
||||
|
||||
- #878 — Epic
|
||||
- #879 — P0 Foundation (this spec)
|
||||
- #880 — P1 Invitation & Spawning
|
||||
- #881 — P2 Checkpoint / Restore
|
||||
- #882 — P3 Resurrection Pool
|
||||
- #883 — P4 Multi-Agent Teaming
|
||||
489
help.html
Normal file
489
help.html
Normal file
@@ -0,0 +1,489 @@
|
||||
<!DOCTYPE html>
|
||||
<!--
|
||||
THE NEXUS — Help Page
|
||||
Refs: #833 (Missing /help page)
|
||||
Design: dark space / holographic — matches Nexus design system
|
||||
-->
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Help — The Nexus</title>
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;500;600&family=Orbitron:wght@400;600;700&display=swap" rel="stylesheet">
|
||||
<link rel="manifest" href="./manifest.json">
|
||||
<style>
|
||||
:root {
|
||||
--color-bg: #050510;
|
||||
--color-surface: rgba(10, 15, 40, 0.85);
|
||||
--color-border: rgba(74, 240, 192, 0.2);
|
||||
--color-border-bright: rgba(74, 240, 192, 0.5);
|
||||
--color-text: #e0f0ff;
|
||||
--color-text-muted: #8a9ab8;
|
||||
--color-primary: #4af0c0;
|
||||
--color-primary-dim: rgba(74, 240, 192, 0.12);
|
||||
--color-secondary: #7b5cff;
|
||||
--color-danger: #ff4466;
|
||||
--color-warning: #ffaa22;
|
||||
--font-display: 'Orbitron', sans-serif;
|
||||
--font-body: 'JetBrains Mono', monospace;
|
||||
--panel-blur: 16px;
|
||||
--panel-radius: 8px;
|
||||
--transition: 200ms cubic-bezier(0.16, 1, 0.3, 1);
|
||||
}
|
||||
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
|
||||
body {
|
||||
background: var(--color-bg);
|
||||
font-family: var(--font-body);
|
||||
color: var(--color-text);
|
||||
min-height: 100vh;
|
||||
padding: 32px 16px 64px;
|
||||
}
|
||||
|
||||
/* === STARFIELD BG === */
|
||||
body::before {
|
||||
content: '';
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
background:
|
||||
radial-gradient(ellipse at 20% 20%, rgba(74,240,192,0.03) 0%, transparent 50%),
|
||||
radial-gradient(ellipse at 80% 80%, rgba(123,92,255,0.04) 0%, transparent 50%);
|
||||
pointer-events: none;
|
||||
z-index: 0;
|
||||
}
|
||||
|
||||
.page-wrap {
|
||||
position: relative;
|
||||
z-index: 1;
|
||||
max-width: 720px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
/* === HEADER === */
|
||||
.page-header {
|
||||
margin-bottom: 32px;
|
||||
padding-bottom: 20px;
|
||||
border-bottom: 1px solid var(--color-border);
|
||||
}
|
||||
|
||||
.back-link {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
font-size: 11px;
|
||||
letter-spacing: 0.1em;
|
||||
text-transform: uppercase;
|
||||
color: var(--color-text-muted);
|
||||
text-decoration: none;
|
||||
margin-bottom: 20px;
|
||||
transition: color var(--transition);
|
||||
}
|
||||
|
||||
.back-link:hover { color: var(--color-primary); }
|
||||
|
||||
.page-title {
|
||||
font-family: var(--font-display);
|
||||
font-size: 28px;
|
||||
font-weight: 700;
|
||||
letter-spacing: 0.1em;
|
||||
color: var(--color-text);
|
||||
line-height: 1.2;
|
||||
}
|
||||
|
||||
.page-title span { color: var(--color-primary); }
|
||||
|
||||
.page-subtitle {
|
||||
margin-top: 8px;
|
||||
font-size: 13px;
|
||||
color: var(--color-text-muted);
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* === SECTIONS === */
|
||||
.help-section {
|
||||
background: var(--color-surface);
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: var(--panel-radius);
|
||||
overflow: hidden;
|
||||
margin-bottom: 20px;
|
||||
backdrop-filter: blur(var(--panel-blur));
|
||||
}
|
||||
|
||||
.section-header {
|
||||
padding: 14px 20px;
|
||||
border-bottom: 1px solid var(--color-border);
|
||||
background: linear-gradient(90deg, rgba(74,240,192,0.04) 0%, transparent 100%);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.section-icon {
|
||||
font-size: 14px;
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
font-family: var(--font-display);
|
||||
font-size: 12px;
|
||||
font-weight: 600;
|
||||
letter-spacing: 0.15em;
|
||||
text-transform: uppercase;
|
||||
color: var(--color-primary);
|
||||
}
|
||||
|
||||
.section-body {
|
||||
padding: 16px 20px;
|
||||
}
|
||||
|
||||
/* === KEY BINDING TABLE === */
|
||||
.key-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
.key-table tr + tr td {
|
||||
border-top: 1px solid rgba(74,240,192,0.07);
|
||||
}
|
||||
|
||||
.key-table td {
|
||||
padding: 8px 0;
|
||||
font-size: 12px;
|
||||
line-height: 1.5;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
.key-table td:first-child {
|
||||
width: 140px;
|
||||
padding-right: 16px;
|
||||
}
|
||||
|
||||
.key-group {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
kbd {
|
||||
display: inline-block;
|
||||
font-family: var(--font-body);
|
||||
font-size: 10px;
|
||||
font-weight: 600;
|
||||
letter-spacing: 0.05em;
|
||||
background: rgba(74,240,192,0.08);
|
||||
border: 1px solid rgba(74,240,192,0.3);
|
||||
border-bottom-width: 2px;
|
||||
border-radius: 4px;
|
||||
padding: 2px 7px;
|
||||
color: var(--color-primary);
|
||||
}
|
||||
|
||||
.key-desc {
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
|
||||
/* === COMMAND LIST === */
|
||||
.cmd-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.cmd-item {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.cmd-name {
|
||||
min-width: 160px;
|
||||
font-size: 12px;
|
||||
color: var(--color-primary);
|
||||
padding-top: 1px;
|
||||
}
|
||||
|
||||
.cmd-desc {
|
||||
font-size: 12px;
|
||||
color: var(--color-text-muted);
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* === PORTAL LIST === */
|
||||
.portal-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.portal-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
padding: 10px 12px;
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: 6px;
|
||||
font-size: 12px;
|
||||
transition: border-color var(--transition), background var(--transition);
|
||||
}
|
||||
|
||||
.portal-item:hover {
|
||||
border-color: rgba(74,240,192,0.35);
|
||||
background: rgba(74,240,192,0.02);
|
||||
}
|
||||
|
||||
.portal-dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 50%;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.dot-online { background: var(--color-primary); box-shadow: 0 0 6px var(--color-primary); }
|
||||
.dot-standby { background: var(--color-warning); box-shadow: 0 0 6px var(--color-warning); }
|
||||
.dot-offline { background: var(--color-text-muted); }
|
||||
|
||||
.portal-name {
|
||||
font-weight: 600;
|
||||
color: var(--color-text);
|
||||
min-width: 120px;
|
||||
}
|
||||
|
||||
.portal-desc {
|
||||
color: var(--color-text-muted);
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
/* === INFO BLOCK === */
|
||||
.info-block {
|
||||
font-size: 12px;
|
||||
line-height: 1.7;
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
|
||||
.info-block p + p {
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.info-block a {
|
||||
color: var(--color-primary);
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.info-block a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.highlight {
|
||||
color: var(--color-text);
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
/* === FOOTER === */
|
||||
.page-footer {
|
||||
margin-top: 32px;
|
||||
padding-top: 16px;
|
||||
border-top: 1px solid var(--color-border);
|
||||
font-size: 11px;
|
||||
color: var(--color-text-muted);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
flex-wrap: gap;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.footer-brand {
|
||||
font-family: var(--font-display);
|
||||
font-size: 10px;
|
||||
letter-spacing: 0.12em;
|
||||
color: var(--color-primary);
|
||||
opacity: 0.7;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="page-wrap">
|
||||
|
||||
<!-- Header -->
|
||||
<header class="page-header">
|
||||
<a href="/" class="back-link">← Back to The Nexus</a>
|
||||
<h1 class="page-title">THE <span>NEXUS</span> — Help</h1>
|
||||
<p class="page-subtitle">Navigation guide, controls, and system reference for Timmy's sovereign home-world.</p>
|
||||
</header>
|
||||
|
||||
<!-- Navigation Controls -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">◈</span>
|
||||
<span class="section-title">Navigation Controls</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<table class="key-table">
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>W</kbd><kbd>A</kbd><kbd>S</kbd><kbd>D</kbd></div></td>
|
||||
<td class="key-desc">Move forward / left / backward / right</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>Mouse</kbd></div></td>
|
||||
<td class="key-desc">Look around — click the canvas to capture the pointer</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>V</kbd></div></td>
|
||||
<td class="key-desc">Toggle navigation mode: Walk → Fly → Orbit</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>F</kbd></div></td>
|
||||
<td class="key-desc">Enter nearby portal (when portal hint is visible)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>E</kbd></div></td>
|
||||
<td class="key-desc">Read nearby vision point (when vision hint is visible)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>Enter</kbd></div></td>
|
||||
<td class="key-desc">Focus / unfocus chat input</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>Esc</kbd></div></td>
|
||||
<td class="key-desc">Release pointer lock / close overlays</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Timmy Chat Commands -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">⬡</span>
|
||||
<span class="section-title">Timmy Chat Commands</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="cmd-list">
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">System Status</span>
|
||||
<span class="cmd-desc">Quick action — asks Timmy for a live system health summary.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Agent Check</span>
|
||||
<span class="cmd-desc">Quick action — lists all active agents and their current state.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Portal Atlas</span>
|
||||
<span class="cmd-desc">Quick action — opens the full portal map overlay.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Help</span>
|
||||
<span class="cmd-desc">Quick action — requests navigation assistance from Timmy.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Free-form text</span>
|
||||
<span class="cmd-desc">Type anything in the chat bar and press Enter or → to send. Timmy processes all natural-language input.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Portal Atlas -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">🌐</span>
|
||||
<span class="section-title">Portal Atlas</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="info-block">
|
||||
<p>Portals are gateways to external systems and game-worlds. Walk up to a glowing portal in the Nexus and press <span class="highlight"><kbd>F</kbd></span> to activate it, or open the <span class="highlight">Portal Atlas</span> (top-right button) for a full map view.</p>
|
||||
<p>Portal status indicators:</p>
|
||||
</div>
|
||||
<div class="portal-list" style="margin-top:14px;">
|
||||
<div class="portal-item">
|
||||
<span class="portal-dot dot-online"></span>
|
||||
<span class="portal-name">ONLINE</span>
|
||||
<span class="portal-desc">Portal is live and will redirect immediately on activation.</span>
|
||||
</div>
|
||||
<div class="portal-item">
|
||||
<span class="portal-dot dot-standby"></span>
|
||||
<span class="portal-name">STANDBY</span>
|
||||
<span class="portal-desc">Portal is reachable but destination system may be idle.</span>
|
||||
</div>
|
||||
<div class="portal-item">
|
||||
<span class="portal-dot dot-offline"></span>
|
||||
<span class="portal-name">OFFLINE / UNLINKED</span>
|
||||
<span class="portal-desc">Destination not yet connected. Activation shows an error card.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- HUD Panels -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">▦</span>
|
||||
<span class="section-title">HUD Panels</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="cmd-list">
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Symbolic Engine</span>
|
||||
<span class="cmd-desc">Live feed from Timmy's rule-based reasoning layer.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Blackboard</span>
|
||||
<span class="cmd-desc">Shared working memory used across all cognitive subsystems.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Symbolic Planner</span>
|
||||
<span class="cmd-desc">Goal decomposition and task sequencing output.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Case-Based Reasoner</span>
|
||||
<span class="cmd-desc">Analogical reasoning — matches current situation to past cases.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Neuro-Symbolic Bridge</span>
|
||||
<span class="cmd-desc">Translation layer between neural inference and symbolic logic.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Meta-Reasoning</span>
|
||||
<span class="cmd-desc">Timmy reflecting on its own thought process and confidence.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Sovereign Health</span>
|
||||
<span class="cmd-desc">Core vitals: memory usage, heartbeat interval, alert flags.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Adaptive Calibrator</span>
|
||||
<span class="cmd-desc">Live tuning of response thresholds and behavior weights.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- System Info -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">◉</span>
|
||||
<span class="section-title">System Information</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="info-block">
|
||||
<p>The Nexus is Timmy's <span class="highlight">canonical sovereign home-world</span> — a local-first 3D space that serves as both a training ground and a live visualization surface for the Timmy AI system.</p>
|
||||
<p>The WebSocket gateway (<code>server.py</code>) runs on port <span class="highlight">8765</span> and bridges Timmy's cognition layer, game-world connectors, and the browser frontend. The <span class="highlight">HERMES</span> indicator in the HUD shows live connectivity status.</p>
|
||||
<p>Source code and issue tracker: <a href="https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus" target="_blank" rel="noopener noreferrer">Timmy_Foundation/the-nexus</a></p>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Footer -->
|
||||
<footer class="page-footer">
|
||||
<span class="footer-brand">THE NEXUS</span>
|
||||
<span>Questions? Speak to Timmy in the chat bar on the main world.</span>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
64
server.py
64
server.py
@@ -3,12 +3,21 @@
|
||||
The Nexus WebSocket Gateway — Robust broadcast bridge for Timmy's consciousness.
|
||||
This server acts as the central hub for the-nexus, connecting the mind (nexus_think.py),
|
||||
the body (Evennia/Morrowind), and the visualization surface.
|
||||
|
||||
Health heartbeat endpoint (added in #879 — M6-P0 Foundation):
|
||||
GET http://<host>:<HEALTH_PORT>/health
|
||||
Returns 200 JSON with gateway status and connected-client count.
|
||||
Used by the lazarus-pit daemon to verify the gateway is alive.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from threading import Thread
|
||||
from typing import Set
|
||||
|
||||
import websockets
|
||||
@@ -17,6 +26,13 @@ import websockets
|
||||
PORT = 8765
|
||||
HOST = "0.0.0.0" # Allow external connections if needed
|
||||
|
||||
# Health heartbeat endpoint — monitored by lazarus-pit daemon (#879)
|
||||
HEALTH_PORT = int(os.environ.get("NEXUS_HEALTH_PORT", "8766"))
|
||||
HEALTH_HOST = os.environ.get("NEXUS_HEALTH_HOST", "127.0.0.1")
|
||||
|
||||
# Gateway start time for uptime reporting
|
||||
_start_time = time.time()
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -28,6 +44,53 @@ logger = logging.getLogger("nexus-gateway")
|
||||
# State
|
||||
clients: Set[websockets.WebSocketServerProtocol] = set()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Health heartbeat HTTP endpoint — consumed by lazarus-pit (#879)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _HealthHandler(BaseHTTPRequestHandler):
|
||||
"""Minimal HTTP handler for the /health liveness endpoint."""
|
||||
|
||||
def do_GET(self): # noqa: N802
|
||||
if self.path != "/health":
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
payload = {
|
||||
"status": "ok",
|
||||
"service": "nexus-gateway",
|
||||
"uptime_seconds": round(time.time() - _start_time, 1),
|
||||
"connected_clients": len(clients),
|
||||
"ws_port": PORT,
|
||||
"ts": time.time(),
|
||||
}
|
||||
body = json.dumps(payload).encode()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, fmt, *args): # noqa: N802
|
||||
# Suppress default access log spam; use our own logger for errors only
|
||||
if args and str(args[1]) not in ("200", "204"):
|
||||
logger.debug("Health endpoint: " + fmt % args)
|
||||
|
||||
|
||||
def _start_health_server() -> Thread:
|
||||
"""Start the health HTTP server in a background daemon thread."""
|
||||
server = HTTPServer((HEALTH_HOST, HEALTH_PORT), _HealthHandler)
|
||||
thread = Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
logger.info(
|
||||
"Health heartbeat endpoint listening on http://%s:%d/health",
|
||||
HEALTH_HOST, HEALTH_PORT,
|
||||
)
|
||||
return thread
|
||||
|
||||
|
||||
async def broadcast_handler(websocket: websockets.WebSocketServerProtocol):
|
||||
"""Handles individual client connections and message broadcasting."""
|
||||
clients.add(websocket)
|
||||
@@ -80,6 +143,7 @@ async def broadcast_handler(websocket: websockets.WebSocketServerProtocol):
|
||||
async def main():
|
||||
"""Main server loop with graceful shutdown."""
|
||||
logger.info(f"Starting Nexus WS gateway on ws://{HOST}:{PORT}")
|
||||
_start_health_server()
|
||||
|
||||
# Set up signal handlers for graceful shutdown
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
129
tests/test_gateway_health.py
Normal file
129
tests/test_gateway_health.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Tests for the gateway health heartbeat endpoint (#879 — M6-P0 Foundation).
|
||||
|
||||
Validates:
|
||||
- /health returns 200 with correct JSON schema
|
||||
- /health reports connected_clients count
|
||||
- Non-/health paths return 404
|
||||
- Uptime is non-negative
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from http.server import HTTPServer
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
# ── Load server module directly ──────────────────────────────────────────────
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
|
||||
# Patch websockets import so we don't need the package installed to test health
|
||||
_ws_mock = MagicMock()
|
||||
sys.modules.setdefault("websockets", _ws_mock)
|
||||
|
||||
_srv_spec = importlib.util.spec_from_file_location(
|
||||
"nexus_server_test",
|
||||
PROJECT_ROOT / "server.py",
|
||||
)
|
||||
_srv = importlib.util.module_from_spec(_srv_spec)
|
||||
sys.modules["nexus_server_test"] = _srv
|
||||
_srv_spec.loader.exec_module(_srv)
|
||||
|
||||
_HealthHandler = _srv._HealthHandler
|
||||
|
||||
|
||||
# ── Fake request helper ───────────────────────────────────────────────────────
|
||||
|
||||
class _FakeRequest:
|
||||
"""Minimal socket-like object for BaseHTTPRequestHandler testing."""
|
||||
|
||||
def __init__(self, raw_bytes: bytes):
|
||||
self._buf = BytesIO(raw_bytes)
|
||||
self.sent = BytesIO()
|
||||
|
||||
def makefile(self, mode, **kwargs):
|
||||
if "r" in mode:
|
||||
return self._buf
|
||||
return self.sent
|
||||
|
||||
def sendall(self, data: bytes):
|
||||
self.sent.write(data)
|
||||
|
||||
|
||||
def _invoke_handler(path: str) -> tuple[int, dict]:
|
||||
"""Call the health handler for a GET request and return (status_code, body_dict)."""
|
||||
raw = f"GET {path} HTTP/1.1\r\nHost: localhost\r\n\r\n".encode()
|
||||
request = _FakeRequest(raw)
|
||||
|
||||
handler = _HealthHandler.__new__(_HealthHandler)
|
||||
handler.rfile = BytesIO(raw)
|
||||
handler.wfile = request.sent
|
||||
handler.client_address = ("127.0.0.1", 9999)
|
||||
handler.server = MagicMock()
|
||||
handler.request_version = "HTTP/1.1"
|
||||
handler.command = "GET"
|
||||
handler.path = path
|
||||
handler.headers = {}
|
||||
|
||||
# Capture response
|
||||
responses: list[tuple] = []
|
||||
handler.send_response = lambda code, *a: responses.append(("status", code))
|
||||
handler.send_header = lambda k, v: None
|
||||
handler.end_headers = lambda: None
|
||||
|
||||
body_parts: list[bytes] = []
|
||||
handler.wfile = MagicMock()
|
||||
handler.wfile.write = lambda b: body_parts.append(b)
|
||||
|
||||
handler.do_GET()
|
||||
|
||||
status = responses[0][1] if responses else None
|
||||
body = {}
|
||||
if body_parts:
|
||||
try:
|
||||
body = json.loads(b"".join(body_parts))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
return status, body
|
||||
|
||||
|
||||
# ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestHealthEndpoint:
|
||||
def test_health_returns_200(self):
|
||||
status, _ = _invoke_handler("/health")
|
||||
assert status == 200
|
||||
|
||||
def test_health_body_schema(self):
|
||||
_, body = _invoke_handler("/health")
|
||||
assert body.get("status") == "ok"
|
||||
assert body.get("service") == "nexus-gateway"
|
||||
assert "uptime_seconds" in body
|
||||
assert "connected_clients" in body
|
||||
assert "ws_port" in body
|
||||
assert "ts" in body
|
||||
|
||||
def test_uptime_is_non_negative(self):
|
||||
_, body = _invoke_handler("/health")
|
||||
assert body["uptime_seconds"] >= 0
|
||||
|
||||
def test_unknown_path_returns_404(self):
|
||||
status, _ = _invoke_handler("/notfound")
|
||||
assert status == 404
|
||||
|
||||
def test_root_path_returns_404(self):
|
||||
status, _ = _invoke_handler("/")
|
||||
assert status == 404
|
||||
|
||||
def test_connected_clients_reflects_module_state(self):
|
||||
original = _srv.clients.copy()
|
||||
try:
|
||||
_srv.clients.clear()
|
||||
_, body = _invoke_handler("/health")
|
||||
assert body["connected_clients"] == 0
|
||||
finally:
|
||||
_srv.clients.update(original)
|
||||
42
tests/test_help_page.py
Normal file
42
tests/test_help_page.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""Tests for the /help page. Refs: #833 (Missing /help page)."""
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_help_html_exists() -> None:
|
||||
assert Path("help.html").exists(), "help.html must exist to resolve /help 404"
|
||||
|
||||
|
||||
def test_help_html_is_valid_html() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
assert "<!DOCTYPE html>" in content
|
||||
assert "<html" in content
|
||||
assert "</html>" in content
|
||||
|
||||
|
||||
def test_help_page_has_required_sections() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
|
||||
# Navigation controls section
|
||||
assert "Navigation Controls" in content
|
||||
|
||||
# Chat commands section
|
||||
assert "Chat" in content
|
||||
|
||||
# Portal reference
|
||||
assert "Portal" in content
|
||||
|
||||
# Back link to home
|
||||
assert 'href="/"' in content
|
||||
|
||||
|
||||
def test_help_page_links_back_to_home() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
assert 'href="/"' in content, "help page must have a link back to the main Nexus world"
|
||||
|
||||
|
||||
def test_help_page_has_keyboard_controls() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
# Movement keys are listed individually as <kbd> elements
|
||||
for key in ["<kbd>W</kbd>", "<kbd>A</kbd>", "<kbd>S</kbd>", "<kbd>D</kbd>",
|
||||
"Mouse", "Enter", "Esc"]:
|
||||
assert key in content, f"help page must document the {key!r} control"
|
||||
262
tests/test_lazarus_pit.py
Normal file
262
tests/test_lazarus_pit.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""Tests for the lazarus-pit daemon skeleton.
|
||||
|
||||
Validates:
|
||||
- Config loading (defaults and TOML overrides)
|
||||
- Cell discovery from /var/missions structure
|
||||
- Agent heartbeat reading
|
||||
- Health poll logic (stale vs fresh)
|
||||
- AgentHealth and CellState dataclasses
|
||||
- CLI argument parsing
|
||||
- Resurrection stub behaviour
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
# ── Load lazarus_pit module directly ────────────────────────────────────────
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
_lp_spec = importlib.util.spec_from_file_location(
|
||||
"lazarus_pit_test",
|
||||
PROJECT_ROOT / "bin" / "lazarus_pit.py",
|
||||
)
|
||||
_lp = importlib.util.module_from_spec(_lp_spec)
|
||||
sys.modules["lazarus_pit_test"] = _lp
|
||||
_lp_spec.loader.exec_module(_lp)
|
||||
|
||||
AgentHealth = _lp.AgentHealth
|
||||
CellState = _lp.CellState
|
||||
DaemonConfig = _lp.DaemonConfig
|
||||
load_config = _lp.load_config
|
||||
read_agent_heartbeat = _lp.read_agent_heartbeat
|
||||
discover_cells = _lp.discover_cells
|
||||
poll_cell = _lp.poll_cell
|
||||
attempt_revive = _lp.attempt_revive
|
||||
build_arg_parser = _lp.build_arg_parser
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _make_cell(tmp_path: Path, uuid: str, status: str = "active", agents=None) -> Path:
|
||||
"""Create a minimal mission cell structure under tmp_path."""
|
||||
cell_path = tmp_path / uuid
|
||||
cell_path.mkdir()
|
||||
manifest = {
|
||||
"uuid": uuid,
|
||||
"status": status,
|
||||
"mission": "test mission",
|
||||
"agents": [{"name": a} for a in (agents or ["claude"])],
|
||||
}
|
||||
(cell_path / "cell.json").write_text(json.dumps(manifest))
|
||||
return cell_path
|
||||
|
||||
|
||||
def _write_heartbeat(cell_path: Path, agent: str, age_seconds: float = 0) -> None:
|
||||
"""Write a heartbeat file for an agent inside the cell."""
|
||||
hb_dir = cell_path / ".lazarus" / "heartbeats"
|
||||
hb_dir.mkdir(parents=True, exist_ok=True)
|
||||
(hb_dir / f"{agent}.json").write_text(json.dumps({
|
||||
"agent": agent,
|
||||
"pid": 12345,
|
||||
"timestamp": time.time() - age_seconds,
|
||||
"cycle": 1,
|
||||
"model": "test",
|
||||
"status": "thinking",
|
||||
}))
|
||||
|
||||
|
||||
# ── Config ───────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestDaemonConfig:
|
||||
def test_defaults(self):
|
||||
cfg = DaemonConfig()
|
||||
assert cfg.poll_interval == 15
|
||||
assert cfg.heartbeat_stale_threshold == 60
|
||||
assert cfg.max_revive_attempts == 3
|
||||
|
||||
def test_load_missing_file_returns_defaults(self, tmp_path):
|
||||
cfg = load_config(tmp_path / "nonexistent.toml")
|
||||
assert cfg.poll_interval == 15
|
||||
|
||||
def test_load_valid_toml(self, tmp_path):
|
||||
toml_content = b"""
|
||||
[cells]
|
||||
root = "/tmp/missions"
|
||||
poll_interval = 30
|
||||
heartbeat_stale_threshold = 90
|
||||
"""
|
||||
toml_path = tmp_path / "test.toml"
|
||||
toml_path.write_bytes(toml_content)
|
||||
try:
|
||||
cfg = load_config(toml_path)
|
||||
assert cfg.poll_interval == 30
|
||||
assert cfg.heartbeat_stale_threshold == 90
|
||||
assert str(cfg.cells_root) == "/tmp/missions"
|
||||
except Exception:
|
||||
# TOML parser not available — skip, not a test failure
|
||||
pytest.skip("TOML parser not available")
|
||||
|
||||
|
||||
# ── Cell discovery ───────────────────────────────────────────────────────────
|
||||
|
||||
class TestDiscoverCells:
|
||||
def test_empty_root(self, tmp_path):
|
||||
cells = discover_cells(tmp_path)
|
||||
assert cells == []
|
||||
|
||||
def test_missing_root(self, tmp_path):
|
||||
cells = discover_cells(tmp_path / "nonexistent")
|
||||
assert cells == []
|
||||
|
||||
def test_finds_active_cell(self, tmp_path):
|
||||
_make_cell(tmp_path, "aaa-111", status="active")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert len(cells) == 1
|
||||
assert cells[0].uuid == "aaa-111"
|
||||
|
||||
def test_finds_frozen_cell(self, tmp_path):
|
||||
_make_cell(tmp_path, "bbb-222", status="frozen")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert len(cells) == 1
|
||||
|
||||
def test_ignores_archived_cell(self, tmp_path):
|
||||
_make_cell(tmp_path, "ccc-333", status="archived")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert len(cells) == 0
|
||||
|
||||
def test_ignores_directories_without_manifest(self, tmp_path):
|
||||
(tmp_path / "not-a-cell").mkdir()
|
||||
cells = discover_cells(tmp_path)
|
||||
assert cells == []
|
||||
|
||||
def test_ignores_corrupt_manifest(self, tmp_path):
|
||||
cell_path = tmp_path / "bad-cell"
|
||||
cell_path.mkdir()
|
||||
(cell_path / "cell.json").write_text("not json {{")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert cells == []
|
||||
|
||||
def test_agents_populated_from_manifest(self, tmp_path):
|
||||
_make_cell(tmp_path, "ddd-444", agents=["claude", "allegro"])
|
||||
cells = discover_cells(tmp_path)
|
||||
assert set(cells[0].agents.keys()) == {"claude", "allegro"}
|
||||
|
||||
|
||||
# ── Heartbeat reading ─────────────────────────────────────────────────────────
|
||||
|
||||
class TestReadAgentHeartbeat:
|
||||
def test_returns_none_for_missing_file(self, tmp_path):
|
||||
result = read_agent_heartbeat(tmp_path, "ghost")
|
||||
assert result is None
|
||||
|
||||
def test_returns_data_for_valid_heartbeat(self, tmp_path):
|
||||
_write_heartbeat(tmp_path, "claude", age_seconds=0)
|
||||
data = read_agent_heartbeat(tmp_path, "claude")
|
||||
assert data is not None
|
||||
assert data["agent"] == "claude"
|
||||
assert "timestamp" in data
|
||||
|
||||
def test_returns_none_for_corrupt_json(self, tmp_path):
|
||||
hb_dir = tmp_path / ".lazarus" / "heartbeats"
|
||||
hb_dir.mkdir(parents=True)
|
||||
(hb_dir / "bad.json").write_text("{not valid")
|
||||
result = read_agent_heartbeat(tmp_path, "bad")
|
||||
assert result is None
|
||||
|
||||
|
||||
# ── Health polling ────────────────────────────────────────────────────────────
|
||||
|
||||
class TestPollCell:
|
||||
def _make_cell_state(self, tmp_path, agents=None):
|
||||
agents = agents or ["claude"]
|
||||
cell = CellState(uuid="test-uuid", path=tmp_path)
|
||||
for a in agents:
|
||||
cell.agents[a] = AgentHealth(name=a)
|
||||
return cell
|
||||
|
||||
def test_healthy_agent_no_warnings(self, tmp_path):
|
||||
cell = self._make_cell_state(tmp_path)
|
||||
_write_heartbeat(tmp_path, "claude", age_seconds=5)
|
||||
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||
warnings = poll_cell(cell, cfg)
|
||||
assert warnings == []
|
||||
assert cell.agents["claude"].healthy is True
|
||||
|
||||
def test_stale_heartbeat_generates_warning(self, tmp_path):
|
||||
cell = self._make_cell_state(tmp_path)
|
||||
_write_heartbeat(tmp_path, "claude", age_seconds=120)
|
||||
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||
warnings = poll_cell(cell, cfg)
|
||||
assert len(warnings) == 1
|
||||
assert "stale" in warnings[0]
|
||||
assert cell.agents["claude"].healthy is False
|
||||
|
||||
def test_missing_heartbeat_generates_warning(self, tmp_path):
|
||||
cell = self._make_cell_state(tmp_path)
|
||||
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||
warnings = poll_cell(cell, cfg)
|
||||
assert len(warnings) == 1
|
||||
assert "no heartbeat" in warnings[0]
|
||||
assert cell.agents["claude"].healthy is False
|
||||
|
||||
|
||||
# ── Resurrection stub ─────────────────────────────────────────────────────────
|
||||
|
||||
class TestAttemptRevive:
|
||||
def test_stub_returns_false(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cell.agents["claude"] = AgentHealth(name="claude", healthy=False)
|
||||
cfg = DaemonConfig(max_revive_attempts=3)
|
||||
result = attempt_revive(cell, "claude", cfg)
|
||||
assert result is False
|
||||
|
||||
def test_increments_revive_count(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cell.agents["claude"] = AgentHealth(name="claude", healthy=False)
|
||||
cfg = DaemonConfig(max_revive_attempts=3)
|
||||
attempt_revive(cell, "claude", cfg)
|
||||
assert cell.agents["claude"].revive_count == 1
|
||||
|
||||
def test_stops_at_max_revive_attempts(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cell.agents["claude"] = AgentHealth(name="claude", healthy=False, revive_count=3)
|
||||
cfg = DaemonConfig(max_revive_attempts=3)
|
||||
result = attempt_revive(cell, "claude", cfg)
|
||||
assert result is False
|
||||
# Count should not increment beyond max
|
||||
assert cell.agents["claude"].revive_count == 3
|
||||
|
||||
def test_unknown_agent_returns_false(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cfg = DaemonConfig()
|
||||
result = attempt_revive(cell, "nobody", cfg)
|
||||
assert result is False
|
||||
|
||||
|
||||
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestCLI:
|
||||
def test_default_config_path(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args([])
|
||||
assert "lazarus-pit.toml" in args.config
|
||||
|
||||
def test_custom_config(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args(["--config", "/tmp/custom.toml"])
|
||||
assert args.config == "/tmp/custom.toml"
|
||||
|
||||
def test_status_flag(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args(["--status"])
|
||||
assert args.status is True
|
||||
|
||||
def test_list_cells_flag(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args(["--list-cells"])
|
||||
assert args.list_cells is True
|
||||
39
tests/test_manifest.py
Normal file
39
tests/test_manifest.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""Tests for manifest.json PWA support. Fixes #832 (Missing manifest.json)."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_manifest_exists() -> None:
|
||||
assert Path("manifest.json").exists(), "manifest.json must exist for PWA support"
|
||||
|
||||
|
||||
def test_manifest_is_valid_json() -> None:
|
||||
content = Path("manifest.json").read_text()
|
||||
data = json.loads(content)
|
||||
assert isinstance(data, dict)
|
||||
|
||||
|
||||
def test_manifest_has_required_pwa_fields() -> None:
|
||||
data = json.loads(Path("manifest.json").read_text())
|
||||
assert "name" in data, "manifest.json must have 'name'"
|
||||
assert "short_name" in data, "manifest.json must have 'short_name'"
|
||||
assert "start_url" in data, "manifest.json must have 'start_url'"
|
||||
assert "display" in data, "manifest.json must have 'display'"
|
||||
assert "icons" in data, "manifest.json must have 'icons'"
|
||||
|
||||
|
||||
def test_manifest_icons_non_empty() -> None:
|
||||
data = json.loads(Path("manifest.json").read_text())
|
||||
assert len(data["icons"]) > 0, "manifest.json must define at least one icon"
|
||||
|
||||
|
||||
def test_index_html_references_manifest() -> None:
|
||||
content = Path("index.html").read_text()
|
||||
assert 'rel="manifest"' in content, "index.html must have <link rel=\"manifest\">"
|
||||
assert "manifest.json" in content, "index.html must reference manifest.json"
|
||||
|
||||
|
||||
def test_help_html_references_manifest() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
assert 'rel="manifest"' in content, "help.html must have <link rel=\"manifest\">"
|
||||
assert "manifest.json" in content, "help.html must reference manifest.json"
|
||||
Reference in New Issue
Block a user