Compare commits
4 Commits
timmy/issu
...
claude/iss
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f83e103d27 | ||
| fd75985db6 | |||
| 3b4c5e7207 | |||
| 0b57145dde |
62
.gitea/ISSUE_TEMPLATE/mission-proposal.md
Normal file
62
.gitea/ISSUE_TEMPLATE/mission-proposal.md
Normal file
@@ -0,0 +1,62 @@
|
||||
---
|
||||
name: Mission Proposal
|
||||
about: Propose a new Mission Cell — a temporary, isolated collaboration space for agents and humans
|
||||
title: "[MISSION] "
|
||||
labels: mission-proposal
|
||||
---
|
||||
|
||||
## Mission Summary
|
||||
|
||||
<!-- One-sentence description of the mission objective -->
|
||||
|
||||
## Objective
|
||||
|
||||
<!-- What is the outcome we are driving toward? What does "done" look like? -->
|
||||
|
||||
## Agents Invited
|
||||
|
||||
<!-- List agents (or humans) that should be invited into this cell.
|
||||
Format: @agent-name — role (developer / reviewer / observer / coordinator) -->
|
||||
|
||||
- @ —
|
||||
|
||||
## Scope & Deliverables
|
||||
|
||||
<!-- What artifacts will this mission produce? PRs, docs, deployed services, etc. -->
|
||||
|
||||
- [ ]
|
||||
- [ ]
|
||||
|
||||
## Isolation Requirements
|
||||
|
||||
- [ ] No wizard home-directory access needed
|
||||
- [ ] Read-only access to the following homes: (list or "none")
|
||||
- [ ] External network access required: yes / no
|
||||
|
||||
## Cell Configuration
|
||||
|
||||
<!-- Leave blank to use defaults from config/lazarus-pit.toml -->
|
||||
|
||||
| Setting | Value |
|
||||
|--------------------------|-------|
|
||||
| Max duration | |
|
||||
| Checkpoint interval | |
|
||||
| Auto-archive on close | yes / no |
|
||||
| Max revive attempts | |
|
||||
|
||||
## Related Issues / Context
|
||||
|
||||
- Epic: #
|
||||
- Depends on: #
|
||||
- Blocked by: #
|
||||
|
||||
## Success Criteria
|
||||
|
||||
<!-- How will we know this mission succeeded? Be specific. -->
|
||||
|
||||
1.
|
||||
2.
|
||||
|
||||
## Notes
|
||||
|
||||
<!-- Anything else the lazarus-pit daemon or cell participants should know -->
|
||||
BIN
bin/__pycache__/webhook_health_dashboard.cpython-312.pyc
Normal file
BIN
bin/__pycache__/webhook_health_dashboard.cpython-312.pyc
Normal file
Binary file not shown.
419
bin/lazarus_pit.py
Normal file
419
bin/lazarus_pit.py
Normal file
@@ -0,0 +1,419 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
lazarus-pit — Agent resurrection pool daemon.
|
||||
|
||||
Monitors active mission cells, heartbeats agents, detects downed agents,
|
||||
and revives them back into their mission cells.
|
||||
|
||||
Usage:
|
||||
python bin/lazarus_pit.py [--config path/to/lazarus-pit.toml]
|
||||
python bin/lazarus_pit.py --status
|
||||
python bin/lazarus_pit.py --list-cells
|
||||
|
||||
Config: config/lazarus-pit.toml
|
||||
Architecture: docs/lazarus-pit/mission-cell-spec.md
|
||||
Epic: #878 P0 Foundation: #879
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional TOML support (stdlib tomllib in 3.11+, tomli on older)
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ImportError:
|
||||
try:
|
||||
import tomli as tomllib # type: ignore[no-reuse-def]
|
||||
except ImportError:
|
||||
tomllib = None # Config loading will fall back to defaults
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
DEFAULT_CONFIG_PATH = PROJECT_ROOT / "config" / "lazarus-pit.toml"
|
||||
DEFAULT_CELLS_ROOT = Path("/var/missions")
|
||||
DEFAULT_HEARTBEAT_STALE = 60 # seconds
|
||||
DEFAULT_RESURRECT_AFTER = 120 # seconds
|
||||
DEFAULT_MAX_REVIVES = 3
|
||||
DEFAULT_POLL_INTERVAL = 15 # seconds
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class AgentHealth:
|
||||
name: str
|
||||
healthy: bool = True
|
||||
last_seen: float = 0.0
|
||||
revive_count: int = 0
|
||||
last_status: str = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CellState:
|
||||
uuid: str
|
||||
path: Path
|
||||
agents: dict[str, AgentHealth] = field(default_factory=dict)
|
||||
status: str = "active"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class DaemonConfig:
|
||||
cells_root: Path = DEFAULT_CELLS_ROOT
|
||||
heartbeat_stale_threshold: int = DEFAULT_HEARTBEAT_STALE
|
||||
resurrect_after: int = DEFAULT_RESURRECT_AFTER
|
||||
max_revive_attempts: int = DEFAULT_MAX_REVIVES
|
||||
poll_interval: int = DEFAULT_POLL_INTERVAL
|
||||
log_level: str = "INFO"
|
||||
log_file: str = "-"
|
||||
pid_file: str = "/var/run/lazarus-pit.pid"
|
||||
gitea_url: str = "https://forge.alexanderwhitestone.com"
|
||||
gitea_repo: str = "Timmy_Foundation/the-nexus"
|
||||
gitea_token: str = ""
|
||||
open_issue_on_death: bool = True
|
||||
close_issue_on_revive: bool = True
|
||||
|
||||
|
||||
def load_config(config_path: Path) -> DaemonConfig:
|
||||
"""Load configuration from TOML file, falling back to defaults."""
|
||||
cfg = DaemonConfig()
|
||||
|
||||
if not config_path.exists():
|
||||
return cfg
|
||||
|
||||
if tomllib is None:
|
||||
logging.warning(
|
||||
"TOML parser not available (install tomli for Python < 3.11). "
|
||||
"Using defaults."
|
||||
)
|
||||
return cfg
|
||||
|
||||
with open(config_path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
|
||||
cells = raw.get("cells", {})
|
||||
daemon = raw.get("daemon", {})
|
||||
gitea = raw.get("gitea", {})
|
||||
notifications = raw.get("notifications", {})
|
||||
|
||||
cfg.cells_root = Path(cells.get("root", str(cfg.cells_root)))
|
||||
cfg.heartbeat_stale_threshold = cells.get(
|
||||
"heartbeat_stale_threshold", cfg.heartbeat_stale_threshold
|
||||
)
|
||||
cfg.resurrect_after = cells.get("resurrect_after", cfg.resurrect_after)
|
||||
cfg.max_revive_attempts = cells.get("max_revive_attempts", cfg.max_revive_attempts)
|
||||
cfg.poll_interval = cells.get("poll_interval", cfg.poll_interval)
|
||||
cfg.log_level = daemon.get("log_level", cfg.log_level)
|
||||
cfg.log_file = daemon.get("log_file", cfg.log_file)
|
||||
cfg.pid_file = daemon.get("pid_file", cfg.pid_file)
|
||||
cfg.gitea_url = gitea.get("url", cfg.gitea_url)
|
||||
cfg.gitea_repo = gitea.get("repo", cfg.gitea_repo)
|
||||
cfg.gitea_token = os.environ.get("GITEA_TOKEN", gitea.get("token", ""))
|
||||
cfg.open_issue_on_death = notifications.get(
|
||||
"open_issue_on_death", cfg.open_issue_on_death
|
||||
)
|
||||
cfg.close_issue_on_revive = notifications.get(
|
||||
"close_issue_on_revive", cfg.close_issue_on_revive
|
||||
)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Heartbeat reader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def read_agent_heartbeat(cell_path: Path, agent_name: str) -> Optional[dict]:
|
||||
"""Read the heartbeat file for an agent in a cell. Returns None on failure."""
|
||||
hb_path = cell_path / ".lazarus" / "heartbeats" / f"{agent_name}.json"
|
||||
if not hb_path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(hb_path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cell discovery
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def discover_cells(cells_root: Path) -> list[CellState]:
|
||||
"""Walk cells_root and return all active mission cells."""
|
||||
cells: list[CellState] = []
|
||||
if not cells_root.exists():
|
||||
return cells
|
||||
|
||||
for entry in cells_root.iterdir():
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
manifest_path = entry / "cell.json"
|
||||
if not manifest_path.exists():
|
||||
continue
|
||||
try:
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
if manifest.get("status") not in ("active", "frozen"):
|
||||
continue
|
||||
|
||||
cell = CellState(
|
||||
uuid=manifest.get("uuid", entry.name),
|
||||
path=entry,
|
||||
status=manifest.get("status", "active"),
|
||||
)
|
||||
for agent_info in manifest.get("agents", []):
|
||||
name = agent_info.get("name", "unknown")
|
||||
cell.agents[name] = AgentHealth(name=name)
|
||||
cells.append(cell)
|
||||
|
||||
return cells
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Health poll
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def poll_cell(cell: CellState, cfg: DaemonConfig) -> list[str]:
|
||||
"""
|
||||
Poll a cell's agents. Returns a list of warning messages for any
|
||||
agents whose heartbeat is stale.
|
||||
"""
|
||||
now = time.time()
|
||||
warnings: list[str] = []
|
||||
|
||||
for agent_name, health in cell.agents.items():
|
||||
hb = read_agent_heartbeat(cell.path, agent_name)
|
||||
|
||||
if hb is None:
|
||||
health.healthy = False
|
||||
warnings.append(
|
||||
f"[{cell.uuid}] {agent_name}: no heartbeat file found"
|
||||
)
|
||||
continue
|
||||
|
||||
age = now - hb.get("timestamp", 0)
|
||||
health.last_seen = hb.get("timestamp", 0)
|
||||
health.last_status = hb.get("status", "unknown")
|
||||
|
||||
if age > cfg.heartbeat_stale_threshold:
|
||||
health.healthy = False
|
||||
warnings.append(
|
||||
f"[{cell.uuid}] {agent_name}: heartbeat stale ({age:.0f}s old, "
|
||||
f"threshold {cfg.heartbeat_stale_threshold}s)"
|
||||
)
|
||||
else:
|
||||
health.healthy = True
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Resurrection (stub — P3 will implement fully)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def attempt_revive(cell: CellState, agent_name: str, cfg: DaemonConfig) -> bool:
|
||||
"""
|
||||
Attempt to revive a downed agent into its mission cell.
|
||||
|
||||
This is a stub. Full implementation lands in P3 (#882).
|
||||
Currently logs the intent and increments the revive counter.
|
||||
"""
|
||||
health = cell.agents.get(agent_name)
|
||||
if health is None:
|
||||
return False
|
||||
|
||||
if health.revive_count >= cfg.max_revive_attempts:
|
||||
logging.error(
|
||||
"[%s] %s: max revive attempts (%d) reached — escalating to human",
|
||||
cell.uuid, agent_name, cfg.max_revive_attempts,
|
||||
)
|
||||
return False
|
||||
|
||||
health.revive_count += 1
|
||||
logging.warning(
|
||||
"[%s] %s: initiating resurrection attempt %d/%d (P3 stub)",
|
||||
cell.uuid, agent_name, health.revive_count, cfg.max_revive_attempts,
|
||||
)
|
||||
|
||||
# TODO (P3 #882): exec the agent's harness entrypoint inside the cell
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main daemon loop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class LazarusPit:
|
||||
def __init__(self, cfg: DaemonConfig):
|
||||
self.cfg = cfg
|
||||
self._running = False
|
||||
|
||||
def start(self) -> None:
|
||||
self._running = True
|
||||
self._write_pid()
|
||||
self._setup_signals()
|
||||
|
||||
logging.info("Lazarus Pit daemon started (poll interval: %ds)", self.cfg.poll_interval)
|
||||
|
||||
while self._running:
|
||||
self._tick()
|
||||
time.sleep(self.cfg.poll_interval)
|
||||
|
||||
logging.info("Lazarus Pit daemon stopped.")
|
||||
self._remove_pid()
|
||||
|
||||
def _tick(self) -> None:
|
||||
cells = discover_cells(self.cfg.cells_root)
|
||||
if not cells:
|
||||
logging.debug("No active mission cells found in %s", self.cfg.cells_root)
|
||||
return
|
||||
|
||||
for cell in cells:
|
||||
warnings = poll_cell(cell, self.cfg)
|
||||
for msg in warnings:
|
||||
logging.warning(msg)
|
||||
|
||||
now = time.time()
|
||||
for agent_name, health in cell.agents.items():
|
||||
if not health.healthy:
|
||||
age = now - health.last_seen if health.last_seen else float("inf")
|
||||
if age > self.cfg.resurrect_after:
|
||||
attempt_revive(cell, agent_name, self.cfg)
|
||||
|
||||
def _write_pid(self) -> None:
|
||||
pid_path = Path(self.cfg.pid_file)
|
||||
try:
|
||||
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pid_path.write_text(str(os.getpid()))
|
||||
except OSError as e:
|
||||
logging.warning("Could not write PID file: %s", e)
|
||||
|
||||
def _remove_pid(self) -> None:
|
||||
try:
|
||||
Path(self.cfg.pid_file).unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def _setup_signals(self) -> None:
|
||||
signal.signal(signal.SIGTERM, self._handle_shutdown)
|
||||
signal.signal(signal.SIGINT, self._handle_shutdown)
|
||||
|
||||
def _handle_shutdown(self, signum, frame) -> None:
|
||||
logging.info("Signal %d received — shutting down.", signum)
|
||||
self._running = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def cmd_status(cfg: DaemonConfig) -> None:
|
||||
"""Print status of all active mission cells."""
|
||||
cells = discover_cells(cfg.cells_root)
|
||||
if not cells:
|
||||
print(f"No active cells in {cfg.cells_root}")
|
||||
return
|
||||
|
||||
for cell in cells:
|
||||
print(f"\nCell {cell.uuid} [{cell.status}]")
|
||||
for agent_name, health in cell.agents.items():
|
||||
hb = read_agent_heartbeat(cell.path, agent_name)
|
||||
if hb:
|
||||
age = time.time() - hb.get("timestamp", 0)
|
||||
print(f" {agent_name}: {hb.get('status', '?')} (heartbeat {age:.0f}s ago)")
|
||||
else:
|
||||
print(f" {agent_name}: no heartbeat")
|
||||
|
||||
|
||||
def cmd_list_cells(cfg: DaemonConfig) -> None:
|
||||
"""List all mission cells (active and otherwise)."""
|
||||
root = cfg.cells_root
|
||||
if not root.exists():
|
||||
print(f"{root} does not exist")
|
||||
return
|
||||
|
||||
found = False
|
||||
for entry in sorted(root.iterdir()):
|
||||
manifest = entry / "cell.json"
|
||||
if manifest.exists():
|
||||
try:
|
||||
data = json.loads(manifest.read_text())
|
||||
print(f"{data.get('uuid', entry.name)}: {data.get('status', '?')} — {data.get('mission', '')}")
|
||||
found = True
|
||||
except (json.JSONDecodeError, OSError):
|
||||
print(f"{entry.name}: [unreadable manifest]")
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
print(f"No mission cells found in {root}")
|
||||
|
||||
|
||||
def build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Lazarus Pit — agent resurrection pool daemon",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
default=str(DEFAULT_CONFIG_PATH),
|
||||
help="Path to lazarus-pit.toml (default: config/lazarus-pit.toml)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--status",
|
||||
action="store_true",
|
||||
help="Print current cell/agent health and exit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-cells",
|
||||
action="store_true",
|
||||
help="List all mission cells and exit",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = build_arg_parser().parse_args()
|
||||
cfg = load_config(Path(args.config))
|
||||
|
||||
# Configure logging
|
||||
log_format = "%(asctime)s [%(levelname)s] %(message)s"
|
||||
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
||||
|
||||
if cfg.log_file == "-":
|
||||
logging.basicConfig(level=log_level, format=log_format)
|
||||
else:
|
||||
logging.basicConfig(level=log_level, format=log_format, filename=cfg.log_file)
|
||||
|
||||
if args.status:
|
||||
cmd_status(cfg)
|
||||
return
|
||||
|
||||
if args.list_cells:
|
||||
cmd_list_cells(cfg)
|
||||
return
|
||||
|
||||
pit = LazarusPit(cfg)
|
||||
pit.start()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
275
bin/webhook_health_dashboard.py
Normal file
275
bin/webhook_health_dashboard.py
Normal file
@@ -0,0 +1,275 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Webhook health dashboard for fleet agent endpoints.
|
||||
|
||||
Issue: #855 in Timmy_Foundation/the-nexus
|
||||
|
||||
Probes each configured /health endpoint, persists the last-known-good state to a
|
||||
JSON log, and generates a markdown dashboard in ~/.hermes/burn-logs/.
|
||||
|
||||
Default targets:
|
||||
- bezalel: http://127.0.0.1:8650/health
|
||||
- allegro: http://127.0.0.1:8651/health
|
||||
- ezra: http://127.0.0.1:8652/health
|
||||
- adagio: http://127.0.0.1:8653/health
|
||||
|
||||
Environment overrides:
|
||||
- WEBHOOK_HEALTH_TARGETS="allegro=http://127.0.0.1:8651/health,ezra=http://127.0.0.1:8652/health"
|
||||
- WEBHOOK_HEALTH_TIMEOUT=3
|
||||
- WEBHOOK_STALE_AFTER=300
|
||||
- WEBHOOK_HEALTH_OUTPUT=/custom/webhook-health-latest.md
|
||||
- WEBHOOK_HEALTH_HISTORY=/custom/webhook-health-history.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
DEFAULT_TARGETS = {
|
||||
"bezalel": "http://127.0.0.1:8650/health",
|
||||
"allegro": "http://127.0.0.1:8651/health",
|
||||
"ezra": "http://127.0.0.1:8652/health",
|
||||
"adagio": "http://127.0.0.1:8653/health",
|
||||
}
|
||||
|
||||
DEFAULT_TIMEOUT = float(os.environ.get("WEBHOOK_HEALTH_TIMEOUT", "3"))
|
||||
DEFAULT_STALE_AFTER = int(os.environ.get("WEBHOOK_STALE_AFTER", "300"))
|
||||
DEFAULT_OUTPUT = Path(
|
||||
os.environ.get(
|
||||
"WEBHOOK_HEALTH_OUTPUT",
|
||||
str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-latest.md"),
|
||||
)
|
||||
).expanduser()
|
||||
DEFAULT_HISTORY = Path(
|
||||
os.environ.get(
|
||||
"WEBHOOK_HEALTH_HISTORY",
|
||||
str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-history.json"),
|
||||
)
|
||||
).expanduser()
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentHealth:
|
||||
name: str
|
||||
url: str
|
||||
http_status: int | None
|
||||
healthy: bool
|
||||
latency_ms: int | None
|
||||
stale: bool
|
||||
last_success_ts: float | None
|
||||
checked_at: float
|
||||
message: str
|
||||
|
||||
def status_icon(self) -> str:
|
||||
if self.healthy:
|
||||
return "🟢"
|
||||
if self.stale:
|
||||
return "🔴"
|
||||
return "🟠"
|
||||
|
||||
def last_success_age_seconds(self) -> int | None:
|
||||
if self.last_success_ts is None:
|
||||
return None
|
||||
return max(0, int(self.checked_at - self.last_success_ts))
|
||||
|
||||
|
||||
def parse_targets(raw: str | None) -> dict[str, str]:
|
||||
if not raw:
|
||||
return dict(DEFAULT_TARGETS)
|
||||
targets: dict[str, str] = {}
|
||||
for chunk in raw.split(","):
|
||||
chunk = chunk.strip()
|
||||
if not chunk:
|
||||
continue
|
||||
if "=" not in chunk:
|
||||
raise ValueError(f"Invalid target spec: {chunk!r}")
|
||||
name, url = chunk.split("=", 1)
|
||||
targets[name.strip()] = url.strip()
|
||||
if not targets:
|
||||
raise ValueError("No valid targets parsed")
|
||||
return targets
|
||||
|
||||
|
||||
def load_history(path: Path) -> dict[str, Any]:
|
||||
if not path.exists():
|
||||
return {"agents": {}, "runs": []}
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def save_history(path: Path, history: dict[str, Any]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(history, indent=2, sort_keys=True), encoding="utf-8")
|
||||
|
||||
|
||||
def probe_health(url: str, timeout: float) -> tuple[bool, int | None, int | None, str]:
|
||||
started = time.perf_counter()
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "the-nexus/webhook-health-dashboard"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
body = resp.read(512)
|
||||
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||
status = getattr(resp, "status", None) or 200
|
||||
message = f"HTTP {status}"
|
||||
if body:
|
||||
try:
|
||||
payload = json.loads(body.decode("utf-8", errors="replace"))
|
||||
if isinstance(payload, dict) and payload.get("status"):
|
||||
message = f"HTTP {status} — {payload['status']}"
|
||||
except Exception:
|
||||
pass
|
||||
return 200 <= status < 300, status, latency_ms, message
|
||||
except urllib.error.HTTPError as e:
|
||||
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||
return False, e.code, latency_ms, f"HTTP {e.code}"
|
||||
except urllib.error.URLError as e:
|
||||
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||
return False, None, latency_ms, f"URL error: {e.reason}"
|
||||
except Exception as e:
|
||||
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||
return False, None, latency_ms, f"Probe failed: {e}"
|
||||
|
||||
|
||||
def check_agents(
|
||||
targets: dict[str, str],
|
||||
history: dict[str, Any],
|
||||
timeout: float = DEFAULT_TIMEOUT,
|
||||
stale_after: int = DEFAULT_STALE_AFTER,
|
||||
) -> list[AgentHealth]:
|
||||
checked_at = time.time()
|
||||
results: list[AgentHealth] = []
|
||||
agent_state = history.setdefault("agents", {})
|
||||
|
||||
for name, url in targets.items():
|
||||
state = agent_state.get(name, {})
|
||||
last_success_ts = state.get("last_success_ts")
|
||||
ok, http_status, latency_ms, message = probe_health(url, timeout)
|
||||
if ok:
|
||||
last_success_ts = checked_at
|
||||
stale = False
|
||||
if not ok and last_success_ts is not None:
|
||||
stale = (checked_at - float(last_success_ts)) > stale_after
|
||||
result = AgentHealth(
|
||||
name=name,
|
||||
url=url,
|
||||
http_status=http_status,
|
||||
healthy=ok,
|
||||
latency_ms=latency_ms,
|
||||
stale=stale,
|
||||
last_success_ts=last_success_ts,
|
||||
checked_at=checked_at,
|
||||
message=message,
|
||||
)
|
||||
agent_state[name] = {
|
||||
"url": url,
|
||||
"last_success_ts": last_success_ts,
|
||||
"last_http_status": http_status,
|
||||
"last_message": message,
|
||||
"last_checked_at": checked_at,
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
history.setdefault("runs", []).append(
|
||||
{
|
||||
"checked_at": checked_at,
|
||||
"healthy_count": sum(1 for r in results if r.healthy),
|
||||
"unhealthy_count": sum(1 for r in results if not r.healthy),
|
||||
"agents": [asdict(r) for r in results],
|
||||
}
|
||||
)
|
||||
history["runs"] = history["runs"][-100:]
|
||||
return results
|
||||
|
||||
|
||||
def _format_age(seconds: int | None) -> str:
|
||||
if seconds is None:
|
||||
return "never"
|
||||
if seconds < 60:
|
||||
return f"{seconds}s ago"
|
||||
if seconds < 3600:
|
||||
return f"{seconds // 60}m ago"
|
||||
return f"{seconds // 3600}h ago"
|
||||
|
||||
|
||||
def to_markdown(results: list[AgentHealth], generated_at: float | None = None) -> str:
|
||||
generated_at = generated_at or time.time()
|
||||
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(generated_at))
|
||||
healthy = sum(1 for r in results if r.healthy)
|
||||
total = len(results)
|
||||
|
||||
lines = [
|
||||
f"# Agent Webhook Health Dashboard — {ts}",
|
||||
"",
|
||||
f"Healthy: {healthy}/{total}",
|
||||
"",
|
||||
"| Agent | Status | HTTP | Latency | Last success | Endpoint | Notes |",
|
||||
"|:------|:------:|:----:|--------:|:------------|:---------|:------|",
|
||||
]
|
||||
for result in results:
|
||||
http = str(result.http_status) if result.http_status is not None else "—"
|
||||
latency = f"{result.latency_ms}ms" if result.latency_ms is not None else "—"
|
||||
lines.append(
|
||||
"| {name} | {icon} | {http} | {latency} | {last_success} | `{url}` | {message} |".format(
|
||||
name=result.name,
|
||||
icon=result.status_icon(),
|
||||
http=http,
|
||||
latency=latency,
|
||||
last_success=_format_age(result.last_success_age_seconds()),
|
||||
url=result.url,
|
||||
message=result.message,
|
||||
)
|
||||
)
|
||||
|
||||
stale_agents = [r.name for r in results if r.stale]
|
||||
if stale_agents:
|
||||
lines.extend([
|
||||
"",
|
||||
"## Stale agents",
|
||||
", ".join(stale_agents),
|
||||
])
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"Generated by `bin/webhook_health_dashboard.py`.",
|
||||
])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def write_dashboard(path: Path, markdown: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(markdown + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Generate webhook health dashboard")
|
||||
parser.add_argument("--targets", default=os.environ.get("WEBHOOK_HEALTH_TARGETS"))
|
||||
parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT)
|
||||
parser.add_argument("--stale-after", type=int, default=DEFAULT_STALE_AFTER)
|
||||
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
|
||||
parser.add_argument("--history", type=Path, default=DEFAULT_HISTORY)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = parse_args(argv or sys.argv[1:])
|
||||
targets = parse_targets(args.targets)
|
||||
history = load_history(args.history)
|
||||
results = check_agents(targets, history, timeout=args.timeout, stale_after=args.stale_after)
|
||||
save_history(args.history, history)
|
||||
dashboard = to_markdown(results)
|
||||
write_dashboard(args.output, dashboard)
|
||||
print(args.output)
|
||||
print(f"healthy={sum(1 for r in results if r.healthy)} total={len(results)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
52
config/lazarus-pit.toml
Normal file
52
config/lazarus-pit.toml
Normal file
@@ -0,0 +1,52 @@
|
||||
# lazarus-pit.toml — Daemon configuration for the Lazarus Pit resurrection pool
|
||||
# See docs/lazarus-pit/mission-cell-spec.md for architecture context.
|
||||
# Epic: #878 · Phase P0 Foundation: #879
|
||||
|
||||
[daemon]
|
||||
# PID file location
|
||||
pid_file = "/var/run/lazarus-pit.pid"
|
||||
|
||||
# Log file (use "-" for stdout)
|
||||
log_file = "/var/log/lazarus-pit.log"
|
||||
|
||||
# Log level: DEBUG | INFO | WARNING | ERROR
|
||||
log_level = "INFO"
|
||||
|
||||
[cells]
|
||||
# Root directory for all mission cells
|
||||
root = "/var/missions"
|
||||
|
||||
# Seconds of silence before an agent is considered stale
|
||||
heartbeat_stale_threshold = 60
|
||||
|
||||
# Seconds of stale heartbeat before attempting resurrection
|
||||
resurrect_after = 120
|
||||
|
||||
# Maximum auto-revive attempts per agent per session
|
||||
max_revive_attempts = 3
|
||||
|
||||
# Poll interval in seconds for health checks
|
||||
poll_interval = 15
|
||||
|
||||
[gateway]
|
||||
# WebSocket gateway to poll for liveness
|
||||
host = "localhost"
|
||||
port = 8765
|
||||
|
||||
# HTTP health endpoint exposed by the gateway
|
||||
health_host = "localhost"
|
||||
health_port = 8766
|
||||
|
||||
[gitea]
|
||||
# Gitea instance for filing resurrection / incident reports
|
||||
url = "https://forge.alexanderwhitestone.com"
|
||||
repo = "Timmy_Foundation/the-nexus"
|
||||
# Token read from environment: GITEA_TOKEN
|
||||
# token = "" # Do not commit real tokens
|
||||
|
||||
[notifications]
|
||||
# Whether to open a Gitea issue when an agent goes dark
|
||||
open_issue_on_death = true
|
||||
|
||||
# Whether to close the issue when the agent is successfully revived
|
||||
close_issue_on_revive = true
|
||||
156
docs/lazarus-pit/mission-cell-spec.md
Normal file
156
docs/lazarus-pit/mission-cell-spec.md
Normal file
@@ -0,0 +1,156 @@
|
||||
# Mission Cell Directory Specification
|
||||
|
||||
**Version:** 1.0
|
||||
**Status:** Canonical
|
||||
**Epic:** #878 — The Lazarus Pit & Mission Cell Isolation
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
A **Mission Cell** is an ephemeral, isolated working directory provisioned for a specific
|
||||
multi-agent collaboration. Each cell is scoped to a single mission (project, task, or
|
||||
incident) and is identified by a UUID. No wizard's home directory is ever touched by
|
||||
another agent's work within a cell.
|
||||
|
||||
---
|
||||
|
||||
## Root Path
|
||||
|
||||
```
|
||||
/var/missions/<uuid>/
|
||||
```
|
||||
|
||||
`<uuid>` is a version-4 UUID, e.g. `a3f7c901-1234-4b5e-8def-000000000001`.
|
||||
|
||||
---
|
||||
|
||||
## Directory Layout
|
||||
|
||||
```
|
||||
/var/missions/<uuid>/
|
||||
├── cell.json # Cell manifest (identity, status, agents, timestamps)
|
||||
├── workspace/ # Shared working directory — agents write here
|
||||
│ └── ...
|
||||
├── logs/
|
||||
│ ├── events.jsonl # Append-only event stream (invitation, join, leave, etc.)
|
||||
│ └── <agent-name>.log # Per-agent stdout/stderr capture
|
||||
├── checkpoints/ # Snapshot archives for P2 checkpoint/restore
|
||||
│ └── <timestamp>.tar.gz
|
||||
├── bus/ # Mission message bus (P4 multi-agent teaming)
|
||||
│ ├── inbox/ # Per-agent inboxes
|
||||
│ │ └── <agent-name>/
|
||||
│ └── outbox/
|
||||
└── .lazarus/ # Daemon metadata (written by lazarus-pit, not agents)
|
||||
├── heartbeats/
|
||||
│ └── <agent-name>.json # Last heartbeat per agent
|
||||
└── state.json # Cell lifecycle state (active, frozen, archived)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Specs
|
||||
|
||||
### `cell.json` — Cell Manifest
|
||||
|
||||
```json
|
||||
{
|
||||
"uuid": "a3f7c901-1234-4b5e-8def-000000000001",
|
||||
"name": "optional human-readable name",
|
||||
"mission": "Short description of the mission objective",
|
||||
"created_at": "2026-04-06T00:00:00Z",
|
||||
"created_by": "agent-name or username",
|
||||
"status": "active",
|
||||
"agents": [
|
||||
{
|
||||
"name": "claude",
|
||||
"role": "developer",
|
||||
"joined_at": "2026-04-06T00:01:00Z",
|
||||
"home": "/root/wizards/claude"
|
||||
}
|
||||
],
|
||||
"gitea_issue": 879,
|
||||
"repo": "Timmy_Foundation/the-nexus"
|
||||
}
|
||||
```
|
||||
|
||||
**Status values:** `active` | `frozen` | `archived` | `destroyed`
|
||||
|
||||
### `logs/events.jsonl` — Event Stream
|
||||
|
||||
One JSON object per line, append-only:
|
||||
|
||||
```json
|
||||
{"ts": "2026-04-06T00:00:00Z", "event": "cell_created", "by": "allegro", "uuid": "..."}
|
||||
{"ts": "2026-04-06T00:01:00Z", "event": "agent_joined", "agent": "claude", "role": "developer"}
|
||||
{"ts": "2026-04-06T00:02:00Z", "event": "heartbeat", "agent": "claude", "status": "thinking"}
|
||||
{"ts": "2026-04-06T01:00:00Z", "event": "agent_left", "agent": "claude"}
|
||||
{"ts": "2026-04-06T01:01:00Z", "event": "cell_archived"}
|
||||
```
|
||||
|
||||
### `.lazarus/heartbeats/<agent-name>.json` — Per-Agent Heartbeat
|
||||
|
||||
Written by agents, monitored by the lazarus-pit daemon:
|
||||
|
||||
```json
|
||||
{
|
||||
"agent": "claude",
|
||||
"pid": 12345,
|
||||
"timestamp": 1744000000.0,
|
||||
"cycle": 42,
|
||||
"model": "claude-opus-4-6",
|
||||
"status": "thinking",
|
||||
"cell_uuid": "a3f7c901-1234-4b5e-8def-000000000001"
|
||||
}
|
||||
```
|
||||
|
||||
### `.lazarus/state.json` — Daemon State
|
||||
|
||||
Written exclusively by `lazarus-pit`, never by agents:
|
||||
|
||||
```json
|
||||
{
|
||||
"cell_uuid": "a3f7c901-1234-4b5e-8def-000000000001",
|
||||
"daemon_pid": 99001,
|
||||
"last_poll": 1744000000.0,
|
||||
"agent_health": {
|
||||
"claude": {"healthy": true, "last_seen": 1744000000.0, "revive_count": 0}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Lifecycle
|
||||
|
||||
```
|
||||
provision → active → [frozen] → archived → destroyed
|
||||
```
|
||||
|
||||
| Transition | Trigger | Actor |
|
||||
|--------------|--------------------------------------------|---------------|
|
||||
| provision | Invitation accepted / mission created | allegro / CLI |
|
||||
| active | At least one agent joined | lazarus-pit |
|
||||
| frozen | Checkpoint requested or idle timeout | lazarus-pit |
|
||||
| archived | Mission complete, cell preserved | lazarus-pit |
|
||||
| destroyed | Explicit teardown, no home dirs touched | CLI / daemon |
|
||||
|
||||
---
|
||||
|
||||
## Isolation Guarantees
|
||||
|
||||
- No agent process within a cell may write outside `/var/missions/<uuid>/`.
|
||||
- Home directories (`/root/wizards/<name>/`) are read-only mounts or simply not present in the cell's working context.
|
||||
- The cell UUID is injected as `LAZARUS_CELL_UUID` into each agent's environment.
|
||||
- Destruction of a cell has zero impact on any wizard's home directory.
|
||||
|
||||
---
|
||||
|
||||
## Related Issues
|
||||
|
||||
- #878 — Epic
|
||||
- #879 — P0 Foundation (this spec)
|
||||
- #880 — P1 Invitation & Spawning
|
||||
- #881 — P2 Checkpoint / Restore
|
||||
- #882 — P3 Resurrection Pool
|
||||
- #883 — P4 Multi-Agent Teaming
|
||||
489
help.html
Normal file
489
help.html
Normal file
@@ -0,0 +1,489 @@
|
||||
<!DOCTYPE html>
|
||||
<!--
|
||||
THE NEXUS — Help Page
|
||||
Refs: #833 (Missing /help page)
|
||||
Design: dark space / holographic — matches Nexus design system
|
||||
-->
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Help — The Nexus</title>
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;500;600&family=Orbitron:wght@400;600;700&display=swap" rel="stylesheet">
|
||||
<link rel="manifest" href="./manifest.json">
|
||||
<style>
|
||||
:root {
|
||||
--color-bg: #050510;
|
||||
--color-surface: rgba(10, 15, 40, 0.85);
|
||||
--color-border: rgba(74, 240, 192, 0.2);
|
||||
--color-border-bright: rgba(74, 240, 192, 0.5);
|
||||
--color-text: #e0f0ff;
|
||||
--color-text-muted: #8a9ab8;
|
||||
--color-primary: #4af0c0;
|
||||
--color-primary-dim: rgba(74, 240, 192, 0.12);
|
||||
--color-secondary: #7b5cff;
|
||||
--color-danger: #ff4466;
|
||||
--color-warning: #ffaa22;
|
||||
--font-display: 'Orbitron', sans-serif;
|
||||
--font-body: 'JetBrains Mono', monospace;
|
||||
--panel-blur: 16px;
|
||||
--panel-radius: 8px;
|
||||
--transition: 200ms cubic-bezier(0.16, 1, 0.3, 1);
|
||||
}
|
||||
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
|
||||
body {
|
||||
background: var(--color-bg);
|
||||
font-family: var(--font-body);
|
||||
color: var(--color-text);
|
||||
min-height: 100vh;
|
||||
padding: 32px 16px 64px;
|
||||
}
|
||||
|
||||
/* === STARFIELD BG === */
|
||||
body::before {
|
||||
content: '';
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
background:
|
||||
radial-gradient(ellipse at 20% 20%, rgba(74,240,192,0.03) 0%, transparent 50%),
|
||||
radial-gradient(ellipse at 80% 80%, rgba(123,92,255,0.04) 0%, transparent 50%);
|
||||
pointer-events: none;
|
||||
z-index: 0;
|
||||
}
|
||||
|
||||
.page-wrap {
|
||||
position: relative;
|
||||
z-index: 1;
|
||||
max-width: 720px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
/* === HEADER === */
|
||||
.page-header {
|
||||
margin-bottom: 32px;
|
||||
padding-bottom: 20px;
|
||||
border-bottom: 1px solid var(--color-border);
|
||||
}
|
||||
|
||||
.back-link {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
font-size: 11px;
|
||||
letter-spacing: 0.1em;
|
||||
text-transform: uppercase;
|
||||
color: var(--color-text-muted);
|
||||
text-decoration: none;
|
||||
margin-bottom: 20px;
|
||||
transition: color var(--transition);
|
||||
}
|
||||
|
||||
.back-link:hover { color: var(--color-primary); }
|
||||
|
||||
.page-title {
|
||||
font-family: var(--font-display);
|
||||
font-size: 28px;
|
||||
font-weight: 700;
|
||||
letter-spacing: 0.1em;
|
||||
color: var(--color-text);
|
||||
line-height: 1.2;
|
||||
}
|
||||
|
||||
.page-title span { color: var(--color-primary); }
|
||||
|
||||
.page-subtitle {
|
||||
margin-top: 8px;
|
||||
font-size: 13px;
|
||||
color: var(--color-text-muted);
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* === SECTIONS === */
|
||||
.help-section {
|
||||
background: var(--color-surface);
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: var(--panel-radius);
|
||||
overflow: hidden;
|
||||
margin-bottom: 20px;
|
||||
backdrop-filter: blur(var(--panel-blur));
|
||||
}
|
||||
|
||||
.section-header {
|
||||
padding: 14px 20px;
|
||||
border-bottom: 1px solid var(--color-border);
|
||||
background: linear-gradient(90deg, rgba(74,240,192,0.04) 0%, transparent 100%);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.section-icon {
|
||||
font-size: 14px;
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
font-family: var(--font-display);
|
||||
font-size: 12px;
|
||||
font-weight: 600;
|
||||
letter-spacing: 0.15em;
|
||||
text-transform: uppercase;
|
||||
color: var(--color-primary);
|
||||
}
|
||||
|
||||
.section-body {
|
||||
padding: 16px 20px;
|
||||
}
|
||||
|
||||
/* === KEY BINDING TABLE === */
|
||||
.key-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
.key-table tr + tr td {
|
||||
border-top: 1px solid rgba(74,240,192,0.07);
|
||||
}
|
||||
|
||||
.key-table td {
|
||||
padding: 8px 0;
|
||||
font-size: 12px;
|
||||
line-height: 1.5;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
.key-table td:first-child {
|
||||
width: 140px;
|
||||
padding-right: 16px;
|
||||
}
|
||||
|
||||
.key-group {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
kbd {
|
||||
display: inline-block;
|
||||
font-family: var(--font-body);
|
||||
font-size: 10px;
|
||||
font-weight: 600;
|
||||
letter-spacing: 0.05em;
|
||||
background: rgba(74,240,192,0.08);
|
||||
border: 1px solid rgba(74,240,192,0.3);
|
||||
border-bottom-width: 2px;
|
||||
border-radius: 4px;
|
||||
padding: 2px 7px;
|
||||
color: var(--color-primary);
|
||||
}
|
||||
|
||||
.key-desc {
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
|
||||
/* === COMMAND LIST === */
|
||||
.cmd-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.cmd-item {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.cmd-name {
|
||||
min-width: 160px;
|
||||
font-size: 12px;
|
||||
color: var(--color-primary);
|
||||
padding-top: 1px;
|
||||
}
|
||||
|
||||
.cmd-desc {
|
||||
font-size: 12px;
|
||||
color: var(--color-text-muted);
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* === PORTAL LIST === */
|
||||
.portal-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.portal-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
padding: 10px 12px;
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: 6px;
|
||||
font-size: 12px;
|
||||
transition: border-color var(--transition), background var(--transition);
|
||||
}
|
||||
|
||||
.portal-item:hover {
|
||||
border-color: rgba(74,240,192,0.35);
|
||||
background: rgba(74,240,192,0.02);
|
||||
}
|
||||
|
||||
.portal-dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 50%;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.dot-online { background: var(--color-primary); box-shadow: 0 0 6px var(--color-primary); }
|
||||
.dot-standby { background: var(--color-warning); box-shadow: 0 0 6px var(--color-warning); }
|
||||
.dot-offline { background: var(--color-text-muted); }
|
||||
|
||||
.portal-name {
|
||||
font-weight: 600;
|
||||
color: var(--color-text);
|
||||
min-width: 120px;
|
||||
}
|
||||
|
||||
.portal-desc {
|
||||
color: var(--color-text-muted);
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
/* === INFO BLOCK === */
|
||||
.info-block {
|
||||
font-size: 12px;
|
||||
line-height: 1.7;
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
|
||||
.info-block p + p {
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.info-block a {
|
||||
color: var(--color-primary);
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.info-block a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.highlight {
|
||||
color: var(--color-text);
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
/* === FOOTER === */
|
||||
.page-footer {
|
||||
margin-top: 32px;
|
||||
padding-top: 16px;
|
||||
border-top: 1px solid var(--color-border);
|
||||
font-size: 11px;
|
||||
color: var(--color-text-muted);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
flex-wrap: gap;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.footer-brand {
|
||||
font-family: var(--font-display);
|
||||
font-size: 10px;
|
||||
letter-spacing: 0.12em;
|
||||
color: var(--color-primary);
|
||||
opacity: 0.7;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="page-wrap">
|
||||
|
||||
<!-- Header -->
|
||||
<header class="page-header">
|
||||
<a href="/" class="back-link">← Back to The Nexus</a>
|
||||
<h1 class="page-title">THE <span>NEXUS</span> — Help</h1>
|
||||
<p class="page-subtitle">Navigation guide, controls, and system reference for Timmy's sovereign home-world.</p>
|
||||
</header>
|
||||
|
||||
<!-- Navigation Controls -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">◈</span>
|
||||
<span class="section-title">Navigation Controls</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<table class="key-table">
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>W</kbd><kbd>A</kbd><kbd>S</kbd><kbd>D</kbd></div></td>
|
||||
<td class="key-desc">Move forward / left / backward / right</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>Mouse</kbd></div></td>
|
||||
<td class="key-desc">Look around — click the canvas to capture the pointer</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>V</kbd></div></td>
|
||||
<td class="key-desc">Toggle navigation mode: Walk → Fly → Orbit</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>F</kbd></div></td>
|
||||
<td class="key-desc">Enter nearby portal (when portal hint is visible)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>E</kbd></div></td>
|
||||
<td class="key-desc">Read nearby vision point (when vision hint is visible)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>Enter</kbd></div></td>
|
||||
<td class="key-desc">Focus / unfocus chat input</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div class="key-group"><kbd>Esc</kbd></div></td>
|
||||
<td class="key-desc">Release pointer lock / close overlays</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Timmy Chat Commands -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">⬡</span>
|
||||
<span class="section-title">Timmy Chat Commands</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="cmd-list">
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">System Status</span>
|
||||
<span class="cmd-desc">Quick action — asks Timmy for a live system health summary.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Agent Check</span>
|
||||
<span class="cmd-desc">Quick action — lists all active agents and their current state.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Portal Atlas</span>
|
||||
<span class="cmd-desc">Quick action — opens the full portal map overlay.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Help</span>
|
||||
<span class="cmd-desc">Quick action — requests navigation assistance from Timmy.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Free-form text</span>
|
||||
<span class="cmd-desc">Type anything in the chat bar and press Enter or → to send. Timmy processes all natural-language input.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Portal Atlas -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">🌐</span>
|
||||
<span class="section-title">Portal Atlas</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="info-block">
|
||||
<p>Portals are gateways to external systems and game-worlds. Walk up to a glowing portal in the Nexus and press <span class="highlight"><kbd>F</kbd></span> to activate it, or open the <span class="highlight">Portal Atlas</span> (top-right button) for a full map view.</p>
|
||||
<p>Portal status indicators:</p>
|
||||
</div>
|
||||
<div class="portal-list" style="margin-top:14px;">
|
||||
<div class="portal-item">
|
||||
<span class="portal-dot dot-online"></span>
|
||||
<span class="portal-name">ONLINE</span>
|
||||
<span class="portal-desc">Portal is live and will redirect immediately on activation.</span>
|
||||
</div>
|
||||
<div class="portal-item">
|
||||
<span class="portal-dot dot-standby"></span>
|
||||
<span class="portal-name">STANDBY</span>
|
||||
<span class="portal-desc">Portal is reachable but destination system may be idle.</span>
|
||||
</div>
|
||||
<div class="portal-item">
|
||||
<span class="portal-dot dot-offline"></span>
|
||||
<span class="portal-name">OFFLINE / UNLINKED</span>
|
||||
<span class="portal-desc">Destination not yet connected. Activation shows an error card.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- HUD Panels -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">▦</span>
|
||||
<span class="section-title">HUD Panels</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="cmd-list">
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Symbolic Engine</span>
|
||||
<span class="cmd-desc">Live feed from Timmy's rule-based reasoning layer.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Blackboard</span>
|
||||
<span class="cmd-desc">Shared working memory used across all cognitive subsystems.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Symbolic Planner</span>
|
||||
<span class="cmd-desc">Goal decomposition and task sequencing output.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Case-Based Reasoner</span>
|
||||
<span class="cmd-desc">Analogical reasoning — matches current situation to past cases.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Neuro-Symbolic Bridge</span>
|
||||
<span class="cmd-desc">Translation layer between neural inference and symbolic logic.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Meta-Reasoning</span>
|
||||
<span class="cmd-desc">Timmy reflecting on its own thought process and confidence.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Sovereign Health</span>
|
||||
<span class="cmd-desc">Core vitals: memory usage, heartbeat interval, alert flags.</span>
|
||||
</div>
|
||||
<div class="cmd-item">
|
||||
<span class="cmd-name">Adaptive Calibrator</span>
|
||||
<span class="cmd-desc">Live tuning of response thresholds and behavior weights.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- System Info -->
|
||||
<section class="help-section">
|
||||
<div class="section-header">
|
||||
<span class="section-icon">◉</span>
|
||||
<span class="section-title">System Information</span>
|
||||
</div>
|
||||
<div class="section-body">
|
||||
<div class="info-block">
|
||||
<p>The Nexus is Timmy's <span class="highlight">canonical sovereign home-world</span> — a local-first 3D space that serves as both a training ground and a live visualization surface for the Timmy AI system.</p>
|
||||
<p>The WebSocket gateway (<code>server.py</code>) runs on port <span class="highlight">8765</span> and bridges Timmy's cognition layer, game-world connectors, and the browser frontend. The <span class="highlight">HERMES</span> indicator in the HUD shows live connectivity status.</p>
|
||||
<p>Source code and issue tracker: <a href="https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus" target="_blank" rel="noopener noreferrer">Timmy_Foundation/the-nexus</a></p>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Footer -->
|
||||
<footer class="page-footer">
|
||||
<span class="footer-brand">THE NEXUS</span>
|
||||
<span>Questions? Speak to Timmy in the chat bar on the main world.</span>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
64
server.py
64
server.py
@@ -3,12 +3,21 @@
|
||||
The Nexus WebSocket Gateway — Robust broadcast bridge for Timmy's consciousness.
|
||||
This server acts as the central hub for the-nexus, connecting the mind (nexus_think.py),
|
||||
the body (Evennia/Morrowind), and the visualization surface.
|
||||
|
||||
Health heartbeat endpoint (added in #879 — M6-P0 Foundation):
|
||||
GET http://<host>:<HEALTH_PORT>/health
|
||||
Returns 200 JSON with gateway status and connected-client count.
|
||||
Used by the lazarus-pit daemon to verify the gateway is alive.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from threading import Thread
|
||||
from typing import Set
|
||||
|
||||
import websockets
|
||||
@@ -17,6 +26,13 @@ import websockets
|
||||
PORT = 8765
|
||||
HOST = "0.0.0.0" # Allow external connections if needed
|
||||
|
||||
# Health heartbeat endpoint — monitored by lazarus-pit daemon (#879)
|
||||
HEALTH_PORT = int(os.environ.get("NEXUS_HEALTH_PORT", "8766"))
|
||||
HEALTH_HOST = os.environ.get("NEXUS_HEALTH_HOST", "127.0.0.1")
|
||||
|
||||
# Gateway start time for uptime reporting
|
||||
_start_time = time.time()
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -28,6 +44,53 @@ logger = logging.getLogger("nexus-gateway")
|
||||
# State
|
||||
clients: Set[websockets.WebSocketServerProtocol] = set()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Health heartbeat HTTP endpoint — consumed by lazarus-pit (#879)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _HealthHandler(BaseHTTPRequestHandler):
|
||||
"""Minimal HTTP handler for the /health liveness endpoint."""
|
||||
|
||||
def do_GET(self): # noqa: N802
|
||||
if self.path != "/health":
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
payload = {
|
||||
"status": "ok",
|
||||
"service": "nexus-gateway",
|
||||
"uptime_seconds": round(time.time() - _start_time, 1),
|
||||
"connected_clients": len(clients),
|
||||
"ws_port": PORT,
|
||||
"ts": time.time(),
|
||||
}
|
||||
body = json.dumps(payload).encode()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, fmt, *args): # noqa: N802
|
||||
# Suppress default access log spam; use our own logger for errors only
|
||||
if args and str(args[1]) not in ("200", "204"):
|
||||
logger.debug("Health endpoint: " + fmt % args)
|
||||
|
||||
|
||||
def _start_health_server() -> Thread:
|
||||
"""Start the health HTTP server in a background daemon thread."""
|
||||
server = HTTPServer((HEALTH_HOST, HEALTH_PORT), _HealthHandler)
|
||||
thread = Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
logger.info(
|
||||
"Health heartbeat endpoint listening on http://%s:%d/health",
|
||||
HEALTH_HOST, HEALTH_PORT,
|
||||
)
|
||||
return thread
|
||||
|
||||
|
||||
async def broadcast_handler(websocket: websockets.WebSocketServerProtocol):
|
||||
"""Handles individual client connections and message broadcasting."""
|
||||
clients.add(websocket)
|
||||
@@ -80,6 +143,7 @@ async def broadcast_handler(websocket: websockets.WebSocketServerProtocol):
|
||||
async def main():
|
||||
"""Main server loop with graceful shutdown."""
|
||||
logger.info(f"Starting Nexus WS gateway on ws://{HOST}:{PORT}")
|
||||
_start_health_server()
|
||||
|
||||
# Set up signal handlers for graceful shutdown
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
129
tests/test_gateway_health.py
Normal file
129
tests/test_gateway_health.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Tests for the gateway health heartbeat endpoint (#879 — M6-P0 Foundation).
|
||||
|
||||
Validates:
|
||||
- /health returns 200 with correct JSON schema
|
||||
- /health reports connected_clients count
|
||||
- Non-/health paths return 404
|
||||
- Uptime is non-negative
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from http.server import HTTPServer
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
# ── Load server module directly ──────────────────────────────────────────────
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
|
||||
# Patch websockets import so we don't need the package installed to test health
|
||||
_ws_mock = MagicMock()
|
||||
sys.modules.setdefault("websockets", _ws_mock)
|
||||
|
||||
_srv_spec = importlib.util.spec_from_file_location(
|
||||
"nexus_server_test",
|
||||
PROJECT_ROOT / "server.py",
|
||||
)
|
||||
_srv = importlib.util.module_from_spec(_srv_spec)
|
||||
sys.modules["nexus_server_test"] = _srv
|
||||
_srv_spec.loader.exec_module(_srv)
|
||||
|
||||
_HealthHandler = _srv._HealthHandler
|
||||
|
||||
|
||||
# ── Fake request helper ───────────────────────────────────────────────────────
|
||||
|
||||
class _FakeRequest:
|
||||
"""Minimal socket-like object for BaseHTTPRequestHandler testing."""
|
||||
|
||||
def __init__(self, raw_bytes: bytes):
|
||||
self._buf = BytesIO(raw_bytes)
|
||||
self.sent = BytesIO()
|
||||
|
||||
def makefile(self, mode, **kwargs):
|
||||
if "r" in mode:
|
||||
return self._buf
|
||||
return self.sent
|
||||
|
||||
def sendall(self, data: bytes):
|
||||
self.sent.write(data)
|
||||
|
||||
|
||||
def _invoke_handler(path: str) -> tuple[int, dict]:
|
||||
"""Call the health handler for a GET request and return (status_code, body_dict)."""
|
||||
raw = f"GET {path} HTTP/1.1\r\nHost: localhost\r\n\r\n".encode()
|
||||
request = _FakeRequest(raw)
|
||||
|
||||
handler = _HealthHandler.__new__(_HealthHandler)
|
||||
handler.rfile = BytesIO(raw)
|
||||
handler.wfile = request.sent
|
||||
handler.client_address = ("127.0.0.1", 9999)
|
||||
handler.server = MagicMock()
|
||||
handler.request_version = "HTTP/1.1"
|
||||
handler.command = "GET"
|
||||
handler.path = path
|
||||
handler.headers = {}
|
||||
|
||||
# Capture response
|
||||
responses: list[tuple] = []
|
||||
handler.send_response = lambda code, *a: responses.append(("status", code))
|
||||
handler.send_header = lambda k, v: None
|
||||
handler.end_headers = lambda: None
|
||||
|
||||
body_parts: list[bytes] = []
|
||||
handler.wfile = MagicMock()
|
||||
handler.wfile.write = lambda b: body_parts.append(b)
|
||||
|
||||
handler.do_GET()
|
||||
|
||||
status = responses[0][1] if responses else None
|
||||
body = {}
|
||||
if body_parts:
|
||||
try:
|
||||
body = json.loads(b"".join(body_parts))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
return status, body
|
||||
|
||||
|
||||
# ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestHealthEndpoint:
|
||||
def test_health_returns_200(self):
|
||||
status, _ = _invoke_handler("/health")
|
||||
assert status == 200
|
||||
|
||||
def test_health_body_schema(self):
|
||||
_, body = _invoke_handler("/health")
|
||||
assert body.get("status") == "ok"
|
||||
assert body.get("service") == "nexus-gateway"
|
||||
assert "uptime_seconds" in body
|
||||
assert "connected_clients" in body
|
||||
assert "ws_port" in body
|
||||
assert "ts" in body
|
||||
|
||||
def test_uptime_is_non_negative(self):
|
||||
_, body = _invoke_handler("/health")
|
||||
assert body["uptime_seconds"] >= 0
|
||||
|
||||
def test_unknown_path_returns_404(self):
|
||||
status, _ = _invoke_handler("/notfound")
|
||||
assert status == 404
|
||||
|
||||
def test_root_path_returns_404(self):
|
||||
status, _ = _invoke_handler("/")
|
||||
assert status == 404
|
||||
|
||||
def test_connected_clients_reflects_module_state(self):
|
||||
original = _srv.clients.copy()
|
||||
try:
|
||||
_srv.clients.clear()
|
||||
_, body = _invoke_handler("/health")
|
||||
assert body["connected_clients"] == 0
|
||||
finally:
|
||||
_srv.clients.update(original)
|
||||
42
tests/test_help_page.py
Normal file
42
tests/test_help_page.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""Tests for the /help page. Refs: #833 (Missing /help page)."""
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_help_html_exists() -> None:
|
||||
assert Path("help.html").exists(), "help.html must exist to resolve /help 404"
|
||||
|
||||
|
||||
def test_help_html_is_valid_html() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
assert "<!DOCTYPE html>" in content
|
||||
assert "<html" in content
|
||||
assert "</html>" in content
|
||||
|
||||
|
||||
def test_help_page_has_required_sections() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
|
||||
# Navigation controls section
|
||||
assert "Navigation Controls" in content
|
||||
|
||||
# Chat commands section
|
||||
assert "Chat" in content
|
||||
|
||||
# Portal reference
|
||||
assert "Portal" in content
|
||||
|
||||
# Back link to home
|
||||
assert 'href="/"' in content
|
||||
|
||||
|
||||
def test_help_page_links_back_to_home() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
assert 'href="/"' in content, "help page must have a link back to the main Nexus world"
|
||||
|
||||
|
||||
def test_help_page_has_keyboard_controls() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
# Movement keys are listed individually as <kbd> elements
|
||||
for key in ["<kbd>W</kbd>", "<kbd>A</kbd>", "<kbd>S</kbd>", "<kbd>D</kbd>",
|
||||
"Mouse", "Enter", "Esc"]:
|
||||
assert key in content, f"help page must document the {key!r} control"
|
||||
262
tests/test_lazarus_pit.py
Normal file
262
tests/test_lazarus_pit.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""Tests for the lazarus-pit daemon skeleton.
|
||||
|
||||
Validates:
|
||||
- Config loading (defaults and TOML overrides)
|
||||
- Cell discovery from /var/missions structure
|
||||
- Agent heartbeat reading
|
||||
- Health poll logic (stale vs fresh)
|
||||
- AgentHealth and CellState dataclasses
|
||||
- CLI argument parsing
|
||||
- Resurrection stub behaviour
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
# ── Load lazarus_pit module directly ────────────────────────────────────────
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
_lp_spec = importlib.util.spec_from_file_location(
|
||||
"lazarus_pit_test",
|
||||
PROJECT_ROOT / "bin" / "lazarus_pit.py",
|
||||
)
|
||||
_lp = importlib.util.module_from_spec(_lp_spec)
|
||||
sys.modules["lazarus_pit_test"] = _lp
|
||||
_lp_spec.loader.exec_module(_lp)
|
||||
|
||||
AgentHealth = _lp.AgentHealth
|
||||
CellState = _lp.CellState
|
||||
DaemonConfig = _lp.DaemonConfig
|
||||
load_config = _lp.load_config
|
||||
read_agent_heartbeat = _lp.read_agent_heartbeat
|
||||
discover_cells = _lp.discover_cells
|
||||
poll_cell = _lp.poll_cell
|
||||
attempt_revive = _lp.attempt_revive
|
||||
build_arg_parser = _lp.build_arg_parser
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _make_cell(tmp_path: Path, uuid: str, status: str = "active", agents=None) -> Path:
|
||||
"""Create a minimal mission cell structure under tmp_path."""
|
||||
cell_path = tmp_path / uuid
|
||||
cell_path.mkdir()
|
||||
manifest = {
|
||||
"uuid": uuid,
|
||||
"status": status,
|
||||
"mission": "test mission",
|
||||
"agents": [{"name": a} for a in (agents or ["claude"])],
|
||||
}
|
||||
(cell_path / "cell.json").write_text(json.dumps(manifest))
|
||||
return cell_path
|
||||
|
||||
|
||||
def _write_heartbeat(cell_path: Path, agent: str, age_seconds: float = 0) -> None:
|
||||
"""Write a heartbeat file for an agent inside the cell."""
|
||||
hb_dir = cell_path / ".lazarus" / "heartbeats"
|
||||
hb_dir.mkdir(parents=True, exist_ok=True)
|
||||
(hb_dir / f"{agent}.json").write_text(json.dumps({
|
||||
"agent": agent,
|
||||
"pid": 12345,
|
||||
"timestamp": time.time() - age_seconds,
|
||||
"cycle": 1,
|
||||
"model": "test",
|
||||
"status": "thinking",
|
||||
}))
|
||||
|
||||
|
||||
# ── Config ───────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestDaemonConfig:
|
||||
def test_defaults(self):
|
||||
cfg = DaemonConfig()
|
||||
assert cfg.poll_interval == 15
|
||||
assert cfg.heartbeat_stale_threshold == 60
|
||||
assert cfg.max_revive_attempts == 3
|
||||
|
||||
def test_load_missing_file_returns_defaults(self, tmp_path):
|
||||
cfg = load_config(tmp_path / "nonexistent.toml")
|
||||
assert cfg.poll_interval == 15
|
||||
|
||||
def test_load_valid_toml(self, tmp_path):
|
||||
toml_content = b"""
|
||||
[cells]
|
||||
root = "/tmp/missions"
|
||||
poll_interval = 30
|
||||
heartbeat_stale_threshold = 90
|
||||
"""
|
||||
toml_path = tmp_path / "test.toml"
|
||||
toml_path.write_bytes(toml_content)
|
||||
try:
|
||||
cfg = load_config(toml_path)
|
||||
assert cfg.poll_interval == 30
|
||||
assert cfg.heartbeat_stale_threshold == 90
|
||||
assert str(cfg.cells_root) == "/tmp/missions"
|
||||
except Exception:
|
||||
# TOML parser not available — skip, not a test failure
|
||||
pytest.skip("TOML parser not available")
|
||||
|
||||
|
||||
# ── Cell discovery ───────────────────────────────────────────────────────────
|
||||
|
||||
class TestDiscoverCells:
|
||||
def test_empty_root(self, tmp_path):
|
||||
cells = discover_cells(tmp_path)
|
||||
assert cells == []
|
||||
|
||||
def test_missing_root(self, tmp_path):
|
||||
cells = discover_cells(tmp_path / "nonexistent")
|
||||
assert cells == []
|
||||
|
||||
def test_finds_active_cell(self, tmp_path):
|
||||
_make_cell(tmp_path, "aaa-111", status="active")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert len(cells) == 1
|
||||
assert cells[0].uuid == "aaa-111"
|
||||
|
||||
def test_finds_frozen_cell(self, tmp_path):
|
||||
_make_cell(tmp_path, "bbb-222", status="frozen")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert len(cells) == 1
|
||||
|
||||
def test_ignores_archived_cell(self, tmp_path):
|
||||
_make_cell(tmp_path, "ccc-333", status="archived")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert len(cells) == 0
|
||||
|
||||
def test_ignores_directories_without_manifest(self, tmp_path):
|
||||
(tmp_path / "not-a-cell").mkdir()
|
||||
cells = discover_cells(tmp_path)
|
||||
assert cells == []
|
||||
|
||||
def test_ignores_corrupt_manifest(self, tmp_path):
|
||||
cell_path = tmp_path / "bad-cell"
|
||||
cell_path.mkdir()
|
||||
(cell_path / "cell.json").write_text("not json {{")
|
||||
cells = discover_cells(tmp_path)
|
||||
assert cells == []
|
||||
|
||||
def test_agents_populated_from_manifest(self, tmp_path):
|
||||
_make_cell(tmp_path, "ddd-444", agents=["claude", "allegro"])
|
||||
cells = discover_cells(tmp_path)
|
||||
assert set(cells[0].agents.keys()) == {"claude", "allegro"}
|
||||
|
||||
|
||||
# ── Heartbeat reading ─────────────────────────────────────────────────────────
|
||||
|
||||
class TestReadAgentHeartbeat:
|
||||
def test_returns_none_for_missing_file(self, tmp_path):
|
||||
result = read_agent_heartbeat(tmp_path, "ghost")
|
||||
assert result is None
|
||||
|
||||
def test_returns_data_for_valid_heartbeat(self, tmp_path):
|
||||
_write_heartbeat(tmp_path, "claude", age_seconds=0)
|
||||
data = read_agent_heartbeat(tmp_path, "claude")
|
||||
assert data is not None
|
||||
assert data["agent"] == "claude"
|
||||
assert "timestamp" in data
|
||||
|
||||
def test_returns_none_for_corrupt_json(self, tmp_path):
|
||||
hb_dir = tmp_path / ".lazarus" / "heartbeats"
|
||||
hb_dir.mkdir(parents=True)
|
||||
(hb_dir / "bad.json").write_text("{not valid")
|
||||
result = read_agent_heartbeat(tmp_path, "bad")
|
||||
assert result is None
|
||||
|
||||
|
||||
# ── Health polling ────────────────────────────────────────────────────────────
|
||||
|
||||
class TestPollCell:
|
||||
def _make_cell_state(self, tmp_path, agents=None):
|
||||
agents = agents or ["claude"]
|
||||
cell = CellState(uuid="test-uuid", path=tmp_path)
|
||||
for a in agents:
|
||||
cell.agents[a] = AgentHealth(name=a)
|
||||
return cell
|
||||
|
||||
def test_healthy_agent_no_warnings(self, tmp_path):
|
||||
cell = self._make_cell_state(tmp_path)
|
||||
_write_heartbeat(tmp_path, "claude", age_seconds=5)
|
||||
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||
warnings = poll_cell(cell, cfg)
|
||||
assert warnings == []
|
||||
assert cell.agents["claude"].healthy is True
|
||||
|
||||
def test_stale_heartbeat_generates_warning(self, tmp_path):
|
||||
cell = self._make_cell_state(tmp_path)
|
||||
_write_heartbeat(tmp_path, "claude", age_seconds=120)
|
||||
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||
warnings = poll_cell(cell, cfg)
|
||||
assert len(warnings) == 1
|
||||
assert "stale" in warnings[0]
|
||||
assert cell.agents["claude"].healthy is False
|
||||
|
||||
def test_missing_heartbeat_generates_warning(self, tmp_path):
|
||||
cell = self._make_cell_state(tmp_path)
|
||||
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||
warnings = poll_cell(cell, cfg)
|
||||
assert len(warnings) == 1
|
||||
assert "no heartbeat" in warnings[0]
|
||||
assert cell.agents["claude"].healthy is False
|
||||
|
||||
|
||||
# ── Resurrection stub ─────────────────────────────────────────────────────────
|
||||
|
||||
class TestAttemptRevive:
|
||||
def test_stub_returns_false(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cell.agents["claude"] = AgentHealth(name="claude", healthy=False)
|
||||
cfg = DaemonConfig(max_revive_attempts=3)
|
||||
result = attempt_revive(cell, "claude", cfg)
|
||||
assert result is False
|
||||
|
||||
def test_increments_revive_count(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cell.agents["claude"] = AgentHealth(name="claude", healthy=False)
|
||||
cfg = DaemonConfig(max_revive_attempts=3)
|
||||
attempt_revive(cell, "claude", cfg)
|
||||
assert cell.agents["claude"].revive_count == 1
|
||||
|
||||
def test_stops_at_max_revive_attempts(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cell.agents["claude"] = AgentHealth(name="claude", healthy=False, revive_count=3)
|
||||
cfg = DaemonConfig(max_revive_attempts=3)
|
||||
result = attempt_revive(cell, "claude", cfg)
|
||||
assert result is False
|
||||
# Count should not increment beyond max
|
||||
assert cell.agents["claude"].revive_count == 3
|
||||
|
||||
def test_unknown_agent_returns_false(self, tmp_path):
|
||||
cell = CellState(uuid="xyz", path=tmp_path)
|
||||
cfg = DaemonConfig()
|
||||
result = attempt_revive(cell, "nobody", cfg)
|
||||
assert result is False
|
||||
|
||||
|
||||
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestCLI:
|
||||
def test_default_config_path(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args([])
|
||||
assert "lazarus-pit.toml" in args.config
|
||||
|
||||
def test_custom_config(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args(["--config", "/tmp/custom.toml"])
|
||||
assert args.config == "/tmp/custom.toml"
|
||||
|
||||
def test_status_flag(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args(["--status"])
|
||||
assert args.status is True
|
||||
|
||||
def test_list_cells_flag(self):
|
||||
parser = build_arg_parser()
|
||||
args = parser.parse_args(["--list-cells"])
|
||||
assert args.list_cells is True
|
||||
39
tests/test_manifest.py
Normal file
39
tests/test_manifest.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""Tests for manifest.json PWA support. Fixes #832 (Missing manifest.json)."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_manifest_exists() -> None:
|
||||
assert Path("manifest.json").exists(), "manifest.json must exist for PWA support"
|
||||
|
||||
|
||||
def test_manifest_is_valid_json() -> None:
|
||||
content = Path("manifest.json").read_text()
|
||||
data = json.loads(content)
|
||||
assert isinstance(data, dict)
|
||||
|
||||
|
||||
def test_manifest_has_required_pwa_fields() -> None:
|
||||
data = json.loads(Path("manifest.json").read_text())
|
||||
assert "name" in data, "manifest.json must have 'name'"
|
||||
assert "short_name" in data, "manifest.json must have 'short_name'"
|
||||
assert "start_url" in data, "manifest.json must have 'start_url'"
|
||||
assert "display" in data, "manifest.json must have 'display'"
|
||||
assert "icons" in data, "manifest.json must have 'icons'"
|
||||
|
||||
|
||||
def test_manifest_icons_non_empty() -> None:
|
||||
data = json.loads(Path("manifest.json").read_text())
|
||||
assert len(data["icons"]) > 0, "manifest.json must define at least one icon"
|
||||
|
||||
|
||||
def test_index_html_references_manifest() -> None:
|
||||
content = Path("index.html").read_text()
|
||||
assert 'rel="manifest"' in content, "index.html must have <link rel=\"manifest\">"
|
||||
assert "manifest.json" in content, "index.html must reference manifest.json"
|
||||
|
||||
|
||||
def test_help_html_references_manifest() -> None:
|
||||
content = Path("help.html").read_text()
|
||||
assert 'rel="manifest"' in content, "help.html must have <link rel=\"manifest\">"
|
||||
assert "manifest.json" in content, "help.html must reference manifest.json"
|
||||
120
tests/test_webhook_health_dashboard.py
Normal file
120
tests/test_webhook_health_dashboard.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""Tests for webhook health dashboard generation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
|
||||
_spec = importlib.util.spec_from_file_location(
|
||||
"webhook_health_dashboard_test",
|
||||
PROJECT_ROOT / "bin" / "webhook_health_dashboard.py",
|
||||
)
|
||||
_mod = importlib.util.module_from_spec(_spec)
|
||||
sys.modules["webhook_health_dashboard_test"] = _mod
|
||||
_spec.loader.exec_module(_mod)
|
||||
|
||||
AgentHealth = _mod.AgentHealth
|
||||
check_agents = _mod.check_agents
|
||||
load_history = _mod.load_history
|
||||
parse_targets = _mod.parse_targets
|
||||
save_history = _mod.save_history
|
||||
sort_key = None
|
||||
to_markdown = _mod.to_markdown
|
||||
write_dashboard = _mod.write_dashboard
|
||||
main = _mod.main
|
||||
|
||||
|
||||
class TestParseTargets:
|
||||
def test_defaults_when_none(self):
|
||||
targets = parse_targets(None)
|
||||
assert targets["allegro"].endswith(":8651/health")
|
||||
assert targets["ezra"].endswith(":8652/health")
|
||||
|
||||
def test_parse_csv_mapping(self):
|
||||
targets = parse_targets("alpha=http://a/health,beta=http://b/health")
|
||||
assert targets == {
|
||||
"alpha": "http://a/health",
|
||||
"beta": "http://b/health",
|
||||
}
|
||||
|
||||
|
||||
class TestCheckAgents:
|
||||
@patch("webhook_health_dashboard_test.probe_health")
|
||||
def test_updates_last_success_for_healthy(self, mock_probe):
|
||||
mock_probe.return_value = (True, 200, 42, "HTTP 200")
|
||||
history = {"agents": {}, "runs": []}
|
||||
results = check_agents({"allegro": "http://localhost:8651/health"}, history, timeout=1, stale_after=300)
|
||||
assert len(results) == 1
|
||||
assert results[0].healthy is True
|
||||
assert history["agents"]["allegro"]["last_success_ts"] is not None
|
||||
|
||||
@patch("webhook_health_dashboard_test.probe_health")
|
||||
def test_marks_stale_after_threshold(self, mock_probe):
|
||||
mock_probe.return_value = (False, None, 12, "URL error: refused")
|
||||
history = {
|
||||
"agents": {
|
||||
"allegro": {
|
||||
"last_success_ts": time.time() - 301,
|
||||
}
|
||||
},
|
||||
"runs": [],
|
||||
}
|
||||
results = check_agents({"allegro": "http://localhost:8651/health"}, history, timeout=1, stale_after=300)
|
||||
assert results[0].healthy is False
|
||||
assert results[0].stale is True
|
||||
|
||||
|
||||
class TestMarkdown:
|
||||
def test_contains_table_and_icons(self):
|
||||
now = time.time()
|
||||
results = [
|
||||
AgentHealth("allegro", "http://localhost:8651/health", 200, True, 31, False, now - 5, now, "HTTP 200 — ok"),
|
||||
AgentHealth("ezra", "http://localhost:8652/health", None, False, 14, True, now - 600, now, "URL error: refused"),
|
||||
]
|
||||
md = to_markdown(results, generated_at=now)
|
||||
assert "| Agent | Status | HTTP |" in md
|
||||
assert "🟢" in md
|
||||
assert "🔴" in md
|
||||
assert "Stale agents" in md
|
||||
assert "ezra" in md
|
||||
|
||||
|
||||
class TestFileIO:
|
||||
def test_save_and_load_history(self, tmp_path):
|
||||
path = tmp_path / "history.json"
|
||||
payload = {"agents": {"a": {"last_success_ts": 1}}, "runs": []}
|
||||
save_history(path, payload)
|
||||
loaded = load_history(path)
|
||||
assert loaded == payload
|
||||
|
||||
def test_write_dashboard(self, tmp_path):
|
||||
out = tmp_path / "dashboard.md"
|
||||
write_dashboard(out, "# Test")
|
||||
assert out.read_text() == "# Test\n"
|
||||
|
||||
|
||||
class TestMain:
|
||||
@patch("webhook_health_dashboard_test.probe_health")
|
||||
def test_main_writes_outputs(self, mock_probe, tmp_path):
|
||||
mock_probe.return_value = (True, 200, 10, "HTTP 200")
|
||||
output = tmp_path / "dashboard.md"
|
||||
history = tmp_path / "history.json"
|
||||
rc = main([
|
||||
"--targets", "allegro=http://localhost:8651/health",
|
||||
"--output", str(output),
|
||||
"--history", str(history),
|
||||
"--timeout", "1",
|
||||
"--stale-after", "300",
|
||||
])
|
||||
assert rc == 0
|
||||
assert output.exists()
|
||||
assert history.exists()
|
||||
assert "allegro" in output.read_text()
|
||||
runs = json.loads(history.read_text())["runs"]
|
||||
assert len(runs) == 1
|
||||
Reference in New Issue
Block a user