Compare commits
1 Commits
mimo/code/
...
claude/iss
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f83e103d27 |
62
.gitea/ISSUE_TEMPLATE/mission-proposal.md
Normal file
62
.gitea/ISSUE_TEMPLATE/mission-proposal.md
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
---
|
||||||
|
name: Mission Proposal
|
||||||
|
about: Propose a new Mission Cell — a temporary, isolated collaboration space for agents and humans
|
||||||
|
title: "[MISSION] "
|
||||||
|
labels: mission-proposal
|
||||||
|
---
|
||||||
|
|
||||||
|
## Mission Summary
|
||||||
|
|
||||||
|
<!-- One-sentence description of the mission objective -->
|
||||||
|
|
||||||
|
## Objective
|
||||||
|
|
||||||
|
<!-- What is the outcome we are driving toward? What does "done" look like? -->
|
||||||
|
|
||||||
|
## Agents Invited
|
||||||
|
|
||||||
|
<!-- List agents (or humans) that should be invited into this cell.
|
||||||
|
Format: @agent-name — role (developer / reviewer / observer / coordinator) -->
|
||||||
|
|
||||||
|
- @ —
|
||||||
|
|
||||||
|
## Scope & Deliverables
|
||||||
|
|
||||||
|
<!-- What artifacts will this mission produce? PRs, docs, deployed services, etc. -->
|
||||||
|
|
||||||
|
- [ ]
|
||||||
|
- [ ]
|
||||||
|
|
||||||
|
## Isolation Requirements
|
||||||
|
|
||||||
|
- [ ] No wizard home-directory access needed
|
||||||
|
- [ ] Read-only access to the following homes: (list or "none")
|
||||||
|
- [ ] External network access required: yes / no
|
||||||
|
|
||||||
|
## Cell Configuration
|
||||||
|
|
||||||
|
<!-- Leave blank to use defaults from config/lazarus-pit.toml -->
|
||||||
|
|
||||||
|
| Setting | Value |
|
||||||
|
|--------------------------|-------|
|
||||||
|
| Max duration | |
|
||||||
|
| Checkpoint interval | |
|
||||||
|
| Auto-archive on close | yes / no |
|
||||||
|
| Max revive attempts | |
|
||||||
|
|
||||||
|
## Related Issues / Context
|
||||||
|
|
||||||
|
- Epic: #
|
||||||
|
- Depends on: #
|
||||||
|
- Blocked by: #
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
<!-- How will we know this mission succeeded? Be specific. -->
|
||||||
|
|
||||||
|
1.
|
||||||
|
2.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
<!-- Anything else the lazarus-pit daemon or cell participants should know -->
|
||||||
419
bin/lazarus_pit.py
Normal file
419
bin/lazarus_pit.py
Normal file
@@ -0,0 +1,419 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
lazarus-pit — Agent resurrection pool daemon.
|
||||||
|
|
||||||
|
Monitors active mission cells, heartbeats agents, detects downed agents,
|
||||||
|
and revives them back into their mission cells.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python bin/lazarus_pit.py [--config path/to/lazarus-pit.toml]
|
||||||
|
python bin/lazarus_pit.py --status
|
||||||
|
python bin/lazarus_pit.py --list-cells
|
||||||
|
|
||||||
|
Config: config/lazarus-pit.toml
|
||||||
|
Architecture: docs/lazarus-pit/mission-cell-spec.md
|
||||||
|
Epic: #878 P0 Foundation: #879
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Optional TOML support (stdlib tomllib in 3.11+, tomli on older)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
try:
|
||||||
|
import tomllib # Python 3.11+
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
import tomli as tomllib # type: ignore[no-reuse-def]
|
||||||
|
except ImportError:
|
||||||
|
tomllib = None # Config loading will fall back to defaults
|
||||||
|
|
||||||
|
PROJECT_ROOT = Path(__file__).parent.parent
|
||||||
|
DEFAULT_CONFIG_PATH = PROJECT_ROOT / "config" / "lazarus-pit.toml"
|
||||||
|
DEFAULT_CELLS_ROOT = Path("/var/missions")
|
||||||
|
DEFAULT_HEARTBEAT_STALE = 60 # seconds
|
||||||
|
DEFAULT_RESURRECT_AFTER = 120 # seconds
|
||||||
|
DEFAULT_MAX_REVIVES = 3
|
||||||
|
DEFAULT_POLL_INTERVAL = 15 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Data models
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AgentHealth:
|
||||||
|
name: str
|
||||||
|
healthy: bool = True
|
||||||
|
last_seen: float = 0.0
|
||||||
|
revive_count: int = 0
|
||||||
|
last_status: str = "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CellState:
|
||||||
|
uuid: str
|
||||||
|
path: Path
|
||||||
|
agents: dict[str, AgentHealth] = field(default_factory=dict)
|
||||||
|
status: str = "active"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Config
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DaemonConfig:
|
||||||
|
cells_root: Path = DEFAULT_CELLS_ROOT
|
||||||
|
heartbeat_stale_threshold: int = DEFAULT_HEARTBEAT_STALE
|
||||||
|
resurrect_after: int = DEFAULT_RESURRECT_AFTER
|
||||||
|
max_revive_attempts: int = DEFAULT_MAX_REVIVES
|
||||||
|
poll_interval: int = DEFAULT_POLL_INTERVAL
|
||||||
|
log_level: str = "INFO"
|
||||||
|
log_file: str = "-"
|
||||||
|
pid_file: str = "/var/run/lazarus-pit.pid"
|
||||||
|
gitea_url: str = "https://forge.alexanderwhitestone.com"
|
||||||
|
gitea_repo: str = "Timmy_Foundation/the-nexus"
|
||||||
|
gitea_token: str = ""
|
||||||
|
open_issue_on_death: bool = True
|
||||||
|
close_issue_on_revive: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(config_path: Path) -> DaemonConfig:
|
||||||
|
"""Load configuration from TOML file, falling back to defaults."""
|
||||||
|
cfg = DaemonConfig()
|
||||||
|
|
||||||
|
if not config_path.exists():
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
if tomllib is None:
|
||||||
|
logging.warning(
|
||||||
|
"TOML parser not available (install tomli for Python < 3.11). "
|
||||||
|
"Using defaults."
|
||||||
|
)
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
with open(config_path, "rb") as f:
|
||||||
|
raw = tomllib.load(f)
|
||||||
|
|
||||||
|
cells = raw.get("cells", {})
|
||||||
|
daemon = raw.get("daemon", {})
|
||||||
|
gitea = raw.get("gitea", {})
|
||||||
|
notifications = raw.get("notifications", {})
|
||||||
|
|
||||||
|
cfg.cells_root = Path(cells.get("root", str(cfg.cells_root)))
|
||||||
|
cfg.heartbeat_stale_threshold = cells.get(
|
||||||
|
"heartbeat_stale_threshold", cfg.heartbeat_stale_threshold
|
||||||
|
)
|
||||||
|
cfg.resurrect_after = cells.get("resurrect_after", cfg.resurrect_after)
|
||||||
|
cfg.max_revive_attempts = cells.get("max_revive_attempts", cfg.max_revive_attempts)
|
||||||
|
cfg.poll_interval = cells.get("poll_interval", cfg.poll_interval)
|
||||||
|
cfg.log_level = daemon.get("log_level", cfg.log_level)
|
||||||
|
cfg.log_file = daemon.get("log_file", cfg.log_file)
|
||||||
|
cfg.pid_file = daemon.get("pid_file", cfg.pid_file)
|
||||||
|
cfg.gitea_url = gitea.get("url", cfg.gitea_url)
|
||||||
|
cfg.gitea_repo = gitea.get("repo", cfg.gitea_repo)
|
||||||
|
cfg.gitea_token = os.environ.get("GITEA_TOKEN", gitea.get("token", ""))
|
||||||
|
cfg.open_issue_on_death = notifications.get(
|
||||||
|
"open_issue_on_death", cfg.open_issue_on_death
|
||||||
|
)
|
||||||
|
cfg.close_issue_on_revive = notifications.get(
|
||||||
|
"close_issue_on_revive", cfg.close_issue_on_revive
|
||||||
|
)
|
||||||
|
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Heartbeat reader
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def read_agent_heartbeat(cell_path: Path, agent_name: str) -> Optional[dict]:
|
||||||
|
"""Read the heartbeat file for an agent in a cell. Returns None on failure."""
|
||||||
|
hb_path = cell_path / ".lazarus" / "heartbeats" / f"{agent_name}.json"
|
||||||
|
if not hb_path.exists():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(hb_path.read_text())
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cell discovery
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def discover_cells(cells_root: Path) -> list[CellState]:
|
||||||
|
"""Walk cells_root and return all active mission cells."""
|
||||||
|
cells: list[CellState] = []
|
||||||
|
if not cells_root.exists():
|
||||||
|
return cells
|
||||||
|
|
||||||
|
for entry in cells_root.iterdir():
|
||||||
|
if not entry.is_dir():
|
||||||
|
continue
|
||||||
|
manifest_path = entry / "cell.json"
|
||||||
|
if not manifest_path.exists():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
manifest = json.loads(manifest_path.read_text())
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if manifest.get("status") not in ("active", "frozen"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
cell = CellState(
|
||||||
|
uuid=manifest.get("uuid", entry.name),
|
||||||
|
path=entry,
|
||||||
|
status=manifest.get("status", "active"),
|
||||||
|
)
|
||||||
|
for agent_info in manifest.get("agents", []):
|
||||||
|
name = agent_info.get("name", "unknown")
|
||||||
|
cell.agents[name] = AgentHealth(name=name)
|
||||||
|
cells.append(cell)
|
||||||
|
|
||||||
|
return cells
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Health poll
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def poll_cell(cell: CellState, cfg: DaemonConfig) -> list[str]:
|
||||||
|
"""
|
||||||
|
Poll a cell's agents. Returns a list of warning messages for any
|
||||||
|
agents whose heartbeat is stale.
|
||||||
|
"""
|
||||||
|
now = time.time()
|
||||||
|
warnings: list[str] = []
|
||||||
|
|
||||||
|
for agent_name, health in cell.agents.items():
|
||||||
|
hb = read_agent_heartbeat(cell.path, agent_name)
|
||||||
|
|
||||||
|
if hb is None:
|
||||||
|
health.healthy = False
|
||||||
|
warnings.append(
|
||||||
|
f"[{cell.uuid}] {agent_name}: no heartbeat file found"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
age = now - hb.get("timestamp", 0)
|
||||||
|
health.last_seen = hb.get("timestamp", 0)
|
||||||
|
health.last_status = hb.get("status", "unknown")
|
||||||
|
|
||||||
|
if age > cfg.heartbeat_stale_threshold:
|
||||||
|
health.healthy = False
|
||||||
|
warnings.append(
|
||||||
|
f"[{cell.uuid}] {agent_name}: heartbeat stale ({age:.0f}s old, "
|
||||||
|
f"threshold {cfg.heartbeat_stale_threshold}s)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
health.healthy = True
|
||||||
|
|
||||||
|
return warnings
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Resurrection (stub — P3 will implement fully)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def attempt_revive(cell: CellState, agent_name: str, cfg: DaemonConfig) -> bool:
|
||||||
|
"""
|
||||||
|
Attempt to revive a downed agent into its mission cell.
|
||||||
|
|
||||||
|
This is a stub. Full implementation lands in P3 (#882).
|
||||||
|
Currently logs the intent and increments the revive counter.
|
||||||
|
"""
|
||||||
|
health = cell.agents.get(agent_name)
|
||||||
|
if health is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if health.revive_count >= cfg.max_revive_attempts:
|
||||||
|
logging.error(
|
||||||
|
"[%s] %s: max revive attempts (%d) reached — escalating to human",
|
||||||
|
cell.uuid, agent_name, cfg.max_revive_attempts,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
health.revive_count += 1
|
||||||
|
logging.warning(
|
||||||
|
"[%s] %s: initiating resurrection attempt %d/%d (P3 stub)",
|
||||||
|
cell.uuid, agent_name, health.revive_count, cfg.max_revive_attempts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO (P3 #882): exec the agent's harness entrypoint inside the cell
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main daemon loop
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class LazarusPit:
|
||||||
|
def __init__(self, cfg: DaemonConfig):
|
||||||
|
self.cfg = cfg
|
||||||
|
self._running = False
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
self._running = True
|
||||||
|
self._write_pid()
|
||||||
|
self._setup_signals()
|
||||||
|
|
||||||
|
logging.info("Lazarus Pit daemon started (poll interval: %ds)", self.cfg.poll_interval)
|
||||||
|
|
||||||
|
while self._running:
|
||||||
|
self._tick()
|
||||||
|
time.sleep(self.cfg.poll_interval)
|
||||||
|
|
||||||
|
logging.info("Lazarus Pit daemon stopped.")
|
||||||
|
self._remove_pid()
|
||||||
|
|
||||||
|
def _tick(self) -> None:
|
||||||
|
cells = discover_cells(self.cfg.cells_root)
|
||||||
|
if not cells:
|
||||||
|
logging.debug("No active mission cells found in %s", self.cfg.cells_root)
|
||||||
|
return
|
||||||
|
|
||||||
|
for cell in cells:
|
||||||
|
warnings = poll_cell(cell, self.cfg)
|
||||||
|
for msg in warnings:
|
||||||
|
logging.warning(msg)
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
for agent_name, health in cell.agents.items():
|
||||||
|
if not health.healthy:
|
||||||
|
age = now - health.last_seen if health.last_seen else float("inf")
|
||||||
|
if age > self.cfg.resurrect_after:
|
||||||
|
attempt_revive(cell, agent_name, self.cfg)
|
||||||
|
|
||||||
|
def _write_pid(self) -> None:
|
||||||
|
pid_path = Path(self.cfg.pid_file)
|
||||||
|
try:
|
||||||
|
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
pid_path.write_text(str(os.getpid()))
|
||||||
|
except OSError as e:
|
||||||
|
logging.warning("Could not write PID file: %s", e)
|
||||||
|
|
||||||
|
def _remove_pid(self) -> None:
|
||||||
|
try:
|
||||||
|
Path(self.cfg.pid_file).unlink(missing_ok=True)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _setup_signals(self) -> None:
|
||||||
|
signal.signal(signal.SIGTERM, self._handle_shutdown)
|
||||||
|
signal.signal(signal.SIGINT, self._handle_shutdown)
|
||||||
|
|
||||||
|
def _handle_shutdown(self, signum, frame) -> None:
|
||||||
|
logging.info("Signal %d received — shutting down.", signum)
|
||||||
|
self._running = False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def cmd_status(cfg: DaemonConfig) -> None:
|
||||||
|
"""Print status of all active mission cells."""
|
||||||
|
cells = discover_cells(cfg.cells_root)
|
||||||
|
if not cells:
|
||||||
|
print(f"No active cells in {cfg.cells_root}")
|
||||||
|
return
|
||||||
|
|
||||||
|
for cell in cells:
|
||||||
|
print(f"\nCell {cell.uuid} [{cell.status}]")
|
||||||
|
for agent_name, health in cell.agents.items():
|
||||||
|
hb = read_agent_heartbeat(cell.path, agent_name)
|
||||||
|
if hb:
|
||||||
|
age = time.time() - hb.get("timestamp", 0)
|
||||||
|
print(f" {agent_name}: {hb.get('status', '?')} (heartbeat {age:.0f}s ago)")
|
||||||
|
else:
|
||||||
|
print(f" {agent_name}: no heartbeat")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_list_cells(cfg: DaemonConfig) -> None:
|
||||||
|
"""List all mission cells (active and otherwise)."""
|
||||||
|
root = cfg.cells_root
|
||||||
|
if not root.exists():
|
||||||
|
print(f"{root} does not exist")
|
||||||
|
return
|
||||||
|
|
||||||
|
found = False
|
||||||
|
for entry in sorted(root.iterdir()):
|
||||||
|
manifest = entry / "cell.json"
|
||||||
|
if manifest.exists():
|
||||||
|
try:
|
||||||
|
data = json.loads(manifest.read_text())
|
||||||
|
print(f"{data.get('uuid', entry.name)}: {data.get('status', '?')} — {data.get('mission', '')}")
|
||||||
|
found = True
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
print(f"{entry.name}: [unreadable manifest]")
|
||||||
|
found = True
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
print(f"No mission cells found in {root}")
|
||||||
|
|
||||||
|
|
||||||
|
def build_arg_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Lazarus Pit — agent resurrection pool daemon",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
default=str(DEFAULT_CONFIG_PATH),
|
||||||
|
help="Path to lazarus-pit.toml (default: config/lazarus-pit.toml)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--status",
|
||||||
|
action="store_true",
|
||||||
|
help="Print current cell/agent health and exit",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--list-cells",
|
||||||
|
action="store_true",
|
||||||
|
help="List all mission cells and exit",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = build_arg_parser().parse_args()
|
||||||
|
cfg = load_config(Path(args.config))
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
log_format = "%(asctime)s [%(levelname)s] %(message)s"
|
||||||
|
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
||||||
|
|
||||||
|
if cfg.log_file == "-":
|
||||||
|
logging.basicConfig(level=log_level, format=log_format)
|
||||||
|
else:
|
||||||
|
logging.basicConfig(level=log_level, format=log_format, filename=cfg.log_file)
|
||||||
|
|
||||||
|
if args.status:
|
||||||
|
cmd_status(cfg)
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.list_cells:
|
||||||
|
cmd_list_cells(cfg)
|
||||||
|
return
|
||||||
|
|
||||||
|
pit = LazarusPit(cfg)
|
||||||
|
pit.start()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
52
config/lazarus-pit.toml
Normal file
52
config/lazarus-pit.toml
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# lazarus-pit.toml — Daemon configuration for the Lazarus Pit resurrection pool
|
||||||
|
# See docs/lazarus-pit/mission-cell-spec.md for architecture context.
|
||||||
|
# Epic: #878 · Phase P0 Foundation: #879
|
||||||
|
|
||||||
|
[daemon]
|
||||||
|
# PID file location
|
||||||
|
pid_file = "/var/run/lazarus-pit.pid"
|
||||||
|
|
||||||
|
# Log file (use "-" for stdout)
|
||||||
|
log_file = "/var/log/lazarus-pit.log"
|
||||||
|
|
||||||
|
# Log level: DEBUG | INFO | WARNING | ERROR
|
||||||
|
log_level = "INFO"
|
||||||
|
|
||||||
|
[cells]
|
||||||
|
# Root directory for all mission cells
|
||||||
|
root = "/var/missions"
|
||||||
|
|
||||||
|
# Seconds of silence before an agent is considered stale
|
||||||
|
heartbeat_stale_threshold = 60
|
||||||
|
|
||||||
|
# Seconds of stale heartbeat before attempting resurrection
|
||||||
|
resurrect_after = 120
|
||||||
|
|
||||||
|
# Maximum auto-revive attempts per agent per session
|
||||||
|
max_revive_attempts = 3
|
||||||
|
|
||||||
|
# Poll interval in seconds for health checks
|
||||||
|
poll_interval = 15
|
||||||
|
|
||||||
|
[gateway]
|
||||||
|
# WebSocket gateway to poll for liveness
|
||||||
|
host = "localhost"
|
||||||
|
port = 8765
|
||||||
|
|
||||||
|
# HTTP health endpoint exposed by the gateway
|
||||||
|
health_host = "localhost"
|
||||||
|
health_port = 8766
|
||||||
|
|
||||||
|
[gitea]
|
||||||
|
# Gitea instance for filing resurrection / incident reports
|
||||||
|
url = "https://forge.alexanderwhitestone.com"
|
||||||
|
repo = "Timmy_Foundation/the-nexus"
|
||||||
|
# Token read from environment: GITEA_TOKEN
|
||||||
|
# token = "" # Do not commit real tokens
|
||||||
|
|
||||||
|
[notifications]
|
||||||
|
# Whether to open a Gitea issue when an agent goes dark
|
||||||
|
open_issue_on_death = true
|
||||||
|
|
||||||
|
# Whether to close the issue when the agent is successfully revived
|
||||||
|
close_issue_on_revive = true
|
||||||
156
docs/lazarus-pit/mission-cell-spec.md
Normal file
156
docs/lazarus-pit/mission-cell-spec.md
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
# Mission Cell Directory Specification
|
||||||
|
|
||||||
|
**Version:** 1.0
|
||||||
|
**Status:** Canonical
|
||||||
|
**Epic:** #878 — The Lazarus Pit & Mission Cell Isolation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
A **Mission Cell** is an ephemeral, isolated working directory provisioned for a specific
|
||||||
|
multi-agent collaboration. Each cell is scoped to a single mission (project, task, or
|
||||||
|
incident) and is identified by a UUID. No wizard's home directory is ever touched by
|
||||||
|
another agent's work within a cell.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Root Path
|
||||||
|
|
||||||
|
```
|
||||||
|
/var/missions/<uuid>/
|
||||||
|
```
|
||||||
|
|
||||||
|
`<uuid>` is a version-4 UUID, e.g. `a3f7c901-1234-4b5e-8def-000000000001`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Directory Layout
|
||||||
|
|
||||||
|
```
|
||||||
|
/var/missions/<uuid>/
|
||||||
|
├── cell.json # Cell manifest (identity, status, agents, timestamps)
|
||||||
|
├── workspace/ # Shared working directory — agents write here
|
||||||
|
│ └── ...
|
||||||
|
├── logs/
|
||||||
|
│ ├── events.jsonl # Append-only event stream (invitation, join, leave, etc.)
|
||||||
|
│ └── <agent-name>.log # Per-agent stdout/stderr capture
|
||||||
|
├── checkpoints/ # Snapshot archives for P2 checkpoint/restore
|
||||||
|
│ └── <timestamp>.tar.gz
|
||||||
|
├── bus/ # Mission message bus (P4 multi-agent teaming)
|
||||||
|
│ ├── inbox/ # Per-agent inboxes
|
||||||
|
│ │ └── <agent-name>/
|
||||||
|
│ └── outbox/
|
||||||
|
└── .lazarus/ # Daemon metadata (written by lazarus-pit, not agents)
|
||||||
|
├── heartbeats/
|
||||||
|
│ └── <agent-name>.json # Last heartbeat per agent
|
||||||
|
└── state.json # Cell lifecycle state (active, frozen, archived)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## File Specs
|
||||||
|
|
||||||
|
### `cell.json` — Cell Manifest
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"uuid": "a3f7c901-1234-4b5e-8def-000000000001",
|
||||||
|
"name": "optional human-readable name",
|
||||||
|
"mission": "Short description of the mission objective",
|
||||||
|
"created_at": "2026-04-06T00:00:00Z",
|
||||||
|
"created_by": "agent-name or username",
|
||||||
|
"status": "active",
|
||||||
|
"agents": [
|
||||||
|
{
|
||||||
|
"name": "claude",
|
||||||
|
"role": "developer",
|
||||||
|
"joined_at": "2026-04-06T00:01:00Z",
|
||||||
|
"home": "/root/wizards/claude"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gitea_issue": 879,
|
||||||
|
"repo": "Timmy_Foundation/the-nexus"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status values:** `active` | `frozen` | `archived` | `destroyed`
|
||||||
|
|
||||||
|
### `logs/events.jsonl` — Event Stream
|
||||||
|
|
||||||
|
One JSON object per line, append-only:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"ts": "2026-04-06T00:00:00Z", "event": "cell_created", "by": "allegro", "uuid": "..."}
|
||||||
|
{"ts": "2026-04-06T00:01:00Z", "event": "agent_joined", "agent": "claude", "role": "developer"}
|
||||||
|
{"ts": "2026-04-06T00:02:00Z", "event": "heartbeat", "agent": "claude", "status": "thinking"}
|
||||||
|
{"ts": "2026-04-06T01:00:00Z", "event": "agent_left", "agent": "claude"}
|
||||||
|
{"ts": "2026-04-06T01:01:00Z", "event": "cell_archived"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `.lazarus/heartbeats/<agent-name>.json` — Per-Agent Heartbeat
|
||||||
|
|
||||||
|
Written by agents, monitored by the lazarus-pit daemon:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"agent": "claude",
|
||||||
|
"pid": 12345,
|
||||||
|
"timestamp": 1744000000.0,
|
||||||
|
"cycle": 42,
|
||||||
|
"model": "claude-opus-4-6",
|
||||||
|
"status": "thinking",
|
||||||
|
"cell_uuid": "a3f7c901-1234-4b5e-8def-000000000001"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `.lazarus/state.json` — Daemon State
|
||||||
|
|
||||||
|
Written exclusively by `lazarus-pit`, never by agents:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"cell_uuid": "a3f7c901-1234-4b5e-8def-000000000001",
|
||||||
|
"daemon_pid": 99001,
|
||||||
|
"last_poll": 1744000000.0,
|
||||||
|
"agent_health": {
|
||||||
|
"claude": {"healthy": true, "last_seen": 1744000000.0, "revive_count": 0}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Lifecycle
|
||||||
|
|
||||||
|
```
|
||||||
|
provision → active → [frozen] → archived → destroyed
|
||||||
|
```
|
||||||
|
|
||||||
|
| Transition | Trigger | Actor |
|
||||||
|
|--------------|--------------------------------------------|---------------|
|
||||||
|
| provision | Invitation accepted / mission created | allegro / CLI |
|
||||||
|
| active | At least one agent joined | lazarus-pit |
|
||||||
|
| frozen | Checkpoint requested or idle timeout | lazarus-pit |
|
||||||
|
| archived | Mission complete, cell preserved | lazarus-pit |
|
||||||
|
| destroyed | Explicit teardown, no home dirs touched | CLI / daemon |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Isolation Guarantees
|
||||||
|
|
||||||
|
- No agent process within a cell may write outside `/var/missions/<uuid>/`.
|
||||||
|
- Home directories (`/root/wizards/<name>/`) are read-only mounts or simply not present in the cell's working context.
|
||||||
|
- The cell UUID is injected as `LAZARUS_CELL_UUID` into each agent's environment.
|
||||||
|
- Destruction of a cell has zero impact on any wizard's home directory.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Related Issues
|
||||||
|
|
||||||
|
- #878 — Epic
|
||||||
|
- #879 — P0 Foundation (this spec)
|
||||||
|
- #880 — P1 Invitation & Spawning
|
||||||
|
- #881 — P2 Checkpoint / Restore
|
||||||
|
- #882 — P3 Resurrection Pool
|
||||||
|
- #883 — P4 Multi-Agent Teaming
|
||||||
64
server.py
64
server.py
@@ -3,12 +3,21 @@
|
|||||||
The Nexus WebSocket Gateway — Robust broadcast bridge for Timmy's consciousness.
|
The Nexus WebSocket Gateway — Robust broadcast bridge for Timmy's consciousness.
|
||||||
This server acts as the central hub for the-nexus, connecting the mind (nexus_think.py),
|
This server acts as the central hub for the-nexus, connecting the mind (nexus_think.py),
|
||||||
the body (Evennia/Morrowind), and the visualization surface.
|
the body (Evennia/Morrowind), and the visualization surface.
|
||||||
|
|
||||||
|
Health heartbeat endpoint (added in #879 — M6-P0 Foundation):
|
||||||
|
GET http://<host>:<HEALTH_PORT>/health
|
||||||
|
Returns 200 JSON with gateway status and connected-client count.
|
||||||
|
Used by the lazarus-pit daemon to verify the gateway is alive.
|
||||||
"""
|
"""
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||||
|
from threading import Thread
|
||||||
from typing import Set
|
from typing import Set
|
||||||
|
|
||||||
import websockets
|
import websockets
|
||||||
@@ -17,6 +26,13 @@ import websockets
|
|||||||
PORT = 8765
|
PORT = 8765
|
||||||
HOST = "0.0.0.0" # Allow external connections if needed
|
HOST = "0.0.0.0" # Allow external connections if needed
|
||||||
|
|
||||||
|
# Health heartbeat endpoint — monitored by lazarus-pit daemon (#879)
|
||||||
|
HEALTH_PORT = int(os.environ.get("NEXUS_HEALTH_PORT", "8766"))
|
||||||
|
HEALTH_HOST = os.environ.get("NEXUS_HEALTH_HOST", "127.0.0.1")
|
||||||
|
|
||||||
|
# Gateway start time for uptime reporting
|
||||||
|
_start_time = time.time()
|
||||||
|
|
||||||
# Logging setup
|
# Logging setup
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
@@ -28,6 +44,53 @@ logger = logging.getLogger("nexus-gateway")
|
|||||||
# State
|
# State
|
||||||
clients: Set[websockets.WebSocketServerProtocol] = set()
|
clients: Set[websockets.WebSocketServerProtocol] = set()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Health heartbeat HTTP endpoint — consumed by lazarus-pit (#879)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class _HealthHandler(BaseHTTPRequestHandler):
|
||||||
|
"""Minimal HTTP handler for the /health liveness endpoint."""
|
||||||
|
|
||||||
|
def do_GET(self): # noqa: N802
|
||||||
|
if self.path != "/health":
|
||||||
|
self.send_response(404)
|
||||||
|
self.end_headers()
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"status": "ok",
|
||||||
|
"service": "nexus-gateway",
|
||||||
|
"uptime_seconds": round(time.time() - _start_time, 1),
|
||||||
|
"connected_clients": len(clients),
|
||||||
|
"ws_port": PORT,
|
||||||
|
"ts": time.time(),
|
||||||
|
}
|
||||||
|
body = json.dumps(payload).encode()
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "application/json")
|
||||||
|
self.send_header("Content-Length", str(len(body)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(body)
|
||||||
|
|
||||||
|
def log_message(self, fmt, *args): # noqa: N802
|
||||||
|
# Suppress default access log spam; use our own logger for errors only
|
||||||
|
if args and str(args[1]) not in ("200", "204"):
|
||||||
|
logger.debug("Health endpoint: " + fmt % args)
|
||||||
|
|
||||||
|
|
||||||
|
def _start_health_server() -> Thread:
|
||||||
|
"""Start the health HTTP server in a background daemon thread."""
|
||||||
|
server = HTTPServer((HEALTH_HOST, HEALTH_PORT), _HealthHandler)
|
||||||
|
thread = Thread(target=server.serve_forever, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
logger.info(
|
||||||
|
"Health heartbeat endpoint listening on http://%s:%d/health",
|
||||||
|
HEALTH_HOST, HEALTH_PORT,
|
||||||
|
)
|
||||||
|
return thread
|
||||||
|
|
||||||
|
|
||||||
async def broadcast_handler(websocket: websockets.WebSocketServerProtocol):
|
async def broadcast_handler(websocket: websockets.WebSocketServerProtocol):
|
||||||
"""Handles individual client connections and message broadcasting."""
|
"""Handles individual client connections and message broadcasting."""
|
||||||
clients.add(websocket)
|
clients.add(websocket)
|
||||||
@@ -80,6 +143,7 @@ async def broadcast_handler(websocket: websockets.WebSocketServerProtocol):
|
|||||||
async def main():
|
async def main():
|
||||||
"""Main server loop with graceful shutdown."""
|
"""Main server loop with graceful shutdown."""
|
||||||
logger.info(f"Starting Nexus WS gateway on ws://{HOST}:{PORT}")
|
logger.info(f"Starting Nexus WS gateway on ws://{HOST}:{PORT}")
|
||||||
|
_start_health_server()
|
||||||
|
|
||||||
# Set up signal handlers for graceful shutdown
|
# Set up signal handlers for graceful shutdown
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
|
|||||||
129
tests/test_gateway_health.py
Normal file
129
tests/test_gateway_health.py
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
"""Tests for the gateway health heartbeat endpoint (#879 — M6-P0 Foundation).
|
||||||
|
|
||||||
|
Validates:
|
||||||
|
- /health returns 200 with correct JSON schema
|
||||||
|
- /health reports connected_clients count
|
||||||
|
- Non-/health paths return 404
|
||||||
|
- Uptime is non-negative
|
||||||
|
"""
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from http.server import HTTPServer
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# ── Load server module directly ──────────────────────────────────────────────
|
||||||
|
PROJECT_ROOT = Path(__file__).parent.parent
|
||||||
|
|
||||||
|
# Patch websockets import so we don't need the package installed to test health
|
||||||
|
_ws_mock = MagicMock()
|
||||||
|
sys.modules.setdefault("websockets", _ws_mock)
|
||||||
|
|
||||||
|
_srv_spec = importlib.util.spec_from_file_location(
|
||||||
|
"nexus_server_test",
|
||||||
|
PROJECT_ROOT / "server.py",
|
||||||
|
)
|
||||||
|
_srv = importlib.util.module_from_spec(_srv_spec)
|
||||||
|
sys.modules["nexus_server_test"] = _srv
|
||||||
|
_srv_spec.loader.exec_module(_srv)
|
||||||
|
|
||||||
|
_HealthHandler = _srv._HealthHandler
|
||||||
|
|
||||||
|
|
||||||
|
# ── Fake request helper ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class _FakeRequest:
|
||||||
|
"""Minimal socket-like object for BaseHTTPRequestHandler testing."""
|
||||||
|
|
||||||
|
def __init__(self, raw_bytes: bytes):
|
||||||
|
self._buf = BytesIO(raw_bytes)
|
||||||
|
self.sent = BytesIO()
|
||||||
|
|
||||||
|
def makefile(self, mode, **kwargs):
|
||||||
|
if "r" in mode:
|
||||||
|
return self._buf
|
||||||
|
return self.sent
|
||||||
|
|
||||||
|
def sendall(self, data: bytes):
|
||||||
|
self.sent.write(data)
|
||||||
|
|
||||||
|
|
||||||
|
def _invoke_handler(path: str) -> tuple[int, dict]:
|
||||||
|
"""Call the health handler for a GET request and return (status_code, body_dict)."""
|
||||||
|
raw = f"GET {path} HTTP/1.1\r\nHost: localhost\r\n\r\n".encode()
|
||||||
|
request = _FakeRequest(raw)
|
||||||
|
|
||||||
|
handler = _HealthHandler.__new__(_HealthHandler)
|
||||||
|
handler.rfile = BytesIO(raw)
|
||||||
|
handler.wfile = request.sent
|
||||||
|
handler.client_address = ("127.0.0.1", 9999)
|
||||||
|
handler.server = MagicMock()
|
||||||
|
handler.request_version = "HTTP/1.1"
|
||||||
|
handler.command = "GET"
|
||||||
|
handler.path = path
|
||||||
|
handler.headers = {}
|
||||||
|
|
||||||
|
# Capture response
|
||||||
|
responses: list[tuple] = []
|
||||||
|
handler.send_response = lambda code, *a: responses.append(("status", code))
|
||||||
|
handler.send_header = lambda k, v: None
|
||||||
|
handler.end_headers = lambda: None
|
||||||
|
|
||||||
|
body_parts: list[bytes] = []
|
||||||
|
handler.wfile = MagicMock()
|
||||||
|
handler.wfile.write = lambda b: body_parts.append(b)
|
||||||
|
|
||||||
|
handler.do_GET()
|
||||||
|
|
||||||
|
status = responses[0][1] if responses else None
|
||||||
|
body = {}
|
||||||
|
if body_parts:
|
||||||
|
try:
|
||||||
|
body = json.loads(b"".join(body_parts))
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
return status, body
|
||||||
|
|
||||||
|
|
||||||
|
# ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestHealthEndpoint:
|
||||||
|
def test_health_returns_200(self):
|
||||||
|
status, _ = _invoke_handler("/health")
|
||||||
|
assert status == 200
|
||||||
|
|
||||||
|
def test_health_body_schema(self):
|
||||||
|
_, body = _invoke_handler("/health")
|
||||||
|
assert body.get("status") == "ok"
|
||||||
|
assert body.get("service") == "nexus-gateway"
|
||||||
|
assert "uptime_seconds" in body
|
||||||
|
assert "connected_clients" in body
|
||||||
|
assert "ws_port" in body
|
||||||
|
assert "ts" in body
|
||||||
|
|
||||||
|
def test_uptime_is_non_negative(self):
|
||||||
|
_, body = _invoke_handler("/health")
|
||||||
|
assert body["uptime_seconds"] >= 0
|
||||||
|
|
||||||
|
def test_unknown_path_returns_404(self):
|
||||||
|
status, _ = _invoke_handler("/notfound")
|
||||||
|
assert status == 404
|
||||||
|
|
||||||
|
def test_root_path_returns_404(self):
|
||||||
|
status, _ = _invoke_handler("/")
|
||||||
|
assert status == 404
|
||||||
|
|
||||||
|
def test_connected_clients_reflects_module_state(self):
|
||||||
|
original = _srv.clients.copy()
|
||||||
|
try:
|
||||||
|
_srv.clients.clear()
|
||||||
|
_, body = _invoke_handler("/health")
|
||||||
|
assert body["connected_clients"] == 0
|
||||||
|
finally:
|
||||||
|
_srv.clients.update(original)
|
||||||
262
tests/test_lazarus_pit.py
Normal file
262
tests/test_lazarus_pit.py
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
"""Tests for the lazarus-pit daemon skeleton.
|
||||||
|
|
||||||
|
Validates:
|
||||||
|
- Config loading (defaults and TOML overrides)
|
||||||
|
- Cell discovery from /var/missions structure
|
||||||
|
- Agent heartbeat reading
|
||||||
|
- Health poll logic (stale vs fresh)
|
||||||
|
- AgentHealth and CellState dataclasses
|
||||||
|
- CLI argument parsing
|
||||||
|
- Resurrection stub behaviour
|
||||||
|
"""
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# ── Load lazarus_pit module directly ────────────────────────────────────────
|
||||||
|
PROJECT_ROOT = Path(__file__).parent.parent
|
||||||
|
_lp_spec = importlib.util.spec_from_file_location(
|
||||||
|
"lazarus_pit_test",
|
||||||
|
PROJECT_ROOT / "bin" / "lazarus_pit.py",
|
||||||
|
)
|
||||||
|
_lp = importlib.util.module_from_spec(_lp_spec)
|
||||||
|
sys.modules["lazarus_pit_test"] = _lp
|
||||||
|
_lp_spec.loader.exec_module(_lp)
|
||||||
|
|
||||||
|
AgentHealth = _lp.AgentHealth
|
||||||
|
CellState = _lp.CellState
|
||||||
|
DaemonConfig = _lp.DaemonConfig
|
||||||
|
load_config = _lp.load_config
|
||||||
|
read_agent_heartbeat = _lp.read_agent_heartbeat
|
||||||
|
discover_cells = _lp.discover_cells
|
||||||
|
poll_cell = _lp.poll_cell
|
||||||
|
attempt_revive = _lp.attempt_revive
|
||||||
|
build_arg_parser = _lp.build_arg_parser
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _make_cell(tmp_path: Path, uuid: str, status: str = "active", agents=None) -> Path:
|
||||||
|
"""Create a minimal mission cell structure under tmp_path."""
|
||||||
|
cell_path = tmp_path / uuid
|
||||||
|
cell_path.mkdir()
|
||||||
|
manifest = {
|
||||||
|
"uuid": uuid,
|
||||||
|
"status": status,
|
||||||
|
"mission": "test mission",
|
||||||
|
"agents": [{"name": a} for a in (agents or ["claude"])],
|
||||||
|
}
|
||||||
|
(cell_path / "cell.json").write_text(json.dumps(manifest))
|
||||||
|
return cell_path
|
||||||
|
|
||||||
|
|
||||||
|
def _write_heartbeat(cell_path: Path, agent: str, age_seconds: float = 0) -> None:
|
||||||
|
"""Write a heartbeat file for an agent inside the cell."""
|
||||||
|
hb_dir = cell_path / ".lazarus" / "heartbeats"
|
||||||
|
hb_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(hb_dir / f"{agent}.json").write_text(json.dumps({
|
||||||
|
"agent": agent,
|
||||||
|
"pid": 12345,
|
||||||
|
"timestamp": time.time() - age_seconds,
|
||||||
|
"cycle": 1,
|
||||||
|
"model": "test",
|
||||||
|
"status": "thinking",
|
||||||
|
}))
|
||||||
|
|
||||||
|
|
||||||
|
# ── Config ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestDaemonConfig:
|
||||||
|
def test_defaults(self):
|
||||||
|
cfg = DaemonConfig()
|
||||||
|
assert cfg.poll_interval == 15
|
||||||
|
assert cfg.heartbeat_stale_threshold == 60
|
||||||
|
assert cfg.max_revive_attempts == 3
|
||||||
|
|
||||||
|
def test_load_missing_file_returns_defaults(self, tmp_path):
|
||||||
|
cfg = load_config(tmp_path / "nonexistent.toml")
|
||||||
|
assert cfg.poll_interval == 15
|
||||||
|
|
||||||
|
def test_load_valid_toml(self, tmp_path):
|
||||||
|
toml_content = b"""
|
||||||
|
[cells]
|
||||||
|
root = "/tmp/missions"
|
||||||
|
poll_interval = 30
|
||||||
|
heartbeat_stale_threshold = 90
|
||||||
|
"""
|
||||||
|
toml_path = tmp_path / "test.toml"
|
||||||
|
toml_path.write_bytes(toml_content)
|
||||||
|
try:
|
||||||
|
cfg = load_config(toml_path)
|
||||||
|
assert cfg.poll_interval == 30
|
||||||
|
assert cfg.heartbeat_stale_threshold == 90
|
||||||
|
assert str(cfg.cells_root) == "/tmp/missions"
|
||||||
|
except Exception:
|
||||||
|
# TOML parser not available — skip, not a test failure
|
||||||
|
pytest.skip("TOML parser not available")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Cell discovery ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestDiscoverCells:
|
||||||
|
def test_empty_root(self, tmp_path):
|
||||||
|
cells = discover_cells(tmp_path)
|
||||||
|
assert cells == []
|
||||||
|
|
||||||
|
def test_missing_root(self, tmp_path):
|
||||||
|
cells = discover_cells(tmp_path / "nonexistent")
|
||||||
|
assert cells == []
|
||||||
|
|
||||||
|
def test_finds_active_cell(self, tmp_path):
|
||||||
|
_make_cell(tmp_path, "aaa-111", status="active")
|
||||||
|
cells = discover_cells(tmp_path)
|
||||||
|
assert len(cells) == 1
|
||||||
|
assert cells[0].uuid == "aaa-111"
|
||||||
|
|
||||||
|
def test_finds_frozen_cell(self, tmp_path):
|
||||||
|
_make_cell(tmp_path, "bbb-222", status="frozen")
|
||||||
|
cells = discover_cells(tmp_path)
|
||||||
|
assert len(cells) == 1
|
||||||
|
|
||||||
|
def test_ignores_archived_cell(self, tmp_path):
|
||||||
|
_make_cell(tmp_path, "ccc-333", status="archived")
|
||||||
|
cells = discover_cells(tmp_path)
|
||||||
|
assert len(cells) == 0
|
||||||
|
|
||||||
|
def test_ignores_directories_without_manifest(self, tmp_path):
|
||||||
|
(tmp_path / "not-a-cell").mkdir()
|
||||||
|
cells = discover_cells(tmp_path)
|
||||||
|
assert cells == []
|
||||||
|
|
||||||
|
def test_ignores_corrupt_manifest(self, tmp_path):
|
||||||
|
cell_path = tmp_path / "bad-cell"
|
||||||
|
cell_path.mkdir()
|
||||||
|
(cell_path / "cell.json").write_text("not json {{")
|
||||||
|
cells = discover_cells(tmp_path)
|
||||||
|
assert cells == []
|
||||||
|
|
||||||
|
def test_agents_populated_from_manifest(self, tmp_path):
|
||||||
|
_make_cell(tmp_path, "ddd-444", agents=["claude", "allegro"])
|
||||||
|
cells = discover_cells(tmp_path)
|
||||||
|
assert set(cells[0].agents.keys()) == {"claude", "allegro"}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Heartbeat reading ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestReadAgentHeartbeat:
|
||||||
|
def test_returns_none_for_missing_file(self, tmp_path):
|
||||||
|
result = read_agent_heartbeat(tmp_path, "ghost")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_returns_data_for_valid_heartbeat(self, tmp_path):
|
||||||
|
_write_heartbeat(tmp_path, "claude", age_seconds=0)
|
||||||
|
data = read_agent_heartbeat(tmp_path, "claude")
|
||||||
|
assert data is not None
|
||||||
|
assert data["agent"] == "claude"
|
||||||
|
assert "timestamp" in data
|
||||||
|
|
||||||
|
def test_returns_none_for_corrupt_json(self, tmp_path):
|
||||||
|
hb_dir = tmp_path / ".lazarus" / "heartbeats"
|
||||||
|
hb_dir.mkdir(parents=True)
|
||||||
|
(hb_dir / "bad.json").write_text("{not valid")
|
||||||
|
result = read_agent_heartbeat(tmp_path, "bad")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Health polling ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestPollCell:
|
||||||
|
def _make_cell_state(self, tmp_path, agents=None):
|
||||||
|
agents = agents or ["claude"]
|
||||||
|
cell = CellState(uuid="test-uuid", path=tmp_path)
|
||||||
|
for a in agents:
|
||||||
|
cell.agents[a] = AgentHealth(name=a)
|
||||||
|
return cell
|
||||||
|
|
||||||
|
def test_healthy_agent_no_warnings(self, tmp_path):
|
||||||
|
cell = self._make_cell_state(tmp_path)
|
||||||
|
_write_heartbeat(tmp_path, "claude", age_seconds=5)
|
||||||
|
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||||
|
warnings = poll_cell(cell, cfg)
|
||||||
|
assert warnings == []
|
||||||
|
assert cell.agents["claude"].healthy is True
|
||||||
|
|
||||||
|
def test_stale_heartbeat_generates_warning(self, tmp_path):
|
||||||
|
cell = self._make_cell_state(tmp_path)
|
||||||
|
_write_heartbeat(tmp_path, "claude", age_seconds=120)
|
||||||
|
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||||
|
warnings = poll_cell(cell, cfg)
|
||||||
|
assert len(warnings) == 1
|
||||||
|
assert "stale" in warnings[0]
|
||||||
|
assert cell.agents["claude"].healthy is False
|
||||||
|
|
||||||
|
def test_missing_heartbeat_generates_warning(self, tmp_path):
|
||||||
|
cell = self._make_cell_state(tmp_path)
|
||||||
|
cfg = DaemonConfig(heartbeat_stale_threshold=60)
|
||||||
|
warnings = poll_cell(cell, cfg)
|
||||||
|
assert len(warnings) == 1
|
||||||
|
assert "no heartbeat" in warnings[0]
|
||||||
|
assert cell.agents["claude"].healthy is False
|
||||||
|
|
||||||
|
|
||||||
|
# ── Resurrection stub ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestAttemptRevive:
|
||||||
|
def test_stub_returns_false(self, tmp_path):
|
||||||
|
cell = CellState(uuid="xyz", path=tmp_path)
|
||||||
|
cell.agents["claude"] = AgentHealth(name="claude", healthy=False)
|
||||||
|
cfg = DaemonConfig(max_revive_attempts=3)
|
||||||
|
result = attempt_revive(cell, "claude", cfg)
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_increments_revive_count(self, tmp_path):
|
||||||
|
cell = CellState(uuid="xyz", path=tmp_path)
|
||||||
|
cell.agents["claude"] = AgentHealth(name="claude", healthy=False)
|
||||||
|
cfg = DaemonConfig(max_revive_attempts=3)
|
||||||
|
attempt_revive(cell, "claude", cfg)
|
||||||
|
assert cell.agents["claude"].revive_count == 1
|
||||||
|
|
||||||
|
def test_stops_at_max_revive_attempts(self, tmp_path):
|
||||||
|
cell = CellState(uuid="xyz", path=tmp_path)
|
||||||
|
cell.agents["claude"] = AgentHealth(name="claude", healthy=False, revive_count=3)
|
||||||
|
cfg = DaemonConfig(max_revive_attempts=3)
|
||||||
|
result = attempt_revive(cell, "claude", cfg)
|
||||||
|
assert result is False
|
||||||
|
# Count should not increment beyond max
|
||||||
|
assert cell.agents["claude"].revive_count == 3
|
||||||
|
|
||||||
|
def test_unknown_agent_returns_false(self, tmp_path):
|
||||||
|
cell = CellState(uuid="xyz", path=tmp_path)
|
||||||
|
cfg = DaemonConfig()
|
||||||
|
result = attempt_revive(cell, "nobody", cfg)
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestCLI:
|
||||||
|
def test_default_config_path(self):
|
||||||
|
parser = build_arg_parser()
|
||||||
|
args = parser.parse_args([])
|
||||||
|
assert "lazarus-pit.toml" in args.config
|
||||||
|
|
||||||
|
def test_custom_config(self):
|
||||||
|
parser = build_arg_parser()
|
||||||
|
args = parser.parse_args(["--config", "/tmp/custom.toml"])
|
||||||
|
assert args.config == "/tmp/custom.toml"
|
||||||
|
|
||||||
|
def test_status_flag(self):
|
||||||
|
parser = build_arg_parser()
|
||||||
|
args = parser.parse_args(["--status"])
|
||||||
|
assert args.status is True
|
||||||
|
|
||||||
|
def test_list_cells_flag(self):
|
||||||
|
parser = build_arg_parser()
|
||||||
|
args = parser.parse_args(["--list-cells"])
|
||||||
|
assert args.list_cells is True
|
||||||
Reference in New Issue
Block a user