diff --git a/hermes-sovereign/README.md b/hermes-sovereign/README.md new file mode 100644 index 00000000..722bdf98 --- /dev/null +++ b/hermes-sovereign/README.md @@ -0,0 +1,27 @@ +# Hermes Sovereign Extensions + +Sovereign extensions extracted from the hermes-agent fork (Timmy_Foundation/hermes-agent). + +These files were incorrectly committed to the upstream fork and have been moved here +to restore clean upstream tracking. The hermes-agent repo can now stay in sync with +NousResearch/hermes-agent without merge conflicts from our custom work. + +## Directory Layout + +| Directory | Contents | +|-------------------|----------------------------------------------------| +| `docs/` | Deploy guides, performance reports, security docs, research notes | +| `security/` | Security audit workflows, PR checklists, validation scripts | +| `wizard-bootstrap/` | Wizard bootstrap environment — dependency checking, auditing | +| `notebooks/` | Jupyter notebooks for agent health monitoring | +| `scripts/` | Forge health checks, smoke tests, syntax guard, deploy validation | +| `ci/` | Gitea CI workflow definitions | +| `githooks/` | Pre-commit hooks and config | +| `devkit/` | Developer toolkit — Gitea client, health, notebook runner, secret scan | + +## Origin + +- **Source repo:** `Timmy_Foundation/hermes-agent` (gitea/main branch) +- **Upstream:** `NousResearch/hermes-agent` +- **Extracted:** 2026-04-07 +- **Issues:** #337, #338 diff --git a/hermes-sovereign/ci/ci.yml b/hermes-sovereign/ci/ci.yml new file mode 100644 index 00000000..07ee501c --- /dev/null +++ b/hermes-sovereign/ci/ci.yml @@ -0,0 +1,57 @@ +name: Forge CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: forge-ci-${{ gitea.ref }} + cancel-in-progress: true + +jobs: + smoke-and-build: + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + cache-dependency-glob: "uv.lock" + + - name: Set up Python 3.11 + run: uv python install 3.11 + + - name: Install package + run: | + uv venv .venv --python 3.11 + source .venv/bin/activate + uv pip install -e ".[all,dev]" + + - name: Smoke tests + run: | + source .venv/bin/activate + python scripts/smoke_test.py + env: + OPENROUTER_API_KEY: "" + OPENAI_API_KEY: "" + NOUS_API_KEY: "" + + - name: Syntax guard + run: | + source .venv/bin/activate + python scripts/syntax_guard.py + + - name: Green-path E2E + run: | + source .venv/bin/activate + python -m pytest tests/test_green_path_e2e.py -q --tb=short + env: + OPENROUTER_API_KEY: "" + OPENAI_API_KEY: "" + NOUS_API_KEY: "" diff --git a/hermes-sovereign/ci/notebook-ci.yml b/hermes-sovereign/ci/notebook-ci.yml new file mode 100644 index 00000000..d3794b41 --- /dev/null +++ b/hermes-sovereign/ci/notebook-ci.yml @@ -0,0 +1,44 @@ +name: Notebook CI + +on: + push: + paths: + - 'notebooks/**' + pull_request: + paths: + - 'notebooks/**' + +jobs: + notebook-smoke: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + pip install papermill jupytext nbformat + python -m ipykernel install --user --name python3 + + - name: Execute system health notebook + run: | + papermill notebooks/agent_task_system_health.ipynb /tmp/output.ipynb \ + -p threshold 0.5 \ + -p hostname ci-runner + + - name: Verify output has results + run: | + python -c " + import json + nb = json.load(open('/tmp/output.ipynb')) + code_cells = [c for c in nb['cells'] if c['cell_type'] == 'code'] + outputs = [c.get('outputs', []) for c in code_cells] + total_outputs = sum(len(o) for o in outputs) + assert total_outputs > 0, 'Notebook produced no outputs' + print(f'Notebook executed successfully with {total_outputs} output(s)') + " diff --git a/hermes-sovereign/devkit/README.md b/hermes-sovereign/devkit/README.md new file mode 100644 index 00000000..40db3e66 --- /dev/null +++ b/hermes-sovereign/devkit/README.md @@ -0,0 +1,56 @@ +# Bezalel's Devkit — Shared Tools for the Wizard Fleet + +This directory contains reusable CLI tools and Python modules for CI, testing, deployment, observability, and Gitea automation. Any wizard can invoke them via `python -m devkit.`. + +## Tools + +### `gitea_client` — Gitea API Client +List issues/PRs, post comments, create PRs, update issues. + +```bash +python -m devkit.gitea_client issues --state open --limit 20 +python -m devkit.gitea_client create-comment --number 142 --body "Update from Bezalel" +python -m devkit.gitea_client prs --state open +``` + +### `health` — Fleet Health Monitor +Checks system load, disk, memory, running processes, and key package versions. + +```bash +python -m devkit.health --threshold-load 1.0 --threshold-disk 90.0 --fail-on-critical +``` + +### `notebook_runner` — Notebook Execution Wrapper +Parameterizes and executes Jupyter notebooks via Papermill with structured JSON reporting. + +```bash +python -m devkit.notebook_runner task.ipynb output.ipynb -p threshold=1.0 -p hostname=forge +``` + +### `smoke_test` — Fast Smoke Test Runner +Runs core import checks, CLI entrypoint tests, and one bare green-path E2E. + +```bash +python -m devkit.smoke_test --verbose +``` + +### `secret_scan` — Secret Leak Scanner +Scans the repo for API keys, tokens, and private keys. + +```bash +python -m devkit.secret_scan --path . --fail-on-find +``` + +### `wizard_env` — Environment Validator +Checks that a wizard environment has all required binaries, env vars, Python packages, and Hermes config. + +```bash +python -m devkit.wizard_env --json --fail-on-incomplete +``` + +## Philosophy + +- **CLI-first** — Every tool is runnable as `python -m devkit.` +- **JSON output** — Easy to parse from other agents and CI pipelines +- **Zero dependencies beyond stdlib** where possible; optional heavy deps are runtime-checked +- **Fail-fast** — Exit codes are meaningful for CI gating diff --git a/hermes-sovereign/devkit/__init__.py b/hermes-sovereign/devkit/__init__.py new file mode 100644 index 00000000..9a16cf9a --- /dev/null +++ b/hermes-sovereign/devkit/__init__.py @@ -0,0 +1,9 @@ +""" +Bezalel's Devkit — Shared development tools for the wizard fleet. + +A collection of CLI-accessible utilities for CI, testing, deployment, +observability, and Gitea automation. Designed to be used by any agent +via subprocess or direct Python import. +""" + +__version__ = "0.1.0" diff --git a/hermes-sovereign/devkit/gitea_client.py b/hermes-sovereign/devkit/gitea_client.py new file mode 100644 index 00000000..427ec3ab --- /dev/null +++ b/hermes-sovereign/devkit/gitea_client.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +""" +Shared Gitea API client for wizard fleet automation. + +Usage as CLI: + python -m devkit.gitea_client issues --repo Timmy_Foundation/hermes-agent --state open + python -m devkit.gitea_client issue --repo Timmy_Foundation/hermes-agent --number 142 + python -m devkit.gitea_client create-comment --repo Timmy_Foundation/hermes-agent --number 142 --body "Update from Bezalel" + python -m devkit.gitea_client prs --repo Timmy_Foundation/hermes-agent --state open + +Usage as module: + from devkit.gitea_client import GiteaClient + client = GiteaClient() + issues = client.list_issues("Timmy_Foundation/hermes-agent", state="open") +""" + +import argparse +import json +import os +import sys +from typing import Any, Dict, List, Optional + +import urllib.request + + +DEFAULT_BASE_URL = os.getenv("GITEA_URL", "https://forge.alexanderwhitestone.com") +DEFAULT_TOKEN = os.getenv("GITEA_TOKEN", "") + + +class GiteaClient: + def __init__(self, base_url: str = DEFAULT_BASE_URL, token: str = DEFAULT_TOKEN): + self.base_url = base_url.rstrip("/") + self.token = token or "" + + def _request( + self, + method: str, + path: str, + data: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None, + ) -> Any: + url = f"{self.base_url}/api/v1{path}" + req_headers = {"Content-Type": "application/json", "Accept": "application/json"} + if self.token: + req_headers["Authorization"] = f"token {self.token}" + if headers: + req_headers.update(headers) + + body = json.dumps(data).encode() if data else None + req = urllib.request.Request(url, data=body, headers=req_headers, method=method) + + try: + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read().decode()) + except urllib.error.HTTPError as e: + return {"error": True, "status": e.code, "body": e.read().decode()} + + def list_issues(self, repo: str, state: str = "open", limit: int = 50) -> List[Dict]: + return self._request("GET", f"/repos/{repo}/issues?state={state}&limit={limit}") or [] + + def get_issue(self, repo: str, number: int) -> Dict: + return self._request("GET", f"/repos/{repo}/issues/{number}") or {} + + def create_comment(self, repo: str, number: int, body: str) -> Dict: + return self._request( + "POST", f"/repos/{repo}/issues/{number}/comments", {"body": body} + ) + + def update_issue(self, repo: str, number: int, **fields) -> Dict: + return self._request("PATCH", f"/repos/{repo}/issues/{number}", fields) + + def list_prs(self, repo: str, state: str = "open", limit: int = 50) -> List[Dict]: + return self._request("GET", f"/repos/{repo}/pulls?state={state}&limit={limit}") or [] + + def get_pr(self, repo: str, number: int) -> Dict: + return self._request("GET", f"/repos/{repo}/pulls/{number}") or {} + + def create_pr(self, repo: str, title: str, head: str, base: str, body: str = "") -> Dict: + return self._request( + "POST", + f"/repos/{repo}/pulls", + {"title": title, "head": head, "base": base, "body": body}, + ) + + +def _fmt_json(obj: Any) -> str: + return json.dumps(obj, indent=2, ensure_ascii=False) + + +def main(argv: List[str] = None) -> int: + argv = argv or sys.argv[1:] + parser = argparse.ArgumentParser(description="Gitea CLI for wizard fleet") + parser.add_argument("--repo", default="Timmy_Foundation/hermes-agent", help="Repository full name") + parser.add_argument("--token", default=DEFAULT_TOKEN, help="Gitea API token") + parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="Gitea base URL") + sub = parser.add_subparsers(dest="cmd") + + p_issues = sub.add_parser("issues", help="List issues") + p_issues.add_argument("--state", default="open") + p_issues.add_argument("--limit", type=int, default=50) + + p_issue = sub.add_parser("issue", help="Get single issue") + p_issue.add_argument("--number", type=int, required=True) + + p_prs = sub.add_parser("prs", help="List PRs") + p_prs.add_argument("--state", default="open") + p_prs.add_argument("--limit", type=int, default=50) + + p_pr = sub.add_parser("pr", help="Get single PR") + p_pr.add_argument("--number", type=int, required=True) + + p_comment = sub.add_parser("create-comment", help="Post comment on issue/PR") + p_comment.add_argument("--number", type=int, required=True) + p_comment.add_argument("--body", required=True) + + p_update = sub.add_parser("update-issue", help="Update issue fields") + p_update.add_argument("--number", type=int, required=True) + p_update.add_argument("--title", default=None) + p_update.add_argument("--body", default=None) + p_update.add_argument("--state", default=None) + + p_create_pr = sub.add_parser("create-pr", help="Create a PR") + p_create_pr.add_argument("--title", required=True) + p_create_pr.add_argument("--head", required=True) + p_create_pr.add_argument("--base", default="main") + p_create_pr.add_argument("--body", default="") + + args = parser.parse_args(argv) + client = GiteaClient(base_url=args.base_url, token=args.token) + + if args.cmd == "issues": + print(_fmt_json(client.list_issues(args.repo, args.state, args.limit))) + elif args.cmd == "issue": + print(_fmt_json(client.get_issue(args.repo, args.number))) + elif args.cmd == "prs": + print(_fmt_json(client.list_prs(args.repo, args.state, args.limit))) + elif args.cmd == "pr": + print(_fmt_json(client.get_pr(args.repo, args.number))) + elif args.cmd == "create-comment": + print(_fmt_json(client.create_comment(args.repo, args.number, args.body))) + elif args.cmd == "update-issue": + fields = {k: v for k, v in {"title": args.title, "body": args.body, "state": args.state}.items() if v is not None} + print(_fmt_json(client.update_issue(args.repo, args.number, **fields))) + elif args.cmd == "create-pr": + print(_fmt_json(client.create_pr(args.repo, args.title, args.head, args.base, args.body))) + else: + parser.print_help() + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/devkit/health.py b/hermes-sovereign/devkit/health.py new file mode 100644 index 00000000..a5ebfa92 --- /dev/null +++ b/hermes-sovereign/devkit/health.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Fleet health monitor for wizard agents. +Checks local system state and reports structured health metrics. + +Usage as CLI: + python -m devkit.health + python -m devkit.health --threshold-load 1.0 --check-disk + +Usage as module: + from devkit.health import check_health + report = check_health() +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +import time +from typing import Any, Dict, List + + +def _run(cmd: List[str]) -> str: + try: + return subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip() + except Exception as e: + return f"error: {e}" + + +def check_health(threshold_load: float = 1.0, threshold_disk_percent: float = 90.0) -> Dict[str, Any]: + gather_time = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + + # Load average + load_raw = _run(["cat", "/proc/loadavg"]) + load_values = [] + avg_load = None + if load_raw.startswith("error:"): + load_status = load_raw + else: + try: + load_values = [float(x) for x in load_raw.split()[:3]] + avg_load = sum(load_values) / len(load_values) + load_status = "critical" if avg_load > threshold_load else "ok" + except Exception as e: + load_status = f"error parsing load: {e}" + + # Disk usage + disk = shutil.disk_usage("/") + disk_percent = (disk.used / disk.total) * 100 if disk.total else 0.0 + disk_status = "critical" if disk_percent > threshold_disk_percent else "ok" + + # Memory + meminfo = _run(["cat", "/proc/meminfo"]) + mem_stats = {} + for line in meminfo.splitlines(): + if ":" in line: + key, val = line.split(":", 1) + mem_stats[key.strip()] = val.strip() + + # Running processes + hermes_pids = [] + try: + ps_out = subprocess.check_output(["pgrep", "-a", "-f", "hermes"]).decode().strip() + hermes_pids = [line.split(None, 1) for line in ps_out.splitlines() if line.strip()] + except subprocess.CalledProcessError: + hermes_pids = [] + + # Python package versions (key ones) + key_packages = ["jupyterlab", "papermill", "requests"] + pkg_versions = {} + for pkg in key_packages: + try: + out = subprocess.check_output([sys.executable, "-m", "pip", "show", pkg], stderr=subprocess.DEVNULL).decode() + for line in out.splitlines(): + if line.startswith("Version:"): + pkg_versions[pkg] = line.split(":", 1)[1].strip() + break + except Exception: + pkg_versions[pkg] = None + + overall = "ok" + if load_status == "critical" or disk_status == "critical": + overall = "critical" + elif not hermes_pids: + overall = "warning" + + return { + "timestamp": gather_time, + "overall": overall, + "load": { + "raw": load_raw if not load_raw.startswith("error:") else None, + "1min": load_values[0] if len(load_values) > 0 else None, + "5min": load_values[1] if len(load_values) > 1 else None, + "15min": load_values[2] if len(load_values) > 2 else None, + "avg": round(avg_load, 3) if avg_load is not None else None, + "threshold": threshold_load, + "status": load_status, + }, + "disk": { + "total_gb": round(disk.total / (1024 ** 3), 2), + "used_gb": round(disk.used / (1024 ** 3), 2), + "free_gb": round(disk.free / (1024 ** 3), 2), + "used_percent": round(disk_percent, 2), + "threshold_percent": threshold_disk_percent, + "status": disk_status, + }, + "memory": mem_stats, + "processes": { + "hermes_count": len(hermes_pids), + "hermes_pids": hermes_pids[:10], + }, + "packages": pkg_versions, + } + + +def main(argv: List[str] = None) -> int: + argv = argv or sys.argv[1:] + parser = argparse.ArgumentParser(description="Fleet health monitor") + parser.add_argument("--threshold-load", type=float, default=1.0) + parser.add_argument("--threshold-disk", type=float, default=90.0) + parser.add_argument("--fail-on-critical", action="store_true", help="Exit non-zero if overall is critical") + args = parser.parse_args(argv) + + report = check_health(args.threshold_load, args.threshold_disk) + print(json.dumps(report, indent=2)) + if args.fail_on_critical and report.get("overall") == "critical": + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/devkit/notebook_runner.py b/hermes-sovereign/devkit/notebook_runner.py new file mode 100644 index 00000000..90023964 --- /dev/null +++ b/hermes-sovereign/devkit/notebook_runner.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Notebook execution runner for agent tasks. +Wraps papermill with sensible defaults and structured JSON reporting. + +Usage as CLI: + python -m devkit.notebook_runner notebooks/task.ipynb output.ipynb -p threshold 1.0 + python -m devkit.notebook_runner notebooks/task.ipynb --dry-run + +Usage as module: + from devkit.notebook_runner import run_notebook + result = run_notebook("task.ipynb", "output.ipynb", parameters={"threshold": 1.0}) +""" + +import argparse +import json +import os +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + + +def run_notebook( + input_path: str, + output_path: Optional[str] = None, + parameters: Optional[Dict[str, Any]] = None, + kernel: str = "python3", + timeout: Optional[int] = None, + dry_run: bool = False, +) -> Dict[str, Any]: + input_path = str(Path(input_path).expanduser().resolve()) + if output_path is None: + fd, output_path = tempfile.mkstemp(suffix=".ipynb") + os.close(fd) + else: + output_path = str(Path(output_path).expanduser().resolve()) + + if dry_run: + return { + "status": "dry_run", + "input": input_path, + "output": output_path, + "parameters": parameters or {}, + "kernel": kernel, + } + + cmd = ["papermill", input_path, output_path, "--kernel", kernel] + if timeout is not None: + cmd.extend(["--execution-timeout", str(timeout)]) + for key, value in (parameters or {}).items(): + cmd.extend(["-p", key, str(value)]) + + start = os.times() + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True, + ) + end = os.times() + return { + "status": "ok", + "input": input_path, + "output": output_path, + "parameters": parameters or {}, + "kernel": kernel, + "elapsed_seconds": round((end.elapsed - start.elapsed), 2), + "stdout": proc.stdout[-2000:] if proc.stdout else "", + } + except subprocess.CalledProcessError as e: + end = os.times() + return { + "status": "error", + "input": input_path, + "output": output_path, + "parameters": parameters or {}, + "kernel": kernel, + "elapsed_seconds": round((end.elapsed - start.elapsed), 2), + "stdout": e.stdout[-2000:] if e.stdout else "", + "stderr": e.stderr[-2000:] if e.stderr else "", + "returncode": e.returncode, + } + except FileNotFoundError: + return { + "status": "error", + "message": "papermill not found. Install with: uv tool install papermill", + } + + +def main(argv: List[str] = None) -> int: + argv = argv or sys.argv[1:] + parser = argparse.ArgumentParser(description="Notebook runner for agents") + parser.add_argument("input", help="Input notebook path") + parser.add_argument("output", nargs="?", default=None, help="Output notebook path") + parser.add_argument("-p", "--parameter", action="append", default=[], help="Parameters as key=value") + parser.add_argument("--kernel", default="python3") + parser.add_argument("--timeout", type=int, default=None) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args(argv) + + parameters = {} + for raw in args.parameter: + if "=" not in raw: + print(f"Invalid parameter (expected key=value): {raw}", file=sys.stderr) + return 1 + k, v = raw.split("=", 1) + # Best-effort type inference + if v.lower() in ("true", "false"): + v = v.lower() == "true" + else: + try: + v = int(v) + except ValueError: + try: + v = float(v) + except ValueError: + pass + parameters[k] = v + + result = run_notebook( + args.input, + args.output, + parameters=parameters, + kernel=args.kernel, + timeout=args.timeout, + dry_run=args.dry_run, + ) + print(json.dumps(result, indent=2)) + return 0 if result.get("status") == "ok" else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/devkit/secret_scan.py b/hermes-sovereign/devkit/secret_scan.py new file mode 100644 index 00000000..f776aa31 --- /dev/null +++ b/hermes-sovereign/devkit/secret_scan.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Fast secret leak scanner for the repository. +Checks for common patterns that should never be committed. + +Usage as CLI: + python -m devkit.secret_scan + python -m devkit.secret_scan --path /some/repo --fail-on-find + +Usage as module: + from devkit.secret_scan import scan + findings = scan("/path/to/repo") +""" + +import argparse +import json +import os +import re +import sys +from pathlib import Path +from typing import Any, Dict, List + +# Patterns to flag +PATTERNS = { + "aws_access_key_id": re.compile(r"AKIA[0-9A-Z]{16}"), + "aws_secret_key": re.compile(r"['\"\s][0-9a-zA-Z/+]{40}['\"\s]"), + "generic_api_key": re.compile(r"api[_-]?key\s*[:=]\s*['\"][a-zA-Z0-9_\-]{20,}['\"]", re.IGNORECASE), + "private_key": re.compile(r"-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----"), + "github_token": re.compile(r"gh[pousr]_[A-Za-z0-9_]{36,}"), + "gitea_token": re.compile(r"[0-9a-f]{40}"), # heuristic for long hex strings after "token" + "telegram_bot_token": re.compile(r"[0-9]{9,}:[A-Za-z0-9_-]{35,}"), +} + +# Files and paths to skip +SKIP_PATHS = [ + ".git", + "__pycache__", + ".pytest_cache", + "node_modules", + "venv", + ".env", + ".agent-skills", +] + +# Max file size to scan (bytes) +MAX_FILE_SIZE = 1024 * 1024 + + +def _should_skip(path: Path) -> bool: + for skip in SKIP_PATHS: + if skip in path.parts: + return True + return False + + +def scan(root: str = ".") -> List[Dict[str, Any]]: + root_path = Path(root).resolve() + findings = [] + for file_path in root_path.rglob("*"): + if not file_path.is_file(): + continue + if _should_skip(file_path): + continue + if file_path.stat().st_size > MAX_FILE_SIZE: + continue + try: + text = file_path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + for pattern_name, pattern in PATTERNS.items(): + for match in pattern.finditer(text): + # Simple context: line around match + start = max(0, match.start() - 40) + end = min(len(text), match.end() + 40) + context = text[start:end].replace("\n", " ") + findings.append({ + "file": str(file_path.relative_to(root_path)), + "pattern": pattern_name, + "line": text[:match.start()].count("\n") + 1, + "context": context, + }) + return findings + + +def main(argv: List[str] = None) -> int: + argv = argv or sys.argv[1:] + parser = argparse.ArgumentParser(description="Secret leak scanner") + parser.add_argument("--path", default=".", help="Repository root to scan") + parser.add_argument("--fail-on-find", action="store_true", help="Exit non-zero if secrets found") + parser.add_argument("--json", action="store_true", help="Output as JSON") + args = parser.parse_args(argv) + + findings = scan(args.path) + if args.json: + print(json.dumps({"findings": findings, "count": len(findings)}, indent=2)) + else: + print(f"Scanned {args.path}") + print(f"Findings: {len(findings)}") + for f in findings: + print(f" [{f['pattern']}] {f['file']}:{f['line']} -> ...{f['context']}...") + + if args.fail_on_find and findings: + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/devkit/smoke_test.py b/hermes-sovereign/devkit/smoke_test.py new file mode 100644 index 00000000..830c0190 --- /dev/null +++ b/hermes-sovereign/devkit/smoke_test.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Shared smoke test runner for hermes-agent. +Fast checks that catch obvious breakage without maintenance burden. + +Usage as CLI: + python -m devkit.smoke_test + python -m devkit.smoke_test --verbose + +Usage as module: + from devkit.smoke_test import run_smoke_tests + results = run_smoke_tests() +""" + +import argparse +import importlib +import json +import subprocess +import sys +from pathlib import Path +from typing import Any, Dict, List + + +HERMES_ROOT = Path(__file__).resolve().parent.parent + + +def _test_imports() -> Dict[str, Any]: + modules = [ + "hermes_constants", + "hermes_state", + "cli", + "tools.skills_sync", + "tools.skills_hub", + ] + errors = [] + for mod in modules: + try: + importlib.import_module(mod) + except Exception as e: + errors.append({"module": mod, "error": str(e)}) + return { + "name": "core_imports", + "status": "ok" if not errors else "fail", + "errors": errors, + } + + +def _test_cli_entrypoints() -> Dict[str, Any]: + entrypoints = [ + [sys.executable, "-m", "cli", "--help"], + ] + errors = [] + for cmd in entrypoints: + try: + subprocess.run(cmd, capture_output=True, text=True, check=True, cwd=HERMES_ROOT) + except subprocess.CalledProcessError as e: + errors.append({"cmd": cmd, "error": f"exit {e.returncode}"}) + except Exception as e: + errors.append({"cmd": cmd, "error": str(e)}) + return { + "name": "cli_entrypoints", + "status": "ok" if not errors else "fail", + "errors": errors, + } + + +def _test_green_path_e2e() -> Dict[str, Any]: + """One bare green-path E2E: terminal_tool echo hello.""" + try: + from tools.terminal_tool import terminal + result = terminal(command="echo hello") + output = result.get("output", "") + if "hello" in output.lower(): + return {"name": "green_path_e2e", "status": "ok", "output": output.strip()} + return {"name": "green_path_e2e", "status": "fail", "error": f"Unexpected output: {output}"} + except Exception as e: + return {"name": "green_path_e2e", "status": "fail", "error": str(e)} + + +def run_smoke_tests(verbose: bool = False) -> Dict[str, Any]: + tests = [ + _test_imports(), + _test_cli_entrypoints(), + _test_green_path_e2e(), + ] + failed = [t for t in tests if t["status"] != "ok"] + result = { + "overall": "ok" if not failed else "fail", + "tests": tests, + "failed_count": len(failed), + } + if verbose: + print(json.dumps(result, indent=2)) + return result + + +def main(argv: List[str] = None) -> int: + argv = argv or sys.argv[1:] + parser = argparse.ArgumentParser(description="Smoke test runner") + parser.add_argument("--verbose", action="store_true") + args = parser.parse_args(argv) + + result = run_smoke_tests(verbose=True) + return 0 if result["overall"] == "ok" else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/devkit/wizard_env.py b/hermes-sovereign/devkit/wizard_env.py new file mode 100644 index 00000000..f4c8bf47 --- /dev/null +++ b/hermes-sovereign/devkit/wizard_env.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Wizard environment validator. +Checks that a new wizard environment is ready for duty. + +Usage as CLI: + python -m devkit.wizard_env + python -m devkit.wizard_env --fix + +Usage as module: + from devkit.wizard_env import validate + report = validate() +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +from typing import Any, Dict, List + + +def _has_cmd(name: str) -> bool: + return shutil.which(name) is not None + + +def _check_env_var(name: str) -> Dict[str, Any]: + value = os.getenv(name) + return { + "name": name, + "status": "ok" if value else "missing", + "value": value[:10] + "..." if value and len(value) > 20 else value, + } + + +def _check_python_pkg(name: str) -> Dict[str, Any]: + try: + __import__(name) + return {"name": name, "status": "ok"} + except ImportError: + return {"name": name, "status": "missing"} + + +def validate() -> Dict[str, Any]: + checks = { + "binaries": [ + {"name": "python3", "status": "ok" if _has_cmd("python3") else "missing"}, + {"name": "git", "status": "ok" if _has_cmd("git") else "missing"}, + {"name": "curl", "status": "ok" if _has_cmd("curl") else "missing"}, + {"name": "jupyter-lab", "status": "ok" if _has_cmd("jupyter-lab") else "missing"}, + {"name": "papermill", "status": "ok" if _has_cmd("papermill") else "missing"}, + {"name": "jupytext", "status": "ok" if _has_cmd("jupytext") else "missing"}, + ], + "env_vars": [ + _check_env_var("GITEA_URL"), + _check_env_var("GITEA_TOKEN"), + _check_env_var("TELEGRAM_BOT_TOKEN"), + ], + "python_packages": [ + _check_python_pkg("requests"), + _check_python_pkg("jupyter_server"), + _check_python_pkg("nbformat"), + ], + } + + all_ok = all( + c["status"] == "ok" + for group in checks.values() + for c in group + ) + + # Hermes-specific checks + hermes_home = os.path.expanduser("~/.hermes") + checks["hermes"] = [ + {"name": "config.yaml", "status": "ok" if os.path.exists(f"{hermes_home}/config.yaml") else "missing"}, + {"name": "skills_dir", "status": "ok" if os.path.exists(f"{hermes_home}/skills") else "missing"}, + ] + + all_ok = all_ok and all(c["status"] == "ok" for c in checks["hermes"]) + + return { + "overall": "ok" if all_ok else "incomplete", + "checks": checks, + } + + +def main(argv: List[str] = None) -> int: + argv = argv or sys.argv[1:] + parser = argparse.ArgumentParser(description="Wizard environment validator") + parser.add_argument("--json", action="store_true") + parser.add_argument("--fail-on-incomplete", action="store_true") + args = parser.parse_args(argv) + + report = validate() + if args.json: + print(json.dumps(report, indent=2)) + else: + print(f"Wizard Environment: {report['overall']}") + for group, items in report["checks"].items(): + print(f"\n[{group}]") + for item in items: + status_icon = "✅" if item["status"] == "ok" else "❌" + print(f" {status_icon} {item['name']}: {item['status']}") + + if args.fail_on_incomplete and report["overall"] != "ok": + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/docs/DEPLOY.md b/hermes-sovereign/docs/DEPLOY.md new file mode 100644 index 00000000..bce4cf0e --- /dev/null +++ b/hermes-sovereign/docs/DEPLOY.md @@ -0,0 +1,569 @@ +# Hermes Agent — Sovereign Deployment Runbook + +> **Goal**: A new VPS can go from bare OS to a running Hermes instance in under 30 minutes using only this document. + +--- + +## Table of Contents + +1. [Prerequisites](#1-prerequisites) +2. [Environment Setup](#2-environment-setup) +3. [Secret Injection](#3-secret-injection) +4. [Installation](#4-installation) +5. [Starting the Stack](#5-starting-the-stack) +6. [Health Checks](#6-health-checks) +7. [Stop / Restart Procedures](#7-stop--restart-procedures) +8. [Zero-Downtime Restart](#8-zero-downtime-restart) +9. [Rollback Procedure](#9-rollback-procedure) +10. [Database / State Migrations](#10-database--state-migrations) +11. [Docker Compose Deployment](#11-docker-compose-deployment) +12. [systemd Deployment](#12-systemd-deployment) +13. [Monitoring & Logs](#13-monitoring--logs) +14. [Security Checklist](#14-security-checklist) +15. [Troubleshooting](#15-troubleshooting) + +--- + +## 1. Prerequisites + +| Requirement | Minimum | Recommended | +|-------------|---------|-------------| +| OS | Ubuntu 22.04 LTS | Ubuntu 24.04 LTS | +| RAM | 512 MB | 2 GB | +| CPU | 1 vCPU | 2 vCPU | +| Disk | 5 GB | 20 GB | +| Python | 3.11 | 3.12 | +| Node.js | 18 | 20 | +| Git | any | any | + +**Optional but recommended:** +- Docker Engine ≥ 24 + Compose plugin (for containerised deployment) +- `curl`, `jq` (for health-check scripting) + +--- + +## 2. Environment Setup + +### 2a. Create a dedicated system user (bare-metal deployments) + +```bash +sudo useradd -m -s /bin/bash hermes +sudo su - hermes +``` + +### 2b. Install Hermes + +```bash +# Official one-liner installer +curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash + +# Reload PATH so `hermes` is available +source ~/.bashrc +``` + +The installer places: +- The agent code at `~/.local/lib/python3.x/site-packages/` (pip editable install) +- The `hermes` entry point at `~/.local/bin/hermes` +- Default config directory at `~/.hermes/` + +### 2c. Verify installation + +```bash +hermes --version +hermes doctor +``` + +--- + +## 3. Secret Injection + +**Rule: secrets never live in the repository. They live only in `~/.hermes/.env`.** + +```bash +# Copy the template (do NOT edit the repo copy) +cp /path/to/hermes-agent/.env.example ~/.hermes/.env +chmod 600 ~/.hermes/.env + +# Edit with your preferred editor +nano ~/.hermes/.env +``` + +### Minimum required keys + +| Variable | Purpose | Where to get it | +|----------|---------|----------------| +| `OPENROUTER_API_KEY` | LLM inference | https://openrouter.ai/keys | +| `TELEGRAM_BOT_TOKEN` | Telegram gateway | @BotFather on Telegram | + +### Optional but common keys + +| Variable | Purpose | +|----------|---------| +| `DISCORD_BOT_TOKEN` | Discord gateway | +| `SLACK_BOT_TOKEN` + `SLACK_APP_TOKEN` | Slack gateway | +| `EXA_API_KEY` | Web search tool | +| `FAL_KEY` | Image generation | +| `ANTHROPIC_API_KEY` | Direct Anthropic inference | + +### Pre-flight validation + +Before starting the stack, run: + +```bash +python scripts/deploy-validate --check-ports --skip-health +``` + +This catches missing keys, placeholder values, and misconfigurations without touching running services. + +--- + +## 4. Installation + +### 4a. Clone the repository (if not using the installer) + +```bash +git clone https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent.git +cd hermes-agent +pip install -e ".[all]" --user +npm install +``` + +### 4b. Run the setup wizard + +```bash +hermes setup +``` + +The wizard configures your LLM provider, messaging platforms, and data directory interactively. + +--- + +## 5. Starting the Stack + +### Bare-metal (foreground — useful for first run) + +```bash +# Agent + gateway combined +hermes gateway start + +# Or just the CLI agent (no messaging) +hermes +``` + +### Bare-metal (background daemon) + +```bash +hermes gateway start & +echo $! > ~/.hermes/gateway.pid +``` + +### Via systemd (recommended for production) + +See [Section 12](#12-systemd-deployment). + +### Via Docker Compose + +See [Section 11](#11-docker-compose-deployment). + +--- + +## 6. Health Checks + +### 6a. API server liveness probe + +The API server (enabled via `api_server` platform in gateway config) exposes `/health`: + +```bash +curl -s http://127.0.0.1:8642/health | jq . +``` + +Expected response: + +```json +{ + "status": "ok", + "platform": "hermes-agent", + "version": "0.5.0", + "uptime_seconds": 123, + "gateway_state": "running", + "platforms": { + "telegram": {"state": "connected"}, + "discord": {"state": "connected"} + } +} +``` + +| Field | Meaning | +|-------|---------| +| `status` | `"ok"` — HTTP server is alive. Any non-200 = down. | +| `gateway_state` | `"running"` — all platforms started. `"starting"` — still initialising. | +| `platforms` | Per-adapter connection state. | + +### 6b. Gateway runtime status file + +```bash +cat ~/.hermes/gateway_state.json | jq '{state: .gateway_state, platforms: .platforms}' +``` + +### 6c. Deploy-validate script + +```bash +python scripts/deploy-validate +``` + +Runs all checks and prints a pass/fail summary. Exit code 0 = healthy. + +### 6d. systemd health + +```bash +systemctl status hermes-gateway +journalctl -u hermes-gateway --since "5 minutes ago" +``` + +--- + +## 7. Stop / Restart Procedures + +### Graceful stop + +```bash +# systemd +sudo systemctl stop hermes-gateway + +# Docker Compose +docker compose -f deploy/docker-compose.yml down + +# Process signal (if running ad-hoc) +kill -TERM $(cat ~/.hermes/gateway.pid) +``` + +### Restart + +```bash +# systemd +sudo systemctl restart hermes-gateway + +# Docker Compose +docker compose -f deploy/docker-compose.yml restart hermes + +# Ad-hoc +hermes gateway start --replace +``` + +The `--replace` flag removes stale PID/lock files from an unclean shutdown before starting. + +--- + +## 8. Zero-Downtime Restart + +Hermes is a stateful long-running process (persistent sessions, active cron jobs). True zero-downtime requires careful sequencing. + +### Strategy A — systemd rolling restart (recommended) + +systemd's `Restart=on-failure` with a 5-second back-off ensures automatic recovery from crashes. For intentional restarts, use: + +```bash +sudo systemctl reload-or-restart hermes-gateway +``` + +`hermes-gateway.service` uses `TimeoutStopSec=30` so in-flight agent turns finish before the old process dies. + +> **Note:** Active messaging conversations will see a brief pause (< 30 s) while the gateway reconnects to platforms. The session store is file-based and persists across restarts — conversations resume where they left off. + +### Strategy B — Blue/green with two HERMES_HOME directories + +For zero-downtime where even a brief pause is unacceptable: + +```bash +# 1. Prepare the new environment (different HERMES_HOME) +export HERMES_HOME=/home/hermes/.hermes-green +hermes setup # configure green env with same .env + +# 2. Start green on a different port (e.g. 8643) +API_SERVER_PORT=8643 hermes gateway start & + +# 3. Verify green is healthy +curl -s http://127.0.0.1:8643/health | jq .gateway_state + +# 4. Switch load balancer (nginx/caddy) to port 8643 + +# 5. Gracefully stop blue +kill -TERM $(cat ~/.hermes/.hermes/gateway.pid) +``` + +### Strategy C — Docker Compose rolling update + +```bash +# Pull the new image +docker compose -f deploy/docker-compose.yml pull hermes + +# Recreate with zero-downtime if you have a replicated setup +docker compose -f deploy/docker-compose.yml up -d --no-deps hermes +``` + +Docker stops the old container only after the new one passes its healthcheck. + +--- + +## 9. Rollback Procedure + +### 9a. Code rollback (pip install) + +```bash +# Find the previous version tag +git log --oneline --tags | head -10 + +# Roll back to a specific tag +git checkout v0.4.0 +pip install -e ".[all]" --user --quiet + +# Restart the gateway +sudo systemctl restart hermes-gateway +``` + +### 9b. Docker image rollback + +```bash +# Pull a specific version +docker pull ghcr.io/nousresearch/hermes-agent:v0.4.0 + +# Update docker-compose.yml image tag, then: +docker compose -f deploy/docker-compose.yml up -d +``` + +### 9c. State / data rollback + +The data directory (`~/.hermes/` or the Docker volume `hermes_data`) contains sessions, memories, cron jobs, and the response store. Back it up before every update: + +```bash +# Backup (run BEFORE updating) +tar czf ~/backups/hermes_data_$(date +%F_%H%M).tar.gz ~/.hermes/ + +# Restore from backup +sudo systemctl stop hermes-gateway +rm -rf ~/.hermes/ +tar xzf ~/backups/hermes_data_2026-04-06_1200.tar.gz -C ~/ +sudo systemctl start hermes-gateway +``` + +> **Tested rollback**: The rollback procedure above was validated in staging on 2026-04-06. Data integrity was confirmed by checking session count before/after: `ls ~/.hermes/sessions/ | wc -l`. + +--- + +## 10. Database / State Migrations + +Hermes uses two persistent stores: + +| Store | Location | Format | +|-------|----------|--------| +| Session store | `~/.hermes/sessions/*.json` | JSON files | +| Response store (API server) | `~/.hermes/response_store.db` | SQLite WAL | +| Gateway state | `~/.hermes/gateway_state.json` | JSON | +| Memories | `~/.hermes/memories/*.md` | Markdown files | +| Cron jobs | `~/.hermes/cron/*.json` | JSON files | + +### Migration steps (between versions) + +1. **Stop** the gateway before migrating. +2. **Backup** the data directory (see Section 9c). +3. **Check release notes** for migration instructions (see `RELEASE_*.md`). +4. **Run** `hermes doctor` after starting the new version — it validates state compatibility. +5. **Verify** health via `python scripts/deploy-validate`. + +There are currently no SQL migrations to run manually. The SQLite schema is +created automatically on first use with `CREATE TABLE IF NOT EXISTS`. + +--- + +## 11. Docker Compose Deployment + +### First-time setup + +```bash +# 1. Copy .env.example to .env in the repo root +cp .env.example .env +nano .env # fill in your API keys + +# 2. Validate config before starting +python scripts/deploy-validate --skip-health + +# 3. Start the stack +docker compose -f deploy/docker-compose.yml up -d + +# 4. Watch startup logs +docker compose -f deploy/docker-compose.yml logs -f + +# 5. Verify health +curl -s http://127.0.0.1:8642/health | jq . +``` + +### Updating to a new version + +```bash +# Pull latest image +docker compose -f deploy/docker-compose.yml pull + +# Recreate container (Docker waits for healthcheck before stopping old) +docker compose -f deploy/docker-compose.yml up -d + +# Watch logs +docker compose -f deploy/docker-compose.yml logs -f --since 2m +``` + +### Data backup (Docker) + +```bash +docker run --rm \ + -v hermes_data:/data \ + -v $(pwd)/backups:/backup \ + alpine tar czf /backup/hermes_data_$(date +%F).tar.gz /data +``` + +--- + +## 12. systemd Deployment + +### Install unit files + +```bash +# From the repo root +sudo cp deploy/hermes-agent.service /etc/systemd/system/ +sudo cp deploy/hermes-gateway.service /etc/systemd/system/ + +sudo systemctl daemon-reload + +# Enable on boot + start now +sudo systemctl enable --now hermes-gateway + +# (Optional) also run the CLI agent as a background service +# sudo systemctl enable --now hermes-agent +``` + +### Adjust the unit file for your user/paths + +Edit `/etc/systemd/system/hermes-gateway.service`: + +```ini +[Service] +User=youruser # change from 'hermes' +WorkingDirectory=/home/youruser +EnvironmentFile=/home/youruser/.hermes/.env +ExecStart=/home/youruser/.local/bin/hermes gateway start --replace +``` + +Then: + +```bash +sudo systemctl daemon-reload +sudo systemctl restart hermes-gateway +``` + +### Verify + +```bash +systemctl status hermes-gateway +journalctl -u hermes-gateway -f +``` + +--- + +## 13. Monitoring & Logs + +### Log locations + +| Log | Location | +|-----|----------| +| Gateway (systemd) | `journalctl -u hermes-gateway` | +| Gateway (Docker) | `docker compose logs hermes` | +| Session trajectories | `~/.hermes/logs/session_*.json` | +| Deploy events | `~/.hermes/logs/deploy.log` | +| Runtime state | `~/.hermes/gateway_state.json` | + +### Useful log commands + +```bash +# Last 100 lines, follow +journalctl -u hermes-gateway -n 100 -f + +# Errors only +journalctl -u hermes-gateway -p err --since today + +# Docker: structured logs with timestamps +docker compose -f deploy/docker-compose.yml logs --timestamps hermes +``` + +### Alerting + +Add a cron job on the host to page you if the health check fails: + +```bash +# /etc/cron.d/hermes-healthcheck +* * * * * root curl -sf http://127.0.0.1:8642/health > /dev/null || \ + echo "Hermes unhealthy at $(date)" | mail -s "ALERT: Hermes down" ops@example.com +``` + +--- + +## 14. Security Checklist + +- [ ] `.env` has permissions `600` and is **not** tracked by git (`git ls-files .env` returns nothing). +- [ ] `API_SERVER_KEY` is set if the API server is exposed beyond `127.0.0.1`. +- [ ] API server is bound to `127.0.0.1` (not `0.0.0.0`) unless behind a TLS-terminating reverse proxy. +- [ ] Firewall allows only the ports your platforms require (no unnecessary open ports). +- [ ] systemd unit uses `NoNewPrivileges=true`, `PrivateTmp=true`, `ProtectSystem=strict`. +- [ ] Docker container has resource limits set (`deploy.resources.limits`). +- [ ] Backups of `~/.hermes/` are stored outside the server (e.g. S3, remote NAS). +- [ ] `hermes doctor` returns no errors on the running instance. +- [ ] `python scripts/deploy-validate` exits 0 after every configuration change. + +--- + +## 15. Troubleshooting + +### Gateway won't start + +```bash +hermes gateway start --replace # clears stale PID files + +# Check for port conflicts +ss -tlnp | grep 8642 + +# Verbose logs +HERMES_LOG_LEVEL=DEBUG hermes gateway start +``` + +### Health check returns `gateway_state: "starting"` for more than 60 s + +Platform adapters take time to authenticate (especially Telegram + Discord). Check logs for auth errors: + +```bash +journalctl -u hermes-gateway --since "2 minutes ago" | grep -i "error\|token\|auth" +``` + +### `/health` returns connection refused + +The API server platform may not be enabled. Verify your gateway config (`~/.hermes/config.yaml`) includes: + +```yaml +gateway: + platforms: + - api_server +``` + +### Rollback needed after failed update + +See [Section 9](#9-rollback-procedure). If you backed up before updating, rollback takes < 5 minutes. + +### Sessions lost after restart + +Sessions are file-based in `~/.hermes/sessions/`. They persist across restarts. If they are gone, check: + +```bash +ls -la ~/.hermes/sessions/ +# Verify the volume is mounted (Docker): +docker exec hermes-agent ls /opt/data/sessions/ +``` + +--- + +*This runbook is owned by the Bezalel epic backlog. Update it whenever deployment procedures change.* diff --git a/hermes-sovereign/docs/NOTEBOOK_WORKFLOW.md b/hermes-sovereign/docs/NOTEBOOK_WORKFLOW.md new file mode 100644 index 00000000..20c660cc --- /dev/null +++ b/hermes-sovereign/docs/NOTEBOOK_WORKFLOW.md @@ -0,0 +1,57 @@ +# Notebook Workflow for Agent Tasks + +This directory demonstrates a sovereign, version-controlled workflow for LLM agent tasks using Jupyter notebooks. + +## Philosophy + +- **`.py` files are the source of truth`** — authored and reviewed as plain Python with `# %%` cell markers (via Jupytext) +- **`.ipynb` files are generated artifacts** — auto-created from `.py` for execution and rich viewing +- **Papermill parameterizes and executes** — each run produces an output notebook with code, narrative, and results preserved +- **Output notebooks are audit artifacts** — every execution leaves a permanent, replayable record + +## File Layout + +``` +notebooks/ + agent_task_system_health.py # Source of truth (Jupytext) + agent_task_system_health.ipynb # Generated from .py +docs/ + NOTEBOOK_WORKFLOW.md # This document +.gitea/workflows/ + notebook-ci.yml # CI gate: executes notebooks on PR/push +``` + +## How Agents Work With Notebooks + +1. **Create** — Agent generates a `.py` notebook using `# %% [markdown]` and `# %%` code blocks +2. **Review** — PR reviewers see clean diffs in Gitea (no JSON noise) +3. **Generate** — `jupytext --to ipynb` produces the `.ipynb` before merge +4. **Execute** — Papermill runs the notebook with injected parameters +5. **Archive** — Output notebook is committed to a `reports/` branch or artifact store + +## Converting Between Formats + +```bash +# .py -> .ipynb +jupytext --to ipynb notebooks/agent_task_system_health.py + +# .ipynb -> .py +jupytext --to py notebooks/agent_task_system_health.ipynb + +# Execute with parameters +papermill notebooks/agent_task_system_health.ipynb output.ipynb \ + -p threshold 1.0 -p hostname forge-vps-01 +``` + +## CI Gate + +The `notebook-ci.yml` workflow executes all notebooks in `notebooks/` on every PR and push, ensuring that checked-in notebooks still run and produce outputs. + +## Why This Matters + +| Problem | Notebook Solution | +|---|---| +| Ephemeral agent reasoning | Markdown cells narrate the thought process | +| Stateless single-turn tools | Stateful cells persist variables across steps | +| Unreviewable binary artifacts | `.py` source is diffable and PR-friendly | +| No execution audit trail | Output notebook preserves code + outputs + metadata | diff --git a/hermes-sovereign/docs/PERFORMANCE_ANALYSIS_REPORT.md b/hermes-sovereign/docs/PERFORMANCE_ANALYSIS_REPORT.md new file mode 100644 index 00000000..9710b47e --- /dev/null +++ b/hermes-sovereign/docs/PERFORMANCE_ANALYSIS_REPORT.md @@ -0,0 +1,589 @@ +# Hermes Agent Performance Analysis Report + +**Date:** 2025-03-30 +**Scope:** Entire codebase - run_agent.py, gateway, tools +**Lines Analyzed:** 50,000+ lines of Python code + +--- + +## Executive Summary + +The codebase exhibits **severe performance bottlenecks** across multiple dimensions. The monolithic architecture, excessive synchronous I/O, lack of caching, and inefficient algorithms result in significant performance degradation under load. + +**Critical Issues Found:** +- 113 lock primitives (potential contention points) +- 482 sleep calls (blocking delays) +- 1,516 JSON serialization calls (CPU overhead) +- 8,317-line run_agent.py (unmaintainable, slow import) +- Synchronous HTTP requests in async contexts + +--- + +## 1. HOTSPOT ANALYSIS (Slowest Code Paths) + +### 1.1 run_agent.py - The Monolithic Bottleneck + +**File Size:** 8,317 lines, 419KB +**Severity:** CRITICAL + +**Issues:** +```python +# Lines 460-1000: Massive __init__ method with 50+ parameters +# Lines 3759-3826: _anthropic_messages_create - blocking API calls +# Lines 3827-3920: _interruptible_api_call - sync wrapper around async +# Lines 2269-2297: _hydrate_todo_store - O(n) history scan on every message +# Lines 2158-2222: _save_session_log - synchronous file I/O on every turn +``` + +**Performance Impact:** +- Import time: ~2-3 seconds (circular dependencies, massive imports) +- Initialization: 500ms+ per AIAgent instance +- Memory footprint: ~50MB per agent instance +- Session save: 50-100ms blocking I/O per turn + +### 1.2 Gateway Stream Consumer - Busy-Wait Pattern + +**File:** gateway/stream_consumer.py +**Lines:** 88-147 + +```python +# PROBLEM: Busy-wait loop with fixed 50ms sleep +while True: + try: + item = self._queue.get_nowait() # Non-blocking + except queue.Empty: + break + # ... + await asyncio.sleep(0.05) # 50ms delay = max 20 updates/sec +``` + +**Issues:** +- Fixed 50ms sleep limits throughput to 20 updates/second +- No adaptive back-off +- Wastes CPU cycles polling + +### 1.3 Context Compression - Expensive LLM Calls + +**File:** agent/context_compressor.py +**Lines:** 250-369 + +```python +def _generate_summary(self, turns_to_summarize: List[Dict]) -> Optional[str]: + # Calls LLM for EVERY compression - $$$ and latency + response = call_llm( + messages=[{"role": "user", "content": prompt}], + max_tokens=summary_budget * 2, # Expensive! + ) +``` + +**Issues:** +- Synchronous LLM call blocks agent loop +- No caching of similar contexts +- Repeated serialization of same messages + +### 1.4 Web Tools - Synchronous HTTP Requests + +**File:** tools/web_tools.py +**Lines:** 171-188 + +```python +def _tavily_request(endpoint: str, payload: dict) -> dict: + response = httpx.post(url, json=payload, timeout=60) # BLOCKING + response.raise_for_status() + return response.json() +``` + +**Issues:** +- 60-second blocking timeout +- No async/await pattern +- Serial request pattern (no parallelism) + +### 1.5 SQLite Session Store - Write Contention + +**File:** hermes_state.py +**Lines:** 116-215 + +```python +def _execute_write(self, fn: Callable) -> T: + for attempt in range(self._WRITE_MAX_RETRIES): # 15 retries! + try: + with self._lock: # Global lock + self._conn.execute("BEGIN IMMEDIATE") + result = fn(self._conn) + self._conn.commit() + except sqlite3.OperationalError: + time.sleep(random.uniform(0.020, 0.150)) # Random jitter +``` + +**Issues:** +- Global thread lock on all writes +- 15 retry attempts with jitter +- Serializes all DB operations + +--- + +## 2. MEMORY PROFILING RECOMMENDATIONS + +### 2.1 Memory Leaks Identified + +**A. Agent Cache in Gateway (run.py lines 406-413)** +```python +# PROBLEM: Unbounded cache growth +self._agent_cache: Dict[str, tuple] = {} # Never evicted! +self._agent_cache_lock = _threading.Lock() +``` +**Fix:** Implement LRU cache with maxsize=100 + +**B. Message History in run_agent.py** +```python +self._session_messages: List[Dict[str, Any]] = [] # Unbounded! +``` +**Fix:** Implement sliding window or compression threshold + +**C. Read Tracker in file_tools.py (lines 57-62)** +```python +_read_tracker: dict = {} # Per-task state never cleaned +``` +**Fix:** TTL-based eviction + +### 2.2 Large Object Retention + +**A. Tool Registry (tools/registry.py)** +- Holds ALL tool schemas in memory (~5MB) +- No lazy loading + +**B. Model Metadata Cache (agent/model_metadata.py)** +- Caches all model info indefinitely +- No TTL or size limits + +### 2.3 String Duplication + +**Issue:** 1,516 JSON serialize/deserialize calls create massive string duplication + +**Recommendation:** +- Use orjson for 10x faster JSON processing +- Implement string interning for repeated keys +- Use MessagePack for internal serialization + +--- + +## 3. ASYNC CONVERSION OPPORTUNITIES + +### 3.1 High-Priority Conversions + +| File | Function | Current | Impact | +|------|----------|---------|--------| +| tools/web_tools.py | web_search_tool | Sync | HIGH | +| tools/web_tools.py | web_extract_tool | Sync | HIGH | +| tools/browser_tool.py | browser_navigate | Sync | HIGH | +| tools/terminal_tool.py | terminal_tool | Sync | MEDIUM | +| tools/file_tools.py | read_file_tool | Sync | MEDIUM | +| agent/context_compressor.py | _generate_summary | Sync | HIGH | +| run_agent.py | _save_session_log | Sync | MEDIUM | + +### 3.2 Async Bridge Overhead + +**File:** model_tools.py (lines 81-126) + +```python +def _run_async(coro): + # PROBLEM: Creates thread pool for EVERY async call! + if loop and loop.is_running(): + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + future = pool.submit(asyncio.run, coro) + return future.result(timeout=300) +``` + +**Issues:** +- Creates/destroys thread pool per call +- 300-second blocking wait +- No connection pooling + +**Fix:** Use persistent async loop with asyncio.gather() + +### 3.3 Gateway Async Patterns + +**Current:** +```python +# gateway/run.py - Mixed sync/async +async def handle_message(self, event): + result = self.run_agent_sync(event) # Blocks event loop! +``` + +**Recommended:** +```python +async def handle_message(self, event): + result = await asyncio.to_thread(self.run_agent_sync, event) +``` + +--- + +## 4. CACHING STRATEGY IMPROVEMENTS + +### 4.1 Missing Cache Layers + +**A. Tool Schema Resolution** +```python +# model_tools.py - Rebuilds schemas every call +filtered_tools = registry.get_definitions(tools_to_include) +``` +**Fix:** Cache tool definitions keyed by (enabled_toolsets, disabled_toolsets) + +**B. Model Metadata Fetching** +```python +# agent/model_metadata.py - Fetches on every init +fetch_model_metadata() # HTTP request! +``` +**Fix:** Cache with 1-hour TTL (already noted but not consistently applied) + +**C. Session Context Building** +```python +# gateway/session.py - Rebuilds prompt every message +build_session_context_prompt(context) # String formatting overhead +``` +**Fix:** Cache with LRU for repeated contexts + +### 4.2 Cache Invalidation Strategy + +**Recommended Implementation:** +```python +from functools import lru_cache +from cachetools import TTLCache + +# For tool definitions +@lru_cache(maxsize=128) +def get_cached_tool_definitions(enabled_toolsets: tuple, disabled_toolsets: tuple): + return registry.get_definitions(set(enabled_toolsets)) + +# For API responses +model_metadata_cache = TTLCache(maxsize=100, ttl=3600) +``` + +### 4.3 Redis/Memcached for Distributed Caching + +For multi-instance gateway deployments: +- Cache session state in Redis +- Share tool definitions across workers +- Distributed rate limiting + +--- + +## 5. PERFORMANCE OPTIMIZATIONS (15+) + +### 5.1 Critical Optimizations + +**OPT-1: Async Web Tool HTTP Client** +```python +# tools/web_tools.py - Replace with async +import httpx + +async def web_search_tool(query: str) -> dict: + async with httpx.AsyncClient() as client: + response = await client.post(url, json=payload, timeout=60) + return response.json() +``` +**Impact:** 10x throughput improvement for concurrent requests + +**OPT-2: Streaming JSON Parser** +```python +# Replace json.loads for large responses +import ijson # Incremental JSON parser + +async def parse_large_response(stream): + async for item in ijson.items(stream, 'results.item'): + yield item +``` +**Impact:** 50% memory reduction for large API responses + +**OPT-3: Connection Pooling** +```python +# Single shared HTTP client +_http_client: Optional[httpx.AsyncClient] = None + +async def get_http_client() -> httpx.AsyncClient: + global _http_client + if _http_client is None: + _http_client = httpx.AsyncClient( + limits=httpx.Limits(max_keepalive_connections=20, max_connections=100) + ) + return _http_client +``` +**Impact:** Eliminates connection overhead (50-100ms per request) + +**OPT-4: Compiled Regex Caching** +```python +# run_agent.py line 243-256 - Compiles regex every call! +_DESTRUCTIVE_PATTERNS = re.compile(...) # Module level - good + +# But many patterns are inline - cache them +@lru_cache(maxsize=1024) +def get_path_pattern(path: str): + return re.compile(re.escape(path) + r'.*') +``` +**Impact:** 20% CPU reduction in path matching + +**OPT-5: Lazy Tool Discovery** +```python +# model_tools.py - Imports ALL tools at startup +def _discover_tools(): + for mod_name in _modules: # 16 imports! + importlib.import_module(mod_name) + +# Fix: Lazy import on first use +@lru_cache(maxsize=1) +def _get_tool_module(name: str): + return importlib.import_module(f"tools.{name}") +``` +**Impact:** 2-second faster startup time + +### 5.2 Database Optimizations + +**OPT-6: SQLite Write Batching** +```python +# hermes_state.py - Current: one write per operation +# Fix: Batch writes + +def batch_insert_messages(self, messages: List[Dict]): + with self._lock: + self._conn.execute("BEGIN IMMEDIATE") + try: + self._conn.executemany( + "INSERT INTO messages (...) VALUES (...)", + [(m['session_id'], m['content'], ...) for m in messages] + ) + self._conn.commit() + except: + self._conn.rollback() +``` +**Impact:** 10x faster for bulk operations + +**OPT-7: Connection Pool for SQLite** +```python +# Use sqlalchemy with connection pooling +from sqlalchemy import create_engine +from sqlalchemy.pool import QueuePool + +engine = create_engine( + 'sqlite:///state.db', + poolclass=QueuePool, + pool_size=5, + max_overflow=10 +) +``` + +### 5.3 Memory Optimizations + +**OPT-8: Streaming Message Processing** +```python +# run_agent.py - Current: loads ALL messages into memory +# Fix: Generator-based processing + +def iter_messages(self, session_id: str): + cursor = self._conn.execute( + "SELECT content FROM messages WHERE session_id = ? ORDER BY timestamp", + (session_id,) + ) + for row in cursor: + yield json.loads(row['content']) +``` + +**OPT-9: String Interning** +```python +import sys + +# For repeated string keys in JSON +INTERN_KEYS = {'role', 'content', 'tool_calls', 'function'} + +def intern_message(msg: dict) -> dict: + return {sys.intern(k) if k in INTERN_KEYS else k: v + for k, v in msg.items()} +``` + +### 5.4 Algorithmic Optimizations + +**OPT-10: O(1) Tool Lookup** +```python +# tools/registry.py - Current: linear scan +for name in sorted(tool_names): # O(n log n) + entry = self._tools.get(name) + +# Fix: Pre-computed sets +self._tool_index = {name: entry for name, entry in self._tools.items()} +``` + +**OPT-11: Path Overlap Detection** +```python +# run_agent.py lines 327-335 - O(n*m) comparison +def _paths_overlap(left: Path, right: Path) -> bool: + # Current: compares ALL path parts + +# Fix: Hash-based lookup +from functools import lru_cache + +@lru_cache(maxsize=1024) +def get_path_hash(path: Path) -> str: + return str(path.resolve()) +``` + +**OPT-12: Parallel Tool Execution** +```python +# run_agent.py - Current: sequential or limited parallel +# Fix: asyncio.gather for safe tools + +async def execute_tool_batch(tool_calls): + safe_tools = [tc for tc in tool_calls if tc.name in _PARALLEL_SAFE_TOOLS] + unsafe_tools = [tc for tc in tool_calls if tc.name not in _PARALLEL_SAFE_TOOLS] + + # Execute safe tools in parallel + safe_results = await asyncio.gather(*[ + execute_tool(tc) for tc in safe_tools + ]) + + # Execute unsafe tools sequentially + unsafe_results = [] + for tc in unsafe_tools: + unsafe_results.append(await execute_tool(tc)) +``` + +### 5.5 I/O Optimizations + +**OPT-13: Async File Operations** +```python +# utils.py - atomic_json_write uses blocking I/O +# Fix: aiofiles + +import aiofiles + +async def async_atomic_json_write(path: Path, data: dict): + tmp_path = path.with_suffix('.tmp') + async with aiofiles.open(tmp_path, 'w') as f: + await f.write(json.dumps(data)) + tmp_path.rename(path) +``` + +**OPT-14: Memory-Mapped Files for Large Logs** +```python +# For trajectory files +import mmap + +def read_trajectory_chunk(path: Path, offset: int, size: int): + with open(path, 'rb') as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + return mm[offset:offset+size] +``` + +**OPT-15: Compression for Session Storage** +```python +import lz4.frame # Fast compression + +class CompressedSessionDB(SessionDB): + def _compress_message(self, content: str) -> bytes: + return lz4.frame.compress(content.encode()) + + def _decompress_message(self, data: bytes) -> str: + return lz4.frame.decompress(data).decode() +``` +**Impact:** 70% storage reduction, faster I/O + +--- + +## 6. ADDITIONAL RECOMMENDATIONS + +### 6.1 Architecture Improvements + +1. **Split run_agent.py** into modules: + - agent/core.py - Core conversation loop + - agent/tools.py - Tool execution + - agent/persistence.py - Session management + - agent/api.py - API client management + +2. **Implement Event-Driven Architecture:** + - Use message queue for tool execution + - Decouple gateway from agent logic + - Enable horizontal scaling + +3. **Add Metrics Collection:** + ```python + from prometheus_client import Histogram, Counter + + tool_execution_time = Histogram('tool_duration_seconds', 'Time spent in tools', ['tool_name']) + api_call_counter = Counter('api_calls_total', 'Total API calls', ['provider', 'status']) + ``` + +### 6.2 Profiling Recommendations + +**Immediate Actions:** +```bash +# 1. Profile import time +python -X importtime -c "import run_agent" 2>&1 | head -100 + +# 2. Memory profiling +pip install memory_profiler +python -m memory_profiler run_agent.py + +# 3. CPU profiling +pip install py-spy +py-spy top -- python run_agent.py + +# 4. Async profiling +pip install austin +austin python run_agent.py +``` + +### 6.3 Load Testing + +```python +# locustfile.py for gateway load testing +from locust import HttpUser, task + +class GatewayUser(HttpUser): + @task + def send_message(self): + self.client.post("/webhook/telegram", json={ + "message": {"text": "Hello", "chat": {"id": 123}} + }) +``` + +--- + +## 7. PRIORITY MATRIX + +| Priority | Optimization | Effort | Impact | +|----------|-------------|--------|--------| +| P0 | Async web tools | Low | 10x throughput | +| P0 | HTTP connection pooling | Low | 100ms latency | +| P0 | SQLite batch writes | Low | 10x DB perf | +| P1 | Tool lazy loading | Low | 2s startup | +| P1 | Agent cache LRU | Low | Memory leak fix | +| P1 | Streaming JSON | Medium | 50% memory | +| P2 | Code splitting | High | Maintainability | +| P2 | Redis caching | Medium | Scalability | +| P2 | Compression | Low | 70% storage | + +--- + +## 8. CONCLUSION + +The Hermes Agent codebase has significant performance debt accumulated from rapid feature development. The monolithic architecture and synchronous I/O patterns are the primary bottlenecks. + +**Quick Wins (1 week):** +- Async HTTP clients +- Connection pooling +- SQLite batching +- Lazy loading + +**Medium Term (1 month):** +- Code modularization +- Caching layers +- Streaming processing + +**Long Term (3 months):** +- Event-driven architecture +- Horizontal scaling +- Distributed caching + +**Estimated Performance Gains:** +- Latency: 50-70% reduction +- Throughput: 10x improvement +- Memory: 40% reduction +- Startup: 3x faster diff --git a/hermes-sovereign/docs/PERFORMANCE_HOTSPOTS_QUICKREF.md b/hermes-sovereign/docs/PERFORMANCE_HOTSPOTS_QUICKREF.md new file mode 100644 index 00000000..12a01963 --- /dev/null +++ b/hermes-sovereign/docs/PERFORMANCE_HOTSPOTS_QUICKREF.md @@ -0,0 +1,241 @@ +# Performance Hotspots Quick Reference + +## Critical Files to Optimize + +### 1. run_agent.py (8,317 lines, 419KB) +``` +Lines 460-1000: Massive __init__ - 50+ params, slow startup +Lines 2158-2222: _save_session_log - blocking I/O every turn +Lines 2269-2297: _hydrate_todo_store - O(n) history scan +Lines 3759-3826: _anthropic_messages_create - blocking API calls +Lines 3827-3920: _interruptible_api_call - sync/async bridge overhead +``` + +**Fix Priority: CRITICAL** +- Split into modules +- Add async session logging +- Cache history hydration + +--- + +### 2. gateway/run.py (6,016 lines, 274KB) +``` +Lines 406-413: _agent_cache - unbounded growth, memory leak +Lines 464-493: _get_or_create_gateway_honcho - blocking init +Lines 2800+: run_agent_sync - blocks event loop +``` + +**Fix Priority: HIGH** +- Implement LRU cache +- Use asyncio.to_thread() + +--- + +### 3. gateway/stream_consumer.py +``` +Lines 88-147: Busy-wait loop with 50ms sleep + Max 20 updates/sec throughput +``` + +**Fix Priority: MEDIUM** +- Use asyncio.Event for signaling +- Adaptive back-off + +--- + +### 4. tools/web_tools.py (1,843 lines) +``` +Lines 171-188: _tavily_request - sync httpx call, 60s timeout +Lines 256-301: process_content_with_llm - sync LLM call +``` + +**Fix Priority: CRITICAL** +- Convert to async +- Add connection pooling + +--- + +### 5. tools/browser_tool.py (1,955 lines) +``` +Lines 194-208: _resolve_cdp_override - sync requests call +Lines 234-257: _get_cloud_provider - blocking config read +``` + +**Fix Priority: HIGH** +- Async HTTP client +- Cache config reads + +--- + +### 6. tools/terminal_tool.py (1,358 lines) +``` +Lines 66-92: _check_disk_usage_warning - blocking glob walk +Lines 167-289: _prompt_for_sudo_password - thread creation per call +``` + +**Fix Priority: MEDIUM** +- Async disk check +- Thread pool reuse + +--- + +### 7. tools/file_tools.py (563 lines) +``` +Lines 53-62: _read_tracker - unbounded dict growth +Lines 195-262: read_file_tool - sync file I/O +``` + +**Fix Priority: MEDIUM** +- TTL-based cleanup +- aiofiles for async I/O + +--- + +### 8. agent/context_compressor.py (676 lines) +``` +Lines 250-369: _generate_summary - expensive LLM call +Lines 490-500: _find_tail_cut_by_tokens - O(n) token counting +``` + +**Fix Priority: HIGH** +- Background compression task +- Cache summaries + +--- + +### 9. hermes_state.py (1,274 lines) +``` +Lines 116-215: _execute_write - global lock, 15 retries +Lines 143-156: SQLite with WAL but single connection +``` + +**Fix Priority: HIGH** +- Connection pooling +- Batch writes + +--- + +### 10. model_tools.py (472 lines) +``` +Lines 81-126: _run_async - creates ThreadPool per call! +Lines 132-170: _discover_tools - imports ALL tools at startup +``` + +**Fix Priority: CRITICAL** +- Persistent thread pool +- Lazy tool loading + +--- + +## Quick Fixes (Copy-Paste Ready) + +### Fix 1: LRU Cache for Agent Cache +```python +from functools import lru_cache +from cachetools import TTLCache + +# In gateway/run.py +self._agent_cache: Dict[str, tuple] = TTLCache(maxsize=100, ttl=3600) +``` + +### Fix 2: Async HTTP Client +```python +# In tools/web_tools.py +import httpx + +_http_client: Optional[httpx.AsyncClient] = None + +async def get_http_client() -> httpx.AsyncClient: + global _http_client + if _http_client is None: + _http_client = httpx.AsyncClient(timeout=60) + return _http_client +``` + +### Fix 3: Connection Pool for DB +```python +# In hermes_state.py +from sqlalchemy import create_engine +from sqlalchemy.pool import QueuePool + +engine = create_engine( + 'sqlite:///state.db', + poolclass=QueuePool, + pool_size=5, + max_overflow=10 +) +``` + +### Fix 4: Lazy Tool Loading +```python +# In model_tools.py +@lru_cache(maxsize=1) +def _get_discovered_tools(): + """Cache tool discovery after first call""" + _discover_tools() + return registry +``` + +### Fix 5: Batch Session Writes +```python +# In run_agent.py +async def _save_session_log_async(self, messages): + """Non-blocking session save""" + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, self._save_session_log, messages) +``` + +--- + +## Performance Metrics to Track + +```python +# Add these metrics +IMPORT_TIME = Gauge('import_time_seconds', 'Module import time') +AGENT_INIT_TIME = Gauge('agent_init_seconds', 'AIAgent init time') +TOOL_EXECUTION_TIME = Histogram('tool_duration_seconds', 'Tool execution', ['tool_name']) +DB_WRITE_TIME = Histogram('db_write_seconds', 'Database write time') +API_LATENCY = Histogram('api_latency_seconds', 'API call latency', ['provider']) +MEMORY_USAGE = Gauge('memory_usage_bytes', 'Process memory') +CACHE_HIT_RATE = Gauge('cache_hit_rate', 'Cache hit rate', ['cache_name']) +``` + +--- + +## One-Liner Profiling Commands + +```bash +# Find slow imports +python -X importtime -c "from run_agent import AIAgent" 2>&1 | head -50 + +# Find blocking I/O +sudo strace -e trace=openat,read,write -c python run_agent.py 2>&1 + +# Memory profiling +pip install memory_profiler && python -m memory_profiler run_agent.py + +# CPU profiling +pip install py-spy && py-spy record -o profile.svg -- python run_agent.py + +# Find all sleep calls +grep -rn "time.sleep\|asyncio.sleep" --include="*.py" | wc -l + +# Find all JSON calls +grep -rn "json.loads\|json.dumps" --include="*.py" | wc -l + +# Find all locks +grep -rn "threading.Lock\|threading.RLock\|asyncio.Lock" --include="*.py" +``` + +--- + +## Expected Performance After Fixes + +| Metric | Before | After | Improvement | +|--------|--------|-------|-------------| +| Startup time | 3-5s | 1-2s | 3x faster | +| API latency | 500ms | 200ms | 2.5x faster | +| Concurrent requests | 10/s | 100/s | 10x throughput | +| Memory per agent | 50MB | 30MB | 40% reduction | +| DB writes/sec | 50 | 500 | 10x throughput | +| Import time | 2s | 0.5s | 4x faster | diff --git a/hermes-sovereign/docs/PERFORMANCE_OPTIMIZATIONS.md b/hermes-sovereign/docs/PERFORMANCE_OPTIMIZATIONS.md new file mode 100644 index 00000000..5b414ead --- /dev/null +++ b/hermes-sovereign/docs/PERFORMANCE_OPTIMIZATIONS.md @@ -0,0 +1,163 @@ +# Performance Optimizations for run_agent.py + +## Summary of Changes + +This document describes the async I/O and performance optimizations applied to `run_agent.py` to fix blocking operations and improve overall responsiveness. + +--- + +## 1. Session Log Batching (PROBLEM 1: Lines 2158-2222) + +### Problem +`_save_session_log()` performed **blocking file I/O** on every conversation turn, causing: +- UI freezing during rapid message exchanges +- Unnecessary disk writes (JSON file was overwritten every turn) +- Synchronous `json.dump()` and `fsync()` blocking the main thread + +### Solution +Implemented **async batching** with the following components: + +#### New Methods: +- `_init_session_log_batcher()` - Initialize batching infrastructure +- `_save_session_log()` - Updated to use non-blocking batching +- `_flush_session_log_async()` - Flush writes in background thread +- `_write_session_log_sync()` - Actual blocking I/O (runs in thread pool) +- `_deferred_session_log_flush()` - Delayed flush for batching +- `_shutdown_session_log_batcher()` - Cleanup and flush on exit + +#### Key Features: +- **Time-based batching**: Minimum 500ms between writes +- **Deferred flushing**: Rapid successive calls are batched +- **Thread pool**: Single-worker executor prevents concurrent write conflicts +- **Atexit cleanup**: Ensures pending logs are flushed on exit +- **Backward compatible**: Same method signature, no breaking changes + +#### Performance Impact: +- Before: Every turn blocks on disk I/O (~5-20ms per write) +- After: Updates cached in memory, flushed every 500ms or on exit +- 10 rapid calls now result in ~1-2 writes instead of 10 + +--- + +## 2. Todo Store Hydration Caching (PROBLEM 2: Lines 2269-2297) + +### Problem +`_hydrate_todo_store()` performed **O(n) history scan on every message**: +- Scanned entire conversation history backwards +- No caching between calls +- Re-parsed JSON for every message check +- Gateway mode creates fresh AIAgent per message, making this worse + +### Solution +Implemented **result caching** with scan limiting: + +#### Key Changes: +```python +# Added caching flags +self._todo_store_hydrated # Marks if hydration already done +self._todo_cache_key # Caches history object id + +# Added scan limit for very long histories +scan_limit = 100 # Only scan last 100 messages +``` + +#### Performance Impact: +- Before: O(n) scan every call, parsing JSON for each tool message +- After: O(1) cached check, skips redundant work +- First call: Scans up to 100 messages (limited) +- Subsequent calls: <1μs cached check + +--- + +## 3. API Call Timeouts (PROBLEM 3: Lines 3759-3826) + +### Problem +`_anthropic_messages_create()` and `_interruptible_api_call()` had: +- **No timeout handling** - could block indefinitely +- 300ms polling interval for interrupt detection (sluggish) +- No timeout for OpenAI-compatible endpoints + +### Solution +Added comprehensive timeout handling: + +#### Changes to `_anthropic_messages_create()`: +- Added `timeout: float = 300.0` parameter (5 minutes default) +- Passes timeout to Anthropic SDK + +#### Changes to `_interruptible_api_call()`: +- Added `timeout: float = 300.0` parameter +- **Reduced polling interval** from 300ms to **50ms** (6x faster interrupt response) +- Added elapsed time tracking +- Raises `TimeoutError` if API call exceeds timeout +- Force-closes clients on timeout to prevent resource leaks +- Passes timeout to OpenAI-compatible endpoints + +#### Performance Impact: +- Before: Could hang forever on stuck connections +- After: Guaranteed timeout after 5 minutes (configurable) +- Interrupt response: 300ms → 50ms (6x faster) + +--- + +## Backward Compatibility + +All changes maintain **100% backward compatibility**: + +1. **Session logging**: Same method signature, behavior is additive +2. **Todo hydration**: Same signature, caching is transparent +3. **API calls**: New `timeout` parameter has sensible default (300s) + +No existing code needs modification to benefit from these optimizations. + +--- + +## Testing + +Run the verification script: +```bash +python3 -c " +import ast +with open('run_agent.py') as f: + source = f.read() +tree = ast.parse(source) + +methods = ['_init_session_log_batcher', '_write_session_log_sync', + '_shutdown_session_log_batcher', '_hydrate_todo_store', + '_interruptible_api_call'] + +for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) and node.name in methods: + print(f'✓ Found {node.name}') +print('\nAll optimizations verified!') +" +``` + +--- + +## Lines Modified + +| Function | Line Range | Change Type | +|----------|-----------|-------------| +| `_init_session_log_batcher` | ~2168-2178 | NEW | +| `_save_session_log` | ~2178-2230 | MODIFIED | +| `_flush_session_log_async` | ~2230-2240 | NEW | +| `_write_session_log_sync` | ~2240-2300 | NEW | +| `_deferred_session_log_flush` | ~2300-2305 | NEW | +| `_shutdown_session_log_batcher` | ~2305-2315 | NEW | +| `_hydrate_todo_store` | ~2320-2360 | MODIFIED | +| `_anthropic_messages_create` | ~3870-3890 | MODIFIED | +| `_interruptible_api_call` | ~3895-3970 | MODIFIED | + +--- + +## Future Improvements + +Potential additional optimizations: +1. Use `aiofiles` for true async file I/O (requires aiofiles dependency) +2. Batch SQLite writes in `_flush_messages_to_session_db` +3. Add compression for large session logs +4. Implement write-behind caching for checkpoint manager + +--- + +*Optimizations implemented: 2026-03-31* diff --git a/hermes-sovereign/docs/SECURE_CODING_GUIDELINES.md b/hermes-sovereign/docs/SECURE_CODING_GUIDELINES.md new file mode 100644 index 00000000..34a860d3 --- /dev/null +++ b/hermes-sovereign/docs/SECURE_CODING_GUIDELINES.md @@ -0,0 +1,566 @@ +# SECURE CODING GUIDELINES + +## Hermes Agent Development Security Standards +**Version:** 1.0 +**Effective Date:** March 30, 2026 + +--- + +## 1. GENERAL PRINCIPLES + +### 1.1 Security-First Mindset +- Every feature must be designed with security in mind +- Assume all input is malicious until proven otherwise +- Defense in depth: multiple layers of security controls +- Fail securely: when security controls fail, default to denial + +### 1.2 Threat Model +Primary threats to consider: +- Malicious user prompts +- Compromised or malicious skills +- Supply chain attacks +- Insider threats +- Accidental data exposure + +--- + +## 2. INPUT VALIDATION + +### 2.1 Validate All Input +```python +# ❌ INCORRECT +def process_file(path: str): + with open(path) as f: + return f.read() + +# ✅ CORRECT +from pydantic import BaseModel, validator +import re + +class FileRequest(BaseModel): + path: str + max_size: int = 1000000 + + @validator('path') + def validate_path(cls, v): + # Block path traversal + if '..' in v or v.startswith('/'): + raise ValueError('Invalid path characters') + # Allowlist safe characters + if not re.match(r'^[\w\-./]+$', v): + raise ValueError('Invalid characters in path') + return v + + @validator('max_size') + def validate_size(cls, v): + if v < 0 or v > 10000000: + raise ValueError('Size out of range') + return v + +def process_file(request: FileRequest): + # Now safe to use request.path + pass +``` + +### 2.2 Length Limits +Always enforce maximum lengths: +```python +MAX_INPUT_LENGTH = 10000 +MAX_FILENAME_LENGTH = 255 +MAX_PATH_LENGTH = 4096 + +def validate_length(value: str, max_len: int, field_name: str): + if len(value) > max_len: + raise ValueError(f"{field_name} exceeds maximum length of {max_len}") +``` + +### 2.3 Type Safety +Use type hints and enforce them: +```python +from typing import Union + +def safe_function(user_id: int, message: str) -> dict: + if not isinstance(user_id, int): + raise TypeError("user_id must be an integer") + if not isinstance(message, str): + raise TypeError("message must be a string") + # ... function logic +``` + +--- + +## 3. COMMAND EXECUTION + +### 3.1 Never Use shell=True +```python +import subprocess +import shlex + +# ❌ NEVER DO THIS +subprocess.run(f"ls {user_input}", shell=True) + +# ❌ NEVER DO THIS EITHER +cmd = f"cat {filename}" +os.system(cmd) + +# ✅ CORRECT - Use list arguments +subprocess.run(["ls", user_input], shell=False) + +# ✅ CORRECT - Use shlex for complex cases +cmd_parts = shlex.split(user_input) +subprocess.run(["ls"] + cmd_parts, shell=False) +``` + +### 3.2 Command Allowlisting +```python +ALLOWED_COMMANDS = frozenset([ + "ls", "cat", "grep", "find", "git", "python", "pip" +]) + +def validate_command(command: str): + parts = shlex.split(command) + if parts[0] not in ALLOWED_COMMANDS: + raise SecurityError(f"Command '{parts[0]}' not allowed") +``` + +### 3.3 Input Sanitization +```python +import re + +def sanitize_shell_input(value: str) -> str: + """Remove dangerous shell metacharacters.""" + # Block shell metacharacters + dangerous = re.compile(r'[;&|`$(){}[\]\\]') + if dangerous.search(value): + raise ValueError("Shell metacharacters not allowed") + return value +``` + +--- + +## 4. FILE OPERATIONS + +### 4.1 Path Validation +```python +from pathlib import Path + +class FileSandbox: + def __init__(self, root: Path): + self.root = root.resolve() + + def validate_path(self, user_path: str) -> Path: + """Validate and resolve user-provided path within sandbox.""" + # Expand user home + expanded = Path(user_path).expanduser() + + # Resolve to absolute path + try: + resolved = expanded.resolve() + except (OSError, ValueError) as e: + raise SecurityError(f"Invalid path: {e}") + + # Ensure path is within sandbox + try: + resolved.relative_to(self.root) + except ValueError: + raise SecurityError("Path outside sandbox") + + return resolved + + def safe_open(self, user_path: str, mode: str = 'r'): + safe_path = self.validate_path(user_path) + return open(safe_path, mode) +``` + +### 4.2 Prevent Symlink Attacks +```python +import os + +def safe_read_file(filepath: Path): + """Read file, following symlinks only within allowed directories.""" + # Resolve symlinks + real_path = filepath.resolve() + + # Verify still in allowed location after resolution + if not str(real_path).startswith(str(SAFE_ROOT)): + raise SecurityError("Symlink escape detected") + + # Verify it's a regular file + if not real_path.is_file(): + raise SecurityError("Not a regular file") + + return real_path.read_text() +``` + +### 4.3 Temporary Files +```python +import tempfile +import os + +def create_secure_temp_file(): + """Create temp file with restricted permissions.""" + # Create with restrictive permissions + fd, path = tempfile.mkstemp(prefix="hermes_", suffix=".tmp") + try: + # Set owner-read/write only + os.chmod(path, 0o600) + return fd, path + except: + os.close(fd) + os.unlink(path) + raise +``` + +--- + +## 5. SECRET MANAGEMENT + +### 5.1 Environment Variables +```python +import os + +# ❌ NEVER DO THIS +def execute_command(command: str): + # Child inherits ALL environment + subprocess.run(command, shell=True, env=os.environ) + +# ✅ CORRECT - Explicit whitelisting +_ALLOWED_ENV = frozenset([ + "PATH", "HOME", "USER", "LANG", "TERM", "SHELL" +]) + +def get_safe_environment(): + return {k: v for k, v in os.environ.items() + if k in _ALLOWED_ENV} + +def execute_command(command: str): + subprocess.run( + command, + shell=False, + env=get_safe_environment() + ) +``` + +### 5.2 Secret Detection +```python +import re + +_SECRET_PATTERNS = [ + re.compile(r'sk-[a-zA-Z0-9]{20,}'), # OpenAI-style keys + re.compile(r'ghp_[a-zA-Z0-9]{36}'), # GitHub PAT + re.compile(r'[a-zA-Z0-9]{40}'), # Generic high-entropy strings +] + +def detect_secrets(text: str) -> list: + """Detect potential secrets in text.""" + findings = [] + for pattern in _SECRET_PATTERNS: + matches = pattern.findall(text) + findings.extend(matches) + return findings + +def redact_secrets(text: str) -> str: + """Redact detected secrets.""" + for pattern in _SECRET_PATTERNS: + text = pattern.sub('***REDACTED***', text) + return text +``` + +### 5.3 Secure Logging +```python +import logging +from agent.redact import redact_sensitive_text + +class SecureLogger: + def __init__(self, logger: logging.Logger): + self.logger = logger + + def debug(self, msg: str, *args, **kwargs): + self.logger.debug(redact_sensitive_text(msg), *args, **kwargs) + + def info(self, msg: str, *args, **kwargs): + self.logger.info(redact_sensitive_text(msg), *args, **kwargs) + + def warning(self, msg: str, *args, **kwargs): + self.logger.warning(redact_sensitive_text(msg), *args, **kwargs) + + def error(self, msg: str, *args, **kwargs): + self.logger.error(redact_sensitive_text(msg), *args, **kwargs) +``` + +--- + +## 6. NETWORK SECURITY + +### 6.1 URL Validation +```python +from urllib.parse import urlparse +import ipaddress + +_BLOCKED_SCHEMES = frozenset(['file', 'ftp', 'gopher']) +_BLOCKED_HOSTS = frozenset([ + 'localhost', '127.0.0.1', '0.0.0.0', + '169.254.169.254', # AWS metadata + '[::1]', '[::]' +]) +_PRIVATE_NETWORKS = [ + ipaddress.ip_network('10.0.0.0/8'), + ipaddress.ip_network('172.16.0.0/12'), + ipaddress.ip_network('192.168.0.0/16'), + ipaddress.ip_network('127.0.0.0/8'), + ipaddress.ip_network('169.254.0.0/16'), # Link-local +] + +def validate_url(url: str) -> bool: + """Validate URL is safe to fetch.""" + parsed = urlparse(url) + + # Check scheme + if parsed.scheme not in ('http', 'https'): + raise ValueError(f"Scheme '{parsed.scheme}' not allowed") + + # Check hostname + hostname = parsed.hostname + if not hostname: + raise ValueError("No hostname in URL") + + if hostname.lower() in _BLOCKED_HOSTS: + raise ValueError("Host not allowed") + + # Check IP addresses + try: + ip = ipaddress.ip_address(hostname) + for network in _PRIVATE_NETWORKS: + if ip in network: + raise ValueError("Private IP address not allowed") + except ValueError: + pass # Not an IP, continue + + return True +``` + +### 6.2 Redirect Handling +```python +import requests + +def safe_get(url: str, max_redirects: int = 5): + """GET URL with redirect validation.""" + session = requests.Session() + session.max_redirects = max_redirects + + # Validate initial URL + validate_url(url) + + # Custom redirect handler + response = session.get( + url, + allow_redirects=True, + hooks={'response': lambda r, *args, **kwargs: validate_url(r.url)} + ) + + return response +``` + +--- + +## 7. AUTHENTICATION & AUTHORIZATION + +### 7.1 API Key Validation +```python +import secrets +import hmac +import hashlib + +def constant_time_compare(val1: str, val2: str) -> bool: + """Compare strings in constant time to prevent timing attacks.""" + return hmac.compare_digest(val1.encode(), val2.encode()) + +def validate_api_key(provided_key: str, expected_key: str) -> bool: + """Validate API key using constant-time comparison.""" + if not provided_key or not expected_key: + return False + return constant_time_compare(provided_key, expected_key) +``` + +### 7.2 Session Management +```python +import secrets +from datetime import datetime, timedelta + +class SessionManager: + SESSION_TIMEOUT = timedelta(hours=24) + + def create_session(self, user_id: str) -> str: + """Create secure session token.""" + token = secrets.token_urlsafe(32) + expires = datetime.utcnow() + self.SESSION_TIMEOUT + # Store in database with expiration + return token + + def validate_session(self, token: str) -> bool: + """Validate session token.""" + # Lookup in database + # Check expiration + # Validate token format + return True +``` + +--- + +## 8. ERROR HANDLING + +### 8.1 Secure Error Messages +```python +import logging + +# Internal detailed logging +logger = logging.getLogger(__name__) + +class UserFacingError(Exception): + """Error safe to show to users.""" + pass + +def process_request(data: dict): + try: + result = internal_operation(data) + return result + except ValueError as e: + # Log full details internally + logger.error(f"Validation error: {e}", exc_info=True) + # Return safe message to user + raise UserFacingError("Invalid input provided") + except Exception as e: + # Log full details internally + logger.error(f"Unexpected error: {e}", exc_info=True) + # Generic message to user + raise UserFacingError("An error occurred") +``` + +### 8.2 Exception Handling +```python +def safe_operation(): + try: + risky_operation() + except Exception as e: + # Always clean up resources + cleanup_resources() + # Log securely + logger.error(f"Operation failed: {redact_sensitive_text(str(e))}") + # Re-raise or convert + raise +``` + +--- + +## 9. CRYPTOGRAPHY + +### 9.1 Password Hashing +```python +import bcrypt + +def hash_password(password: str) -> str: + """Hash password using bcrypt.""" + salt = bcrypt.gensalt(rounds=12) + hashed = bcrypt.hashpw(password.encode(), salt) + return hashed.decode() + +def verify_password(password: str, hashed: str) -> bool: + """Verify password against hash.""" + return bcrypt.checkpw(password.encode(), hashed.encode()) +``` + +### 9.2 Secure Random +```python +import secrets + +def generate_token(length: int = 32) -> str: + """Generate cryptographically secure token.""" + return secrets.token_urlsafe(length) + +def generate_pin(length: int = 6) -> str: + """Generate secure numeric PIN.""" + return ''.join(str(secrets.randbelow(10)) for _ in range(length)) +``` + +--- + +## 10. CODE REVIEW CHECKLIST + +### Before Submitting Code: +- [ ] All user inputs validated +- [ ] No shell=True in subprocess calls +- [ ] All file paths validated and sandboxed +- [ ] Secrets not logged or exposed +- [ ] URLs validated before fetching +- [ ] Error messages don't leak sensitive info +- [ ] No hardcoded credentials +- [ ] Proper exception handling +- [ ] Security tests included +- [ ] Documentation updated + +### Security-Focused Review Questions: +1. What happens if this receives malicious input? +2. Can this leak sensitive data? +3. Are there privilege escalation paths? +4. What if the external service is compromised? +5. Is the error handling secure? + +--- + +## 11. TESTING SECURITY + +### 11.1 Security Unit Tests +```python +def test_path_traversal_blocked(): + sandbox = FileSandbox(Path("/safe/path")) + with pytest.raises(SecurityError): + sandbox.validate_path("../../../etc/passwd") + +def test_command_injection_blocked(): + with pytest.raises(SecurityError): + validate_command("ls; rm -rf /") + +def test_secret_redaction(): + text = "Key: sk-test123456789" + redacted = redact_secrets(text) + assert "sk-test" not in redacted +``` + +### 11.2 Fuzzing +```python +import hypothesis.strategies as st +from hypothesis import given + +@given(st.text()) +def test_input_validation(input_text): + # Should never crash, always validate or reject + try: + result = process_input(input_text) + assert isinstance(result, ExpectedType) + except ValidationError: + pass # Expected for invalid input +``` + +--- + +## 12. INCIDENT RESPONSE + +### Security Incident Procedure: +1. **Stop** - Halt the affected system/process +2. **Assess** - Determine scope and impact +3. **Contain** - Prevent further damage +4. **Investigate** - Gather evidence +5. **Remediate** - Fix the vulnerability +6. **Recover** - Restore normal operations +7. **Learn** - Document and improve + +### Emergency Contacts: +- Security Team: security@example.com +- On-call: +1-XXX-XXX-XXXX +- Slack: #security-incidents + +--- + +**Document Owner:** Security Team +**Review Cycle:** Quarterly +**Last Updated:** March 30, 2026 diff --git a/hermes-sovereign/docs/SECURITY_AUDIT_REPORT.md b/hermes-sovereign/docs/SECURITY_AUDIT_REPORT.md new file mode 100644 index 00000000..6eff9ad0 --- /dev/null +++ b/hermes-sovereign/docs/SECURITY_AUDIT_REPORT.md @@ -0,0 +1,705 @@ +# HERMES AGENT - COMPREHENSIVE SECURITY AUDIT REPORT +**Audit Date:** March 30, 2026 +**Auditor:** Security Analysis Agent +**Scope:** Entire codebase including authentication, command execution, file operations, sandbox environments, and API endpoints + +--- + +## EXECUTIVE SUMMARY + +The Hermes Agent codebase contains **32 identified security issues** across critical severity (5), high severity (12), medium severity (10), and low severity (5). The most critical vulnerabilities involve command injection vectors, sandbox escape possibilities, and secret leakage risks. + +**Overall Security Posture: MODERATE-HIGH RISK** +- Well-designed approval system for dangerous commands +- Good secret redaction mechanisms +- Insufficient input validation in several areas +- Multiple command injection vectors +- Incomplete sandbox isolation in some environments + +--- + +## 1. CVSS-SCORED VULNERABILITY REPORT + +### CRITICAL SEVERITY (CVSS 9.0-10.0) + +#### V-001: Command Injection via shell=True in Subprocess Calls +- **CVSS Score:** 9.8 (Critical) +- **Location:** `tools/terminal_tool.py`, `tools/file_operations.py`, `tools/environments/*.py` +- **Description:** Multiple subprocess calls use shell=True with user-controlled input, enabling arbitrary command execution +- **Attack Vector:** Local/Remote via agent prompts or malicious skills +- **Evidence:** + ```python + # terminal_tool.py line ~460 + subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ...) + # Command strings constructed from user input without proper sanitization + ``` +- **Impact:** Complete system compromise, data exfiltration, malware installation +- **Remediation:** Use subprocess without shell=True, pass arguments as lists, implement strict input validation + +#### V-002: Path Traversal in File Operations +- **CVSS Score:** 9.1 (Critical) +- **Location:** `tools/file_operations.py`, `tools/file_tools.py` +- **Description:** Insufficient path validation allows access to sensitive system files +- **Attack Vector:** Malicious file paths like `../../../etc/shadow` or `~/.ssh/id_rsa` +- **Evidence:** + ```python + # file_operations.py - _expand_path() allows ~username expansion + # which can be exploited with crafted usernames + ``` +- **Impact:** Unauthorized file read/write, credential theft, system compromise +- **Remediation:** Implement strict path canonicalization and sandbox boundaries + +#### V-003: Secret Leakage via Environment Variables in Sandboxes +- **CVSS Score:** 9.3 (Critical) +- **Location:** `tools/code_execution_tool.py`, `tools/environments/*.py` +- **Description:** Child processes inherit environment variables containing secrets +- **Attack Vector:** Malicious code executed via execute_code or terminal +- **Evidence:** + ```python + # code_execution_tool.py lines 434-461 + # _SAFE_ENV_PREFIXES filter is incomplete - misses many secret patterns + _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", ...) + _SECRET_SUBSTRINGS = ("TOKEN", "SECRET", "PASSWORD", ...) + # Only blocks explicit patterns - many secret env vars slip through + ``` +- **Impact:** API key theft, credential exfiltration, unauthorized access to external services +- **Remediation:** Whitelist-only approach for env vars, explicit secret scanning + +#### V-004: Sudo Password Exposure via Command Line +- **CVSS Score:** 9.0 (Critical) +- **Location:** `tools/terminal_tool.py`, `_transform_sudo_command()` +- **Description:** Sudo passwords may be exposed in process lists via command line arguments +- **Attack Vector:** Local attackers reading /proc or ps output +- **Evidence:** + ```python + # Line 275: sudo_stdin passed via printf pipe + exec_command = f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}" + ``` +- **Impact:** Privilege escalation credential theft +- **Remediation:** Use file descriptor passing, avoid shell command construction with secrets + +#### V-005: SSRF via Unsafe URL Handling +- **CVSS Score:** 9.4 (Critical) +- **Location:** `tools/web_tools.py`, `tools/browser_tool.py` +- **Description:** URL safety checks can be bypassed via DNS rebinding and redirect chains +- **Attack Vector:** Malicious URLs targeting internal services (169.254.169.254, localhost) +- **Evidence:** + ```python + # url_safety.py - is_safe_url() vulnerable to TOCTOU + # DNS resolution and actual connection are separate operations + ``` +- **Impact:** Internal service access, cloud metadata theft, port scanning +- **Remediation:** Implement connection-level validation, use egress proxy + +--- + +### HIGH SEVERITY (CVSS 7.0-8.9) + +#### V-006: Insecure Deserialization in MCP OAuth +- **CVSS Score:** 8.8 (High) +- **Location:** `tools/mcp_oauth.py`, token storage +- **Description:** JSON token data loaded without schema validation +- **Attack Vector:** Malicious token files crafted by local attackers +- **Remediation:** Add JSON schema validation, sign stored tokens + +#### V-007: SQL Injection in ResponseStore +- **CVSS Score:** 8.5 (High) +- **Location:** `gateway/platforms/api_server.py`, ResponseStore class +- **Description:** Direct string interpolation in SQLite queries +- **Evidence:** + ```python + # Lines 98-106, 114-126 - response_id directly interpolated + "SELECT data FROM responses WHERE response_id = ?", (response_id,) + # While parameterized, no validation of response_id format + ``` +- **Remediation:** Validate response_id format, use UUID strict parsing + +#### V-008: CORS Misconfiguration in API Server +- **CVSS Score:** 8.2 (High) +- **Location:** `gateway/platforms/api_server.py`, cors_middleware +- **Description:** Wildcard CORS allowed with credentials +- **Evidence:** + ```python + # Line 324-328: "*" in origins allows any domain + if "*" in self._cors_origins: + headers["Access-Control-Allow-Origin"] = "*" + ``` +- **Impact:** Cross-origin attacks, credential theft via malicious websites +- **Remediation:** Never allow "*" with credentials, implement strict origin validation + +#### V-009: Authentication Bypass in API Key Check +- **CVSS Score:** 8.1 (High) +- **Location:** `gateway/platforms/api_server.py`, `_check_auth()` +- **Description:** Empty API key configuration allows all requests +- **Evidence:** + ```python + # Line 360-361: No key configured = allow all + if not self._api_key: + return None # No key configured — allow all + ``` +- **Impact:** Unauthorized API access when key not explicitly set +- **Remediation:** Require explicit auth configuration, fail-closed default + +#### V-010: Code Injection via Browser CDP Override +- **CVSS Score:** 8.4 (High) +- **Location:** `tools/browser_tool.py`, `_resolve_cdp_override()` +- **Description:** User-controlled CDP URL fetched without validation +- **Evidence:** + ```python + # Line 195: requests.get(version_url) without URL validation + response = requests.get(version_url, timeout=10) + ``` +- **Impact:** SSRF, internal service exploitation +- **Remediation:** Strict URL allowlisting, validate scheme/host + +#### V-011: Skills Guard Bypass via Obfuscation +- **CVSS Score:** 7.8 (High) +- **Location:** `tools/skills_guard.py`, THREAT_PATTERNS +- **Description:** Regex-based detection can be bypassed with encoding tricks +- **Evidence:** Patterns don't cover all Unicode variants, case variations, or encoding tricks +- **Impact:** Malicious skills installation, code execution +- **Remediation:** Normalize input before scanning, add AST-based analysis + +#### V-012: Privilege Escalation via Docker Socket Mount +- **CVSS Score:** 8.7 (High) +- **Location:** `tools/environments/docker.py`, volume mounting +- **Description:** User-configured volumes can mount Docker socket +- **Evidence:** + ```python + # Line 267: volume_args extends with user-controlled vol + volume_args.extend(["-v", vol]) + ``` +- **Impact:** Container escape, host compromise +- **Remediation:** Blocklist sensitive paths, validate all mount points + +#### V-013: Information Disclosure via Error Messages +- **CVSS Score:** 7.5 (High) +- **Location:** Multiple files across codebase +- **Description:** Detailed error messages expose internal paths, versions, configurations +- **Evidence:** File paths, environment details in exception messages +- **Impact:** Information gathering for targeted attacks +- **Remediation:** Sanitize error messages in production, log details internally only + +#### V-014: Session Fixation in OAuth Flow +- **CVSS Score:** 7.6 (High) +- **Location:** `tools/mcp_oauth.py`, `_wait_for_callback()` +- **Description:** State parameter not validated against session +- **Evidence:** Line 186: state returned but not verified against initial value +- **Impact:** OAuth session hijacking +- **Remediation:** Cryptographically verify state parameter + +#### V-015: Race Condition in File Operations +- **CVSS Score:** 7.4 (High) +- **Location:** `tools/file_operations.py`, `ShellFileOperations` +- **Description:** Time-of-check to time-of-use vulnerabilities in file access +- **Impact:** Privilege escalation, unauthorized file access +- **Remediation:** Use file descriptors, avoid path-based operations + +#### V-016: Insufficient Rate Limiting +- **CVSS Score:** 7.3 (High) +- **Location:** `gateway/platforms/api_server.py`, `gateway/run.py` +- **Description:** No rate limiting on API endpoints +- **Impact:** DoS, brute force attacks, resource exhaustion +- **Remediation:** Implement per-IP and per-user rate limiting + +#### V-017: Insecure Temporary File Creation +- **CVSS Score:** 7.2 (High) +- **Location:** `tools/code_execution_tool.py`, `tools/credential_files.py` +- **Description:** Predictable temp file paths, potential symlink attacks +- **Evidence:** + ```python + # code_execution_tool.py line 388 + tmpdir = tempfile.mkdtemp(prefix="hermes_sandbox_") + # Predictable naming scheme + ``` +- **Impact:** Local privilege escalation via symlink attacks +- **Remediation:** Use tempfile with proper permissions, random suffixes + +--- + +### MEDIUM SEVERITY (CVSS 4.0-6.9) + +#### V-018: Weak Approval Pattern Detection +- **CVSS Score:** 6.5 (Medium) +- **Location:** `tools/approval.py`, DANGEROUS_PATTERNS +- **Description:** Pattern list doesn't cover all dangerous command variants +- **Impact:** Unauthorized dangerous command execution +- **Remediation:** Expand patterns, add behavioral analysis + +#### V-019: Insecure File Permissions on Credentials +- **CVSS Score:** 6.4 (Medium) +- **Location:** `tools/credential_files.py`, `tools/mcp_oauth.py` +- **Description:** Credential files may have overly permissive permissions +- **Evidence:** + ```python + # mcp_oauth.py line 107: chmod 0o600 but no verification + path.chmod(0o600) + ``` +- **Impact:** Local credential theft +- **Remediation:** Verify permissions after creation, use secure umask + +#### V-020: Log Injection via Unsanitized Input +- **CVSS Score:** 5.8 (Medium) +- **Location:** Multiple logging statements across codebase +- **Description:** User-controlled data written directly to logs +- **Impact:** Log poisoning, log analysis bypass +- **Remediation:** Sanitize all logged data, use structured logging + +#### V-021: XML External Entity (XXE) Risk +- **CVSS Score:** 6.2 (Medium) +- **Location:** `skills/productivity/powerpoint/scripts/office/schemas/` XML parsing +- **Description:** PowerPoint processing uses XML without explicit XXE protection +- **Impact:** File disclosure, SSRF via XML entities +- **Remediation:** Disable external entities in XML parsers + +#### V-022: Unsafe YAML Loading +- **CVSS Score:** 6.1 (Medium) +- **Location:** `hermes_cli/config.py`, `tools/skills_guard.py` +- **Description:** yaml.safe_load used but custom constructors may be risky +- **Impact:** Code execution via malicious YAML +- **Remediation:** Audit all YAML loading, disable unsafe tags + +#### V-023: Prototype Pollution in JavaScript Bridge +- **CVSS Score:** 5.9 (Medium) +- **Location:** `scripts/whatsapp-bridge/bridge.js` +- **Description:** Object property assignments without validation +- **Impact:** Logic bypass, potential RCE in Node context +- **Remediation:** Validate all object keys, use Map instead of Object + +#### V-024: Insufficient Subagent Isolation +- **CVSS Score:** 6.3 (Medium) +- **Location:** `tools/delegate_tool.py` +- **Description:** Subagents share filesystem and network with parent +- **Impact:** Lateral movement, privilege escalation between agents +- **Remediation:** Implement stronger sandbox boundaries per subagent + +#### V-025: Predictable Session IDs +- **CVSS Score:** 5.5 (Medium) +- **Location:** `gateway/session.py`, `tools/terminal_tool.py` +- **Description:** Session/task IDs use uuid4 but may be logged/predictable +- **Impact:** Session hijacking +- **Remediation:** Use cryptographically secure random, short-lived tokens + +#### V-026: Missing Integrity Checks on External Binaries +- **CVSS Score:** 5.7 (Medium) +- **Location:** `tools/tirith_security.py`, auto-install process +- **Description:** Binary download with limited verification +- **Evidence:** SHA-256 verified but no code signing verification by default +- **Impact:** Supply chain compromise +- **Remediation:** Require signature verification, pin versions + +#### V-027: Information Leakage in Debug Mode +- **CVSS Score:** 5.2 (Medium) +- **Location:** `tools/debug_helpers.py`, `agent/display.py` +- **Description:** Debug output may contain sensitive configuration +- **Impact:** Information disclosure +- **Remediation:** Redact secrets in all debug output + +--- + +### LOW SEVERITY (CVSS 0.1-3.9) + +#### V-028: Missing Security Headers +- **CVSS Score:** 3.7 (Low) +- **Location:** `gateway/platforms/api_server.py` +- **Description:** Some security headers missing (CSP, HSTS) +- **Remediation:** Add comprehensive security headers + +#### V-029: Verbose Version Information +- **CVSS Score:** 2.3 (Low) +- **Location:** Multiple version endpoints +- **Description:** Detailed version information exposed +- **Remediation:** Minimize version disclosure + +#### V-030: Unused Imports and Dead Code +- **CVSS Score:** 2.0 (Low) +- **Location:** Multiple files +- **Description:** Dead code increases attack surface +- **Remediation:** Remove unused code, regular audits + +#### V-031: Weak Cryptographic Practices +- **CVSS Score:** 3.2 (Low) +- **Location:** `hermes_cli/auth.py`, token handling +- **Description:** No encryption at rest for auth tokens +- **Remediation:** Use OS keychain, encrypt sensitive data + +#### V-032: Missing Input Length Validation +- **CVSS Score:** 3.5 (Low) +- **Location:** Multiple tool input handlers +- **Description:** No maximum length checks on inputs +- **Remediation:** Add length validation to all inputs + +--- + +## 2. ATTACK SURFACE DIAGRAM + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ EXTERNAL ATTACK SURFACE │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Telegram │ │ Discord │ │ Slack │ │ Web Browser │ │ +│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ +│ │ │ │ │ │ +│ ┌──────▼───────┐ ┌──────▼───────┐ ┌──────▼───────┐ ┌──────▼───────┐ │ +│ │ Gateway │──│ Gateway │──│ Gateway │──│ Gateway │ │ +│ │ Adapter │ │ Adapter │ │ Adapter │ │ Adapter │ │ +│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ +│ └─────────────────┴─────────────────┘ │ │ +│ │ │ │ +│ ┌──────▼───────┐ ┌──────▼───────┐ │ +│ │ API Server │◄─────────────────│ Web API │ │ +│ │ (HTTP) │ │ Endpoints │ │ +│ └──────┬───────┘ └──────────────┘ │ +│ │ │ +└───────────────────────────┼───────────────────────────────────────────────┘ + │ +┌───────────────────────────┼───────────────────────────────────────────────┐ +│ INTERNAL ATTACK SURFACE │ +├───────────────────────────┼───────────────────────────────────────────────┤ +│ │ │ +│ ┌──────▼───────┐ │ +│ │ AI Agent │ │ +│ │ Core │ │ +│ └──────┬───────┘ │ +│ │ │ +│ ┌─────────────────┼─────────────────┐ │ +│ │ │ │ │ +│ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ │ +│ │ Tools │ │ Tools │ │ Tools │ │ +│ │ File │ │ Terminal│ │ Web │ │ +│ │ Ops │ │ Exec │ │ Tools │ │ +│ └────┬────┘ └────┬────┘ └────┬────┘ │ +│ │ │ │ │ +│ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ │ +│ │ Local │ │ Docker │ │ Browser │ │ +│ │ FS │ │Sandbox │ │ Tool │ │ +│ └─────────┘ └────┬────┘ └────┬────┘ │ +│ │ │ │ +│ ┌─────▼─────┐ ┌────▼────┐ │ +│ │ Modal │ │ Cloud │ │ +│ │ Cloud │ │ Browser │ │ +│ └───────────┘ └─────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ CREDENTIAL STORAGE │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ auth.json│ │ .env │ │mcp-tokens│ │ skill │ │ │ +│ │ │ (OAuth) │ │ (API Key)│ │ (OAuth) │ │ creds │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────┘ │ +│ │ +└──────────────────────────────────────────────────────────────────────────┘ + +LEGEND: + ■ Entry points (external attack surface) + ■ Internal components (privilege escalation targets) + ■ Credential storage (high-value targets) + ■ Sandboxed environments (isolation boundaries) +``` + +--- + +## 3. MITIGATION ROADMAP + +### Phase 1: Critical Fixes (Week 1-2) + +| Priority | Fix | Owner | Est. Hours | +|----------|-----|-------|------------| +| P0 | Remove all shell=True subprocess calls | Security Team | 16 | +| P0 | Implement strict path sandboxing | Security Team | 12 | +| P0 | Fix secret leakage in child processes | Security Team | 8 | +| P0 | Add connection-level URL validation | Security Team | 8 | + +### Phase 2: High Priority (Week 3-4) + +| Priority | Fix | Owner | Est. Hours | +|----------|-----|-------|------------| +| P1 | Implement proper input validation framework | Dev Team | 20 | +| P1 | Add CORS strict mode | Dev Team | 4 | +| P1 | Fix OAuth state validation | Dev Team | 6 | +| P1 | Add rate limiting | Dev Team | 10 | +| P1 | Implement secure credential storage | Security Team | 12 | + +### Phase 3: Medium Priority (Month 2) + +| Priority | Fix | Owner | Est. Hours | +|----------|-----|-------|------------| +| P2 | Expand dangerous command patterns | Security Team | 6 | +| P2 | Add AST-based skill scanning | Security Team | 16 | +| P2 | Implement subagent isolation | Dev Team | 20 | +| P2 | Add comprehensive audit logging | Dev Team | 12 | + +### Phase 4: Long-term Improvements (Month 3+) + +| Priority | Fix | Owner | Est. Hours | +|----------|-----|-------|------------| +| P3 | Security headers hardening | Dev Team | 4 | +| P3 | Code signing verification | Security Team | 8 | +| P3 | Supply chain security | Dev Team | 12 | +| P3 | Regular security audits | Security Team | Ongoing | + +--- + +## 4. SECURE CODING GUIDELINES + +### 4.1 Command Execution +```python +# ❌ NEVER DO THIS +subprocess.run(f"ls {user_input}", shell=True) + +# ✅ DO THIS +subprocess.run(["ls", user_input], shell=False) + +# ✅ OR USE SHLEX +import shlex +subprocess.run(["ls"] + shlex.split(user_input), shell=False) +``` + +### 4.2 Path Handling +```python +# ❌ NEVER DO THIS +open(os.path.expanduser(user_path), "r") + +# ✅ DO THIS +from pathlib import Path +safe_root = Path("/allowed/path").resolve() +user_path = Path(user_path).expanduser().resolve() +if not str(user_path).startswith(str(safe_root)): + raise PermissionError("Path outside sandbox") +``` + +### 4.3 Secret Handling +```python +# ❌ NEVER DO THIS +os.environ["API_KEY"] = user_api_key # Visible to all child processes + +# ✅ DO THIS +# Use file descriptor passing or explicit whitelisting +child_env = {k: v for k, v in os.environ.items() + if k in ALLOWED_ENV_VARS} +``` + +### 4.4 URL Validation +```python +# ❌ NEVER DO THIS +response = requests.get(user_url) + +# ✅ DO THIS +from urllib.parse import urlparse +parsed = urlparse(user_url) +if parsed.scheme not in ("http", "https"): + raise ValueError("Invalid scheme") +if parsed.hostname not in ALLOWED_HOSTS: + raise ValueError("Host not allowed") +``` + +### 4.5 Input Validation +```python +# Use pydantic for all user inputs +from pydantic import BaseModel, validator + +class FileRequest(BaseModel): + path: str + max_size: int = 1000 + + @validator('path') + def validate_path(cls, v): + if '..' in v or v.startswith('/'): + raise ValueError('Invalid path') + return v +``` + +--- + +## 5. SPECIFIC SECURITY FIXES NEEDED + +### Fix 1: Terminal Tool Command Injection (V-001) +```python +# CURRENT CODE (tools/terminal_tool.py ~line 457) +cmd = [self._docker_exe, "exec", "-w", work_dir, self._container_id, + "bash", "-lc", exec_command] + +# SECURE FIX +cmd = [self._docker_exe, "exec", "-w", work_dir, self._container_id, + "bash", "-lc", exec_command] +# Add strict input validation before this point +if not _is_safe_command(exec_command): + raise SecurityError("Dangerous command detected") +``` + +### Fix 2: File Operations Path Traversal (V-002) +```python +# CURRENT CODE (tools/file_operations.py ~line 409) +def _expand_path(self, path: str) -> str: + if path.startswith('~'): + # ... expansion logic + +# SECURE FIX +def _expand_path(self, path: str) -> str: + safe_root = Path(self.cwd).resolve() + expanded = Path(path).expanduser().resolve() + if not str(expanded).startswith(str(safe_root)): + raise PermissionError(f"Path {path} outside allowed directory") + return str(expanded) +``` + +### Fix 3: Code Execution Environment Sanitization (V-003) +```python +# CURRENT CODE (tools/code_execution_tool.py ~lines 434-461) +_SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", ...) +_SECRET_SUBSTRINGS = ("TOKEN", "SECRET", ...) + +# SECURE FIX - Whitelist approach +_ALLOWED_ENV_VARS = frozenset([ + "PATH", "HOME", "USER", "LANG", "LC_ALL", + "PYTHONPATH", "TERM", "SHELL", "PWD" +]) +child_env = {k: v for k, v in os.environ.items() + if k in _ALLOWED_ENV_VARS} +# Explicitly load only non-secret values +``` + +### Fix 4: API Server Authentication (V-009) +```python +# CURRENT CODE (gateway/platforms/api_server.py ~line 360-361) +if not self._api_key: + return None # No key configured — allow all + +# SECURE FIX +if not self._api_key: + logger.error("API server started without authentication") + return web.json_response( + {"error": "Server misconfigured - auth required"}, + status=500 + ) +``` + +### Fix 5: CORS Configuration (V-008) +```python +# CURRENT CODE (gateway/platforms/api_server.py ~lines 324-328) +if "*" in self._cors_origins: + headers["Access-Control-Allow-Origin"] = "*" + +# SECURE FIX - Never allow wildcard with credentials +if "*" in self._cors_origins: + logger.warning("Wildcard CORS not allowed with credentials") + return None +``` + +### Fix 6: OAuth State Validation (V-014) +```python +# CURRENT CODE (tools/mcp_oauth.py ~line 186) +code, state = await _wait_for_callback() + +# SECURE FIX +stored_state = get_stored_state() +if state != stored_state: + raise SecurityError("OAuth state mismatch - possible CSRF attack") +``` + +### Fix 7: Docker Volume Mount Validation (V-012) +```python +# CURRENT CODE (tools/environments/docker.py ~line 267) +volume_args.extend(["-v", vol]) + +# SECURE FIX +_BLOCKED_PATHS = ['/var/run/docker.sock', '/proc', '/sys', ...] +if any(blocked in vol for blocked in _BLOCKED_PATHS): + raise SecurityError(f"Volume mount {vol} not allowed") +volume_args.extend(["-v", vol]) +``` + +### Fix 8: Debug Output Redaction (V-027) +```python +# Add to all debug logging +from agent.redact import redact_sensitive_text +logger.debug(redact_sensitive_text(debug_message)) +``` + +### Fix 9: Input Length Validation +```python +# Add to all tool entry points +MAX_INPUT_LENGTH = 10000 +if len(user_input) > MAX_INPUT_LENGTH: + raise ValueError(f"Input exceeds maximum length of {MAX_INPUT_LENGTH}") +``` + +### Fix 10: Session ID Entropy +```python +# CURRENT CODE - uses uuid4 +import uuid +session_id = str(uuid.uuid4()) + +# SECURE FIX - use secrets module +import secrets +session_id = secrets.token_urlsafe(32) +``` + +### Fix 11-20: Additional Required Fixes +11. **Add CSRF protection** to all state-changing operations +12. **Implement request signing** for internal service communication +13. **Add certificate pinning** for external API calls +14. **Implement proper key rotation** for auth tokens +15. **Add anomaly detection** for unusual command patterns +16. **Implement network segmentation** for sandbox environments +17. **Add hardware security module (HSM) support** for key storage +18. **Implement behavioral analysis** for skill code +19. **Add automated vulnerability scanning** to CI/CD pipeline +20. **Implement incident response procedures** for security events + +--- + +## 6. SECURITY RECOMMENDATIONS + +### Immediate Actions (Within 24 hours) +1. Disable gateway API server if not required +2. Enable HERMES_YOLO_MODE only for trusted users +3. Review all installed skills from community sources +4. Enable comprehensive audit logging + +### Short-term Actions (Within 1 week) +1. Deploy all P0 fixes +2. Implement monitoring for suspicious command patterns +3. Conduct security training for developers +4. Establish security review process for new features + +### Long-term Actions (Within 1 month) +1. Implement comprehensive security testing +2. Establish bug bounty program +3. Regular third-party security audits +4. Achieve SOC 2 compliance + +--- + +## 7. COMPLIANCE MAPPING + +| Vulnerability | OWASP Top 10 | CWE | NIST 800-53 | +|---------------|--------------|-----|-------------| +| V-001 (Command Injection) | A03:2021 - Injection | CWE-78 | SI-10 | +| V-002 (Path Traversal) | A01:2021 - Broken Access Control | CWE-22 | AC-3 | +| V-003 (Secret Leakage) | A07:2021 - Auth Failures | CWE-200 | SC-28 | +| V-005 (SSRF) | A10:2021 - SSRF | CWE-918 | SC-7 | +| V-008 (CORS) | A05:2021 - Security Misconfig | CWE-942 | AC-4 | +| V-011 (Skills Bypass) | A08:2021 - Integrity Failures | CWE-353 | SI-7 | + +--- + +## APPENDIX A: TESTING RECOMMENDATIONS + +### Security Test Cases +1. Command injection with `; rm -rf /` +2. Path traversal with `../../../etc/passwd` +3. SSRF with `http://169.254.169.254/latest/meta-data/` +4. Secret exfiltration via environment variables +5. OAuth flow manipulation +6. Rate limiting bypass +7. Session fixation attacks +8. Privilege escalation via sudo + +--- + +**Report End** + +*This audit represents a point-in-time assessment. Security is an ongoing process requiring continuous monitoring and improvement.* diff --git a/hermes-sovereign/docs/SECURITY_FIXES_CHECKLIST.md b/hermes-sovereign/docs/SECURITY_FIXES_CHECKLIST.md new file mode 100644 index 00000000..1cdafa1f --- /dev/null +++ b/hermes-sovereign/docs/SECURITY_FIXES_CHECKLIST.md @@ -0,0 +1,488 @@ +# SECURITY FIXES CHECKLIST + +## 20+ Specific Security Fixes Required + +This document provides a detailed checklist of all security fixes identified in the comprehensive audit. + +--- + +## CRITICAL FIXES (Must implement immediately) + +### Fix 1: Remove shell=True from subprocess calls +**File:** `tools/terminal_tool.py` +**Line:** ~457 +**CVSS:** 9.8 + +```python +# BEFORE +subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ...) + +# AFTER +# Validate command first +if not is_safe_command(exec_command): + raise SecurityError("Dangerous command detected") +subprocess.Popen(cmd_list, shell=False, ...) # Pass as list +``` + +--- + +### Fix 2: Implement path sandbox validation +**File:** `tools/file_operations.py` +**Lines:** 409-420 +**CVSS:** 9.1 + +```python +# BEFORE +def _expand_path(self, path: str) -> str: + if path.startswith('~'): + return os.path.expanduser(path) + return path + +# AFTER +def _expand_path(self, path: str) -> Path: + safe_root = Path(self.cwd).resolve() + expanded = Path(path).expanduser().resolve() + if not str(expanded).startswith(str(safe_root)): + raise PermissionError(f"Path {path} outside allowed directory") + return expanded +``` + +--- + +### Fix 3: Environment variable sanitization +**File:** `tools/code_execution_tool.py` +**Lines:** 434-461 +**CVSS:** 9.3 + +```python +# BEFORE +_SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", ...) +_SECRET_SUBSTRINGS = ("TOKEN", "SECRET", ...) + +# AFTER +_ALLOWED_ENV_VARS = frozenset([ + "PATH", "HOME", "USER", "LANG", "LC_ALL", + "TERM", "SHELL", "PWD", "PYTHONPATH" +]) +child_env = {k: v for k, v in os.environ.items() + if k in _ALLOWED_ENV_VARS} +``` + +--- + +### Fix 4: Secure sudo password handling +**File:** `tools/terminal_tool.py` +**Line:** 275 +**CVSS:** 9.0 + +```python +# BEFORE +exec_command = f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}" + +# AFTER +# Use file descriptor passing instead of command line +with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: + f.write(sudo_stdin) + pass_file = f.name +os.chmod(pass_file, 0o600) +exec_command = f"cat {pass_file} | {exec_command}" +# Clean up after execution +``` + +--- + +### Fix 5: Connection-level URL validation +**File:** `tools/url_safety.py` +**Lines:** 50-96 +**CVSS:** 9.4 + +```python +# AFTER - Add to is_safe_url() +# After DNS resolution, verify IP is not in private range +def _validate_connection_ip(hostname: str) -> bool: + try: + addr = socket.getaddrinfo(hostname, None) + for a in addr: + ip = ipaddress.ip_address(a[4][0]) + if ip.is_private or ip.is_loopback or ip.is_reserved: + return False + return True + except: + return False +``` + +--- + +## HIGH PRIORITY FIXES + +### Fix 6: MCP OAuth token validation +**File:** `tools/mcp_oauth.py` +**Lines:** 66-89 +**CVSS:** 8.8 + +```python +# AFTER +async def get_tokens(self): + data = self._read_json(self._tokens_path()) + if not data: + return None + # Add schema validation + if not self._validate_token_schema(data): + logger.error("Invalid token schema, deleting corrupted tokens") + self.remove() + return None + return OAuthToken(**data) +``` + +--- + +### Fix 7: API Server SQL injection prevention +**File:** `gateway/platforms/api_server.py` +**Lines:** 98-126 +**CVSS:** 8.5 + +```python +# AFTER +import uuid + +def _validate_response_id(self, response_id: str) -> bool: + """Validate response_id format to prevent injection.""" + try: + uuid.UUID(response_id.split('-')[0], version=4) + return True + except (ValueError, IndexError): + return False +``` + +--- + +### Fix 8: CORS strict validation +**File:** `gateway/platforms/api_server.py` +**Lines:** 324-328 +**CVSS:** 8.2 + +```python +# AFTER +if "*" in self._cors_origins: + logger.error("Wildcard CORS not allowed with credentials") + return None # Reject wildcard with credentials +``` + +--- + +### Fix 9: Require explicit API key +**File:** `gateway/platforms/api_server.py` +**Lines:** 360-361 +**CVSS:** 8.1 + +```python +# AFTER +if not self._api_key: + logger.error("API server started without authentication") + return web.json_response( + {"error": "Server authentication not configured"}, + status=500 + ) +``` + +--- + +### Fix 10: CDP URL validation +**File:** `tools/browser_tool.py` +**Lines:** 195-208 +**CVSS:** 8.4 + +```python +# AFTER +def _resolve_cdp_override(self, cdp_url: str) -> str: + parsed = urlparse(cdp_url) + if parsed.scheme not in ('ws', 'wss', 'http', 'https'): + raise ValueError("Invalid CDP scheme") + if parsed.hostname not in self._allowed_cdp_hosts: + raise ValueError("CDP host not in allowlist") + return cdp_url +``` + +--- + +### Fix 11: Skills guard normalization +**File:** `tools/skills_guard.py` +**Lines:** 82-484 +**CVSS:** 7.8 + +```python +# AFTER - Add to scan_skill() +def normalize_for_scanning(content: str) -> str: + """Normalize content to detect obfuscated threats.""" + # Normalize Unicode + content = unicodedata.normalize('NFKC', content) + # Normalize case + content = content.lower() + # Remove common obfuscation + content = content.replace('\\x', '') + content = content.replace('\\u', '') + return content +``` + +--- + +### Fix 12: Docker volume validation +**File:** `tools/environments/docker.py` +**Line:** 267 +**CVSS:** 8.7 + +```python +# AFTER +_BLOCKED_PATHS = ['/var/run/docker.sock', '/proc', '/sys', '/dev'] +for vol in volumes: + if any(blocked in vol for blocked in _BLOCKED_PATHS): + raise SecurityError(f"Volume mount {vol} blocked") + volume_args.extend(["-v", vol]) +``` + +--- + +### Fix 13: Secure error messages +**File:** Multiple files +**CVSS:** 7.5 + +```python +# AFTER - Add to all exception handlers +try: + operation() +except Exception as e: + logger.error(f"Error: {e}", exc_info=True) # Full details for logs + raise UserError("Operation failed") # Generic for user +``` + +--- + +### Fix 14: OAuth state validation +**File:** `tools/mcp_oauth.py` +**Line:** 186 +**CVSS:** 7.6 + +```python +# AFTER +code, state = await _wait_for_callback() +stored_state = storage.get_state() +if not hmac.compare_digest(state, stored_state): + raise SecurityError("OAuth state mismatch - possible CSRF") +``` + +--- + +### Fix 15: File operation race condition fix +**File:** `tools/file_operations.py` +**CVSS:** 7.4 + +```python +# AFTER +import fcntl + +def safe_file_access(path: Path): + fd = os.open(path, os.O_RDONLY) + try: + fcntl.flock(fd, fcntl.LOCK_SH) + # Perform operations on fd, not path + return os.read(fd, size) + finally: + fcntl.flock(fd, fcntl.LOCK_UN) + os.close(fd) +``` + +--- + +### Fix 16: Add rate limiting +**File:** `gateway/platforms/api_server.py` +**CVSS:** 7.3 + +```python +# AFTER - Add middleware +from aiohttp_limiter import Limiter + +limiter = Limiter( + rate=100, # requests + per=60, # per minute + key_func=lambda req: req.remote +) + +@app.middleware +async def rate_limit_middleware(request, handler): + if not limiter.is_allowed(request): + return web.json_response( + {"error": "Rate limit exceeded"}, + status=429 + ) + return await handler(request) +``` + +--- + +### Fix 17: Secure temp file creation +**File:** `tools/code_execution_tool.py` +**Line:** 388 +**CVSS:** 7.2 + +```python +# AFTER +import tempfile +import os + +fd, tmpdir = tempfile.mkstemp(prefix="hermes_sandbox_", suffix=".tmp") +os.chmod(tmpdir, 0o700) # Owner only +os.close(fd) +# Use tmpdir securely +``` + +--- + +## MEDIUM PRIORITY FIXES + +### Fix 18: Expand dangerous patterns +**File:** `tools/approval.py` +**Lines:** 40-78 +**CVSS:** 6.5 + +Add patterns: +```python +(r'\bcurl\s+.*\|\s*sh\b', "pipe remote content to shell"), +(r'\bwget\s+.*\|\s*bash\b', "pipe remote content to shell"), +(r'python\s+-c\s+.*import\s+os', "python os import"), +(r'perl\s+-e\s+.*system', "perl system call"), +``` + +--- + +### Fix 19: Credential file permissions +**File:** `tools/credential_files.py`, `tools/mcp_oauth.py` +**CVSS:** 6.4 + +```python +# AFTER +def _write_json(path: Path, data: dict) -> None: + path.write_text(json.dumps(data, indent=2), encoding="utf-8") + path.chmod(0o600) + # Verify permissions were set + stat = path.stat() + if stat.st_mode & 0o077: + raise SecurityError("Failed to set restrictive permissions") +``` + +--- + +### Fix 20: Log sanitization +**File:** Multiple logging statements +**CVSS:** 5.8 + +```python +# AFTER +from agent.redact import redact_sensitive_text + +# In all logging calls +logger.info(redact_sensitive_text(f"Processing {user_input}")) +``` + +--- + +## ADDITIONAL FIXES (21-32) + +### Fix 21: XXE Prevention +**File:** PowerPoint XML processing +Add: +```python +from defusedxml import ElementTree as ET +# Use defusedxml instead of standard xml +``` + +--- + +### Fix 22: YAML Safe Loading Audit +**File:** `hermes_cli/config.py` +Audit all yaml.safe_load calls for custom constructors. + +--- + +### Fix 23: Prototype Pollution Fix +**File:** `scripts/whatsapp-bridge/bridge.js` +Use Map instead of Object for user-controlled keys. + +--- + +### Fix 24: Subagent Isolation +**File:** `tools/delegate_tool.py` +Implement filesystem namespace isolation. + +--- + +### Fix 25: Secure Session IDs +**File:** `gateway/session.py` +Use secrets.token_urlsafe(32) instead of uuid4. + +--- + +### Fix 26: Binary Integrity Checks +**File:** `tools/tirith_security.py` +Require GPG signature verification. + +--- + +### Fix 27: Debug Output Redaction +**File:** `tools/debug_helpers.py` +Apply redact_sensitive_text to all debug output. + +--- + +### Fix 28: Security Headers +**File:** `gateway/platforms/api_server.py` +Add: +```python +"Content-Security-Policy": "default-src 'self'", +"Strict-Transport-Security": "max-age=31536000", +``` + +--- + +### Fix 29: Version Information Minimization +**File:** Version endpoints +Return minimal version information publicly. + +--- + +### Fix 30: Dead Code Removal +**File:** Multiple +Remove unused imports and functions. + +--- + +### Fix 31: Token Encryption at Rest +**File:** `hermes_cli/auth.py` +Use OS keychain or encrypt auth.json. + +--- + +### Fix 32: Input Length Validation +**File:** All tool entry points +Add MAX_INPUT_LENGTH checks everywhere. + +--- + +## IMPLEMENTATION VERIFICATION + +### Testing Requirements +- [ ] All fixes have unit tests +- [ ] Security regression tests pass +- [ ] Fuzzing shows no new vulnerabilities +- [ ] Penetration test completed +- [ ] Code review by security team + +### Sign-off Required +- [ ] Security Team Lead +- [ ] Engineering Manager +- [ ] QA Lead +- [ ] DevOps Lead + +--- + +**Last Updated:** March 30, 2026 +**Next Review:** After all P0/P1 fixes completed diff --git a/hermes-sovereign/docs/SECURITY_MITIGATION_ROADMAP.md b/hermes-sovereign/docs/SECURITY_MITIGATION_ROADMAP.md new file mode 100644 index 00000000..df275ce9 --- /dev/null +++ b/hermes-sovereign/docs/SECURITY_MITIGATION_ROADMAP.md @@ -0,0 +1,359 @@ +# SECURITY MITIGATION ROADMAP + +## Hermes Agent Security Remediation Plan +**Version:** 1.0 +**Date:** March 30, 2026 +**Status:** Draft for Implementation + +--- + +## EXECUTIVE SUMMARY + +This roadmap provides a structured approach to addressing the 32 security vulnerabilities identified in the comprehensive security audit. The plan is organized into four phases, prioritizing fixes by risk and impact. + +--- + +## PHASE 1: CRITICAL FIXES (Week 1-2) +**Target:** Eliminate all CVSS 9.0+ vulnerabilities + +### 1.1 Remove shell=True Subprocess Calls (V-001) +**Owner:** Security Team Lead +**Estimated Effort:** 16 hours +**Priority:** P0 + +#### Tasks: +- [ ] Audit all subprocess calls in codebase +- [ ] Replace shell=True with argument lists +- [ ] Implement shlex.quote for necessary string interpolation +- [ ] Add input validation wrappers + +#### Files to Modify: +- `tools/terminal_tool.py` +- `tools/file_operations.py` +- `tools/environments/docker.py` +- `tools/environments/modal.py` +- `tools/environments/ssh.py` +- `tools/environments/singularity.py` + +#### Testing: +- [ ] Unit tests for all command execution paths +- [ ] Fuzzing with malicious inputs +- [ ] Penetration testing + +--- + +### 1.2 Implement Strict Path Sandboxing (V-002) +**Owner:** Security Team Lead +**Estimated Effort:** 12 hours +**Priority:** P0 + +#### Tasks: +- [ ] Create PathValidator class +- [ ] Implement canonical path resolution +- [ ] Add path traversal detection +- [ ] Enforce sandbox root boundaries + +#### Implementation: +```python +class PathValidator: + def __init__(self, sandbox_root: Path): + self.sandbox_root = sandbox_root.resolve() + + def validate(self, user_path: str) -> Path: + expanded = Path(user_path).expanduser().resolve() + if not str(expanded).startswith(str(self.sandbox_root)): + raise SecurityError("Path outside sandbox") + return expanded +``` + +#### Files to Modify: +- `tools/file_operations.py` +- `tools/file_tools.py` +- All environment implementations + +--- + +### 1.3 Fix Secret Leakage in Child Processes (V-003) +**Owner:** Security Engineer +**Estimated Effort:** 8 hours +**Priority:** P0 + +#### Tasks: +- [ ] Create environment variable whitelist +- [ ] Implement secret detection patterns +- [ ] Add env var scrubbing for child processes +- [ ] Audit credential file mounting + +#### Whitelist Approach: +```python +_ALLOWED_ENV_VARS = frozenset([ + "PATH", "HOME", "USER", "LANG", "LC_ALL", + "TERM", "SHELL", "PWD", "OLDPWD", + "PYTHONPATH", "PYTHONHOME", "PYTHONNOUSERSITE", + "DISPLAY", "XDG_SESSION_TYPE", # GUI apps +]) + +def sanitize_environment(): + return {k: v for k, v in os.environ.items() + if k in _ALLOWED_ENV_VARS} +``` + +--- + +### 1.4 Add Connection-Level URL Validation (V-005) +**Owner:** Security Engineer +**Estimated Effort:** 8 hours +**Priority:** P0 + +#### Tasks: +- [ ] Implement egress proxy option +- [ ] Add connection-level IP validation +- [ ] Validate redirect targets +- [ ] Block private IP ranges at socket level + +--- + +## PHASE 2: HIGH PRIORITY (Week 3-4) +**Target:** Address all CVSS 7.0-8.9 vulnerabilities + +### 2.1 Implement Input Validation Framework (V-006, V-007) +**Owner:** Senior Developer +**Estimated Effort:** 20 hours +**Priority:** P1 + +#### Tasks: +- [ ] Create Pydantic models for all tool inputs +- [ ] Implement length validation +- [ ] Add character allowlisting +- [ ] Create validation decorators + +--- + +### 2.2 Fix CORS Configuration (V-008) +**Owner:** Backend Developer +**Estimated Effort:** 4 hours +**Priority:** P1 + +#### Changes: +- Remove wildcard support when credentials enabled +- Implement strict origin validation +- Add origin allowlist configuration + +--- + +### 2.3 Fix Authentication Bypass (V-009) +**Owner:** Backend Developer +**Estimated Effort:** 4 hours +**Priority:** P1 + +#### Changes: +```python +# Fail-closed default +if not self._api_key: + logger.error("API server requires authentication") + return web.json_response( + {"error": "Authentication required"}, + status=401 + ) +``` + +--- + +### 2.4 Fix OAuth State Validation (V-014) +**Owner:** Security Engineer +**Estimated Effort:** 6 hours +**Priority:** P1 + +#### Tasks: +- Store state parameter in session +- Cryptographically verify callback state +- Implement state expiration + +--- + +### 2.5 Add Rate Limiting (V-016) +**Owner:** Backend Developer +**Estimated Effort:** 10 hours +**Priority:** P1 + +#### Implementation: +- Per-IP rate limiting: 100 requests/minute +- Per-user rate limiting: 1000 requests/hour +- Endpoint-specific limits +- Sliding window algorithm + +--- + +### 2.6 Secure Credential Storage (V-019, V-031) +**Owner:** Security Engineer +**Estimated Effort:** 12 hours +**Priority:** P1 + +#### Tasks: +- Implement OS keychain integration +- Add file encryption at rest +- Implement secure key derivation +- Add access audit logging + +--- + +## PHASE 3: MEDIUM PRIORITY (Month 2) +**Target:** Address CVSS 4.0-6.9 vulnerabilities + +### 3.1 Expand Dangerous Command Patterns (V-018) +**Owner:** Security Engineer +**Estimated Effort:** 6 hours +**Priority:** P2 + +#### Add Patterns: +- More encoding variants (base64, hex, unicode) +- Alternative shell syntaxes +- Indirect command execution +- Environment variable abuse + +--- + +### 3.2 Add AST-Based Skill Scanning (V-011) +**Owner:** Security Engineer +**Estimated Effort:** 16 hours +**Priority:** P2 + +#### Implementation: +- Parse Python code to AST +- Detect dangerous function calls +- Analyze import statements +- Check for obfuscation patterns + +--- + +### 3.3 Implement Subagent Isolation (V-024) +**Owner:** Senior Developer +**Estimated Effort:** 20 hours +**Priority:** P2 + +#### Tasks: +- Create isolated filesystem per subagent +- Implement network namespace isolation +- Add resource limits +- Implement subagent-to-subagent communication restrictions + +--- + +### 3.4 Add Comprehensive Audit Logging (V-013, V-020, V-027) +**Owner:** DevOps Engineer +**Estimated Effort:** 12 hours +**Priority:** P2 + +#### Requirements: +- Log all tool invocations +- Log all authentication events +- Log configuration changes +- Implement log integrity protection +- Add SIEM integration hooks + +--- + +## PHASE 4: LONG-TERM IMPROVEMENTS (Month 3+) + +### 4.1 Security Headers Hardening (V-028) +**Owner:** Backend Developer +**Estimated Effort:** 4 hours + +Add headers: +- Content-Security-Policy +- Strict-Transport-Security +- X-Frame-Options +- X-XSS-Protection + +--- + +### 4.2 Code Signing Verification (V-026) +**Owner:** Security Engineer +**Estimated Effort:** 8 hours + +- Require GPG signatures for binaries +- Implement signature verification +- Pin trusted signing keys + +--- + +### 4.3 Supply Chain Security +**Owner:** DevOps Engineer +**Estimated Effort:** 12 hours + +- Implement dependency scanning +- Add SLSA compliance +- Use private package registry +- Implement SBOM generation + +--- + +### 4.4 Automated Security Testing +**Owner:** QA Lead +**Estimated Effort:** 16 hours + +- Integrate SAST tools (Semgrep, Bandit) +- Add DAST to CI/CD +- Implement fuzzing +- Add security regression tests + +--- + +## IMPLEMENTATION TRACKING + +| Week | Deliverables | Owner | Status | +|------|-------------|-------|--------| +| 1 | P0 Fixes: V-001, V-002 | Security Team | ⏳ Planned | +| 1 | P0 Fixes: V-003, V-005 | Security Team | ⏳ Planned | +| 2 | P0 Testing & Validation | QA Team | ⏳ Planned | +| 3 | P1 Fixes: V-006 through V-010 | Dev Team | ⏳ Planned | +| 3 | P1 Fixes: V-014, V-016 | Dev Team | ⏳ Planned | +| 4 | P1 Testing & Documentation | QA/Doc Team | ⏳ Planned | +| 5-8 | P2 Fixes Implementation | Dev Team | ⏳ Planned | +| 9-12 | P3/P4 Long-term Improvements | All Teams | ⏳ Planned | + +--- + +## SUCCESS METRICS + +### Security Metrics +- [ ] Zero CVSS 9.0+ vulnerabilities +- [ ] < 5 CVSS 7.0-8.9 vulnerabilities +- [ ] 100% of subprocess calls without shell=True +- [ ] 100% path validation coverage +- [ ] 100% input validation on tool entry points + +### Compliance Metrics +- [ ] OWASP Top 10 compliance +- [ ] CWE coverage > 90% +- [ ] Security test coverage > 80% + +--- + +## RISK ACCEPTANCE + +| Vulnerability | Risk | Justification | Approver | +|--------------|------|---------------|----------| +| V-029 (Version Info) | Low | Required for debugging | TBD | +| V-030 (Dead Code) | Low | Cleanup in next refactor | TBD | + +--- + +## APPENDIX: TOOLS AND RESOURCES + +### Recommended Security Tools +1. **SAST:** Semgrep, Bandit, Pylint-security +2. **DAST:** OWASP ZAP, Burp Suite +3. **Dependency:** Safety, Snyk, Dependabot +4. **Secrets:** GitLeaks, TruffleHog +5. **Fuzzing:** Atheris, Hypothesis + +### Training Resources +- OWASP Top 10 for Python +- Secure Coding in Python (SANS) +- AWS Security Best Practices + +--- + +**Document Owner:** Security Team +**Review Cycle:** Monthly during remediation, Quarterly post-completion diff --git a/hermes-sovereign/docs/TEST_ANALYSIS_REPORT.md b/hermes-sovereign/docs/TEST_ANALYSIS_REPORT.md new file mode 100644 index 00000000..2eff5b68 --- /dev/null +++ b/hermes-sovereign/docs/TEST_ANALYSIS_REPORT.md @@ -0,0 +1,509 @@ +# Hermes Agent - Testing Infrastructure Deep Analysis + +## Executive Summary + +The hermes-agent project has a **comprehensive test suite** with **373 test files** containing approximately **4,300+ test functions**. The tests are organized into 10 subdirectories covering all major components. + +--- + +## 1. Test Suite Structure & Statistics + +### 1.1 Directory Breakdown + +| Directory | Test Files | Focus Area | +|-----------|------------|------------| +| `tests/tools/` | 86 | Tool implementations, file operations, environments | +| `tests/gateway/` | 96 | Platform integrations (Discord, Telegram, Slack, etc.) | +| `tests/hermes_cli/` | 48 | CLI commands, configuration, setup flows | +| `tests/agent/` | 16 | Core agent logic, prompt building, model adapters | +| `tests/integration/` | 8 | End-to-end integration tests | +| `tests/acp/` | 8 | Agent Communication Protocol | +| `tests/cron/` | 3 | Cron job scheduling | +| `tests/skills/` | 5 | Skill management | +| `tests/honcho_integration/` | 5 | Honcho memory integration | +| `tests/fakes/` | 2 | Test fixtures and fake servers | +| **Total** | **373** | **~4,311 test functions** | + +### 1.2 Test Classification + +**Unit Tests:** ~95% (3,600+) +**Integration Tests:** ~5% (marked with `@pytest.mark.integration`) +**Async Tests:** ~679 tests use `@pytest.mark.asyncio` + +### 1.3 Largest Test Files (by line count) + +1. `tests/test_run_agent.py` - 3,329 lines (212 tests) - Core agent logic +2. `tests/tools/test_mcp_tool.py` - 2,902 lines (147 tests) - MCP protocol +3. `tests/gateway/test_voice_command.py` - 2,632 lines - Voice features +4. `tests/gateway/test_feishu.py` - 2,580 lines - Feishu platform +5. `tests/gateway/test_api_server.py` - 1,503 lines - API server + +--- + +## 2. Coverage Heat Map - Critical Gaps Identified + +### 2.1 NO TEST COVERAGE (Red Zone) + +#### Agent Module Gaps: +- `agent/copilot_acp_client.py` - Copilot integration (0 tests) +- `agent/gemini_adapter.py` - Google Gemini model support (0 tests) +- `agent/knowledge_ingester.py` - Knowledge ingestion (0 tests) +- `agent/meta_reasoning.py` - Meta-reasoning capabilities (0 tests) +- `agent/skill_utils.py` - Skill utilities (0 tests) +- `agent/trajectory.py` - Trajectory management (0 tests) + +#### Tools Module Gaps: +- `tools/browser_tool.py` - Browser automation (0 tests) +- `tools/code_execution_tool.py` - Code execution (0 tests) +- `tools/gitea_client.py` - Gitea integration (0 tests) +- `tools/image_generation_tool.py` - Image generation (0 tests) +- `tools/neutts_synth.py` - Neural TTS (0 tests) +- `tools/openrouter_client.py` - OpenRouter API (0 tests) +- `tools/session_search_tool.py` - Session search (0 tests) +- `tools/terminal_tool.py` - Terminal operations (0 tests) +- `tools/tts_tool.py` - Text-to-speech (0 tests) +- `tools/web_tools.py` - Web tools core (0 tests) + +#### Gateway Module Gaps: +- `gateway/run.py` - Gateway runner (0 tests) +- `gateway/stream_consumer.py` - Stream consumption (0 tests) + +#### Root-Level Gaps: +- `hermes_constants.py` - Constants (0 tests) +- `hermes_time.py` - Time utilities (0 tests) +- `mini_swe_runner.py` - SWE runner (0 tests) +- `rl_cli.py` - RL CLI (0 tests) +- `utils.py` - Utilities (0 tests) + +### 2.2 LIMITED COVERAGE (Yellow Zone) + +- `agent/models_dev.py` - Only 19 tests for complex model routing +- `agent/smart_model_routing.py` - Only 6 tests +- `tools/approval.py` - 2 test files but complex logic +- `tools/skills_guard.py` - Security-critical, needs more coverage + +### 2.3 GOOD COVERAGE (Green Zone) + +- `agent/anthropic_adapter.py` - 97 tests (comprehensive) +- `agent/prompt_builder.py` - 108 tests (excellent) +- `tools/mcp_tool.py` - 147 tests (very comprehensive) +- `tools/file_tools.py` - Multiple test files +- `gateway/discord.py` - 11 test files covering various aspects +- `gateway/telegram.py` - 10 test files +- `gateway/session.py` - 15 test files + +--- + +## 3. Test Patterns Analysis + +### 3.1 Fixtures Architecture + +**Global Fixtures (`conftest.py`):** +- `_isolate_hermes_home` - Isolates HERMES_HOME to temp directory (autouse) +- `_ensure_current_event_loop` - Event loop management for sync tests (autouse) +- `_enforce_test_timeout` - 30-second timeout per test (autouse) +- `tmp_dir` - Temporary directory fixture +- `mock_config` - Minimal hermes config for unit tests + +**Common Patterns:** +```python +# Isolation pattern +@pytest.fixture(autouse=True) +def isolate_env(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + +# Mock client pattern +@pytest.fixture +def mock_agent(): + with patch("run_agent.OpenAI") as mock: + yield mock +``` + +### 3.2 Mock Usage Statistics + +- **~12,468 mock/patch usages** across the test suite +- Heavy use of `unittest.mock.patch` and `MagicMock` +- `AsyncMock` used for async function mocking +- `SimpleNamespace` for creating mock API response objects + +### 3.3 Test Organization Patterns + +**Class-Based Organization:** +- 1,532 test classes identified +- Grouped by functionality: `Test` +- Example: `TestSanitizeApiMessages`, `TestContextPressureFlags` + +**Function-Based Organization:** +- Used for simpler test files +- Naming: `test__` + +### 3.4 Async Test Patterns + +```python +@pytest.mark.asyncio +async def test_async_function(): + result = await async_function() + assert result == expected +``` + +--- + +## 4. 20 New Test Recommendations (Priority Order) + +### Critical Priority (Security/Risk) + +1. **Browser Tool Security Tests** (`tools/browser_tool.py`) + - Test sandbox escape prevention + - Test malicious script blocking + - Test content security policy enforcement + +2. **Code Execution Sandbox Tests** (`tools/code_execution_tool.py`) + - Test resource limits (CPU, memory) + - Test dangerous import blocking + - Test timeout enforcement + - Test filesystem access restrictions + +3. **Terminal Tool Safety Tests** (`tools/terminal_tool.py`) + - Test dangerous command blocking + - Test command injection prevention + - Test environment variable sanitization + +4. **OpenRouter Client Tests** (`tools/openrouter_client.py`) + - Test API key handling + - Test rate limit handling + - Test error response parsing + +### High Priority (Core Functionality) + +5. **Gemini Adapter Tests** (`agent/gemini_adapter.py`) + - Test message format conversion + - Test tool call normalization + - Test streaming response handling + +6. **Copilot ACP Client Tests** (`agent/copilot_acp_client.py`) + - Test authentication flow + - Test session management + - Test message passing + +7. **Knowledge Ingester Tests** (`agent/knowledge_ingester.py`) + - Test document parsing + - Test embedding generation + - Test knowledge retrieval + +8. **Stream Consumer Tests** (`gateway/stream_consumer.py`) + - Test backpressure handling + - Test reconnection logic + - Test message ordering guarantees + +### Medium Priority (Integration/Features) + +9. **Web Tools Core Tests** (`tools/web_tools.py`) + - Test search result parsing + - Test content extraction + - Test error handling for unavailable services + +10. **Image Generation Tool Tests** (`tools/image_generation_tool.py`) + - Test prompt filtering + - Test image format handling + - Test provider failover + +11. **Gitea Client Tests** (`tools/gitea_client.py`) + - Test repository operations + - Test webhook handling + - Test authentication + +12. **Session Search Tool Tests** (`tools/session_search_tool.py`) + - Test query parsing + - Test result ranking + - Test pagination + +13. **Meta Reasoning Tests** (`agent/meta_reasoning.py`) + - Test strategy selection + - Test reflection generation + - Test learning from failures + +14. **TTS Tool Tests** (`tools/tts_tool.py`) + - Test voice selection + - Test audio format conversion + - Test streaming playback + +15. **Neural TTS Tests** (`tools/neutts_synth.py`) + - Test voice cloning safety + - Test audio quality validation + - Test resource cleanup + +### Lower Priority (Utilities) + +16. **Hermes Constants Tests** (`hermes_constants.py`) + - Test constant values + - Test environment-specific overrides + +17. **Time Utilities Tests** (`hermes_time.py`) + - Test timezone handling + - Test formatting functions + +18. **Utils Module Tests** (`utils.py`) + - Test helper functions + - Test validation utilities + +19. **Mini SWE Runner Tests** (`mini_swe_runner.py`) + - Test repository setup + - Test test execution + - Test result parsing + +20. **RL CLI Tests** (`rl_cli.py`) + - Test training command parsing + - Test configuration validation + - Test checkpoint handling + +--- + +## 5. Test Optimization Opportunities + +### 5.1 Performance Issues Identified + +**Large Test Files (Split Recommended):** +- `tests/test_run_agent.py` (3,329 lines) → Split into multiple files +- `tests/tools/test_mcp_tool.py` (2,902 lines) → Split by MCP feature +- `tests/test_anthropic_adapter.py` (1,219 lines) → Consider splitting + +**Potential Slow Tests:** +- Integration tests with real API calls +- Tests with file I/O operations +- Tests with subprocess spawning + +### 5.2 Optimization Recommendations + +1. **Parallel Execution Already Configured** + - `pytest-xdist` with `-n auto` in CI + - Maintains isolation through fixtures + +2. **Fixture Scope Optimization** + - Review `autouse=True` fixtures for necessity + - Consider session-scoped fixtures for expensive setup + +3. **Mock External Services** + - Some integration tests still hit real APIs + - Create more fakes like `fake_ha_server.py` + +4. **Test Data Management** + - Use factory pattern for test data generation + - Share test fixtures across related tests + +### 5.3 CI/CD Optimizations + +Current CI (`.github/workflows/tests.yml`): +- Uses `uv` for fast dependency installation +- Runs with `-n auto` for parallelization +- Ignores integration tests by default +- 10-minute timeout + +**Recommended Improvements:** +1. Add test duration reporting (`--durations=10`) +2. Add coverage reporting +3. Separate fast unit tests from slower integration tests +4. Add flaky test retry mechanism + +--- + +## 6. Missing Integration Test Scenarios + +### 6.1 Cross-Component Integration + +1. **End-to-End Agent Flow** + - User message → Gateway → Agent → Tools → Response + - Test with real (mocked) LLM responses + +2. **Multi-Platform Gateway** + - Message routing between platforms + - Session persistence across platforms + +3. **Tool + Environment Integration** + - Terminal tool with different backends (local, docker, modal) + - File operations with permission checks + +4. **Skill Lifecycle Integration** + - Skill installation → Registration → Execution → Update → Removal + +5. **Memory + Honcho Integration** + - Memory storage → Retrieval → Context injection + +### 6.2 Failure Scenario Integration Tests + +1. **LLM Provider Failover** + - Primary provider down → Fallback provider + - Rate limiting handling + +2. **Gateway Reconnection** + - Platform disconnect → Reconnect → Resume session + +3. **Tool Execution Failures** + - Tool timeout → Retry → Fallback + - Tool error → Error handling → User notification + +4. **Checkpoint Recovery** + - Crash during batch → Resume from checkpoint + - Corrupted checkpoint handling + +### 6.3 Security Integration Tests + +1. **Prompt Injection Across Stack** + - Gateway input → Agent processing → Tool execution + +2. **Permission Escalation Prevention** + - User permissions → Tool allowlist → Execution + +3. **Data Leak Prevention** + - Memory storage → Context building → Response generation + +--- + +## 7. Performance Test Strategy + +### 7.1 Load Testing Requirements + +1. **Gateway Load Tests** + - Concurrent session handling + - Message throughput per platform + - Memory usage under load + +2. **Agent Response Time Tests** + - End-to-end latency benchmarks + - Tool execution time budgets + - Context building performance + +3. **Resource Utilization Tests** + - Memory leaks in long-running sessions + - File descriptor limits + - CPU usage patterns + +### 7.2 Benchmark Framework + +```python +# Proposed performance test structure +class TestGatewayPerformance: + @pytest.mark.benchmark + def test_message_throughput(self, benchmark): + # Measure messages processed per second + pass + + @pytest.mark.benchmark + def test_session_creation_latency(self, benchmark): + # Measure session setup time + pass +``` + +### 7.3 Performance Regression Detection + +1. **Baseline Establishment** + - Record baseline metrics for critical paths + - Store in version control + +2. **Automated Comparison** + - Compare PR performance against baseline + - Fail if degradation > 10% + +3. **Metrics to Track** + - Test suite execution time + - Memory peak usage + - Individual test durations + +--- + +## 8. Test Infrastructure Improvements + +### 8.1 Coverage Tooling + +**Missing:** Code coverage reporting +**Recommendation:** Add `pytest-cov` to dev dependencies + +```toml +[project.optional-dependencies] +dev = [ + "pytest>=9.0.2,<10", + "pytest-asyncio>=1.3.0,<2", + "pytest-xdist>=3.0,<4", + "pytest-cov>=5.0,<6", # Add this + "mcp>=1.2.0,<2" +] +``` + +### 8.2 Test Categories + +Add more pytest markers for selective test running: + +```python +# In pytest.ini or pyproject.toml +markers = [ + "integration: marks tests requiring external services", + "slow: marks slow tests (>5s)", + "security: marks security-focused tests", + "benchmark: marks performance benchmark tests", + "flakey: marks tests that may be unstable", +] +``` + +### 8.3 Test Data Factory + +Create centralized test data factories: + +```python +# tests/factories.py +class AgentFactory: + @staticmethod + def create_mock_agent(tools=None): + # Return configured mock agent + pass + +class MessageFactory: + @staticmethod + def create_user_message(content): + # Return formatted user message + pass +``` + +--- + +## 9. Summary & Action Items + +### Immediate Actions (High Impact) + +1. **Add coverage reporting** to CI pipeline +2. **Create tests for uncovered security-critical modules:** + - `tools/code_execution_tool.py` + - `tools/browser_tool.py` + - `tools/terminal_tool.py` +3. **Split oversized test files** for better maintainability +4. **Add Gemini adapter tests** (increasingly important provider) + +### Short-term (1-2 Sprints) + +5. Create integration tests for cross-component flows +6. Add performance benchmarks for critical paths +7. Expand OpenRouter client test coverage +8. Add knowledge ingester tests + +### Long-term (Quarter) + +9. Achieve 80% code coverage across all modules +10. Implement performance regression testing +11. Create comprehensive security test suite +12. Document testing patterns and best practices + +--- + +## Appendix: Test File Size Distribution + +| Lines | Count | Category | +|-------|-------|----------| +| 0-100 | ~50 | Simple unit tests | +| 100-500 | ~200 | Standard test files | +| 500-1000 | ~80 | Complex feature tests | +| 1000-2000 | ~30 | Large test suites | +| 2000+ | ~13 | Monolithic test files (needs splitting) | + +--- + +*Analysis generated: March 30, 2026* +*Total test files analyzed: 373* +*Estimated test functions: ~4,311* diff --git a/hermes-sovereign/docs/TEST_OPTIMIZATION_GUIDE.md b/hermes-sovereign/docs/TEST_OPTIMIZATION_GUIDE.md new file mode 100644 index 00000000..a5bc6394 --- /dev/null +++ b/hermes-sovereign/docs/TEST_OPTIMIZATION_GUIDE.md @@ -0,0 +1,364 @@ +# Test Optimization Guide for Hermes Agent + +## Current Test Execution Analysis + +### Test Suite Statistics +- **Total Test Files:** 373 +- **Estimated Test Functions:** ~4,311 +- **Async Tests:** ~679 (15.8%) +- **Integration Tests:** 7 files (excluded from CI) +- **Average Tests per File:** ~11.6 + +### Current CI Configuration +```yaml +# .github/workflows/tests.yml +- name: Run tests + run: | + source .venv/bin/activate + python -m pytest tests/ -q --ignore=tests/integration --tb=short -n auto +``` + +**Current Flags:** +- `-q`: Quiet mode +- `--ignore=tests/integration`: Skip integration tests +- `--tb=short`: Short traceback format +- `-n auto`: Auto-detect parallel workers + +--- + +## Optimization Recommendations + +### 1. Add Test Duration Reporting + +**Current:** No duration tracking +**Recommended:** +```yaml +run: | + python -m pytest tests/ \ + --ignore=tests/integration \ + -n auto \ + --durations=20 \ # Show 20 slowest tests + --durations-min=1.0 # Only show tests >1s +``` + +This will help identify slow tests that need optimization. + +### 2. Implement Test Categories + +Add markers to `pyproject.toml`: +```toml +[tool.pytest.ini_options] +testpaths = ["tests"] +markers = [ + "integration: marks tests requiring external services", + "slow: marks tests that take >5 seconds", + "unit: marks fast unit tests", + "security: marks security-focused tests", + "flakey: marks tests that may be unstable", +] +addopts = "-m 'not integration and not slow' -n auto" +``` + +**Usage:** +```bash +# Run only fast unit tests +pytest -m unit + +# Run all tests including slow ones +pytest -m "not integration" + +# Run only security tests +pytest -m security +``` + +### 3. Optimize Slow Test Candidates + +Based on file sizes, these tests likely need optimization: + +| File | Lines | Optimization Strategy | +|------|-------|----------------------| +| `test_run_agent.py` | 3,329 | Split into multiple files by feature | +| `test_mcp_tool.py` | 2,902 | Split by MCP functionality | +| `test_voice_command.py` | 2,632 | Review for redundant tests | +| `test_feishu.py` | 2,580 | Mock external API calls | +| `test_api_server.py` | 1,503 | Parallelize independent tests | + +### 4. Add Coverage Reporting to CI + +**Updated workflow:** +```yaml +- name: Run tests with coverage + run: | + source .venv/bin/activate + python -m pytest tests/ \ + --ignore=tests/integration \ + -n auto \ + --cov=agent --cov=tools --cov=gateway --cov=hermes_cli \ + --cov-report=xml \ + --cov-report=html \ + --cov-fail-under=70 + +- name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + files: ./coverage.xml + fail_ci_if_error: true +``` + +### 5. Implement Flaky Test Handling + +Add `pytest-rerunfailures`: +```toml +dev = [ + "pytest>=9.0.2,<10", + "pytest-asyncio>=1.3.0,<2", + "pytest-xdist>=3.0,<4", + "pytest-cov>=5.0,<6", + "pytest-rerunfailures>=14.0,<15", # Add this +] +``` + +**Usage:** +```python +# Mark known flaky tests +@pytest.mark.flakey(reruns=3, reruns_delay=1) +async def test_network_dependent_feature(): + # Test that sometimes fails due to network + pass +``` + +### 6. Optimize Fixture Scopes + +Review `conftest.py` fixtures: + +```python +# Current: Function scope (runs for every test) +@pytest.fixture() +def mock_config(): + return {...} + +# Optimized: Session scope (runs once per session) +@pytest.fixture(scope="session") +def mock_config(): + return {...} + +# Optimized: Module scope (runs once per module) +@pytest.fixture(scope="module") +def expensive_setup(): + # Setup that can be reused across module + pass +``` + +### 7. Parallel Execution Tuning + +**Current:** `-n auto` (uses all CPUs) +**Issues:** +- May cause resource contention +- Some tests may not be thread-safe + +**Recommendations:** +```bash +# Limit workers to prevent resource exhaustion +pytest -n 4 # Use 4 workers regardless of CPU count + +# Use load-based scheduling for uneven test durations +pytest -n auto --dist=load + +# Group tests by module to reduce setup overhead +pytest -n auto --dist=loadscope +``` + +### 8. Test Data Management + +**Current Issue:** Tests may create files in `/tmp` without cleanup + +**Solution - Factory Pattern:** +```python +# tests/factories.py +import tempfile +import shutil +from contextlib import contextmanager + +@contextmanager +def temp_workspace(): + """Create isolated temp directory for tests.""" + path = tempfile.mkdtemp(prefix="hermes_test_") + try: + yield Path(path) + finally: + shutil.rmtree(path, ignore_errors=True) + +# Usage in tests +def test_file_operations(): + with temp_workspace() as tmp: + # All file operations in isolated directory + file_path = tmp / "test.txt" + file_path.write_text("content") + assert file_path.exists() + # Automatically cleaned up +``` + +### 9. Database/State Isolation + +**Current:** Uses `monkeypatch` for env vars +**Enhancement:** Database mocking + +```python +@pytest.fixture +def mock_honcho(): + """Mock Honcho client for tests.""" + with patch("honcho_integration.client.HonchoClient") as mock: + mock_instance = MagicMock() + mock_instance.get_session.return_value = {"id": "test-session"} + mock.return_value = mock_instance + yield mock + +# Usage +async def test_memory_storage(mock_honcho): + # Fast, isolated test + pass +``` + +### 10. CI Pipeline Optimization + +**Current Pipeline:** +1. Checkout +2. Install uv +3. Install Python +4. Install deps +5. Run tests + +**Optimized Pipeline (with caching):** +```yaml +jobs: + test: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + version: "0.5.x" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' # Cache pip dependencies + + - name: Cache uv packages + uses: actions/cache@v4 + with: + path: ~/.cache/uv + key: ${{ runner.os }}-uv-${{ hashFiles('**/pyproject.toml') }} + + - name: Install dependencies + run: | + uv venv .venv + uv pip install -e ".[all,dev]" + + - name: Run fast tests + run: | + source .venv/bin/activate + pytest -m "not integration and not slow" -n auto --tb=short + + - name: Run slow tests + if: github.event_name == 'pull_request' + run: | + source .venv/bin/activate + pytest -m "slow" -n 2 --tb=short +``` + +--- + +## Quick Wins (Implement First) + +### 1. Add Duration Reporting (5 minutes) +```yaml +--durations=10 +``` + +### 2. Mark Slow Tests (30 minutes) +Add `@pytest.mark.slow` to tests taking >5s. + +### 3. Split Largest Test File (2 hours) +Split `test_run_agent.py` into: +- `test_run_agent_core.py` +- `test_run_agent_tools.py` +- `test_run_agent_memory.py` +- `test_run_agent_messaging.py` + +### 4. Add Coverage Baseline (1 hour) +```bash +pytest --cov=agent --cov=tools --cov=gateway tests/ --cov-report=html +``` + +### 5. Optimize Fixture Scopes (1 hour) +Review and optimize 5 most-used fixtures. + +--- + +## Long-term Improvements + +### Test Data Generation +```python +# Implement hypothesis-based testing +from hypothesis import given, strategies as st + +@given(st.lists(st.text(), min_size=1)) +def test_message_batching(messages): + # Property-based testing + pass +``` + +### Performance Regression Testing +```python +@pytest.mark.benchmark +def test_message_processing_speed(benchmark): + result = benchmark(process_messages, sample_data) + assert result.throughput > 1000 # msgs/sec +``` + +### Contract Testing +```python +# Verify API contracts between components +@pytest.mark.contract +def test_agent_tool_contract(): + """Verify agent sends correct format to tools.""" + pass +``` + +--- + +## Measurement Checklist + +After implementing optimizations, verify: + +- [ ] Test suite execution time < 5 minutes +- [ ] No individual test > 10 seconds (except integration) +- [ ] Code coverage > 70% +- [ ] All flaky tests marked and retried +- [ ] CI passes consistently (>95% success rate) +- [ ] Memory usage stable (no leaks in test suite) + +--- + +## Tools to Add + +```toml +[project.optional-dependencies] +dev = [ + "pytest>=9.0.2,<10", + "pytest-asyncio>=1.3.0,<2", + "pytest-xdist>=3.0,<4", + "pytest-cov>=5.0,<6", + "pytest-rerunfailures>=14.0,<15", + "pytest-benchmark>=4.0,<5", # Performance testing + "pytest-mock>=3.12,<4", # Enhanced mocking + "hypothesis>=6.100,<7", # Property-based testing + "factory-boy>=3.3,<4", # Test data factories +] +``` diff --git a/hermes-sovereign/docs/V-006_FIX_SUMMARY.md b/hermes-sovereign/docs/V-006_FIX_SUMMARY.md new file mode 100644 index 00000000..e82f1817 --- /dev/null +++ b/hermes-sovereign/docs/V-006_FIX_SUMMARY.md @@ -0,0 +1,73 @@ +# V-006 MCP OAuth Deserialization Vulnerability Fix + +## Summary +Fixed the critical V-006 vulnerability (CVSS 8.8) in MCP OAuth handling that used insecure deserialization, potentially enabling remote code execution. + +## Changes Made + +### 1. Secure OAuth State Serialization (`tools/mcp_oauth.py`) +- **Replaced pickle with JSON**: OAuth state is now serialized using JSON instead of `pickle.loads()`, eliminating the RCE vector +- **Added HMAC-SHA256 signatures**: All state data is cryptographically signed to prevent tampering +- **Implemented secure deserialization**: `SecureOAuthState.deserialize()` validates structure, signature, and expiration +- **Added constant-time comparison**: Token validation uses `secrets.compare_digest()` to prevent timing attacks + +### 2. Token Storage Security Enhancements +- **JSON Schema Validation**: Token data is validated against strict schemas before use +- **HMAC Signing**: Stored tokens are signed with HMAC-SHA256 to detect file tampering +- **Strict Type Checking**: All token fields are type-validated +- **File Permissions**: Token directory created with 0o700, files with 0o600 + +### 3. Security Features +- **Nonce-based replay protection**: Each state has a unique nonce tracked by the state manager +- **10-minute expiration**: States automatically expire after 600 seconds +- **CSRF protection**: State validation prevents cross-site request forgery +- **Environment-based keys**: Supports `HERMES_OAUTH_SECRET` and `HERMES_TOKEN_STORAGE_SECRET` env vars + +### 4. Comprehensive Security Tests (`tests/test_oauth_state_security.py`) +54 security tests covering: +- Serialization/deserialization roundtrips +- Tampering detection (data and signature) +- Schema validation for tokens and client info +- Replay attack prevention +- CSRF attack prevention +- MITM attack detection +- Pickle payload rejection +- Performance tests + +## Files Modified +- `tools/mcp_oauth.py` - Complete rewrite with secure state handling +- `tests/test_oauth_state_security.py` - New comprehensive security test suite + +## Security Verification +```bash +# Run security tests +python tests/test_oauth_state_security.py + +# All 54 tests pass: +# - TestSecureOAuthState: 20 tests +# - TestOAuthStateManager: 10 tests +# - TestSchemaValidation: 8 tests +# - TestTokenStorageSecurity: 6 tests +# - TestNoPickleUsage: 2 tests +# - TestSecretKeyManagement: 3 tests +# - TestOAuthFlowIntegration: 3 tests +# - TestPerformance: 2 tests +``` + +## API Changes (Backwards Compatible) +- `SecureOAuthState` - New class for secure state handling +- `OAuthStateManager` - New class for state lifecycle management +- `HermesTokenStorage` - Enhanced with schema validation and signing +- `OAuthStateError` - New exception for security violations + +## Deployment Notes +1. Existing token files will be invalidated (no signature) - users will need to re-authenticate +2. New secret key will be auto-generated in `~/.hermes/.secrets/` +3. Environment variables can override key locations: + - `HERMES_OAUTH_SECRET` - For state signing + - `HERMES_TOKEN_STORAGE_SECRET` - For token storage signing + +## References +- Security Audit: V-006 Insecure Deserialization in MCP OAuth +- CWE-502: Deserialization of Untrusted Data +- CWE-20: Improper Input Validation diff --git a/hermes-sovereign/docs/agent_core_analysis.md b/hermes-sovereign/docs/agent_core_analysis.md new file mode 100644 index 00000000..a2df8636 --- /dev/null +++ b/hermes-sovereign/docs/agent_core_analysis.md @@ -0,0 +1,466 @@ +# Deep Analysis: Agent Core (run_agent.py + agent/*.py) + +## Executive Summary + +The AIAgent class is a sophisticated conversation orchestrator (~8500 lines) with multi-provider support, parallel tool execution, context compression, and robust error handling. This analysis covers the state machine, retry logic, context management, optimizations, and potential issues. + +--- + +## 1. State Machine Diagram of Conversation Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ AIAgent Conversation State Machine │ +└─────────────────────────────────────────────────────────────────────────────────┘ + +┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ ┌─────────────┐ +│ START │────▶│ INIT │────▶│ BUILD_SYSTEM │────▶│ USER │ +│ │ │ (config) │ │ _PROMPT │ │ INPUT │ +└─────────────┘ └─────────────┘ └─────────────────┘ └──────┬──────┘ + │ + ┌──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ ┌─────────────┐ +│ API_CALL │◄────│ PREPARE │◄────│ HONCHO_PREFETCH│◄────│ COMPRESS? │ +│ (stream) │ │ _MESSAGES │ │ (context) │ │ (threshold)│ +└──────┬──────┘ └─────────────┘ └─────────────────┘ └─────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ API Response Handler │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ STOP │ │ TOOL_CALLS │ │ LENGTH │ │ ERROR │ │ +│ │ (finish) │ │ (execute) │ │ (truncate) │ │ (retry) │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ RETURN │ │ EXECUTE │ │ CONTINUATION│ │ FALLBACK/ │ │ +│ │ RESPONSE │ │ TOOLS │ │ REQUEST │ │ COMPRESS │ │ +│ │ │ │ (parallel/ │ │ │ │ │ │ +│ │ │ │ sequential) │ │ │ │ │ │ +│ └─────────────┘ └──────┬──────┘ └─────────────┘ └─────────────┘ │ +│ │ │ +│ └─────────────────────────────────┐ │ +│ ▼ │ +│ ┌─────────────────┐ │ +│ │ APPEND_RESULTS │──────────┘ +│ │ (loop back) │ +│ └─────────────────┘ +└─────────────────────────────────────────────────────────────────────────────────┘ + +Key States: +─────────── +1. INIT: Agent initialization, client setup, tool loading +2. BUILD_SYSTEM_PROMPT: Cached system prompt assembly with skills/memory +3. USER_INPUT: Message injection with Honcho turn context +4. COMPRESS?: Context threshold check (50% default) +5. API_CALL: Streaming/non-streaming LLM request +6. TOOL_EXECUTION: Parallel (safe) or sequential (interactive) tool calls +7. FALLBACK: Provider failover on errors +8. RETURN: Final response with metadata + +Transitions: +──────────── +- INTERRUPT: Any state → immediate cleanup → RETURN +- MAX_ITERATIONS: API_CALL → RETURN (budget exhausted) +- 413/CONTEXT_ERROR: API_CALL → COMPRESS → retry +- 401/429: API_CALL → FALLBACK → retry +``` + +### Sub-State: Tool Execution + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Tool Execution Flow │ +└─────────────────────────────────────────────────────────────┘ + +┌─────────────────┐ +│ RECEIVE_BATCH │ +└────────┬────────┘ + │ + ┌────┴────┐ + │ Parallel?│ + └────┬────┘ + YES / \ NO + / \ + ▼ ▼ +┌─────────┐ ┌─────────┐ +│CONCURRENT│ │SEQUENTIAL│ +│(ThreadPool│ │(for loop)│ +│ max=8) │ │ │ +└────┬────┘ └────┬────┘ + │ │ + ▼ ▼ +┌─────────┐ ┌─────────┐ +│ _invoke_│ │ _invoke_│ +│ _tool() │ │ _tool() │ (per tool) +│ (workers)│ │ │ +└────┬────┘ └────┬────┘ + │ │ + └────────────┘ + │ + ▼ + ┌───────────────┐ + │ CHECKPOINT? │ (write_file/patch/terminal) + └───────┬───────┘ + │ + ▼ + ┌───────────────┐ + │ BUDGET_WARNING│ (inject if >70% iterations) + └───────┬───────┘ + │ + ▼ + ┌───────────────┐ + │ APPEND_TO_MSGS│ + └───────────────┘ +``` + +--- + +## 2. All Retry/Fallback Logic Identified + +### 2.1 API Call Retry Loop (lines 6420-7351) + +```python +# Primary retry configuration +max_retries = 3 +retry_count = 0 + +# Retryable errors (with backoff): +- Timeout errors (httpx.ReadTimeout, ConnectTimeout, PoolTimeout) +- Connection errors (ConnectError, RemoteProtocolError, ConnectionError) +- SSE connection drops ("connection lost", "network error") +- Rate limits (429) - with Retry-After header respect + +# Backoff strategy: +wait_time = min(2 ** retry_count, 60) # 2s, 4s, 8s max 60s +# Rate limits: use Retry-After header (capped at 120s) +``` + +### 2.2 Streaming Retry Logic (lines 4157-4268) + +```python +_max_stream_retries = int(os.getenv("HERMES_STREAM_RETRIES", 2)) + +# Streaming-specific fallbacks: +1. Streaming fails after partial delivery → NO retry (partial content shown) +2. Streaming fails BEFORE delivery → fallback to non-streaming +3. Stale stream detection (>180s, scaled to 300s for >100K tokens) → kill connection +``` + +### 2.3 Provider Fallback Chain (lines 4334-4443) + +```python +# Fallback chain from config (fallback_model / fallback_providers) +self._fallback_chain = [...] # List of {provider, model} dicts +self._fallback_index = 0 # Current position in chain + +# Trigger conditions: +- max_retries exhausted +- Rate limit (429) with fallback available +- Non-retryable 4xx error (401, 403, 404, 422) +- Empty/malformed response after retries + +# Fallback activation: +_try_activate_fallback() → swaps client, model, base_url in-place +``` + +### 2.4 Context Length Error Handling (lines 6998-7164) + +```python +# 413 Payload Too Large: +max_compression_attempts = 3 +# Compress context and retry + +# Context length exceeded: +CONTEXT_PROBE_TIERS = [128_000, 64_000, 32_000, 16_000, 8_000] +# Step down through tiers on error +``` + +### 2.5 Authentication Refresh Retry (lines 6904-6950) + +```python +# Codex OAuth (401): +codex_auth_retry_attempted = False # Once per request +_try_refresh_codex_client_credentials() + +# Nous Portal (401): +nous_auth_retry_attempted = False +_try_refresh_nous_client_credentials() + +# Anthropic (401): +anthropic_auth_retry_attempted = False +_try_refresh_anthropic_client_credentials() +``` + +### 2.6 Length Continuation Retry (lines 6639-6765) + +```python +# Response truncated (finish_reason='length'): +length_continue_retries = 0 +max_continuation_retries = 3 + +# Request continuation with prompt: +"[System: Your previous response was truncated... Continue exactly where you left off]" +``` + +### 2.7 Tool Call Validation Retries (lines 7400-7500) + +```python +# Invalid tool name: 3 repair attempts +# 1. Lowercase +# 2. Normalize (hyphens/spaces to underscores) +# 3. Fuzzy match (difflib, cutoff=0.7) + +# Invalid JSON arguments: 3 retries +# Empty content after think blocks: 3 retries +# Incomplete scratchpad: 3 retries +``` + +--- + +## 3. Context Window Management Analysis + +### 3.1 Multi-Layer Context System + +``` +┌────────────────────────────────────────────────────────────────────────┐ +│ Context Architecture │ +├────────────────────────────────────────────────────────────────────────┤ +│ Layer 1: System Prompt (cached per session) │ +│ - SOUL.md or DEFAULT_AGENT_IDENTITY │ +│ - Memory blocks (MEMORY.md, USER.md) │ +│ - Skills index │ +│ - Context files (AGENTS.md, .cursorrules) │ +│ - Timestamp, platform hints │ +│ - ~2K-10K tokens typical │ +├────────────────────────────────────────────────────────────────────────┤ +│ Layer 2: Conversation History │ +│ - User/assistant/tool messages │ +│ - Protected head (first 3 messages) │ +│ - Protected tail (last N messages by token budget) │ +│ - Compressible middle section │ +├────────────────────────────────────────────────────────────────────────┤ +│ Layer 3: Tool Definitions │ +│ - ~20-30K tokens with many tools │ +│ - Filtered by enabled/disabled toolsets │ +├────────────────────────────────────────────────────────────────────────┤ +│ Layer 4: Ephemeral Context (API call only) │ +│ - Prefill messages │ +│ - Honcho turn context │ +│ - Plugin context │ +│ - Ephemeral system prompt │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +### 3.2 ContextCompressor Algorithm (agent/context_compressor.py) + +```python +# Configuration: +threshold_percent = 0.50 # Compress at 50% of context length +protect_first_n = 3 # Head protection +protect_last_n = 20 # Tail protection (message count fallback) +tail_token_budget = 20_000 # Tail protection (token budget) +summary_target_ratio = 0.20 # 20% of compressed content for summary + +# Compression phases: +1. Prune old tool results (cheap pre-pass) +2. Determine boundaries (head + tail protection) +3. Generate structured summary via LLM +4. Sanitize tool_call/tool_result pairs +5. Assemble compressed message list + +# Iterative summary updates: +_previous_summary = None # Stored for next compression +``` + +### 3.3 Context Length Detection Hierarchy + +```python +# Detection priority (model_metadata.py): +1. Config override (config.yaml model.context_length) +2. Custom provider config (custom_providers[].models[].context_length) +3. models.dev registry lookup +4. OpenRouter API metadata +5. Endpoint /models probe (local servers) +6. Hardcoded DEFAULT_CONTEXT_LENGTHS +7. Context probing (trial-and-error tiers) +8. DEFAULT_FALLBACK_CONTEXT (128K) +``` + +### 3.4 Prompt Caching (Anthropic) + +```python +# System-and-3 strategy: +# - 4 cache_control breakpoints max +# - System prompt (stable) +# - Last 3 non-system messages (rolling window) +# - 5m or 1h TTL + +# Activation conditions: +_is_openrouter_url() and "claude" in model.lower() +# OR native Anthropic endpoint +``` + +### 3.5 Context Pressure Monitoring + +```python +# User-facing warnings (not injected to LLM): +_context_pressure_warned = False + +# Thresholds: +_budget_caution_threshold = 0.7 # 70% - nudge to wrap up +_budget_warning_threshold = 0.9 # 90% - urgent + +# Injection method: +# Added to last tool result JSON as _budget_warning field +``` + +--- + +## 4. Ten Performance Optimization Opportunities + +### 4.1 Tool Call Deduplication (Missing) +**Current**: No deduplication of identical tool calls within a batch +**Impact**: Redundant API calls, wasted tokens +**Fix**: Add `_deduplicate_tool_calls()` before execution (already implemented but only for delegate_task) + +### 4.2 Context Compression Frequency +**Current**: Compress only at threshold crossing +**Impact**: Sudden latency spike during compression +**Fix**: Background compression prediction + prefetch + +### 4.3 Skills Prompt Cache Invalidation +**Current**: LRU cache keyed by (skills_dir, tools, toolsets) +**Issue**: External skill file changes may not invalidate cache +**Fix**: Add file watcher or mtime check before cache hit + +### 4.4 Streaming Response Buffering +**Current**: Accumulates all deltas in memory +**Impact**: Memory bloat for long responses +**Fix**: Stream directly to output with minimal buffering + +### 4.5 Tool Result Truncation Timing +**Current**: Truncates after tool execution completes +**Impact**: Wasted time on tools returning huge outputs +**Fix**: Streaming truncation during tool execution + +### 4.6 Concurrent Tool Execution Limits +**Current**: Fixed _MAX_TOOL_WORKERS = 8 +**Issue**: Not tuned by available CPU/memory +**Fix**: Dynamic worker count based on system resources + +### 4.7 API Client Connection Pooling +**Current**: Creates new client per interruptible request +**Issue**: Connection overhead +**Fix**: Connection pool with proper cleanup + +### 4.8 Model Metadata Cache TTL +**Current**: 1 hour fixed TTL for OpenRouter metadata +**Issue**: Stale pricing/context data +**Fix**: Adaptive TTL based on error rates + +### 4.9 Honcho Context Prefetch +**Current**: Prefetch queued at turn end, consumed next turn +**Issue**: First turn has no prefetch +**Fix**: Pre-warm cache on session creation + +### 4.10 Session DB Write Batching +**Current**: Per-message writes to SQLite +**Impact**: I/O overhead +**Fix**: Batch writes with periodic flush + +--- + +## 5. Five Potential Race Conditions or Bugs + +### 5.1 Interrupt Propagation Race (HIGH SEVERITY) +**Location**: run_agent.py lines 2253-2259 + +```python +with self._active_children_lock: + children_copy = list(self._active_children) +for child in children_copy: + child.interrupt(message) # Child may be gone +``` + +**Issue**: Child agent may be removed from `_active_children` between copy and iteration +**Fix**: Check if child still exists in list before calling interrupt + +### 5.2 Concurrent Tool Execution Order +**Location**: run_agent.py lines 5308-5478 + +```python +# Results collected in order, but execution is concurrent +results = [None] * num_tools +def _run_tool(index, ...): + results[index] = (function_name, ..., result, ...) +``` + +**Issue**: If tool A depends on tool B's side effects, concurrent execution may fail +**Fix**: Document that parallel tools must be independent; add dependency tracking + +### 5.3 Session DB Concurrent Access +**Location**: run_agent.py lines 1716-1755 + +```python +if not self._session_db: + return +# ... multiple DB operations without transaction +``` + +**Issue**: Gateway creates multiple AIAgent instances; SQLite may lock +**Fix**: Add proper transaction wrapping and retry logic + +### 5.4 Context Compressor State Mutation +**Location**: agent/context_compressor.py lines 545-677 + +```python +messages, pruned_count = self._prune_old_tool_results(messages, ...) +# messages is modified copy, but original may be referenced elsewhere +``` + +**Issue**: Deep copy is shallow for nested structures; tool_calls may be shared +**Fix**: Ensure deep copy of entire message structure + +### 5.5 Tool Call ID Collision +**Location**: run_agent.py lines 2910-2954 + +```python +def _derive_responses_function_call_id(self, call_id, response_item_id): + # Multiple derivations may collide + return f"fc_{sanitized[:48]}" +``` + +**Issue**: Truncated IDs may collide in long conversations +**Fix**: Use full UUIDs or ensure uniqueness with counter + +--- + +## Appendix: Key Files and Responsibilities + +| File | Lines | Responsibility | +|------|-------|----------------| +| run_agent.py | ~8500 | Main AIAgent class, conversation loop | +| agent/prompt_builder.py | ~816 | System prompt assembly, skills indexing | +| agent/context_compressor.py | ~676 | Context compression, summarization | +| agent/auxiliary_client.py | ~1822 | Side-task LLM client routing | +| agent/model_metadata.py | ~930 | Context length detection, pricing | +| agent/display.py | ~771 | CLI feedback, spinners | +| agent/prompt_caching.py | ~72 | Anthropic cache control | +| agent/trajectory.py | ~56 | Trajectory format conversion | +| agent/models_dev.py | ~172 | models.dev registry integration | + +--- + +## Summary Statistics + +- **Total Core Code**: ~13,000 lines +- **State Machine States**: 8 primary, 4 sub-states +- **Retry Mechanisms**: 7 distinct types +- **Context Layers**: 4 layers with compression +- **Potential Issues**: 5 identified (1 high severity) +- **Optimization Opportunities**: 10 identified diff --git a/hermes-sovereign/docs/fleet-sitrep-2026-04-06.md b/hermes-sovereign/docs/fleet-sitrep-2026-04-06.md new file mode 100644 index 00000000..4fab4827 --- /dev/null +++ b/hermes-sovereign/docs/fleet-sitrep-2026-04-06.md @@ -0,0 +1,132 @@ +# Fleet SITREP — April 6, 2026 + +**Classification:** Consolidated Status Report +**Compiled by:** Ezra +**Acknowledged by:** Claude (Issue #143) + +--- + +## Executive Summary + +Allegro executed 7 tasks across infrastructure, contracting, audits, and security. Ezra shipped PR #131, filed formalization audit #132, delivered quarterly report #133, and self-assigned issues #134–#138. All wizard activity mapped below. + +--- + +## 1. Allegro 7-Task Report + +| Task | Description | Status | +|------|-------------|--------| +| 1 | Roll Call / Infrastructure Map | ✅ Complete | +| 2 | Dark industrial anthem (140 BPM, Suno-ready) | ✅ Complete | +| 3 | Operation Get A Job — 7-file contracting playbook pushed to `the-nexus` | ✅ Complete | +| 4 | Formalization audit filed ([the-nexus #893](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/893)) | ✅ Complete | +| 5 | GrepTard Memory Report — PR #525 on `timmy-home` | ✅ Complete | +| 6 | Self-audit issues #894–#899 filed on `the-nexus` | ✅ Filed | +| 7 | `keystore.json` permissions fixed to `600` | ✅ Applied | + +### Critical Findings from Task 4 (Formalization Audit) + +- GOFAI source files missing — only `.pyc` remains +- Nostr keystore was world-readable — **FIXED** (Task 7) +- 39 burn scripts cluttering `/root` — archival pending ([#898](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/898)) + +--- + +## 2. Ezra Deliverables + +| Deliverable | Issue/PR | Status | +|-------------|----------|--------| +| V-011 fix + compressor tuning | [PR #131](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/pulls/131) | ✅ Merged | +| Formalization audit (hermes-agent) | [Issue #132](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/132) | Filed | +| Quarterly report (MD + PDF) | [Issue #133](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/133) | Filed | +| Burn-mode concurrent tool tests | [Issue #134](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/134) | Assigned → Ezra | +| MCP SDK migration | [Issue #135](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/135) | Assigned → Ezra | +| APScheduler migration | [Issue #136](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/136) | Assigned → Ezra | +| Pydantic-settings migration | [Issue #137](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/137) | Assigned → Ezra | +| Contracting playbook tracker | [Issue #138](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/138) | Assigned → Ezra | + +--- + +## 3. Fleet Status + +| Wizard | Host | Status | Blocker | +|--------|------|--------|---------| +| **Ezra** | Hermes VPS | Active — 5 issues queued | None | +| **Bezalel** | Hermes VPS | Gateway running on 8645 | None | +| **Allegro-Primus** | Hermes VPS | **Gateway DOWN on 8644** | Needs restart signal | +| **Bilbo** | External | Gemma 4B active, Telegram dual-mode | Host IP unknown to fleet | + +### Allegro Gateway Recovery + +Allegro-Primus gateway (port 8644) is down. Options: +1. **Alexander restarts manually** on Hermes VPS +2. **Delegate to Bezalel** — Bezalel can issue restart signal via Hermes VPS access +3. **Delegate to Ezra** — Ezra can coordinate restart as part of issue #894 work + +--- + +## 4. Operation Get A Job — Contracting Playbook + +Files pushed to `the-nexus/operation-get-a-job/`: + +| File | Purpose | +|------|---------| +| `README.md` | Master plan | +| `entity-setup.md` | Wyoming LLC, Mercury, E&O insurance | +| `service-offerings.md` | Rates $150–600/hr; packages $5k/$15k/$40k+ | +| `portfolio.md` | Portfolio structure | +| `outreach-templates.md` | Cold email templates | +| `proposal-template.md` | Client proposal structure | +| `rate-card.md` | Rate card | + +**Human-only mile (Alexander's action items):** + +1. Pick LLC name from `entity-setup.md` +2. File Wyoming LLC via Northwest Registered Agent ($225) +3. Get EIN from IRS (free, ~10 min) +4. Open Mercury account (requires EIN + LLC docs) +5. Secure E&O insurance (~$150–250/month) +6. Restart Allegro-Primus gateway (port 8644) +7. Update LinkedIn using profile template +8. Send 5 cold emails using outreach templates + +--- + +## 5. Pending Self-Audit Issues (the-nexus) + +| Issue | Title | Priority | +|-------|-------|----------| +| [#894](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/894) | Deploy burn-mode cron jobs | CRITICAL | +| [#895](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/895) | Telegram thread-based reporting | Normal | +| [#896](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/896) | Retry logic and error recovery | Normal | +| [#897](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/897) | Automate morning reports at 0600 | Normal | +| [#898](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/898) | Archive 39 burn scripts | Normal | +| [#899](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/899) | Keystore permissions | ✅ Done | + +--- + +## 6. Revenue Timeline + +| Milestone | Target | Unlocks | +|-----------|--------|---------| +| LLC + Bank + E&O | Day 5 | Ability to invoice clients | +| First 5 emails sent | Day 7 | Pipeline generation | +| First scoping call | Day 14 | Qualified lead | +| First proposal accepted | Day 21 | **$4,500–$12,000 revenue** | +| Monthly retainer signed | Day 45 | **$6,000/mo recurring** | + +--- + +## 7. Delegation Matrix + +| Owner | Owns | +|-------|------| +| **Alexander** | LLC filing, EIN, Mercury, E&O, LinkedIn, cold emails, gateway restart | +| **Ezra** | Issues #134–#138 (tests, migrations, tracker) | +| **Allegro** | Issues #894, #898 (cron deployment, burn script archival) | +| **Bezalel** | Review formalization audit for Anthropic-specific gaps | + +--- + +*SITREP acknowledged by Claude — April 6, 2026* +*Source issue: [hermes-agent #143](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/143)* diff --git a/hermes-sovereign/docs/gateway_analysis_report.md b/hermes-sovereign/docs/gateway_analysis_report.md new file mode 100644 index 00000000..a1a7e99c --- /dev/null +++ b/hermes-sovereign/docs/gateway_analysis_report.md @@ -0,0 +1,542 @@ +# Hermes Gateway System - Deep Analysis Report + +## Executive Summary + +This report provides an exhaustive analysis of the Hermes messaging gateway system, which serves as the unified interface between the AI agent and 15+ messaging platforms. The gateway handles message routing, session management, platform abstraction, and cross-platform delivery. + +--- + +## 1. Message Flow Diagram for All Platforms + +### 1.1 Inbound Message Flow (Universal Pattern) + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ EXTERNAL MESSAGING PLATFORM │ +│ (Telegram/Discord/Slack/WhatsApp/Signal/Matrix/Mattermost/Email/SMS/etc) │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PLATFORM-SPECIFIC TRANSPORT LAYER │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │ +│ │ WebSocket │ │ Long Poll │ │ Webhook │ │ HTTP REST + SSE │ │ +│ │ (Discord) │ │ (Telegram) │ │ (Generic) │ │ (Signal/HA) │ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PLATFORM ADAPTER (BasePlatformAdapter) │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ 1. Authentication/Validation (token verification, HMAC checks) │ │ +│ │ 2. Message Parsing (extract text, media, metadata) │ │ +│ │ 3. Source Building (SessionSource: chat_id, user_id, platform) │ │ +│ │ 4. Media Caching (images/audio/documents → local filesystem) │ │ +│ │ 5. Deduplication (message ID tracking, TTL caches) │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ MESSAGEEVENT CREATION │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ MessageEvent { │ │ +│ │ text: str, # Extracted message text │ │ +│ │ message_type: MessageType, # TEXT/PHOTO/VOICE/DOCUMENT/etc │ │ +│ │ source: SessionSource, # Platform + chat + user context │ │ +│ │ media_urls: List[str], # Cached attachment paths │ │ +│ │ message_id: str, # Platform message ID │ │ +│ │ reply_to_message_id: str, # Thread/reply context │ │ +│ │ timestamp: datetime, # Message time │ │ +│ │ raw_message: Any, # Platform-specific payload │ │ +│ │ } │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ GATEWAY RUNNER (run.py) │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ 1. Authorization Check (_is_user_authorized) │ │ +│ │ - Check allowlists (user-specific, group-specific) │ │ +│ │ - Check pairing mode (first-user-wins, admin-only) │ │ +│ │ - Validate group policies │ │ +│ │ 2. Session Resolution/Creation (_get_or_create_session) │ │ +│ │ 3. Command Processing (/reset, /status, /stop, etc.) │ │ +│ │ 4. Agent Invocation (_process_message_with_agent) │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ AI AGENT PROCESSING │ +│ (Agent Loop with Tool Calling) │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 1.2 Outbound Message Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ AI AGENT RESPONSE │ +│ (Text + Media + Tool Results) │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ RESPONSE PROCESSING │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ 1. Format Message (platform-specific markdown conversion) │ │ +│ │ 2. Truncate if needed (respect platform limits) │ │ +│ │ 3. Media Handling (upload to platform if needed) │ │ +│ │ 4. Thread Context (reply_to_message_id, thread_id) │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PLATFORM ADAPTER SEND METHOD │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ send(chat_id, content, reply_to, metadata) -> SendResult │ │ +│ │ ├── Telegram: Bot API (HTTP POST to sendMessage) │ │ +│ │ ├── Discord: discord.py (channel.send()) │ │ +│ │ ├── Slack: slack_bolt (chat.postMessage) │ │ +│ │ ├── Matrix: matrix-nio (room_send) │ │ +│ │ ├── Signal: signal-cli HTTP RPC │ │ +│ │ ├── WhatsApp: Bridge HTTP POST to Node.js process │ │ +│ │ └── ... (15+ platforms) │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ DELIVERY CONFIRMATION │ +│ (SendResult: success/error/message_id) │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 1.3 Platform-Specific Transport Architectures + +| Platform | Transport | Connection Model | Authentication | +|----------|-----------|------------------|----------------| +| Telegram | HTTP Long Polling / Webhook | Persistent HTTP | Bot Token | +| Discord | WebSocket (Gateway) | Persistent WS | Bot Token | +| Slack | Socket Mode (WebSocket) | Persistent WS | Bot Token + App Token | +| WhatsApp | HTTP Bridge (Local) | Child Process + HTTP | Session-based | +| Signal | HTTP + SSE | HTTP Stream | signal-cli daemon | +| Matrix | HTTP + Sync Loop | Polling with long-poll | Access Token | +| Mattermost | WebSocket | Persistent WS | Bot Token | +| Email | IMAP + SMTP | Polling (IMAP) | Username/Password | +| SMS (Twilio) | HTTP Webhook | Inbound HTTP + REST outbound | Account SID + Auth Token | +| DingTalk | WebSocket (Stream) | Persistent WS | Client ID + Secret | +| Feishu | WebSocket / Webhook | WS or HTTP | App ID + Secret | +| WeCom | WebSocket | Persistent WS | Bot ID + Secret | +| Home Assistant | WebSocket | Persistent WS | Long-lived Token | +| Webhook | HTTP Server | Inbound HTTP | HMAC Signature | +| API Server | HTTP Server | Inbound HTTP | API Key | + +--- + +## 2. Session Lifecycle Analysis + +### 2.1 Session State Model + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SESSION STATE MACHINE │ +└─────────────────────────────────────────────────────────────────────────────┘ + + ┌──────────┐ + │ START │ + └────┬─────┘ + │ + ▼ + ┌────────────────────────────────────────────────────────────────────┐ + │ SESSION CREATION │ + │ ┌──────────────────────────────────────────────────────────────┐ │ + │ │ 1. Generate session_id (UUID) │ │ + │ │ 2. Create SessionSource (platform, chat_id, user_id, ...) │ │ + │ │ 3. Initialize memory (Honcho/UserRepo) │ │ + │ │ 4. Set creation timestamp │ │ + │ │ 5. Initialize environment (worktree, tools, skills) │ │ + │ └──────────────────────────────────────────────────────────────┘ │ + └────────────────────────────────────────────────────────────────────┘ + │ + ▼ + ┌────────────────────────────────────────────────────────────────────┐ + │ ACTIVE STATE │ + │ ┌──────────────────────────────────────────────────────────────┐ │ + │ │ SESSION OPERATIONS: │ │ + │ │ ├── Message Processing (handle_message) │ │ + │ │ ├── Tool Execution (terminal, file ops, browser, etc.) │ │ + │ │ ├── Memory Storage/Retrieval (context building) │ │ + │ │ ├── Checkpoint Creation (state snapshots) │ │ + │ │ └── Delivery Routing (responses to multiple platforms) │ │ + │ │ │ │ + │ │ LIFECYCLE EVENTS: │ │ + │ │ ├── /reset - Clear session state, keep identity │ │ + │ │ ├── /stop - Interrupt current operation │ │ + │ │ ├── /title - Rename session │ │ + │ │ ├── Checkpoint/Resume - Save/restore execution state │ │ + │ │ └── Background task completion (cron jobs, delegations) │ │ + │ └──────────────────────────────────────────────────────────────┘ │ + └────────────────────────────────────────────────────────────────────┘ + │ + ├── Idle Timeout ────────┐ + │ ▼ + ┌────┴───────────────────────────────────────────────────────────────┐ + │ SESSION PERSISTENCE │ + │ ┌──────────────────────────────────────────────────────────────┐ │ + │ │ Save to: │ │ + │ │ ├── SQLite (session metadata) │ │ + │ │ ├── Honcho (conversation history) │ │ + │ │ ├── Filesystem (checkpoints, outputs) │ │ + │ │ └── Platform (message history for context) │ │ + │ └──────────────────────────────────────────────────────────────┘ │ + └────────────────────────────────────────────────────────────────────┘ + │ + ├── Explicit Close / Error / Timeout + │ + ▼ + ┌────────────────────────────────────────────────────────────────────┐ + │ SESSION TERMINATION │ + │ ┌──────────────────────────────────────────────────────────────┐ │ + │ │ Cleanup Actions: │ │ + │ │ ├── Flush memory to persistent store │ │ + │ │ ├── Cancel pending tasks │ │ + │ │ ├── Close environment resources │ │ + │ │ ├── Remove from active sessions map │ │ + │ │ └── Notify user (if graceful) │ │ + │ └──────────────────────────────────────────────────────────────┘ │ + └────────────────────────────────────────────────────────────────────┘ +``` + +### 2.2 Session Data Model + +```python +SessionSource: + platform: Platform # TELEGRAM, DISCORD, SLACK, etc. + chat_id: str # Platform-specific chat/channel ID + chat_name: Optional[str] # Display name + chat_type: str # "dm" | "group" | "channel" + user_id: str # User identifier (platform-specific) + user_name: Optional[str] # Display name + user_id_alt: Optional[str] # Alternative ID (e.g., Matrix MXID) + thread_id: Optional[str] # Thread/topic ID + message_id: Optional[str] # Specific message ID (for replies) + +SessionMetadata: + session_id: str # UUID + created_at: datetime + last_activity: datetime + agent_id: Optional[str] # Honcho agent ID + session_title: Optional[str] + +ActiveSession: + source: SessionSource + metadata: SessionMetadata + memory: HonchoClient # Conversation storage + environment: Optional[str] # Active execution environment +``` + +### 2.3 Session Persistence Strategy + +| Layer | Storage | TTL/Policy | Purpose | +|-------|---------|------------|---------| +| In-Memory | Dict[str, ActiveSession] | Gateway lifetime | Fast access to active sessions | +| SQLite | `~/.hermes/sessions.db` | Persistent | Session metadata, checkpoints | +| Honcho API | Cloud/self-hosted | Persistent | Conversation history, user memory | +| Filesystem | `~/.hermes/checkpoints/` | User-managed | Execution state snapshots | +| Platform | Message history | Platform-dependent | Context window reconstruction | + +--- + +## 3. Platform Adapter Comparison Matrix + +### 3.1 Feature Matrix + +| Feature | Telegram | Discord | Slack | Matrix | Signal | WhatsApp | Mattermost | Email | SMS | +|---------|----------|---------|-------|--------|--------|----------|------------|-------|-----| +| **Message Types** | +| Text | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Images | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| Documents | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| Voice/Audio | ✅ | ✅ | ⚠️ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| Video | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| Stickers | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Threading** | +| Thread Support | ✅ (topics) | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ (refs) | ❌ | +| Reply Chains | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | +| **Advanced** | +| Typing Indicators | ✅ | ✅ | ⚠️ | ✅ | ⚠️ | ❌ | ✅ | ❌ | ❌ | +| Message Edit | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | +| Message Delete | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | +| Reactions | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | +| Slash Commands | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| **Security** | +| E2EE Available | ❌ | ❌ | ❌ | ✅ | ✅ | ⚠️ | ❌ | ✅ (TLS) | ❌ | +| Self-hosted | ❌ | ❌ | ⚠️ | ✅ | ⚠️ | ❌ | ✅ | ⚠️ | ❌ | +| **Scale** | +| Max Message | 4096 | 2000 | 40000 | 4000 | 8000 | 65536 | 4000 | 50000 | 1600 | +| Rate Limits | High | Medium | Medium | Low | Low | Low | High | Medium | Low | + +### 3.2 Implementation Complexity + +| Platform | Lines of Code | Dependencies | Setup Complexity | Maintenance | +|----------|---------------|--------------|------------------|-------------| +| Telegram | ~2100 | python-telegram-bot | Low | Low | +| Discord | ~2300 | discord.py + opus | Medium | Medium | +| Slack | ~970 | slack-bolt | Medium | Low | +| Matrix | ~1050 | matrix-nio | High | Medium | +| Signal | ~800 | httpx (only) | High | Low | +| WhatsApp | ~800 | Node.js bridge | High | High | +| Mattermost | ~720 | aiohttp | Low | Low | +| Email | ~620 | stdlib (imaplib/smtplib) | Low | Low | +| SMS | ~280 | aiohttp | Low | Low | +| DingTalk | ~340 | dingtalk-stream | Low | Low | +| Feishu | ~3250 | lark-oapi | High | Medium | +| WeCom | ~1330 | aiohttp + httpx | Medium | Medium | +| Home Assistant | ~450 | aiohttp | Low | Low | +| Webhook | ~620 | aiohttp | Low | Low | +| API Server | ~1320 | aiohttp | Low | Low | + +### 3.3 Protocol Implementation Patterns + +| Platform | Connection Pattern | Message Ingestion | Message Delivery | +|----------|-------------------|-------------------|------------------| +| Telegram | Polling/Webhook | Update processing loop | HTTP POST | +| Discord | Gateway WebSocket | Event dispatch | Gateway send | +| Slack | Socket Mode WS | Event handlers | Web API | +| Matrix | Sync loop (HTTP long-poll) | Event callbacks | Room send API | +| Signal | SSE stream | Async iterator | JSON-RPC HTTP | +| WhatsApp | Local HTTP bridge | Polling endpoint | HTTP POST | +| Mattermost | WebSocket | Event loop | REST API | +| Email | IMAP IDLE/polling | UID tracking | SMTP | +| SMS | HTTP webhook | POST handler | REST API | + +--- + +## 4. Ten Scalability Recommendations + +### 4.1 Horizontal Scaling + +**R1. Implement Gateway Sharding** +- Current: Single-process gateway with per-platform adapters +- Problem: Memory/CPU limits as session count grows +- Solution: Implement consistent hashing by chat_id to route messages to gateway shards +- Implementation: Use Redis for session state, allow multiple gateway instances behind load balancer + +**R2. Async Connection Pooling** +- Current: Each adapter manages its own connections +- Problem: Connection explosion with high concurrency +- Solution: Implement shared connection pools for HTTP-based platforms (Telegram, Slack, Matrix) +- Implementation: Use aiohttp/httpx connection pooling with configurable limits + +### 4.2 Message Processing + +**R3. Implement Message Queue Backpressure** +- Current: Direct adapter → agent invocation +- Problem: Agent overload during message bursts +- Solution: Add per-session message queues with prioritization +- Implementation: Use asyncio.PriorityQueue, drop old messages if queue exceeds limit + +**R4. Batch Processing for Similar Requests** +- Current: Each message triggers individual agent runs +- Problem: Redundant processing for similar queries +- Solution: Implement request deduplication and batching window +- Implementation: 100ms batching window, group similar requests, shared LLM inference + +### 4.3 Session Management + +**R5. Session Tiering with LRU Eviction** +- Current: All sessions kept in memory +- Problem: Memory exhaustion with many concurrent sessions +- Solution: Implement hot/warm/cold session tiers +- Implementation: Active (in-memory), Idle (Redis), Archived (DB) with automatic promotion + +**R6. Streaming Response Handling** +- Current: Full response buffering before platform send +- Problem: Delayed first-byte delivery, memory pressure for large responses +- Solution: Stream chunks to platforms as they're generated +- Implementation: Generator-based response handling, platform-specific chunking + +### 4.4 Platform Optimization + +**R7. Adaptive Polling Intervals** +- Current: Fixed polling intervals (Telegram, Email) +- Problem: Wasted API calls during low activity, latency during high activity +- Solution: Implement adaptive backoff based on message frequency +- Implementation: Exponential backoff to min interval, jitter, reset on activity + +**R8. Platform-Specific Rate Limiters** +- Current: Generic rate limiting +- Problem: Platform-specific limits cause throttling errors +- Solution: Implement per-platform token bucket rate limiters +- Implementation: Separate rate limiters per platform with platform-specific limits + +### 4.5 Infrastructure + +**R9. Distributed Checkpoint Storage** +- Current: Local filesystem checkpoints +- Problem: Single point of failure, not shareable across instances +- Solution: Pluggable checkpoint backends (S3, Redis, NFS) +- Implementation: Abstract checkpoint interface, async uploads + +**R10. Observability and Auto-scaling** +- Current: Basic logging, no metrics +- Problem: No visibility into bottlenecks, manual scaling +- Solution: Implement comprehensive metrics and auto-scaling triggers +- Implementation: Prometheus metrics (sessions, messages, latency), HPA based on queue depth + +--- + +## 5. Security Audit for Each Platform + +### 5.1 Authentication & Authorization + +| Platform | Token Storage | Token Rotation | Scope Validation | Vulnerabilities | +|----------|---------------|----------------|------------------|-----------------| +| Telegram | Environment | Manual | Bot-level | Token in env, shared across instances | +| Discord | Environment | Manual | Bot-level | Token in env, privileged intents needed | +| Slack | Environment + OAuth file | Auto (OAuth) | App-level | App token exposure risk | +| Matrix | Environment | Manual | User-level | Access token long-lived | +| Signal | Environment | N/A (daemon) | Account-level | No E2EE for bot messages | +| WhatsApp | Session files | Auto | Account-level | QR code interception risk | +| Mattermost | Environment | Manual | Bot-level | Token in env | +| Email | Environment | App passwords | Account-level | Password in env, IMAP/SMTP plain auth | +| SMS | Environment | N/A | Account-level | Credentials in env | +| DingTalk | Environment | Auto | App-level | Client secret in env | +| Feishu | Environment | Auto | App-level | App secret in env | +| WeCom | Environment | Auto | Bot-level | Bot secret in env | +| Home Assistant | Environment | Manual | Token-level | Long-lived tokens | +| Webhook | Route config | N/A | Route-level | HMAC secret in config | +| API Server | Config | Manual | API key | Key in memory, no rotation | + +### 5.2 Data Protection + +| Platform | Data at Rest | Data in Transit | E2EE Available | PII Redaction | +|----------|--------------|-----------------|----------------|---------------| +| Telegram | ❌ (cloud) | ✅ TLS | ❌ | ✅ Phone numbers | +| Discord | ❌ (cloud) | ✅ TLS | ❌ | ✅ User IDs | +| Slack | ⚠️ (cloud) | ✅ TLS | ❌ | ✅ User IDs | +| Matrix | ✅ (configurable) | ✅ TLS | ✅ (optional) | ⚠️ Partial | +| Signal | ✅ (local) | ✅ TLS | ✅ (always) | ✅ Phone numbers | +| WhatsApp | ⚠️ (local bridge) | ✅ TLS | ⚠️ (bridge) | ❌ | +| Mattermost | ✅ (self-hosted) | ✅ TLS | ❌ | ⚠️ Partial | +| Email | ✅ (local) | ✅ TLS | ⚠️ (PGP possible) | ✅ Addresses | +| SMS | ❌ (Twilio cloud) | ✅ TLS | ❌ | ✅ Phone numbers | +| DingTalk | ❌ (cloud) | ✅ TLS | ❌ | ⚠️ Partial | +| Feishu | ❌ (cloud) | ✅ TLS | ❌ | ⚠️ Partial | +| WeCom | ⚠️ (enterprise) | ✅ TLS | ❌ | ⚠️ Partial | +| Home Assistant | ✅ (local) | ✅ TLS/WS | N/A | ✅ Entity IDs | +| Webhook | ✅ (local) | ✅ TLS | N/A | ⚠️ Config-dependent | +| API Server | ✅ (SQLite) | ✅ TLS | N/A | ✅ API keys | + +### 5.3 Attack Vectors & Mitigations + +#### A. Telegram +- **Vector**: Webhook spoofing with fake updates +- **Mitigation**: Validate update signatures (if using webhooks with secret) +- **Status**: ✅ Implemented (webhook secret validation) + +#### B. Discord +- **Vector**: Gateway intent manipulation for privilege escalation +- **Mitigation**: Minimal intent configuration, validate member permissions +- **Status**: ⚠️ Partial (intents configured but not runtime validated) + +#### C. Slack +- **Vector**: Request forgery via delayed signature replay +- **Mitigation**: Timestamp validation in signature verification +- **Status**: ✅ Implemented (Bolt handles this) + +#### D. Matrix +- **Vector**: Device verification bypass for E2EE rooms +- **Mitigation**: Require verified devices, blacklist unverified +- **Status**: ⚠️ Partial (E2EE supported but verification UI not implemented) + +#### E. Signal +- **Vector**: signal-cli daemon access if local +- **Mitigation**: Bind to localhost only, file permissions on socket +- **Status**: ⚠️ Partial (relies on system configuration) + +#### F. WhatsApp +- **Vector**: Bridge process compromise, session hijacking +- **Mitigation**: Process isolation, session file permissions, QR code timeout +- **Status**: ⚠️ Partial (process isolation via subprocess) + +#### G. Email +- **Vector**: Attachment malware, phishing via spoofed sender +- **Mitigation**: Attachment scanning, SPF/DKIM validation consideration +- **Status**: ⚠️ Partial (automated sender filtering, no malware scanning) + +#### H. Webhook +- **Vector**: HMAC secret brute force, replay attacks +- **Mitigation**: Constant-time comparison, timestamp validation, rate limiting +- **Status**: ✅ Implemented (constant-time HMAC, rate limiting) + +#### I. API Server +- **Vector**: API key brute force, unauthorized model access +- **Mitigation**: Rate limiting, key rotation, request logging +- **Status**: ⚠️ Partial (rate limiting recommended but not enforced) + +### 5.4 Security Recommendations + +1. **Implement Secret Rotation**: All platforms using long-lived tokens should support rotation without restart +2. **Add Request Signing**: Platforms without native validation should implement Ed25519 request signing +3. **Implement Audit Logging**: All authentication events should be logged with structured format +4. **Add Rate Limiting**: Per-user, per-chat, and per-platform rate limiting with exponential backoff +5. **Enable Content Scanning**: File attachments should be scanned for malware before processing +6. **Implement CSP**: For webhook/API server, strict Content-Security-Policy headers +7. **Add Security Headers**: All HTTP responses should include security headers (HSTS, X-Frame-Options, etc.) + +--- + +## Appendix A: Code Quality Metrics + +### A.1 Test Coverage by Platform + +| Platform | Unit Tests | Integration Tests | Mock Coverage | +|----------|------------|-------------------|---------------| +| Telegram | ✅ | ✅ | High | +| Discord | ✅ | ✅ | High | +| Slack | ✅ | ✅ | High | +| Matrix | ✅ | ✅ | Medium | +| Signal | ✅ | ⚠️ | Medium | +| WhatsApp | ✅ | ⚠️ | Low | +| Mattermost | ✅ | ✅ | High | +| Email | ✅ | ✅ | High | +| SMS | ✅ | ✅ | High | +| Other | ⚠️ | ❌ | Low | + +### A.2 Documentation Completeness + +| Platform | Setup Guide | API Reference | Troubleshooting | Examples | +|----------|-------------|---------------|-----------------|----------| +| Telegram | ✅ | ✅ | ✅ | ✅ | +| Discord | ✅ | ✅ | ✅ | ✅ | +| Slack | ✅ | ✅ | ✅ | ✅ | +| WhatsApp | ✅ | ✅ | ✅ | ⚠️ | +| Signal | ✅ | ⚠️ | ⚠️ | ❌ | +| Matrix | ✅ | ⚠️ | ⚠️ | ❌ | +| Other | ⚠️ | ❌ | ❌ | ❌ | + +--- + +## Appendix B: Performance Benchmarks (Estimated) + +| Platform | Messages/sec | Latency (p50) | Latency (p99) | Memory/session | +|----------|--------------|---------------|---------------|----------------| +| Telegram | 100+ | 50ms | 200ms | ~5KB | +| Discord | 50+ | 100ms | 500ms | ~10KB | +| Slack | 50+ | 150ms | 600ms | ~8KB | +| Matrix | 20+ | 300ms | 1000ms | ~15KB | +| Signal | 30+ | 200ms | 800ms | ~10KB | +| WhatsApp | 20+ | 500ms | 2000ms | ~20KB | + +--- + +*Report generated: March 30, 2026* +*Total lines analyzed: ~35,000+ +*Platforms covered: 15 +*Files analyzed: 45+ diff --git a/hermes-sovereign/docs/jupyter-as-execution-layer-research.md b/hermes-sovereign/docs/jupyter-as-execution-layer-research.md new file mode 100644 index 00000000..c8ca1cfa --- /dev/null +++ b/hermes-sovereign/docs/jupyter-as-execution-layer-research.md @@ -0,0 +1,678 @@ +# Jupyter Notebooks as Core LLM Execution Layer — Deep Research Report + +**Issue:** #155 +**Date:** 2026-04-06 +**Status:** Research / Spike +**Prior Art:** Timmy's initial spike (llm_execution_spike.ipynb, hamelnb bridge, JupyterLab on forge VPS) + +--- + +## Executive Summary + +This report deepens the research from issue #155 into three areas requested by Rockachopa: +1. The **full Jupyter product suite** — JupyterHub vs JupyterLab vs Notebook +2. **Papermill** — the production-grade notebook execution engine already used in real data pipelines +3. The **"PR model for notebooks"** — how agents can propose, diff, review, and merge changes to `.ipynb` files similarly to code PRs + +The conclusion: an elegant, production-grade agent→notebook pipeline already exists as open-source tooling. We don't need to invent much — we need to compose what's there. + +--- + +## 1. The Jupyter Product Suite + +The Jupyter ecosystem has three distinct layers that are often conflated. Understanding the distinction is critical for architectural decisions. + +### 1.1 Jupyter Notebook (Classic) + +The original single-user interface. One browser tab = one `.ipynb` file. Version 6 is in maintenance-only mode. Version 7 was rebuilt on JupyterLab components and is functionally equivalent. For headless agent use, the UI is irrelevant — what matters is the `.ipynb` file format and the kernel execution model underneath. + +### 1.2 JupyterLab + +The current canonical Jupyter interface for human users: full IDE, multi-pane, terminal, extension manager, built-in diff viewer, and `jupyterlab-git` for Git workflows from the UI. JupyterLab is the recommended target for agent-collaborative workflows because: + +- It exposes the same REST API as classic Jupyter (kernel sessions, execute, contents) +- Extensions like `jupyterlab-git` let a human co-reviewer inspect changes alongside the agent +- The `hamelnb` bridge Timmy already validated works against a JupyterLab server + +**For agents:** JupyterLab is the platform to run on. The agent doesn't interact with the UI — it uses the Jupyter REST API or Papermill on top of it. + +### 1.3 JupyterHub — The Multi-User Orchestration Layer + +JupyterHub is not a UI. It is a **multi-user server** that spawns, manages, and proxies individual single-user Jupyter servers. This is the production infrastructure layer. + +``` +[Agent / Browser / API Client] + | + [Proxy] (configurable-http-proxy) + / \ + [Hub] [Single-User Jupyter Server per user/agent] + (Auth, (standard JupyterLab/Notebook server) + Spawner, + REST API) +``` + +**Key components:** +- **Hub:** Manages auth, user database, spawner lifecycle, REST API +- **Proxy:** Routes `/hub/*` to Hub, `/user//*` to that user's server +- **Spawner:** How single-user servers are started. Default = local process. Production options include `KubeSpawner` (Kubernetes pod per user) and `DockerSpawner` (container per user) +- **Authenticator:** PAM, OAuth, DummyAuthenticator (for isolated agent environments) + +**JupyterHub REST API** (relevant for agent orchestration): + +```bash +# Spawn a named server for an agent service account +POST /hub/api/users//servers/ + +# Stop it when done +DELETE /hub/api/users//servers/ + +# Create a scoped API token for the agent +POST /hub/api/users//tokens + +# Check server status +GET /hub/api/users/ +``` + +**Why this matters for Hermes:** JupyterHub gives us isolated kernel environments per agent task, programmable lifecycle management, and a clean auth model. Instead of running one shared JupyterLab instance on the forge VPS, we could spawn ephemeral single-user servers per notebook execution run — each with its own kernel, clean state, and resource limits. + +### 1.4 Jupyter Kernel Gateway — Minimal Headless Execution + +If JupyterHub is too heavy, `jupyter-kernel-gateway` exposes just the kernel protocol over REST + WebSocket: + +```bash +pip install jupyter-kernel-gateway +jupyter kernelgateway --KernelGatewayApp.api=kernel_gateway.jupyter_websocket + +# Start kernel +POST /api/kernels +# Execute via WebSocket on Jupyter messaging protocol +WS /api/kernels//channels +# Stop kernel +DELETE /api/kernels/ +``` + +This is the lowest-level option: no notebook management, just raw kernel access. Suitable if we want to build our own execution layer from scratch. + +--- + +## 2. Papermill — Production Notebook Execution + +Papermill is the missing link between "notebook as experiment" and "notebook as repeatable pipeline task." It is already used at scale in industry data pipelines (Netflix, Airbnb, etc.). + +### 2.1 Core Concept: Parameterization + +Papermill's key innovation is **parameter injection**. Tag a cell in the notebook with `"parameters"`: + +```python +# Cell tagged "parameters" (defaults — defined by notebook author) +alpha = 0.5 +batch_size = 32 +model_name = "baseline" +``` + +At runtime, Papermill inserts a new cell immediately after, tagged `"injected-parameters"`, that overrides the defaults: + +```python +# Cell tagged "injected-parameters" (injected by Papermill at runtime) +alpha = 0.01 +batch_size = 128 +model_name = "experiment_007" +``` + +Because Python executes top-to-bottom, the injected cell shadows the defaults. The original notebook is never mutated — Papermill reads input, writes to a new output file. + +### 2.2 Python API + +```python +import papermill as pm + +nb = pm.execute_notebook( + input_path="analysis.ipynb", # source (can be s3://, az://, gs://) + output_path="output/run_001.ipynb", # destination (persists outputs) + parameters={ + "alpha": 0.01, + "n_samples": 1000, + "run_id": "fleet-check-2026-04-06", + }, + kernel_name="python3", + execution_timeout=300, # per-cell timeout in seconds + log_output=True, # stream cell output to logger + cwd="/path/to/notebook/", # working directory +) +# Returns: NotebookNode (the fully executed notebook with all outputs) +``` + +On cell failure, Papermill raises `PapermillExecutionError` with: +- `cell_index` — which cell failed +- `source` — the failing cell's code +- `ename` / `evalue` — exception type and message +- `traceback` — full traceback + +Even on failure, the output notebook is written with whatever cells completed — enabling partial-run inspection. + +### 2.3 CLI + +```bash +# Basic execution +papermill analysis.ipynb output/run_001.ipynb \ + -p alpha 0.01 \ + -p n_samples 1000 + +# From YAML parameter file +papermill analysis.ipynb output/run_001.ipynb -f params.yaml + +# CI-friendly: log outputs, no progress bar +papermill analysis.ipynb output/run_001.ipynb \ + --log-output \ + --no-progress-bar \ + --execution-timeout 300 \ + -p run_id "fleet-check-2026-04-06" + +# Prepare only (inject params, skip execution — for preview/inspection) +papermill analysis.ipynb preview.ipynb --prepare-only -p alpha 0.01 + +# Inspect parameter schema +papermill --help-notebook analysis.ipynb +``` + +**Remote storage** is built in — `pip install papermill[s3]` enables `s3://` paths for both input and output. Azure and GCS are also supported. For Hermes, this means notebook runs can be stored in object storage and retrieved later for audit. + +### 2.4 Scrapbook — Structured Output Collection + +`scrapbook` is Papermill's companion for extracting structured data from executed notebooks. Inside a notebook cell: + +```python +import scrapbook as sb + +# Write typed outputs (stored as special display_data in cell outputs) +sb.glue("accuracy", 0.9342) +sb.glue("metrics", {"precision": 0.91, "recall": 0.93, "f1": 0.92}) +sb.glue("results_df", df, "pandas") # DataFrames too +``` + +After execution, from the agent: + +```python +import scrapbook as sb + +nb = sb.read_notebook("output/fleet-check-2026-04-06.ipynb") +metrics = nb.scraps["metrics"].data # -> {"precision": 0.91, ...} +accuracy = nb.scraps["accuracy"].data # -> 0.9342 + +# Or aggregate across many runs +book = sb.read_notebooks("output/") +book.scrap_dataframe # -> pd.DataFrame with all scraps + filenames +``` + +This is the clean interface between notebook execution and agent decision-making: the notebook outputs its findings as named, typed scraps; the agent reads them programmatically and acts. + +### 2.5 How Papermill Compares to hamelnb + +| Capability | hamelnb | Papermill | +|---|---|---| +| Stateful kernel session | Yes | No (fresh kernel per run) | +| Parameter injection | No | Yes | +| Persistent output notebook | No | Yes | +| Remote storage (S3/Azure) | No | Yes | +| Per-cell timing/metadata | No | Yes (in output nb metadata) | +| Error isolation (partial runs) | No | Yes | +| Production pipeline use | Experimental | Industry-standard | +| Structured output collection | No | Yes (via scrapbook) | + +**Verdict:** `hamelnb` is great for interactive REPL-style exploration (where state accumulates). Papermill is better for task execution (where we want reproducible, parameterized, auditable runs). They serve different use cases. Hermes needs both. + +--- + +## 3. The `.ipynb` File Format — What the Agent Is Actually Working With + +Understanding the format is essential for the "PR model." A `.ipynb` file is JSON with this structure: + +```json +{ + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, + "language_info": {"name": "python", "version": "3.10.0"} + }, + "cells": [ + { + "id": "a1b2c3d4", + "cell_type": "markdown", + "source": "# Fleet Health Check\n\nThis notebook checks system health.", + "metadata": {} + }, + { + "id": "e5f6g7h8", + "cell_type": "code", + "source": "alpha = 0.5\nthreshold = 0.95", + "metadata": {"tags": ["parameters"]}, + "execution_count": null, + "outputs": [] + }, + { + "id": "i9j0k1l2", + "cell_type": "code", + "source": "import sys\nprint(sys.version)", + "metadata": {}, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "3.10.0 (default, ...)\n" + } + ] + } + ] +} +``` + +The `nbformat` Python library provides a clean API for working with this: + +```python +import nbformat + +# Read +with open("notebook.ipynb") as f: + nb = nbformat.read(f, as_version=4) + +# Navigate +for cell in nb.cells: + if cell.cell_type == "code": + print(cell.source) + +# Modify +nb.cells[2].source = "import sys\nprint('updated')" + +# Add cells +new_md = nbformat.v4.new_markdown_cell("## Agent Analysis\nInserted by Hermes.") +nb.cells.insert(3, new_md) + +# Write +with open("modified.ipynb", "w") as f: + nbformat.write(nb, f) + +# Validate +nbformat.validate(nb) # raises nbformat.ValidationError on invalid format +``` + +--- + +## 4. The PR Model for Notebooks + +This is the elegant architecture Rockachopa described: agents making PRs to notebooks the same way they make PRs to code. Here's how the full stack enables it. + +### 4.1 The Problem: Raw `.ipynb` Diffs Are Unusable + +Without tooling, a `git diff` on a notebook that was merely re-run (no source changes) produces thousands of lines of JSON changes — execution counts, timestamps, base64-encoded plot images. Code review on raw `.ipynb` diffs is impractical. + +### 4.2 nbstripout — Clean Git History + +`nbstripout` installs a git **clean filter** that strips outputs before files enter the git index. The working copy is untouched; only what gets committed is clean. + +```bash +pip install nbstripout +nbstripout --install # per-repo +# or +nbstripout --install --global # all repos +``` + +This writes to `.git/config`: +```ini +[filter "nbstripout"] + clean = nbstripout + smudge = cat + required = true + +[diff "ipynb"] + textconv = nbstripout -t +``` + +And to `.gitattributes`: +``` +*.ipynb filter=nbstripout +*.ipynb diff=ipynb +``` + +Now `git diff` shows only source changes — same as reviewing a `.py` file. + +**For executed-output notebooks** (where we want to keep outputs for audit): use a separate path like `runs/` or `outputs/` excluded from the filter via `.gitattributes`: +``` +*.ipynb filter=nbstripout +runs/*.ipynb !filter +runs/*.ipynb !diff +``` + +### 4.3 nbdime — Semantic Diff and Merge + +nbdime understands notebook structure. Instead of diffing raw JSON, it diffs at the level of cells — knowing that `cells` is a list, `source` is a string, and outputs should often be ignored. + +```bash +pip install nbdime + +# Enable semantic git diff/merge for all .ipynb files +nbdime config-git --enable + +# Now standard git commands are notebook-aware: +git diff HEAD notebook.ipynb # semantic cell-level diff +git merge feature-branch # uses nbdime for .ipynb conflict resolution +git log -p notebook.ipynb # readable patch per commit +``` + +**Python API for agent reasoning:** + +```python +import nbdime +import nbformat + +nb_base = nbformat.read(open("original.ipynb"), as_version=4) +nb_pr = nbformat.read(open("proposed.ipynb"), as_version=4) + +diff = nbdime.diff_notebooks(nb_base, nb_pr) + +# diff is a list of structured ops the agent can reason about: +# [{"op": "patch", "key": "cells", "diff": [ +# {"op": "patch", "key": 3, "diff": [ +# {"op": "patch", "key": "source", "diff": [...string ops...]} +# ]} +# ]}] + +# Apply a diff (patch) +from nbdime.patching import patch +nb_result = patch(nb_base, diff) +``` + +### 4.4 The Full Agent PR Workflow + +Here is the complete workflow — analogous to how Hermes makes PRs to code repos via Gitea: + +**1. Agent reads the task notebook** +```python +nb = nbformat.read(open("fleet_health_check.ipynb"), as_version=4) +``` + +**2. Agent locates and modifies relevant cells** +```python +# Find parameter cell +params_cell = next( + c for c in nb.cells + if "parameters" in c.get("metadata", {}).get("tags", []) +) +# Update threshold +params_cell.source = params_cell.source.replace("threshold = 0.95", "threshold = 0.90") + +# Add explanatory markdown +nb.cells.insert( + nb.cells.index(params_cell) + 1, + nbformat.v4.new_markdown_cell( + "**Note (Hermes 2026-04-06):** Threshold lowered from 0.95 to 0.90 " + "based on false-positive analysis from last 7 days of runs." + ) +) +``` + +**3. Agent writes and commits to a branch** +```bash +git checkout -b agent/fleet-health-threshold-update +nbformat.write(nb, open("fleet_health_check.ipynb", "w")) +git add fleet_health_check.ipynb +git commit -m "feat(notebooks): lower fleet health threshold to 0.90 (#155)" +``` + +**4. Agent executes the proposed notebook to validate** +```python +import papermill as pm + +pm.execute_notebook( + "fleet_health_check.ipynb", + "output/validation_run.ipynb", + parameters={"run_id": "agent-validation-2026-04-06"}, + log_output=True, +) +``` + +**5. Agent collects results and compares** +```python +import scrapbook as sb + +result = sb.read_notebook("output/validation_run.ipynb") +health_score = result.scraps["health_score"].data +alert_count = result.scraps["alert_count"].data +``` + +**6. Agent opens PR with results summary** +```bash +curl -X POST "$GITEA_API/pulls" \ + -H "Authorization: token $TOKEN" \ + -d '{ + "title": "feat(notebooks): lower fleet health threshold to 0.90", + "body": "## Agent Analysis\n\n- Health score: 0.94 (was 0.89 with old threshold)\n- Alert count: 12 (was 47 false positives)\n- Validation run: output/validation_run.ipynb\n\nRefs #155", + "head": "agent/fleet-health-threshold-update", + "base": "main" + }' +``` + +**7. Human reviews the PR using nbdime diff** + +The PR diff in Gitea shows the clean cell-level source changes (thanks to nbstripout). The human can also run `nbdiff-web original.ipynb proposed.ipynb` locally for rich rendered diff with output comparison. + +### 4.5 nbval — Regression Testing Notebooks + +`nbval` treats each notebook cell as a pytest test case, re-executing and comparing outputs to stored values: + +```bash +pip install nbval + +# Strict: every cell output must match stored outputs +pytest --nbval fleet_health_check.ipynb + +# Lax: only check cells marked with # NBVAL_CHECK_OUTPUT +pytest --nbval-lax fleet_health_check.ipynb +``` + +Cell-level markers (comments in cell source): +```python +# NBVAL_CHECK_OUTPUT — in lax mode, validate this cell's output +# NBVAL_SKIP — skip this cell entirely +# NBVAL_RAISES_EXCEPTION — expect an exception (test passes if raised) +``` + +This becomes the CI gate: before a notebook PR is merged, run `pytest --nbval-lax` to verify no cells produce errors and critical output cells still produce expected values. + +--- + +## 5. Gaps and Recommendations + +### 5.1 Gap Assessment (Refining Timmy's Original Findings) + +| Gap | Severity | Solution | +|---|---|---| +| No Hermes tool access in kernel | High | Inject `hermes_runtime` module (see §5.2) | +| No structured output protocol | High | Use scrapbook `sb.glue()` pattern | +| No parameterization | Medium | Add Papermill `"parameters"` cell to notebooks | +| XSRF/auth friction | Medium | Disable for local; use JupyterHub token scopes for multi-user | +| No notebook CI/testing | Medium | Add nbval to test suite | +| Raw `.ipynb` diffs in PRs | Medium | Install nbstripout + nbdime | +| No scheduling | Low | Papermill + existing Hermes cron layer | + +### 5.2 Short-Term Recommendations (This Month) + +**1. `NotebookExecutor` tool** + +A thin Hermes tool wrapping the ecosystem: + +```python +class NotebookExecutor: + def execute(self, input_path, output_path, parameters, timeout=300): + """Wraps pm.execute_notebook(). Returns structured result dict.""" + + def collect_outputs(self, notebook_path): + """Wraps sb.read_notebook(). Returns dict of named scraps.""" + + def inspect_parameters(self, notebook_path): + """Wraps pm.inspect_notebook(). Returns parameter schema.""" + + def read_notebook(self, path): + """Returns nbformat NotebookNode for cell inspection/modification.""" + + def write_notebook(self, nb, path): + """Writes modified NotebookNode back to disk.""" + + def diff_notebooks(self, path_a, path_b): + """Returns structured nbdime diff for agent reasoning.""" + + def validate(self, notebook_path): + """Runs nbformat.validate() + optional pytest --nbval-lax.""" +``` + +Execution result structure for the agent: +```python +{ + "status": "success" | "error", + "duration_seconds": 12.34, + "cells_executed": 15, + "failed_cell": { # None on success + "index": 7, + "source": "model.fit(X, y)", + "ename": "ValueError", + "evalue": "Input contains NaN", + }, + "scraps": { # from scrapbook + "health_score": 0.94, + "alert_count": 12, + }, +} +``` + +**2. Fleet Health Check as a Notebook** + +Convert the fleet health check epic into a parameterized notebook with: +- `"parameters"` cell for run configuration (date range, thresholds, agent ID) +- Markdown cells narrating each step +- `sb.glue()` calls for structured outputs +- `# NBVAL_CHECK_OUTPUT` markers on critical cells + +**3. Git hygiene for notebooks** + +Install nbstripout + nbdime in the hermes-agent repo: +```bash +pip install nbstripout nbdime +nbstripout --install +nbdime config-git --enable +``` + +Add to `.gitattributes`: +``` +*.ipynb filter=nbstripout +*.ipynb diff=ipynb +runs/*.ipynb !filter +``` + +### 5.3 Medium-Term Recommendations (Next Quarter) + +**4. `hermes_runtime` Python module** + +Inject Hermes tool access into the kernel via a module that notebooks import: + +```python +# In kernel cell: from hermes_runtime import terminal, read_file, web_search +import hermes_runtime as hermes + +results = hermes.web_search("fleet health metrics best practices") +hermes.terminal("systemctl status agent-fleet") +content = hermes.read_file("/var/log/hermes/agent.log") +``` + +This closes the most significant gap: notebooks gain the same tool access as skills, while retaining state persistence and narrative structure. + +**5. Notebook-triggered cron** + +Extend the Hermes cron layer to accept `.ipynb` paths as targets: +```yaml +# cron entry +schedule: "0 6 * * *" +type: notebook +path: notebooks/fleet_health_check.ipynb +parameters: + run_id: "{{date}}" + alert_threshold: 0.90 +output_path: runs/fleet_health_{{date}}.ipynb +``` + +The cron runner calls `pm.execute_notebook()` and commits the output to the repo. + +**6. JupyterHub for multi-agent isolation** + +If multiple agents need concurrent notebook execution, deploy JupyterHub with `DockerSpawner` or `KubeSpawner`. Each agent job gets an isolated container with its own kernel, no state bleed between runs. + +--- + +## 6. Architecture Vision + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Hermes Agent │ +│ │ +│ Skills (one-shot) Notebooks (multi-step) │ +│ ┌─────────────────┐ ┌─────────────────────────────────┐ │ +│ │ terminal() │ │ .ipynb file │ │ +│ │ web_search() │ │ ├── Markdown (narrative) │ │ +│ │ read_file() │ │ ├── Code cells (logic) │ │ +│ └─────────────────┘ │ ├── "parameters" cell │ │ +│ │ └── sb.glue() outputs │ │ +│ └──────────────┬────────────────┘ │ +│ │ │ +│ ┌──────────────▼────────────────┐ │ +│ │ NotebookExecutor tool │ │ +│ │ (papermill + scrapbook + │ │ +│ │ nbformat + nbdime + nbval) │ │ +│ └──────────────┬────────────────┘ │ +│ │ │ +└────────────────────────────────────────────┼────────────────────┘ + │ + ┌───────────────────▼──────────────────┐ + │ JupyterLab / Hub │ + │ (kernel execution environment) │ + └───────────────────┬──────────────────┘ + │ + ┌───────────────────▼──────────────────┐ + │ Git + Gitea │ + │ (nbstripout clean diffs, │ + │ nbdime semantic review, │ + │ PR workflow for notebook changes) │ + └──────────────────────────────────────┘ +``` + +**Notebooks become the primary artifact of complex tasks:** the agent generates or edits cells, Papermill executes them reproducibly, scrapbook extracts structured outputs for agent decision-making, and the resulting `.ipynb` is both proof-of-work and human-readable report. Skills remain for one-shot actions. Notebooks own multi-step workflows. + +--- + +## 7. Package Summary + +| Package | Purpose | Install | +|---|---|---| +| `nbformat` | Read/write/validate `.ipynb` files | `pip install nbformat` | +| `nbconvert` | Execute and export notebooks | `pip install nbconvert` | +| `papermill` | Parameterize + execute in pipelines | `pip install papermill` | +| `scrapbook` | Structured output collection | `pip install scrapbook` | +| `nbdime` | Semantic diff/merge for git | `pip install nbdime` | +| `nbstripout` | Git filter for clean diffs | `pip install nbstripout` | +| `nbval` | pytest-based output regression | `pip install nbval` | +| `jupyter-kernel-gateway` | Headless REST kernel access | `pip install jupyter-kernel-gateway` | + +--- + +## 8. References + +- [Papermill GitHub (nteract/papermill)](https://github.com/nteract/papermill) +- [Scrapbook GitHub (nteract/scrapbook)](https://github.com/nteract/scrapbook) +- [nbformat format specification](https://nbformat.readthedocs.io/en/latest/format_description.html) +- [nbdime documentation](https://nbdime.readthedocs.io/) +- [nbdime diff format spec (JEP #8)](https://github.com/jupyter/enhancement-proposals/blob/master/08-notebook-diff/notebook-diff.md) +- [nbconvert execute API](https://nbconvert.readthedocs.io/en/latest/execute_api.html) +- [nbstripout README](https://github.com/kynan/nbstripout) +- [nbval GitHub (computationalmodelling/nbval)](https://github.com/computationalmodelling/nbval) +- [JupyterHub REST API](https://jupyterhub.readthedocs.io/en/stable/howto/rest.html) +- [JupyterHub Technical Overview](https://jupyterhub.readthedocs.io/en/latest/reference/technical-overview.html) +- [Jupyter Kernel Gateway](https://github.com/jupyter-server/kernel_gateway) diff --git a/hermes-sovereign/docs/nexus_architect.md b/hermes-sovereign/docs/nexus_architect.md new file mode 100644 index 00000000..1b470b71 --- /dev/null +++ b/hermes-sovereign/docs/nexus_architect.md @@ -0,0 +1,490 @@ +# Nexus Architect Tool + +The **Nexus Architect Tool** enables Timmy (the Hermes Agent) to autonomously design and build 3D environments in the Three.js-based "Nexus" virtual world. It provides a structured interface for creating rooms, portals, lighting systems, and architectural features through LLM-generated Three.js code. + +## Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Nexus Architect Tool │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │ +│ │ Room Design │ │ Portal Create│ │ Lighting System │ │ +│ └──────────────┘ └──────────────┘ └──────────────────────┘ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │ +│ │ Architecture │ │ Code Validate│ │ Scene Export │ │ +│ └──────────────┘ └──────────────┘ └──────────────────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ Scene Graph Store │ +│ (Rooms, Portals, Lights, Architecture) │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Architecture + +### Core Components + +1. **NexusArchitect Class**: Main orchestrator for all architectural operations +2. **SceneGraph**: Dataclass storing the complete world state +3. **Validation Engine**: Security and syntax validation for generated code +4. **Prompt Generator**: Structured LLM prompts for Three.js code generation +5. **Tool Registry Integration**: Registration with Hermes tool system + +### Data Models + +```python +@dataclass +class RoomConfig: + name: str + theme: RoomTheme # meditation, tech_lab, nature, crystal_cave, library, void + dimensions: Dict[str, float] # {width, height, depth} + features: List[str] + lighting_profile: str + fog_enabled: bool + +@dataclass +class PortalConfig: + name: str + source_room: str + target_room: str + position: Dict[str, float] + style: PortalStyle # circular, rectangular, stargate, dissolve, glitch + color: str + one_way: bool + +@dataclass +class LightConfig: + name: str + type: LightType # ambient, directional, point, spot, hemisphere + position: Dict[str, float] + color: str + intensity: float + cast_shadow: bool +``` + +## Available Tools + +### 1. `nexus_design_room` + +Design a new room in the Nexus. + +**Parameters:** +- `name` (string, required): Unique room identifier +- `theme` (string, required): One of `meditation`, `tech_lab`, `nature`, `crystal_cave`, `library`, `void`, `custom` +- `dimensions` (object): `{width, height, depth}` in meters (default: 10x5x10) +- `features` (array): List of feature names (e.g., `water_feature`, `floating_lanterns`) +- `lighting_profile` (string): Preset lighting configuration +- `mental_state` (object): Optional context for design decisions + +**Returns:** +```json +{ + "success": true, + "room_name": "meditation_chamber", + "prompt": "... LLM prompt for Three.js generation ...", + "config": { ... room configuration ... } +} +``` + +**Example:** +```python +nexus_design_room( + name="zen_garden", + theme="meditation", + dimensions={"width": 20, "height": 10, "depth": 20}, + features=["water_feature", "bamboo_grove", "floating_lanterns"], + mental_state={"mood": "calm", "energy": 0.3} +) +``` + +### 2. `nexus_create_portal` + +Create a portal connecting two rooms. + +**Parameters:** +- `name` (string, required): Unique portal identifier +- `source_room` (string, required): Source room name +- `target_room` (string, required): Target room name +- `position` (object): `{x, y, z}` coordinates in source room +- `style` (string): Visual style (`circular`, `rectangular`, `stargate`, `dissolve`, `glitch`) +- `color` (string): Hex color code (default: `#00ffff`) + +**Returns:** +```json +{ + "success": true, + "portal_name": "portal_alpha", + "source": "room_a", + "target": "room_b", + "prompt": "... LLM prompt for portal generation ..." +} +``` + +### 3. `nexus_add_lighting` + +Add lighting elements to a room. + +**Parameters:** +- `room_name` (string, required): Target room +- `lights` (array): List of light configurations + - `name` (string): Light identifier + - `type` (string): `ambient`, `directional`, `point`, `spot`, `hemisphere` + - `position` (object): `{x, y, z}` + - `color` (string): Hex color + - `intensity` (number): Light intensity + - `cast_shadow` (boolean): Enable shadows + +**Example:** +```python +nexus_add_lighting( + room_name="meditation_chamber", + lights=[ + {"name": "ambient", "type": "ambient", "intensity": 0.3}, + {"name": "main", "type": "point", "position": {"x": 0, "y": 5, "z": 0}} + ] +) +``` + +### 4. `nexus_validate_scene` + +Validate generated Three.js code for security and syntax. + +**Parameters:** +- `code` (string, required): JavaScript code to validate +- `strict_mode` (boolean): Enable stricter validation (default: false) + +**Returns:** +```json +{ + "is_valid": true, + "errors": [], + "warnings": [], + "safety_score": 95, + "extracted_code": "... cleaned code ..." +} +``` + +**Security Checks:** +- Banned patterns: `eval()`, `Function()`, `setTimeout(string)`, `document.write` +- Network blocking: `fetch()`, `WebSocket`, `XMLHttpRequest` +- Storage blocking: `localStorage`, `sessionStorage`, `indexedDB` +- Syntax validation: Balanced braces and parentheses + +### 5. `nexus_export_scene` + +Export the current scene configuration. + +**Parameters:** +- `format` (string): `json` or `js` (default: `json`) + +**Returns:** +```json +{ + "success": true, + "format": "json", + "data": "... exported scene data ...", + "summary": { + "rooms": 3, + "portals": 2, + "lights": 5 + } +} +``` + +### 6. `nexus_get_summary` + +Get a summary of the current scene state. + +**Returns:** +```json +{ + "rooms": [ + {"name": "room_a", "theme": "void", "connected_portals": ["p1"]} + ], + "portal_network": [ + {"name": "p1", "source": "room_a", "target": "room_b"} + ], + "total_lights": 5 +} +``` + +## LLM Integration Flow + +``` +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ User Request │────▶│ Architect │────▶│ Prompt │ +│ ("Create a │ │ Tool │ │ Generator │ +│ zen room") │ └──────────────┘ └──────────────┘ +└──────────────┘ │ + ▼ +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ Nexus │◀────│ Validation │◀────│ LLM │ +│ Runtime │ │ Engine │ │ (generates │ +│ │ │ │ │ Three.js) │ +└──────────────┘ └──────────────┘ └──────────────┘ +``` + +1. **Request Parsing**: User request converted to structured configuration +2. **Prompt Generation**: Architect generates structured LLM prompt +3. **Code Generation**: LLM generates Three.js code based on prompt +4. **Validation**: Code validated for security and syntax +5. **Execution**: Validated code ready for Nexus runtime + +## Code Validation + +### Allowed Three.js APIs + +The validation system maintains an allowlist of safe Three.js APIs: + +**Core:** +- `THREE.Scene`, `THREE.Group`, `THREE.Object3D` +- `THREE.PerspectiveCamera`, `THREE.OrthographicCamera` + +**Geometries:** +- `THREE.BoxGeometry`, `THREE.SphereGeometry`, `THREE.PlaneGeometry` +- `THREE.CylinderGeometry`, `THREE.ConeGeometry`, `THREE.TorusGeometry` +- `THREE.BufferGeometry`, `THREE.BufferAttribute` + +**Materials:** +- `THREE.MeshBasicMaterial`, `THREE.MeshStandardMaterial` +- `THREE.MeshPhongMaterial`, `THREE.MeshPhysicalMaterial` +- `THREE.SpriteMaterial`, `THREE.PointsMaterial` + +**Lights:** +- `THREE.AmbientLight`, `THREE.DirectionalLight`, `THREE.PointLight` +- `THREE.SpotLight`, `THREE.HemisphereLight` + +**Math:** +- `THREE.Vector3`, `THREE.Euler`, `THREE.Quaternion`, `THREE.Matrix4` +- `THREE.Color`, `THREE.Raycaster`, `THREE.Clock` + +### Banned Patterns + +```python +BANNED_JS_PATTERNS = [ + r"eval\s*\(", # Code injection + r"Function\s*\(", # Dynamic function creation + r"setTimeout\s*\(\s*['\"]", # Timers with strings + r"document\.write", # DOM manipulation + r"window\.location", # Navigation + r"XMLHttpRequest", # Network requests + r"fetch\s*\(", # Fetch API + r"localStorage", # Storage access + r"navigator", # Browser API access +] +``` + +## Scene Graph Format + +### JSON Export Structure + +```json +{ + "version": "1.0.0", + "rooms": { + "meditation_chamber": { + "name": "meditation_chamber", + "theme": "meditation", + "dimensions": {"width": 20, "height": 10, "depth": 20}, + "features": ["water_feature", "floating_lanterns"], + "fog_enabled": false + } + }, + "portals": { + "portal_1": { + "name": "portal_1", + "source_room": "room_a", + "target_room": "room_b", + "position": {"x": 5, "y": 2, "z": 0}, + "style": "circular", + "color": "#00ffff" + } + }, + "lights": { + "ambient": { + "name": "ambient", + "type": "AmbientLight", + "color": "#ffffff", + "intensity": 0.3 + } + }, + "global_settings": { + "shadow_map_enabled": true, + "antialias": true + } +} +``` + +## Usage Examples + +### Creating a Meditation Space + +```python +# Step 1: Design the room +room_result = nexus_design_room( + name="zen_garden", + theme="meditation", + dimensions={"width": 25, "height": 12, "depth": 25}, + features=["water_feature", "bamboo_grove", "stone_path", "floating_lanterns"], + mental_state={"mood": "peaceful", "energy": 0.2} +) + +# Step 2: Generate the Three.js code (send prompt to LLM) +prompt = room_result["prompt"] +# ... LLM generates code ... + +# Step 3: Validate the generated code +generated_code = """ +function createRoom() { + const scene = new THREE.Scene(); + // ... room implementation ... + return scene; +} +""" +validation = nexus_validate_scene(code=generated_code) +assert validation["is_valid"] + +# Step 4: Add lighting +nexus_add_lighting( + room_name="zen_garden", + lights=[ + {"name": "ambient", "type": "ambient", "intensity": 0.2, "color": "#ffe4b5"}, + {"name": "sun", "type": "directional", "position": {"x": 10, "y": 20, "z": 5}}, + {"name": "lantern_glow", "type": "point", "color": "#ffaa00", "intensity": 0.8} + ] +) +``` + +### Creating a Portal Network + +```python +# Create hub room +nexus_design_room(name="hub", theme="tech_lab", dimensions={"width": 30, "height": 15, "depth": 30}) + +# Create destination rooms +nexus_design_room(name="library", theme="library") +nexus_design_room(name="crystal_cave", theme="crystal_cave") +nexus_design_room(name="nature", theme="nature") + +# Create portals +nexus_create_portal(name="to_library", source_room="hub", target_room="library", style="rectangular") +nexus_create_portal(name="to_cave", source_room="hub", target_room="crystal_cave", style="stargate") +nexus_create_portal(name="to_nature", source_room="hub", target_room="nature", style="circular", color="#00ff00") + +# Export the scene +export = nexus_export_scene(format="json") +print(export["data"]) +``` + +## Testing + +Run the test suite: + +```bash +# Run all tests +pytest tests/tools/test_nexus_architect.py -v + +# Run specific test categories +pytest tests/tools/test_nexus_architect.py::TestCodeValidation -v +pytest tests/tools/test_nexus_architect.py::TestNexusArchitect -v +pytest tests/tools/test_nexus_architect.py::TestSecurity -v + +# Run with coverage +pytest tests/tools/test_nexus_architect.py --cov=tools.nexus_architect --cov-report=html +``` + +### Test Coverage + +- **Unit Tests**: Data models, validation, prompt generation +- **Integration Tests**: Complete workflows, scene export +- **Security Tests**: XSS attempts, code injection, banned patterns +- **Performance Tests**: Large scenes, complex portal networks + +## Future Enhancements + +### Planned Features + +1. **Asset Library Integration** + - Pre-built furniture and decor objects + - Material library (PBR textures) + - Audio ambience presets + +2. **Advanced Validation** + - AST-based JavaScript parsing + - Sandboxed code execution testing + - Performance profiling (polygon count, draw calls) + +3. **Multi-Agent Collaboration** + - Room ownership and permissions + - Concurrent editing with conflict resolution + - Version control for scenes + +4. **Runtime Integration** + - Hot-reload for scene updates + - Real-time collaboration protocol + - Physics engine integration (Cannon.js, Ammo.js) + +5. **AI-Assisted Design** + - Automatic room layout optimization + - Lighting analysis and recommendations + - Accessibility compliance checking + +## Configuration + +### Environment Variables + +```bash +# Enable debug logging +NEXUS_ARCHITECT_DEBUG=1 + +# Set maximum scene complexity +NEXUS_MAX_ROOMS=100 +NEXUS_MAX_PORTALS=500 +NEXUS_MAX_LIGHTS=1000 + +# Strict validation mode +NEXUS_STRICT_VALIDATION=1 +``` + +### Toolset Registration + +The tool automatically registers with the Hermes tool registry: + +```python +from tools.registry import registry + +registry.register( + name="nexus_design_room", + toolset="nexus_architect", + schema=NEXUS_ARCHITECT_SCHEMAS["nexus_design_room"], + handler=..., + emoji="🏛️", +) +``` + +## Troubleshooting + +### Common Issues + +**"Room already exists" error:** +- Room names must be unique within a session +- Use `nexus_get_summary()` to list existing rooms + +**"Invalid theme" error:** +- Check theme spelling against allowed values +- Use lowercase theme names + +**Code validation failures:** +- Ensure no banned APIs are used +- Check for balanced braces/parentheses +- Try `strict_mode=false` for less strict validation + +**Missing room errors:** +- Rooms must be created before adding lights or portals +- Verify room name spelling matches exactly + +## References + +- [Three.js Documentation](https://threejs.org/docs/) +- [Hermes Agent Tools Guide](tools-reference.md) +- [Nexus Runtime Specification](nexus-runtime.md) (TODO) diff --git a/hermes-sovereign/docs/nexus_architect_summary.md b/hermes-sovereign/docs/nexus_architect_summary.md new file mode 100644 index 00000000..72c3cf62 --- /dev/null +++ b/hermes-sovereign/docs/nexus_architect_summary.md @@ -0,0 +1,138 @@ +# Phase 31: Nexus Architect Tool — Implementation Summary + +## Overview + +Successfully designed and scaffolded the **Nexus Architect Tool** for autonomous 3D world generation in a Three.js-based virtual environment. This tool enables Timmy (the Hermes Agent) to design rooms, create portals, add lighting, and generate validated Three.js code. + +## Files Created + +### 1. `tools/nexus_architect.py` (42KB) +Main tool implementation with: +- **6 registered tools**: `nexus_design_room`, `nexus_create_portal`, `nexus_add_lighting`, `nexus_validate_scene`, `nexus_export_scene`, `nexus_get_summary` +- **Data models**: RoomConfig, PortalConfig, LightConfig, ArchitectureConfig, SceneGraph +- **LLM prompt generators**: Structured prompts for Three.js code generation +- **Security validation**: Banned pattern detection, syntax checking, code sanitization +- **Tool registry integration**: Automatic registration with Hermes tool system + +### 2. `tests/tools/test_nexus_architect.py` (24KB) +Comprehensive test suite with: +- **48 test cases** covering all functionality +- **6 test classes**: Data models, validation, prompt generation, core functionality, integration, security, performance +- **100% test pass rate** + +### 3. `docs/nexus_architect.md` (15KB) +Complete documentation including: +- Architecture overview with diagrams +- Tool usage examples and API reference +- Scene graph format specification +- Security model and allowed/banned APIs +- Troubleshooting guide + +## Key Design Decisions + +### Architecture Research Findings +Since no existing "the-nexus" repository was found in the codebase, the architecture was designed based on: +- Common Three.js scene management patterns +- Task requirements for rooms, portals, and lighting +- Security best practices for LLM-generated code + +### Data Model Design +``` +Room: name, theme, dimensions, features, fog settings +Portal: name, source/target rooms, position, style, color +Light: name, type, position, color, intensity, shadows +SceneGraph: versioned container for all world elements +``` + +### Security Model +**Banned Patterns** (detected and rejected): +- `eval()`, `Function()`, dynamic code execution +- `fetch()`, `WebSocket`, network requests +- `localStorage`, `sessionStorage`, storage access +- `document.write`, `window.location`, DOM manipulation + +**Validation Features**: +- Regex-based pattern detection +- Syntax validation (balanced braces/parentheses) +- Code sanitization (comment removal, debugger stripping) +- Safety scoring (100 - errors*20 - warnings*5) + +### LLM Integration Flow +1. User request → structured configuration +2. Configuration → LLM prompt (with context/mental state) +3. LLM generates Three.js code +4. Code validation (security + syntax) +5. Validated code → Nexus runtime + +## Tool Capabilities + +### nexus_design_room +- Creates room configuration with 7 themes (meditation, tech_lab, nature, crystal_cave, library, void, custom) +- Generates structured LLM prompt for Three.js room code +- Supports mental state context for adaptive design + +### nexus_create_portal +- Connects two rooms with visual portal +- 5 portal styles (circular, rectangular, stargate, dissolve, glitch) +- Generates portal animation and effect code prompts + +### nexus_add_lighting +- Adds 6 light types (ambient, directional, point, spot, hemisphere, rect_area) +- Configurable shadows, colors, intensity +- Generates lighting system code prompts + +### nexus_validate_scene +- Security validation against banned patterns +- Syntax checking for JavaScript/Three.js +- Extracts code from markdown blocks +- Returns safety score (0-100) + +### nexus_export_scene +- Exports to JSON or JavaScript module format +- Includes complete scene graph with rooms, portals, lights +- Summary statistics for scene complexity + +### nexus_get_summary +- Returns current world state overview +- Room connectivity via portal network +- Light and architecture counts + +## Testing Coverage + +| Category | Tests | Status | +|----------|-------|--------| +| Data Models | 6 | ✅ Pass | +| Code Validation | 7 | ✅ Pass | +| Code Sanitization | 3 | ✅ Pass | +| Prompt Generation | 4 | ✅ Pass | +| Core Functionality | 13 | ✅ Pass | +| Tool Entry Points | 5 | ✅ Pass | +| Integration | 3 | ✅ Pass | +| Security | 3 | ✅ Pass | +| Performance | 2 | ✅ Pass | +| **Total** | **48** | **✅ All Pass** | + +## Future Work (Phase 2+) + +1. **LLM Integration**: Connect to actual LLM API for code generation +2. **Asset Library**: Pre-built 3D models and textures +3. **Runtime Integration**: Hot-reload, physics engine (Cannon.js/Ammo.js) +4. **Multi-Agent**: Room ownership, concurrent editing +5. **Persistence**: Database storage for scenes +6. **UI Components**: Visualization of scene graph + +## Integration Notes + +The tool is ready for integration with: +- Hermes tool registry (auto-registers on import) +- LLM providers (OpenAI, Anthropic, etc.) +- Three.js runtime environments +- Session management for persistent world state + +## Code Quality + +- **Type hints**: Full typing for all functions +- **Docstrings**: Comprehensive documentation +- **Error handling**: Graceful failure with informative messages +- **Security**: Defense-in-depth for code generation +- **Testing**: Comprehensive coverage across all categories diff --git a/hermes-sovereign/docs/research-ssd-self-distillation-2026-04.md b/hermes-sovereign/docs/research-ssd-self-distillation-2026-04.md new file mode 100644 index 00000000..e121302d --- /dev/null +++ b/hermes-sovereign/docs/research-ssd-self-distillation-2026-04.md @@ -0,0 +1,166 @@ +# Research Acknowledgment: SSD — Simple Self-Distillation Improves Code Generation + +**Issue:** #128 +**Paper:** [Embarrassingly Simple Self-Distillation Improves Code Generation](https://arxiv.org/abs/2604.01193) +**Authors:** Ruixiang Zhang, Richard He Bai, Huangjie Zheng, Navdeep Jaitly, Ronan Collobert, Yizhe Zhang (Apple) +**Date:** April 1, 2026 +**Code:** https://github.com/apple/ml-ssd +**Acknowledged by:** Claude — April 6, 2026 + +--- + +## Assessment: High Relevance to Fleet + +This paper is directly applicable to the hermes-agent fleet. The headline result — +7.5pp pass@1 on Qwen3-4B — is at exactly the scale we operate. The method requires no external infrastructure. Triage verdict: **P0 / Week-class work**. + +--- + +## What SSD Actually Does + +Three steps, nothing exotic: + +1. **Sample**: For each coding prompt, generate one solution at temperature `T_train` (~0.9). Do NOT filter for correctness. +2. **Fine-tune**: SFT on the resulting `(prompt, unverified_solution)` pairs. Standard cross-entropy loss. No RLHF, no GRPO, no DPO. +3. **Evaluate**: At `T_eval` (which must be **different** from `T_train`). This asymmetry is not optional — using the same temperature for both loses 30–50% of the gains. + +The counterintuitive part: N=1 per problem, unverified. Prior self-improvement work uses N>>1 and filters by execution. SSD doesn't. The paper argues this is *why* it works — you're sharpening the model's own distribution, not fitting to a correctness filter's selection bias. + +--- + +## The Fork/Lock Theory + +The paper's core theoretical contribution explains *why* temperature asymmetry matters. + +**Locks** — positions requiring syntactic precision: colons, parentheses, import paths, variable names. A mistake here is a hard error. Low temperature helps at Locks. But applying low temperature globally kills diversity everywhere. + +**Forks** — algorithmic choice points where multiple valid continuations exist: picking a sort algorithm, choosing a data structure, deciding on a loop structure. High temperature helps at Forks. But applying high temperature globally introduces errors at Locks. + +SSD's fine-tuning reshapes token distributions **context-dependently**: +- At Locks: narrows the distribution, suppressing distractor tokens +- At Forks: widens the distribution, preserving valid algorithmic paths + +A single global temperature cannot do this. SFT on self-generated data can, because the model learns from examples that implicitly encode which positions are Locks and which are Forks in each problem context. + +**Fleet implication**: Our agents are currently using a single temperature for everything. This is leaving performance on the table even without fine-tuning. The immediate zero-cost action is temperature auditing (see Phase 1 below). + +--- + +## Results That Matter to Us + +| Model | Before | After | Delta | +|-------|--------|-------|-------| +| Qwen3-30B-Instruct | 42.4% | 55.3% | +12.9pp (+30% rel) | +| Qwen3-4B-Instruct | baseline | baseline+7.5pp | +7.5pp | +| Llama-3.1-8B-Instruct | baseline | baseline+3.5pp | +3.5pp | + +Gains concentrate on hard problems: +14.2pp medium, +15.3pp hard. This is the distribution our agents face on real Gitea issues — not easy textbook problems. + +--- + +## Fleet Implementation Plan + +### Phase 1: Temperature Audit (Zero cost, this week) + +Current state: fleet agents use default or eyeballed temperature settings. The paper shows T_eval != T_train is critical even without fine-tuning. + +Actions: +1. Document current temperature settings in `hermes/`, `skills/`, and any Ollama config files +2. Establish a held-out test set of 20+ solved Gitea issues with known-correct outputs +3. Run A/B: current T_eval vs. T_eval=0.7 vs. T_eval=0.3 for code generation tasks +4. Record pass rates per condition; file findings as a follow-up issue + +Expected outcome: measurable improvement with no model changes, no infrastructure, no cost. + +### Phase 2: SSD Pipeline (1–2 weeks, single Mac) + +Replicate the paper's method on Qwen3-4B via Ollama + axolotl or unsloth: + +``` +1. Dataset construction: + - Extract 100–500 coding prompts from Gitea issue backlog + - Focus on issues that have accepted PRs (ground truth available for evaluation only, not training) + - Format: (system_prompt + issue_description) → model generates solution at T_train=0.9 + +2. Fine-tuning: + - Use LoRA (not full fine-tune) to stay local-first + - Standard SFT: cross-entropy on (prompt, self-generated_solution) pairs + - Recommended: unsloth for memory efficiency on Mac hardware + - Training budget: 1–3 epochs, small batch size + +3. Evaluation: + - Compare base model vs. SSD-tuned model at T_eval=0.7 + - Metric: pass@1 on held-out issues not in training set + - Also test on general coding benchmarks to check for capability regression +``` + +Infrastructure assessment: +- **RAM**: Qwen3-4B quantized (Q4_K_M) needs ~3.5GB VRAM for inference; LoRA fine-tuning needs ~8–12GB unified memory (Mac M-series feasible) +- **Storage**: Self-generated dataset is small; LoRA adapter is ~100–500MB +- **Time**: 500 examples × 3 epochs ≈ 2–4 hours on M2/M3 Max +- **Dependencies**: Ollama (inference), unsloth or axolotl (fine-tuning), datasets (HuggingFace), trl + +No cloud required. No teacher model required. No code execution environment required. + +### Phase 3: Continuous Self-Improvement Loop (1–2 months) + +Wire SSD into the fleet's burn mode: + +``` +Nightly cron: + 1. Collect agent solutions from the day's completed issues + 2. Filter: only solutions where the PR was merged (human-verified correct) + 3. Append to rolling training buffer (last 500 examples) + 4. Run SFT fine-tune on buffer → update LoRA adapter + 5. Swap adapter into Ollama deployment at dawn + 6. Agents start next day with yesterday's lessons baked in +``` + +This integrates naturally with RetainDB (#112) — the persistent memory system would track which solutions were merged, providing the feedback signal. The continuous loop turns every merged PR into a training example. + +### Phase 4: Sovereignty Confirmation + +The paper validates that external data is not required for improvement. Our fleet can: +- Fine-tune exclusively on its own conversation data +- Stay fully local (no API calls, no external datasets) +- Accumulate improvements over time without model subscriptions + +This is the sovereign fine-tuning capability the fleet needs to remain independent as external model APIs change pricing or capabilities. + +--- + +## Risks and Mitigations + +| Risk | Assessment | Mitigation | +|------|------------|------------| +| SSD gains don't transfer from LiveCodeBench to Gitea issues | Medium — our domain is software engineering, not competitive programming | Test on actual Gitea issues from the backlog; don't assume benchmark numbers transfer | +| Fine-tuning degrades non-code capabilities | Low-Medium | LoRA instead of full fine-tune; test on general tasks after SFT; retain base model checkpoint | +| Small training set (<200 examples) insufficient | Medium | Paper shows gains at modest scale; supplement with open code datasets (Stack, TheVault) if needed | +| Qwen3 GGUF format incompatible with unsloth fine-tuning | Low | unsloth supports Qwen3; verify exact GGUF variant compatibility before starting | +| Temperature asymmetry effect smaller on instruction-tuned variants | Low | Paper explicitly tests instruct variants and shows gains; Qwen3-4B-Instruct is in the paper's results | + +--- + +## Acceptance Criteria Status + +From the issue: + +- [ ] **Temperature audit** — Document current T/top_p settings across fleet agents, compare with paper recommendations +- [ ] **T_eval benchmark** — A/B test on 20+ solved Gitea issues; measure correctness +- [ ] **SSD reproduction** — Replicate pipeline on Qwen4B with 100 prompts; measure pass@1 change +- [ ] **Infrastructure assessment** — Documented above (Phase 2 section); GPU/RAM/storage requirements are Mac-feasible +- [ ] **Continuous loop design** — Architecture drafted above (Phase 3 section); integrates with RetainDB (#112) + +Infrastructure assessment and continuous loop design are addressed in this document. Temperature audit and SSD reproduction require follow-up issues with execution. + +--- + +## Recommended Follow-Up Issues + +1. **Temperature Audit** — Audit all fleet agent temperature configs; run A/B on T_eval variants; file results (Phase 1) +2. **SSD Pipeline Spike** — Build and run the 3-stage SSD pipeline on Qwen3-4B; report pass@1 delta (Phase 2) +3. **Nightly SFT Integration** — Wire SSD into burn-mode cron; integrate with RetainDB feedback loop (Phase 3) + +--- + +*Research acknowledged by Claude — April 6, 2026* +*Source issue: [hermes-agent #128](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/128)* diff --git a/hermes-sovereign/githooks/pre-commit b/hermes-sovereign/githooks/pre-commit new file mode 100644 index 00000000..9e733867 --- /dev/null +++ b/hermes-sovereign/githooks/pre-commit @@ -0,0 +1,15 @@ +#!/bin/bash +# +# Pre-commit hook wrapper for secret leak detection. +# +# Installation: +# git config core.hooksPath .githooks +# +# To bypass temporarily: +# git commit --no-verify +# + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +exec python3 "${SCRIPT_DIR}/pre-commit.py" "$@" diff --git a/hermes-sovereign/githooks/pre-commit-config.yaml b/hermes-sovereign/githooks/pre-commit-config.yaml new file mode 100644 index 00000000..af01c059 --- /dev/null +++ b/hermes-sovereign/githooks/pre-commit-config.yaml @@ -0,0 +1,25 @@ +repos: + # Secret detection + - repo: https://github.com/gitleaks/gitleaks + rev: v8.21.2 + hooks: + - id: gitleaks + name: Detect secrets with gitleaks + description: Detect hardcoded secrets, API keys, and credentials + + # Basic security hygiene + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-added-large-files + args: ['--maxkb=500'] + - id: detect-private-key + name: Detect private keys + - id: check-merge-conflict + - id: check-yaml + - id: check-toml + - id: end-of-file-fixer + - id: trailing-whitespace + args: ['--markdown-linebreak-ext=md'] + - id: no-commit-to-branch + args: ['--branch', 'main'] diff --git a/hermes-sovereign/githooks/pre-commit.py b/hermes-sovereign/githooks/pre-commit.py new file mode 100644 index 00000000..a48ade0a --- /dev/null +++ b/hermes-sovereign/githooks/pre-commit.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +""" +Pre-commit hook for detecting secret leaks in staged files. + +Scans staged diffs and full file contents for common secret patterns, +token file paths, private keys, and credential strings. + +Installation: + git config core.hooksPath .githooks + +To bypass: + git commit --no-verify +""" + +from __future__ import annotations + +import re +import subprocess +import sys +from pathlib import Path +from typing import Iterable, List, Callable, Union + +# ANSI color codes +RED = "\033[0;31m" +YELLOW = "\033[1;33m" +GREEN = "\033[0;32m" +NC = "\033[0m" + + +class Finding: + """Represents a single secret leak finding.""" + + def __init__(self, filename: str, line: int, message: str) -> None: + self.filename = filename + self.line = line + self.message = message + + def __repr__(self) -> str: + return f"Finding({self.filename!r}, {self.line}, {self.message!r})" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Finding): + return NotImplemented + return ( + self.filename == other.filename + and self.line == other.line + and self.message == other.message + ) + + +# --------------------------------------------------------------------------- +# Regex patterns +# --------------------------------------------------------------------------- + +_RE_SK_KEY = re.compile(r"sk-[a-zA-Z0-9]{20,}") +_RE_BEARER = re.compile(r"Bearer\s+[a-zA-Z0-9_-]{20,}") + +_RE_ENV_ASSIGN = re.compile( + r"^(?:export\s+)?" + r"(OPENAI_API_KEY|GITEA_TOKEN|ANTHROPIC_API_KEY|KIMI_API_KEY" + r"|TELEGRAM_BOT_TOKEN|DISCORD_TOKEN)" + r"\s*=\s*(.+)$" +) + +_RE_TOKEN_PATHS = re.compile( + r'(?:^|["\'\s])' + r"(\.(?:env)" + r"|(?:secrets|keystore|credentials|token|api_keys)\.json" + r"|~/\.hermes/credentials/" + r"|/root/nostr-relay/keystore\.json)" +) + +_RE_PRIVATE_KEY = re.compile( + r"-----BEGIN (PRIVATE KEY|RSA PRIVATE KEY|OPENSSH PRIVATE KEY)-----" +) + +_RE_URL_PASSWORD = re.compile(r"https?://[^:]+:[^@]+@") + +_RE_RAW_TOKEN = re.compile(r'"token"\s*:\s*"([^"]{10,})"') +_RE_RAW_API_KEY = re.compile(r'"api_key"\s*:\s*"([^"]{10,})"') + +# Safe patterns (placeholders) +_SAFE_ENV_VALUES = { + "", + "***", + "REDACTED", + "", +} + +_RE_DOC_EXAMPLE = re.compile( + r"\b(?:example|documentation|doc|readme)\b", + re.IGNORECASE, +) + +_RE_OS_ENVIRON = re.compile(r"os\.environ(?:\.get|\[)") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def is_binary_content(content: Union[str, bytes]) -> bool: + """Return True if content appears to be binary.""" + if isinstance(content, str): + return False + return b"\x00" in content + + +def _looks_like_safe_env_line(line: str) -> bool: + """Check if a line is a safe env var read or reference.""" + if _RE_OS_ENVIRON.search(line): + return True + # Variable expansion like $OPENAI_API_KEY + if re.search(r'\$\w+\s*$', line.strip()): + return True + return False + + +def _is_placeholder(value: str) -> bool: + """Check if a value is a known placeholder or empty.""" + stripped = value.strip().strip('"').strip("'") + if stripped in _SAFE_ENV_VALUES: + return True + # Single word references like $VAR + if re.fullmatch(r"\$\w+", stripped): + return True + return False + + +def _is_doc_or_example(line: str, value: str | None = None) -> bool: + """Check if line appears to be documentation or example code.""" + # If the line contains a placeholder value, it's likely documentation + if value is not None and _is_placeholder(value): + return True + # If the line contains doc keywords and no actual secret-looking value + if _RE_DOC_EXAMPLE.search(line): + # For env assignments, if value is empty or placeholder + m = _RE_ENV_ASSIGN.search(line) + if m and _is_placeholder(m.group(2)): + return True + return False + + +# --------------------------------------------------------------------------- +# Scanning +# --------------------------------------------------------------------------- + +def scan_line(line: str, filename: str, line_no: int) -> Iterable[Finding]: + """Scan a single line for secret leak patterns.""" + stripped = line.rstrip("\n") + if not stripped: + return + + # --- API keys ---------------------------------------------------------- + if _RE_SK_KEY.search(stripped): + yield Finding(filename, line_no, "Potential API key (sk-...) found") + return # One finding per line is enough + + if _RE_BEARER.search(stripped): + yield Finding(filename, line_no, "Potential Bearer token found") + return + + # --- Env var assignments ----------------------------------------------- + m = _RE_ENV_ASSIGN.search(stripped) + if m: + var_name = m.group(1) + value = m.group(2) + if _looks_like_safe_env_line(stripped): + return + if _is_doc_or_example(stripped, value): + return + if not _is_placeholder(value): + yield Finding( + filename, + line_no, + f"Potential secret assignment: {var_name}=...", + ) + return + + # --- Token file paths -------------------------------------------------- + if _RE_TOKEN_PATHS.search(stripped): + yield Finding(filename, line_no, "Potential token file path found") + return + + # --- Private key blocks ------------------------------------------------ + if _RE_PRIVATE_KEY.search(stripped): + yield Finding(filename, line_no, "Private key block found") + return + + # --- Passwords in URLs ------------------------------------------------- + if _RE_URL_PASSWORD.search(stripped): + yield Finding(filename, line_no, "Password in URL found") + return + + # --- Raw token patterns ------------------------------------------------ + if _RE_RAW_TOKEN.search(stripped): + yield Finding(filename, line_no, 'Raw "token" string with long value') + return + + if _RE_RAW_API_KEY.search(stripped): + yield Finding(filename, line_no, 'Raw "api_key" string with long value') + return + + +def scan_content(content: Union[str, bytes], filename: str) -> List[Finding]: + """Scan full file content for secrets.""" + if isinstance(content, bytes): + try: + text = content.decode("utf-8") + except UnicodeDecodeError: + return [] + else: + text = content + + findings: List[Finding] = [] + for line_no, line in enumerate(text.splitlines(), start=1): + findings.extend(scan_line(line, filename, line_no)) + return findings + + +def scan_files( + files: List[str], + content_reader: Callable[[str], bytes], +) -> List[Finding]: + """Scan a list of files using the provided content reader.""" + findings: List[Finding] = [] + for filepath in files: + content = content_reader(filepath) + if is_binary_content(content): + continue + findings.extend(scan_content(content, filepath)) + return findings + + +# --------------------------------------------------------------------------- +# Git helpers +# --------------------------------------------------------------------------- + + +def get_staged_files() -> List[str]: + """Return a list of staged file paths (excluding deletions).""" + result = subprocess.run( + ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return [] + return [f for f in result.stdout.strip().split("\n") if f] + + +def get_staged_diff() -> str: + """Return the diff of staged changes.""" + result = subprocess.run( + ["git", "diff", "--cached", "--no-color", "-U0"], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return "" + return result.stdout + + +def get_file_content_at_staged(filepath: str) -> bytes: + """Return the staged content of a file.""" + result = subprocess.run( + ["git", "show", f":{filepath}"], + capture_output=True, + ) + if result.returncode != 0: + return b"" + return result.stdout + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> int: + print(f"{GREEN}🔍 Scanning for secret leaks in staged files...{NC}") + + staged_files = get_staged_files() + if not staged_files: + print(f"{GREEN}✓ No files staged for commit{NC}") + return 0 + + # Scan both full staged file contents and the diff content + findings = scan_files(staged_files, get_file_content_at_staged) + + diff_text = get_staged_diff() + if diff_text: + for line_no, line in enumerate(diff_text.splitlines(), start=1): + # Only scan added lines in the diff + if line.startswith("+") and not line.startswith("+++"): + findings.extend(scan_line(line[1:], "", line_no)) + + if not findings: + print(f"{GREEN}✓ No potential secret leaks detected{NC}") + return 0 + + print(f"{RED}✗ Potential secret leaks detected:{NC}\n") + for finding in findings: + loc = finding.filename + print( + f" {RED}[LEAK]{NC} {loc}:{finding.line} — {finding.message}" + ) + + print() + print(f"{RED}╔════════════════════════════════════════════════════════════╗{NC}") + print(f"{RED}║ COMMIT BLOCKED: Potential secrets detected! ║{NC}") + print(f"{RED}╚════════════════════════════════════════════════════════════╝{NC}") + print() + print("Recommendations:") + print(" 1. Remove secrets from your code") + print(" 2. Use environment variables or a secrets manager") + print(" 3. Add sensitive files to .gitignore") + print(" 4. Rotate any exposed credentials immediately") + print() + print("If you are CERTAIN this is a false positive, you can bypass:") + print(" git commit --no-verify") + print() + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/notebooks/agent_task_system_health.ipynb b/hermes-sovereign/notebooks/agent_task_system_health.ipynb new file mode 100644 index 00000000..ab481501 --- /dev/null +++ b/hermes-sovereign/notebooks/agent_task_system_health.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parameterized Agent Task: System Health Check\n", + "\n", + "This notebook demonstrates how an LLM agent can generate a task notebook,\n", + "a scheduler can parameterize and execute it via papermill,\n", + "and the output becomes a persistent audit artifact." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {"tags": ["parameters"]}, + "outputs": [], + "source": [ + "# Default parameters — papermill will inject overrides here\n", + "threshold = 1.0\n", + "hostname = \"localhost\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json, subprocess, datetime\n", + "gather_time = datetime.datetime.now().isoformat()\n", + "load_avg = subprocess.check_output([\"cat\", \"/proc/loadavg\"]).decode().strip()\n", + "load_values = [float(x) for x in load_avg.split()[:3]]\n", + "avg_load = sum(load_values) / len(load_values)\n", + "intervention_needed = avg_load > threshold\n", + "report = {\n", + " \"hostname\": hostname,\n", + " \"threshold\": threshold,\n", + " \"avg_load\": round(avg_load, 3),\n", + " \"intervention_needed\": intervention_needed,\n", + " \"gathered_at\": gather_time\n", + "}\n", + "print(json.dumps(report, indent=2))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermes-sovereign/notebooks/agent_task_system_health.py b/hermes-sovereign/notebooks/agent_task_system_health.py new file mode 100644 index 00000000..6b9ef904 --- /dev/null +++ b/hermes-sovereign/notebooks/agent_task_system_health.py @@ -0,0 +1,41 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Parameterized Agent Task: System Health Check +# +# This notebook demonstrates how an LLM agent can generate a task notebook, +# a scheduler can parameterize and execute it via papermill, +# and the output becomes a persistent audit artifact. + +# %% tags=["parameters"] +# Default parameters — papermill will inject overrides here +threshold = 1.0 +hostname = "localhost" + +# %% +import json, subprocess, datetime +gather_time = datetime.datetime.now().isoformat() +load_avg = subprocess.check_output(["cat", "/proc/loadavg"]).decode().strip() +load_values = [float(x) for x in load_avg.split()[:3]] +avg_load = sum(load_values) / len(load_values) +intervention_needed = avg_load > threshold +report = { + "hostname": hostname, + "threshold": threshold, + "avg_load": round(avg_load, 3), + "intervention_needed": intervention_needed, + "gathered_at": gather_time +} +print(json.dumps(report, indent=2)) diff --git a/hermes-sovereign/scripts/deploy-validate b/hermes-sovereign/scripts/deploy-validate new file mode 100644 index 00000000..4b9741e8 --- /dev/null +++ b/hermes-sovereign/scripts/deploy-validate @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +""" +deploy-validate — pre-flight configuration checker for Hermes deployments. + +Catches common configuration errors BEFORE they cause runtime failures. +Safe to run at any time: it only reads files and makes lightweight network +checks — it never writes state or sends messages. + +Usage: + python scripts/deploy-validate # validate current environment + python scripts/deploy-validate --dry-run # alias for the same thing + python scripts/deploy-validate --env /path/to/.env + +Exit codes: + 0 All checks passed (or only warnings). + 1 One or more blocking errors found. +""" + +from __future__ import annotations + +import argparse +import os +import socket +import sys +import urllib.error +import urllib.request +from pathlib import Path +from typing import Optional + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +RESET = "\033[0m" +RED = "\033[91m" +YELLOW = "\033[93m" +GREEN = "\033[92m" +BOLD = "\033[1m" + + +def _color(text: str, code: str) -> str: + if sys.stdout.isatty(): + return f"{code}{text}{RESET}" + return text + + +def ok(msg: str) -> None: + print(f" {_color('✔', GREEN)} {msg}") + + +def warn(msg: str) -> None: + print(f" {_color('⚠', YELLOW)} {msg}") + + +def error(msg: str) -> None: + print(f" {_color('✘', RED)} {msg}") + + +def section(title: str) -> None: + print(f"\n{_color(BOLD + title, BOLD)}") + + +# --------------------------------------------------------------------------- +# .env loader (minimal — avoids dependency on python-dotenv for portability) +# --------------------------------------------------------------------------- + +def _load_env_file(path: Path) -> dict[str, str]: + """Parse a .env file and return a dict of key→value pairs.""" + result: dict[str, str] = {} + if not path.exists(): + return result + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, _, value = line.partition("=") + key = key.strip() + # Strip inline comments and surrounding quotes. + value = value.split("#")[0].strip().strip("\"'") + if key: + result[key] = value + return result + + +# --------------------------------------------------------------------------- +# Individual checks +# --------------------------------------------------------------------------- + +def check_env_file(env_path: Path) -> dict[str, str]: + section("Environment file") + if not env_path.exists(): + error(f".env not found at {env_path}") + error("Copy .env.example → .env and fill in your API keys.") + return {} + ok(f".env found at {env_path}") + + raw = _load_env_file(env_path) + + # Warn if any value looks like a placeholder. + placeholder_patterns = ("your_", "xxxx", "changeme", "todo", "replace_me") + for key, value in raw.items(): + if value and any(p in value.lower() for p in placeholder_patterns): + warn(f"{key} looks like a placeholder: {value!r}") + + return raw + + +def check_llm_key(env: dict[str, str]) -> bool: + section("LLM provider") + providers = { + "OPENROUTER_API_KEY": "OpenRouter", + "ANTHROPIC_API_KEY": "Anthropic", + "OPENAI_API_KEY": "OpenAI", + "GLM_API_KEY": "z.ai / GLM", + "KIMI_API_KEY": "Kimi / Moonshot", + "MINIMAX_API_KEY": "MiniMax", + "NOUS_API_KEY": "Nous Portal", + "HF_TOKEN": "Hugging Face", + "KILOCODE_API_KEY": "KiloCode", + "OPENCODE_ZEN_API_KEY": "OpenCode Zen", + } + found = [name for key, name in providers.items() if env.get(key, "").strip()] + if not found: + error("No LLM API key detected. Set at least one (e.g. OPENROUTER_API_KEY).") + return False + ok(f"LLM provider key present: {', '.join(found)}") + return True + + +def check_hermes_home(env: dict[str, str]) -> Optional[Path]: + section("HERMES_HOME data directory") + raw = env.get("HERMES_HOME") or os.environ.get("HERMES_HOME") or "" + if raw: + home = Path(raw).expanduser() + else: + home = Path.home() / ".hermes" + + if not home.exists(): + warn(f"HERMES_HOME does not exist yet: {home} (will be created on first run)") + return home + + ok(f"HERMES_HOME exists: {home}") + + required_dirs = ["logs", "sessions", "cron", "memories", "skills"] + for d in required_dirs: + if not (home / d).is_dir(): + warn(f"Expected subdirectory missing: {home / d} (created automatically at runtime)") + + if (home / ".env").exists(): + ok(f"Data-directory .env present: {home / '.env'}") + else: + warn(f"No .env in HERMES_HOME ({home}). " + "The Docker entrypoint copies .env.example on first run; " + "for bare-metal installs copy it manually.") + + return home + + +def check_gateway_platforms(env: dict[str, str]) -> None: + section("Messaging platform tokens") + platforms: dict[str, list[str]] = { + "Telegram": ["TELEGRAM_BOT_TOKEN"], + "Discord": ["DISCORD_BOT_TOKEN"], + "Slack": ["SLACK_BOT_TOKEN", "SLACK_APP_TOKEN"], + "WhatsApp": [], # pairing-based, no env key required + "Email": ["EMAIL_ADDRESS", "EMAIL_PASSWORD"], + } + any_found = False + for platform, keys in platforms.items(): + if not keys: + continue # WhatsApp — no key check + if all(env.get(k, "").strip() for k in keys): + ok(f"{platform}: configured ({', '.join(keys)})") + any_found = True + if not any_found: + warn("No messaging platform tokens found. " + "The gateway will start but accept no inbound messages. " + "Set at least one platform token (e.g. TELEGRAM_BOT_TOKEN).") + + +def check_api_server_reachable(host: str = "127.0.0.1", port: int = 8642) -> None: + section("API server health check") + url = f"http://{host}:{port}/health" + try: + with urllib.request.urlopen(url, timeout=5) as resp: + body = resp.read().decode() + if '"status"' in body and "ok" in body: + ok(f"API server healthy: {url}") + else: + warn(f"Unexpected /health response from {url}: {body[:200]}") + except urllib.error.URLError as exc: + # Not a failure — the server may not be running in --dry-run mode. + warn(f"API server not reachable at {url}: {exc.reason} " + "(expected if gateway is not running)") + except OSError as exc: + warn(f"API server not reachable at {url}: {exc}") + + +def check_gateway_status(hermes_home: Optional[Path]) -> None: + section("Gateway runtime status") + if hermes_home is None: + warn("HERMES_HOME unknown — skipping runtime status check.") + return + + state_file = hermes_home / "gateway_state.json" + pid_file = hermes_home / "gateway.pid" + + if not state_file.exists() and not pid_file.exists(): + warn("Gateway does not appear to be running (no PID or state file). " + "This is expected before the first start.") + return + + if state_file.exists(): + import json + try: + state = json.loads(state_file.read_text()) + gw_state = state.get("gateway_state", "unknown") + updated = state.get("updated_at", "?") + if gw_state == "running": + ok(f"Gateway state: {gw_state} (updated {updated})") + platforms = state.get("platforms", {}) + for plat, pdata in platforms.items(): + pstate = pdata.get("state", "unknown") + if pstate in ("connected", "running", "ok"): + ok(f" Platform {plat}: {pstate}") + else: + warn(f" Platform {plat}: {pstate} — {pdata.get('error_message', '')}") + elif gw_state in ("stopped", "startup_failed"): + error(f"Gateway state: {gw_state} — {state.get('exit_reason', 'no reason recorded')}") + else: + warn(f"Gateway state: {gw_state}") + except Exception as exc: + warn(f"Could not parse {state_file}: {exc}") + else: + warn("State file missing; only PID file found. Gateway may be starting.") + + +def check_docker_available() -> None: + section("Docker / compose availability") + for cmd in ("docker", "docker compose"): + _check_command(cmd.split()[0], cmd) + + +def _check_command(name: str, display: str) -> bool: + import shutil + if shutil.which(name): + ok(f"{display} found") + return True + warn(f"{display} not found in PATH (only required for Docker deployments)") + return False + + +def check_ports_free(ports: list[int] = None) -> None: + section("Port availability") + if ports is None: + ports = [8642] + for port in ports: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(1) + result = s.connect_ex(("127.0.0.1", port)) + if result == 0: + warn(f"Port {port} is already in use. " + "The API server will fail to bind unless you change its port.") + else: + ok(f"Port {port} is free") + + +def check_no_secrets_in_repo(repo_root: Path) -> None: + section("Secret hygiene") + dangerous = [".env", "*.pem", "*.key", "id_rsa", "id_ed25519"] + gitignore = repo_root / ".gitignore" + if gitignore.exists(): + content = gitignore.read_text() + for pattern in [".env", "*.pem", "*.key"]: + if pattern in content or pattern.lstrip("*. ") in content: + ok(f".gitignore covers {pattern}") + else: + warn(f".gitignore does not mention {pattern}. " + "Ensure secrets are never committed.") + else: + warn("No .gitignore found. Secrets could accidentally be committed.") + + # Check the env file itself isn't tracked. + env_file = repo_root / ".env" + if env_file.exists(): + import subprocess + try: + out = subprocess.run( + ["git", "ls-files", "--error-unmatch", ".env"], + cwd=repo_root, + capture_output=True, + ) + if out.returncode == 0: + error(".env IS tracked by git! Remove it immediately: git rm --cached .env") + else: + ok(".env is not tracked by git") + except FileNotFoundError: + warn("git not found — cannot verify .env tracking status") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> int: + parser = argparse.ArgumentParser( + description="Pre-flight configuration validator for Hermes deployments.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Alias for the default mode (no state is written regardless).", + ) + parser.add_argument( + "--env", metavar="PATH", + help="Path to .env file (default: .env in repo root).", + ) + parser.add_argument( + "--check-ports", action="store_true", + help="Also verify that required ports are free (useful before first start).", + ) + parser.add_argument( + "--skip-health", action="store_true", + help="Skip the live /health HTTP check (use when gateway is not running).", + ) + args = parser.parse_args() + + print(f"\n{_color(BOLD + 'Hermes Deploy Validator', BOLD)}") + print("=" * 50) + + repo_root = Path(__file__).resolve().parent.parent + env_path = Path(args.env) if args.env else repo_root / ".env" + + errors_before = [0] # mutable sentinel + + # Monkey-patch error() to count failures. + _original_error = globals()["error"] + error_count = 0 + + def counting_error(msg: str) -> None: + nonlocal error_count + error_count += 1 + _original_error(msg) + + globals()["error"] = counting_error + + # Run checks. + env = check_env_file(env_path) + check_no_secrets_in_repo(repo_root) + llm_ok = check_llm_key(env) + hermes_home = check_hermes_home(env) + check_gateway_platforms(env) + if args.check_ports: + check_ports_free() + if not args.skip_health: + check_api_server_reachable() + check_gateway_status(hermes_home) + + # Summary. + print(f"\n{'=' * 50}") + if error_count == 0: + print(_color(f"All checks passed (0 errors).", GREEN)) + return 0 + else: + print(_color(f"{error_count} error(s) found. Fix them before deploying.", RED)) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes-sovereign/scripts/forge_health_check.py b/hermes-sovereign/scripts/forge_health_check.py new file mode 100644 index 00000000..e0bc821e --- /dev/null +++ b/hermes-sovereign/scripts/forge_health_check.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +"""Forge Health Check — Build verification and artifact integrity scanner. + +Scans wizard environments for: +- Missing source files (.pyc without .py) — Allegro finding: GOFAI source files gone +- Burn script accumulation in /root or wizard directories +- World-readable sensitive files (keystores, tokens, configs) +- Missing required environment variables + +Usage: + python scripts/forge_health_check.py /root/wizards + python scripts/forge_health_check.py /root/wizards --json + python scripts/forge_health_check.py /root/wizards --fix-permissions +""" + +from __future__ import annotations + +import argparse +import json +import os +import stat +import sys +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Iterable + + +SENSITIVE_FILE_PATTERNS = ( + "keystore", + "password", + "private", + "apikey", + "api_key", + "credentials", +) + +SENSITIVE_NAME_PREFIXES = ( + "key_", + "keys_", + "token_", + "tokens_", + "secret_", + "secrets_", + ".env", + "env.", +) + +SENSITIVE_NAME_SUFFIXES = ( + "_key", + "_keys", + "_token", + "_tokens", + "_secret", + "_secrets", + ".key", + ".env", + ".token", + ".secret", +) + +SENSIBLE_PERMISSIONS = 0o600 # owner read/write only + +REQUIRED_ENV_VARS = ( + "GITEA_URL", + "GITEA_TOKEN", + "GITEA_USER", +) + +BURN_SCRIPT_PATTERNS = ( + "burn", + "ignite", + "inferno", + "scorch", + "char", + "blaze", + "ember", +) + + +@dataclass +class HealthFinding: + category: str + severity: str # critical, warning, info + path: str + message: str + suggestion: str = "" + + +@dataclass +class HealthReport: + target: str + findings: list[HealthFinding] = field(default_factory=list) + passed: bool = True + + def add(self, finding: HealthFinding) -> None: + self.findings.append(finding) + if finding.severity == "critical": + self.passed = False + + +EXCLUDED_PATH_SEGMENTS = frozenset({ + ".cache", "__pycache__", ".venv", "venv", "site-packages", + ".local/share/uv", "node_modules", ".git", ".tox", +}) + + +def _is_excluded_path(path: Path) -> bool: + """Skip cache, venv, and package-manager directories.""" + parts = set(path.parts) + return not parts.isdisjoint(EXCLUDED_PATH_SEGMENTS) + + +def scan_orphaned_bytecode(root: Path, report: HealthReport) -> None: + """Detect .pyc files without corresponding .py source files.""" + for pyc in root.rglob("*.pyc"): + if _is_excluded_path(pyc): + continue + py = pyc.with_suffix(".py") + if not py.exists(): + # Also check __pycache__ naming convention + if pyc.name.startswith("__") and pyc.parent.name == "__pycache__": + stem = pyc.stem.split(".")[0] + py = pyc.parent.parent / f"{stem}.py" + if not py.exists(): + report.add( + HealthFinding( + category="artifact_integrity", + severity="critical", + path=str(pyc), + message=f"Compiled bytecode without source: {pyc}", + suggestion="Restore missing .py source file from version control or backup", + ) + ) + + +def scan_burn_script_clutter(root: Path, report: HealthReport) -> None: + """Detect burn scripts and other temporary artifacts outside proper staging.""" + for path in root.iterdir(): + if not path.is_file(): + continue + lower = path.name.lower() + if any(pat in lower for pat in BURN_SCRIPT_PATTERNS): + report.add( + HealthFinding( + category="deployment_hygiene", + severity="warning", + path=str(path), + message=f"Burn script or temporary artifact in production path: {path.name}", + suggestion="Archive to a burn/ or tmp/ directory, or remove if no longer needed", + ) + ) + + +def _is_sensitive_filename(name: str) -> bool: + """Check if a filename indicates it may contain secrets.""" + lower = name.lower() + if lower == ".env.example": + return False + # Skip stylesheet and documentation artifacts + if lower.endswith(".css"): + return False + # Skip scanner tooling — these are detectors, not secrets + if lower in {"secret_scan.py", "secret_scanner.py"}: + return False + if any(pat in lower for pat in SENSITIVE_FILE_PATTERNS): + return True + if any(lower.startswith(pref) for pref in SENSITIVE_NAME_PREFIXES): + return True + if any(lower.endswith(suff) for suff in SENSITIVE_NAME_SUFFIXES): + return True + return False + + +def scan_sensitive_file_permissions(root: Path, report: HealthReport, fix: bool = False) -> None: + """Detect world-readable sensitive files.""" + for fpath in root.rglob("*"): + if not fpath.is_file(): + continue + if _is_excluded_path(fpath): + continue + # Skip test files — real secrets should never live in tests/ + if "/tests/" in str(fpath) or str(fpath).startswith(str(root / "tests")): + continue + if not _is_sensitive_filename(fpath.name): + continue + + try: + mode = fpath.stat().st_mode + except OSError: + continue + + # Readable by group or other + if mode & stat.S_IRGRP or mode & stat.S_IROTH: + was_fixed = False + if fix: + try: + fpath.chmod(SENSIBLE_PERMISSIONS) + was_fixed = True + except OSError: + pass + + report.add( + HealthFinding( + category="security", + severity="critical", + path=str(fpath), + message=( + f"Sensitive file world-readable: {fpath.name} " + f"(mode={oct(mode & 0o777)})" + ), + suggestion=( + f"Fixed permissions to {oct(SENSIBLE_PERMISSIONS)}" + if was_fixed + else f"Run 'chmod {oct(SENSIBLE_PERMISSIONS)[2:]} {fpath}'" + ), + ) + ) + + +def scan_environment_variables(report: HealthReport) -> None: + """Check for required environment variables.""" + for var in REQUIRED_ENV_VARS: + if not os.environ.get(var): + report.add( + HealthFinding( + category="configuration", + severity="warning", + path="$" + var, + message=f"Required environment variable {var} is missing or empty", + suggestion="Export the variable in your shell profile or secrets manager", + ) + ) + + +def run_health_check(target: Path, fix_permissions: bool = False) -> HealthReport: + report = HealthReport(target=str(target.resolve())) + if target.exists(): + scan_orphaned_bytecode(target, report) + scan_burn_script_clutter(target, report) + scan_sensitive_file_permissions(target, report, fix=fix_permissions) + scan_environment_variables(report) + return report + + +def print_report(report: HealthReport) -> None: + status = "PASS" if report.passed else "FAIL" + print(f"Forge Health Check: {status}") + print(f"Target: {report.target}") + print(f"Findings: {len(report.findings)}\n") + + by_category: dict[str, list[HealthFinding]] = {} + for f in report.findings: + by_category.setdefault(f.category, []).append(f) + + for category, findings in by_category.items(): + print(f"[{category.upper()}]") + for f in findings: + print(f" {f.severity.upper()}: {f.message}") + if f.suggestion: + print(f" -> {f.suggestion}") + print() + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Forge Health Check") + parser.add_argument("target", nargs="?", default="/root/wizards", help="Root path to scan") + parser.add_argument("--json", action="store_true", help="Output JSON report") + parser.add_argument("--fix-permissions", action="store_true", help="Auto-fix file permissions") + args = parser.parse_args(argv) + + target = Path(args.target) + report = run_health_check(target, fix_permissions=args.fix_permissions) + + if args.json: + print(json.dumps(asdict(report), indent=2)) + else: + print_report(report) + + return 0 if report.passed else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/hermes-sovereign/scripts/smoke_test.py b/hermes-sovereign/scripts/smoke_test.py new file mode 100644 index 00000000..b9b8717a --- /dev/null +++ b/hermes-sovereign/scripts/smoke_test.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Forge smoke tests — fast checks that core imports resolve and entrypoints load. + +Total runtime target: < 30 seconds. +""" + +from __future__ import annotations + +import importlib +import subprocess +import sys +from pathlib import Path + +# Allow running smoke test directly from repo root before pip install +REPO_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(REPO_ROOT)) + +CORE_MODULES = [ + "hermes_cli.config", + "hermes_state", + "model_tools", + "toolsets", + "utils", +] + +CLI_ENTRYPOINTS = [ + [sys.executable, "cli.py", "--help"], +] + + +def test_imports() -> None: + ok = 0 + skipped = 0 + for mod in CORE_MODULES: + try: + importlib.import_module(mod) + ok += 1 + except ImportError as exc: + # If the failure is a missing third-party dependency, skip rather than fail + # so the smoke test can run before `pip install` in bare environments. + msg = str(exc).lower() + if "no module named" in msg and mod.replace(".", "/") not in msg: + print(f"SKIP: import {mod} -> missing dependency ({exc})") + skipped += 1 + else: + print(f"FAIL: import {mod} -> {exc}") + sys.exit(1) + except Exception as exc: + print(f"FAIL: import {mod} -> {exc}") + sys.exit(1) + print(f"OK: {ok} core imports", end="") + if skipped: + print(f" ({skipped} skipped due to missing deps)") + else: + print() + + +def test_cli_help() -> None: + ok = 0 + skipped = 0 + for cmd in CLI_ENTRYPOINTS: + result = subprocess.run(cmd, capture_output=True, timeout=30) + if result.returncode == 0: + ok += 1 + continue + stderr = result.stderr.decode().lower() + # Gracefully skip if dependencies are missing in bare environments + if "modulenotfounderror" in stderr or "no module named" in stderr: + print(f"SKIP: {' '.join(cmd)} -> missing dependency") + skipped += 1 + else: + print(f"FAIL: {' '.join(cmd)} -> {result.stderr.decode()[:200]}") + sys.exit(1) + print(f"OK: {ok} CLI entrypoints", end="") + if skipped: + print(f" ({skipped} skipped due to missing deps)") + else: + print() + + +def main() -> int: + test_imports() + test_cli_help() + print("Smoke tests passed.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/hermes-sovereign/scripts/syntax_guard.py b/hermes-sovereign/scripts/syntax_guard.py new file mode 100644 index 00000000..7c41dc9b --- /dev/null +++ b/hermes-sovereign/scripts/syntax_guard.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +"""Syntax guard — compile all Python files to catch syntax errors before merge.""" +import py_compile +import sys +from pathlib import Path + +errors = [] +for p in Path(".").rglob("*.py"): + if ".venv" in p.parts or "__pycache__" in p.parts: + continue + try: + py_compile.compile(str(p), doraise=True) + except py_compile.PyCompileError as e: + errors.append(f"{p}: {e}") + print(f"SYNTAX ERROR: {p}: {e}", file=sys.stderr) + +if errors: + print(f"\n{len(errors)} file(s) with syntax errors", file=sys.stderr) + sys.exit(1) +print("All Python files compile successfully") diff --git a/hermes-sovereign/scripts/test_process_resilience.py b/hermes-sovereign/scripts/test_process_resilience.py new file mode 100644 index 00000000..faac34e9 --- /dev/null +++ b/hermes-sovereign/scripts/test_process_resilience.py @@ -0,0 +1,489 @@ +""" +Verification tests for Issue #123: Process Resilience + +Verifies the fixes introduced by these commits: +- d3d5b895: refactor: simplify _get_service_pids - dedupe systemd scopes, fix self-import, harden launchd parsing +- a2a9ad74: fix: hermes update kills freshly-restarted gateway service +- 78697092: fix(cli): add missing subprocess.run() timeouts in gateway CLI (#5424) + +Tests cover: + (a) _get_service_pids() deduplication (no duplicate PIDs across systemd + launchd) + (b) _get_service_pids() doesn't include own process (self-import bug fix verified) + (c) hermes update excludes current gateway PIDs (update safety) + (d) All subprocess.run() calls in hermes_cli/ have timeout= parameter + (e) launchd parsing handles malformed data gracefully +""" +import ast +import os +import platform +import subprocess +import sys +import textwrap +import unittest +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + + +# --------------------------------------------------------------------------- +# Resolve project root (parent of hermes_cli) +# --------------------------------------------------------------------------- +PROJECT_ROOT = Path(__file__).resolve().parent.parent +HERMES_CLI = PROJECT_ROOT / "hermes_cli" +sys.path.insert(0, str(PROJECT_ROOT)) + + +def _get_service_pids() -> set: + """Reproduction of the _get_service_pids logic from commit d3d5b895. + + The function was introduced in d3d5b895 which simplified the previous + find_gateway_pids() approach and fixed: + 1. Deduplication across user+system systemd scopes + 2. Self-import bug (importing from hermes_cli.gateway was wrong) + 3. launchd parsing hardening (skipping header, validating label) + + This local copy lets us test the logic without requiring import side-effects. + """ + pids: set = set() + + # Platform detection (same as hermes_cli.gateway) + is_linux = sys.platform.startswith("linux") + is_macos = sys.platform == "darwin" + + # Linux: check both user and system systemd scopes + if is_linux: + service_name = "hermes-gateway" + for scope in ("--user", ""): + cmd = ["systemctl"] + ([scope] if scope else []) + ["show", service_name, "--property=MainPID", "--value"] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=5) + if result.returncode == 0: + for line in result.stdout.splitlines(): + line = line.strip() + if line.isdigit(): + pid = int(line) + if pid > 0 and pid != os.getpid(): + pids.add(pid) + except Exception: + pass + + # macOS: check launchd + if is_macos: + label = "ai.hermes.gateway" + try: + result = subprocess.run( + ["launchctl", "list"], capture_output=True, text=True, timeout=5, + ) + for line in result.stdout.splitlines(): + parts = line.strip().split("\t") + if len(parts) >= 3 and parts[2] == label: + try: + pid = int(parts[0]) + if pid > 0 and pid != os.getpid(): + pids.add(pid) + except ValueError: + continue + except Exception: + pass + + return pids + + +# =================================================================== +# (a) PID Deduplication: systemd + launchd PIDs are deduplicated +# =================================================================== +class TestPIDDeduplication(unittest.TestCase): + """Verify that the service-pid discovery function returns unique PIDs.""" + + @patch("subprocess.run") + @patch("sys.platform", "linux") + def test_systemd_duplicate_pids_deduplicated(self, mock_run): + """When systemd reports the same PID in user + system scope, it's deduplicated.""" + def fake_run(cmd, **kwargs): + if "systemctl" in cmd: + # Both scopes report the same PID + return SimpleNamespace(returncode=0, stdout="12345\n") + return SimpleNamespace(returncode=1, stdout="", stderr="") + + mock_run.side_effect = fake_run + + pids = _get_service_pids() + self.assertIsInstance(pids, set) + # Same PID in both scopes -> only one entry + self.assertEqual(len(pids), 1, f"Expected 1 unique PID, got {pids}") + self.assertIn(12345, pids) + + @patch("subprocess.run") + @patch("sys.platform", "darwin") + def test_macos_single_pid_no_dup(self, mock_run): + """On macOS, a single launchd PID appears exactly once.""" + def fake_run(cmd, **kwargs): + if cmd[0] == "launchctl": + return SimpleNamespace( + returncode=0, + stdout="PID\tExitCode\tLabel\n12345\t0\tai.hermes.gateway\n", + stderr="", + ) + return SimpleNamespace(returncode=1, stdout="", stderr="") + + mock_run.side_effect = fake_run + + pids = _get_service_pids() + self.assertIsInstance(pids, set) + self.assertEqual(len(pids), 1) + self.assertIn(12345, pids) + + @patch("subprocess.run") + @patch("sys.platform", "linux") + def test_different_systemd_pids_both_included(self, mock_run): + """When user and system scopes have different PIDs, both are returned.""" + user_first = True + + def fake_run(cmd, **kwargs): + nonlocal user_first + if "systemctl" in cmd and "--user" in cmd: + return SimpleNamespace(returncode=0, stdout="11111\n") + if "systemctl" in cmd: + return SimpleNamespace(returncode=0, stdout="22222\n") + return SimpleNamespace(returncode=1, stdout="", stderr="") + + mock_run.side_effect = fake_run + + pids = _get_service_pids() + self.assertEqual(len(pids), 2) + self.assertIn(11111, pids) + self.assertIn(22222, pids) + + +# =================================================================== +# (b) Self-Import Bug Fix: _get_service_pids() doesn't include own PID +# =================================================================== +class TestSelfImportFix(unittest.TestCase): + """Verify that own PID is excluded (commit d3d5b895 fix).""" + + @patch("subprocess.run") + @patch("sys.platform", "linux") + def test_own_pid_excluded_systemd(self, mock_run): + """When systemd reports our own PID, it must be excluded.""" + our_pid = os.getpid() + + def fake_run(cmd, **kwargs): + if "systemctl" in cmd: + return SimpleNamespace(returncode=0, stdout=f"{our_pid}\n") + return SimpleNamespace(returncode=1, stdout="", stderr="") + + mock_run.side_effect = fake_run + + pids = _get_service_pids() + self.assertNotIn( + our_pid, pids, + f"Service PIDs must not include our own PID ({our_pid})" + ) + + @patch("subprocess.run") + @patch("sys.platform", "darwin") + def test_own_pid_excluded_launchd(self, mock_run): + """When launchd output includes our own PID, it must be excluded.""" + our_pid = os.getpid() + label = "ai.hermes.gateway" + + def fake_run(cmd, **kwargs): + if cmd[0] == "launchctl": + return SimpleNamespace( + returncode=0, + stdout=f"{our_pid}\t0\t{label}\n", + stderr="", + ) + return SimpleNamespace(returncode=1, stdout="", stderr="") + + mock_run.side_effect = fake_run + + pids = _get_service_pids() + self.assertNotIn(our_pid, pids, "Service PIDs must not include our own PID") + + +# =================================================================== +# (c) Update Safety: hermes update excludes current gateway PIDs +# =================================================================== +class TestUpdateSafety(unittest.TestCase): + """Verify that the update command logic protects current gateway PIDs.""" + + def test_find_gateway_pids_exists_and_excludes_own(self): + """find_gateway_pids() in hermes_cli.gateway excludes own PID.""" + from hermes_cli.gateway import find_gateway_pids + self.assertTrue(callable(find_gateway_pids), + "find_gateway_pids must be callable") + + # The current implementation (d3d5b895) explicitly checks pid != os.getpid() + import hermes_cli.gateway as gw + import inspect + source = inspect.getsource(gw.find_gateway_pids) + self.assertIn("os.getpid()", source, + "find_gateway_pids should reference os.getpid() for self-exclusion") + + def test_wait_for_gateway_exit_exists(self): + """The restart flow includes _wait_for_gateway_exit to avoid killing new process.""" + from hermes_cli.gateway import _wait_for_gateway_exit + self.assertTrue(callable(_wait_for_gateway_exit), + "_wait_for_gateway_exit must exist to prevent race conditions") + + def test_kill_gateway_uses_find_gateway_pids(self): + """kill_gateway_processes uses find_gateway_pids before killing.""" + from hermes_cli import gateway as gw + import inspect + source = inspect.getsource(gw.kill_gateway_processes) + self.assertIn("find_gateway_pids", source, + "kill_gateway_processes must use find_gateway_pids") + + +# =================================================================== +# (d) All subprocess.run() calls in hermes_cli/ have timeout= parameter +# =================================================================== +class TestSubprocessTimeouts(unittest.TestCase): + """Check subprocess.run() calls for timeout coverage. + + Note: Some calls legitimately don't need a timeout (e.g., status display + commands where the user sees the output). This test identifies which ones + are missing so they can be triaged. + """ + + def _collect_missing_timeouts(self): + """Parse every .py file in hermes_cli/ and find subprocess.run() without timeout.""" + failures = [] + + # Lines that are intentionally missing timeout (interactive status display, etc.) + # These are in gateway CLI service management commands where the user expects + # to see the output on screen (e.g., systemctl status --no-pager) + ALLOWED_NO_TIMEOUT = { + # Interactive display commands (user waiting for output) + "hermes_cli/status.py", + "hermes_cli/gateway.py", + "hermes_cli/uninstall.py", + "hermes_cli/doctor.py", + # Interactive subprocess calls + "hermes_cli/main.py", + "hermes_cli/tools_config.py", + } + + for py_file in sorted(HERMES_CLI.rglob("*.py")): + try: + source = py_file.read_text(encoding="utf-8") + except Exception: + continue + + if "subprocess.run" not in source: + continue + + rel = str(py_file.relative_to(PROJECT_ROOT)) + if rel in ALLOWED_NO_TIMEOUT: + continue + + try: + tree = ast.parse(source, filename=str(py_file)) + except SyntaxError: + failures.append(f"{rel}: SyntaxError in AST parse") + continue + + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + + # Detect subprocess.run(...) + func = node.func + is_subprocess_run = False + + if isinstance(func, ast.Attribute) and func.attr == "run": + if isinstance(func.value, ast.Name): + is_subprocess_run = True + + if not is_subprocess_run: + continue + + has_timeout = False + for kw in node.keywords: + if kw.arg == "timeout": + has_timeout = True + break + + if not has_timeout: + failures.append(f"{rel}:{node.lineno}: subprocess.run() without timeout=") + + return failures + + def test_core_modules_have_timeouts(self): + """Core CLI modules must have timeouts on subprocess.run() calls. + + Files with legitimate interactive subprocess.run() calls (e.g., installers, + status displays) are excluded from this check. + """ + # Files where subprocess.run() intentionally lacks timeout (interactive, status) + # but that should still be audited manually + INTERACTIVE_FILES = { + HERMES_CLI / "config.py", # setup/installer - user waits + HERMES_CLI / "gateway.py", # service management - user sees output + HERMES_CLI / "uninstall.py", # uninstaller - user waits + HERMES_CLI / "doctor.py", # diagnostics - user sees output + HERMES_CLI / "status.py", # status display - user waits + HERMES_CLI / "main.py", # mixed interactive/CLI + HERMES_CLI / "setup.py", # setup wizard - user waits + HERMES_CLI / "tools_config.py", # config editor - user waits + } + + missing = [] + for py_file in sorted(HERMES_CLI.rglob("*.py")): + if py_file in INTERACTIVE_FILES: + continue + try: + source = py_file.read_text(encoding="utf-8") + except Exception: + continue + if "subprocess.run" not in source: + continue + try: + tree = ast.parse(source, filename=str(py_file)) + except SyntaxError: + missing.append(f"{py_file.relative_to(PROJECT_ROOT)}: SyntaxError") + continue + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if isinstance(func, ast.Attribute) and func.attr == "run": + if isinstance(func.value, ast.Name): + has_timeout = any(kw.arg == "timeout" for kw in node.keywords) + if not has_timeout: + rel = py_file.relative_to(PROJECT_ROOT) + missing.append(f"{rel}:{node.lineno}: missing timeout=") + + self.assertFalse( + missing, + f"subprocess.run() calls missing timeout= in non-interactive files:\n" + + "\n".join(f" {m}" for m in missing) + ) + + +# =================================================================== +# (e) Launchd parsing handles malformed data gracefully +# =================================================================== +class TestLaunchdMalformedData(unittest.TestCase): + """Verify that launchd output parsing handles edge cases without crashing. + + The fix in d3d5b895 added: + - Header line detection (skip lines where parts[0] == "PID") + - Label matching (only accept if parts[2] == expected label) + - Graceful ValueError handling for non-numeric PIDs + - PID > 0 check + """ + + def _parse_launchd_label_test(self, stdout: str, label: str = "ai.hermes.gateway") -> set: + """Reproduce the hardened launchd parsing logic.""" + pids = set() + for line in stdout.splitlines(): + parts = line.strip().split("\t") + # Hardened check: require 3 tab-separated fields + if len(parts) >= 3 and parts[2] == label: + try: + pid = int(parts[0]) + # Exclude PID 0 (not a real process PID) + if pid > 0: + pids.add(pid) + except ValueError: + continue + return pids + + def test_header_line_skipped(self): + """Standard launchd header line should not produce a PID.""" + result = self._parse_launchd_label_test("PID\tExitCode\tLabel\n") + self.assertEqual(result, set()) + + def test_malformed_lines_skipped(self): + """Lines with non-numeric PIDs should be skipped.""" + result = self._parse_launchd_label_test("abc\t0\tai.hermes.gateway\n") + self.assertEqual(result, set()) + + def test_short_lines_skipped(self): + """Lines with fewer than 3 tab-separated fields should be skipped.""" + result = self._parse_launchd_label_test("12345\n") + self.assertEqual(result, set()) + + def test_empty_output_handled(self): + """Empty output should not crash.""" + result = self._parse_launchd_label_test("") + self.assertEqual(result, set()) + + def test_pid_zero_excluded(self): + """PID 0 should be excluded (not a real process PID).""" + result = self._parse_launchd_label_test("0\t0\tai.hermes.gateway\n") + self.assertEqual(result, set()) + + def test_negative_pid_excluded(self): + """Negative PIDs should be excluded.""" + result = self._parse_launchd_label_test("-1\t0\tai.hermes.gateway\n") + self.assertEqual(result, set()) + + def test_wrong_label_skipped(self): + """Lines for a different label should be skipped.""" + result = self._parse_launchd_label_test("12345\t0\tcom.other.service\n") + self.assertEqual(result, set()) + + def test_valid_pid_accepted(self): + """Valid launchd output should return the correct PID.""" + result = self._parse_launchd_label_test("12345\t0\tai.hermes.gateway\n") + self.assertEqual(result, {12345}) + + def test_mixed_valid_invalid(self): + """Mix of valid and invalid lines should return only valid PIDs.""" + output = textwrap.dedent("""\ + PID\tExitCode\tLabel + abc\t0\tai.hermes.gateway + -1\t0\tai.hermes.gateway + 54321\t0\tai.hermes.gateway + 12345\t1\tai.hermes.gateway""") + result = self._parse_launchd_label_test(output) + self.assertEqual(result, {54321, 12345}) + + def test_extra_fields_ignored(self): + """Lines with extra tab-separated fields should still work.""" + result = self._parse_launchd_label_test("12345\t0\tai.hermes.gateway\textra\n") + self.assertEqual(result, {12345}) + + +# =================================================================== +# (f) Git commit verification +# =================================================================== +class TestCommitVerification(unittest.TestCase): + """Verify the expected commits are present in gitea/main.""" + + def test_d3d5b895_is_present(self): + """Commit d3d5b895 (simplify _get_service_pids) must be in gitea/main.""" + result = subprocess.run( + ["git", "rev-parse", "--verify", "d3d5b895^{commit}"], + capture_output=True, text=True, timeout=10, + cwd=PROJECT_ROOT, + ) + self.assertEqual(result.returncode, 0, + "Commit d3d5b895 must be present in the branch") + + def test_a2a9ad74_is_present(self): + """Commit a2a9ad74 (fix update kills freshly-restarted gateway) must be in gitea/main.""" + result = subprocess.run( + ["git", "rev-parse", "--verify", "a2a9ad74^{commit}"], + capture_output=True, text=True, timeout=10, + cwd=PROJECT_ROOT, + ) + self.assertEqual(result.returncode, 0, + "Commit a2a9ad74 must be present in the branch") + + def test_78697092_is_present(self): + """Commit 78697092 (add missing subprocess.run() timeouts) must be in gitea/main.""" + result = subprocess.run( + ["git", "rev-parse", "--verify", "78697092^{commit}"], + capture_output=True, text=True, timeout=10, + cwd=PROJECT_ROOT, + ) + self.assertEqual(result.returncode, 0, + "Commit 78697092 must be present in the branch") + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/hermes-sovereign/security/dependency-audit.yml b/hermes-sovereign/security/dependency-audit.yml new file mode 100644 index 00000000..b9dab222 --- /dev/null +++ b/hermes-sovereign/security/dependency-audit.yml @@ -0,0 +1,83 @@ +name: Dependency Audit + +on: + pull_request: + branches: [main] + paths: + - 'requirements.txt' + - 'pyproject.toml' + - 'uv.lock' + schedule: + - cron: '0 8 * * 1' # Weekly on Monday + workflow_dispatch: + +permissions: + pull-requests: write + contents: read + +jobs: + audit: + name: Audit Python dependencies + runs-on: ubuntu-latest + container: catthehacker/ubuntu:act-22.04 + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - name: Set up Python + run: uv python install 3.11 + - name: Install pip-audit + run: uv pip install --system pip-audit + - name: Run pip-audit + id: audit + run: | + set -euo pipefail + # Run pip-audit against the lock file/requirements + if pip-audit --requirement requirements.txt -f json -o /tmp/audit-results.json 2>/tmp/audit-stderr.txt; then + echo "found=false" >> "$GITHUB_OUTPUT" + else + echo "found=true" >> "$GITHUB_OUTPUT" + # Check severity + CRITICAL=$(python3 -c " + import json, sys + data = json.load(open('/tmp/audit-results.json')) + vulns = data.get('dependencies', []) + for d in vulns: + for v in d.get('vulns', []): + aliases = v.get('aliases', []) + # Check for critical/high CVSS + if any('CVSS' in str(a) for a in aliases): + print('true') + sys.exit(0) + print('false') + " 2>/dev/null || echo 'false') + echo "critical=${CRITICAL}" >> "$GITHUB_OUTPUT" + fi + continue-on-error: true + - name: Post results comment + if: steps.audit.outputs.found == 'true' && github.event_name == 'pull_request' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + BODY="## ⚠️ Dependency Vulnerabilities Detected + + \`pip-audit\` found vulnerable dependencies in this PR. Review and update before merging. + + \`\`\` + $(cat /tmp/audit-results.json | python3 -c " + import json, sys + data = json.load(sys.stdin) + for dep in data.get('dependencies', []): + for v in dep.get('vulns', []): + print(f\" {dep['name']}=={dep['version']}: {v['id']} - {v.get('description', '')[:120]}\") + " 2>/dev/null || cat /tmp/audit-stderr.txt) + \`\`\` + + --- + *Automated scan by [dependency-audit](/.github/workflows/dependency-audit.yml)*" + gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" + - name: Fail on vulnerabilities + if: steps.audit.outputs.found == 'true' + run: | + echo "::error::Vulnerable dependencies detected. See PR comment for details." + cat /tmp/audit-results.json | python3 -m json.tool || true + exit 1 diff --git a/hermes-sovereign/security/quarterly-security-audit.yml b/hermes-sovereign/security/quarterly-security-audit.yml new file mode 100644 index 00000000..3d737d00 --- /dev/null +++ b/hermes-sovereign/security/quarterly-security-audit.yml @@ -0,0 +1,115 @@ +name: Quarterly Security Audit + +on: + schedule: + # Run at 08:00 UTC on the first day of each quarter (Jan, Apr, Jul, Oct) + - cron: '0 8 1 1,4,7,10 *' + workflow_dispatch: + inputs: + reason: + description: 'Reason for manual trigger' + required: false + default: 'Manual quarterly audit' + +permissions: + issues: write + contents: read + +jobs: + create-audit-issue: + name: Create quarterly security audit issue + runs-on: ubuntu-latest + container: catthehacker/ubuntu:act-22.04 + steps: + - uses: actions/checkout@v4 + + - name: Get quarter info + id: quarter + run: | + MONTH=$(date +%-m) + YEAR=$(date +%Y) + QUARTER=$(( (MONTH - 1) / 3 + 1 )) + echo "quarter=Q${QUARTER}-${YEAR}" >> "$GITHUB_OUTPUT" + echo "year=${YEAR}" >> "$GITHUB_OUTPUT" + echo "q=${QUARTER}" >> "$GITHUB_OUTPUT" + + - name: Create audit issue + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + QUARTER="${{ steps.quarter.outputs.quarter }}" + + gh issue create \ + --title "[$QUARTER] Quarterly Security Audit" \ + --label "security,audit" \ + --body "$(cat <<'BODY' + ## Quarterly Security Audit — ${{ steps.quarter.outputs.quarter }} + + This is the scheduled quarterly security audit for the hermes-agent project. Complete each section and close this issue when the audit is done. + + **Audit Period:** ${{ steps.quarter.outputs.quarter }} + **Due:** End of quarter + **Owner:** Assign to a maintainer + + --- + + ## 1. Open Issues & PRs Audit + + Review all open issues and PRs for security-relevant content. Tag any that touch attack surfaces with the `security` label. + + - [ ] Review open issues older than 30 days for unaddressed security concerns + - [ ] Tag security-relevant open PRs with `needs-security-review` + - [ ] Check for any issues referencing CVEs or known vulnerabilities + - [ ] Review recently closed security issues — are fixes deployed? + + ## 2. Dependency Audit + + - [ ] Run `pip-audit` against current `requirements.txt` / `pyproject.toml` + - [ ] Check `uv.lock` for any pinned versions with known CVEs + - [ ] Review any `git+` dependencies for recent changes or compromise signals + - [ ] Update vulnerable dependencies and open PRs for each + + ## 3. Critical Path Review + + Review recent changes to attack-surface paths: + + - [ ] `gateway/` — authentication, message routing, platform adapters + - [ ] `tools/` — file I/O, command execution, web access + - [ ] `agent/` — prompt handling, context management + - [ ] `config/` — secrets loading, configuration parsing + - [ ] `.github/workflows/` — CI/CD integrity + + Run: `git log --since="3 months ago" --name-only -- gateway/ tools/ agent/ config/ .github/workflows/` + + ## 4. Secret Scan + + - [ ] Run secret scanner on the full codebase (not just diffs) + - [ ] Verify no credentials are present in git history + - [ ] Confirm all API keys/tokens in use are rotated on a regular schedule + + ## 5. Access & Permissions Review + + - [ ] Review who has write access to the main branch + - [ ] Confirm branch protection rules are still in place (require PR + review) + - [ ] Verify CI/CD secrets are scoped correctly (not over-permissioned) + - [ ] Review CODEOWNERS file for accuracy + + ## 6. Vulnerability Triage + + List any new vulnerabilities found this quarter: + + | ID | Component | Severity | Status | Owner | + |----|-----------|----------|--------|-------| + | | | | | | + + ## 7. Action Items + + | Action | Owner | Due Date | Status | + |--------|-------|----------|--------| + | | | | | + + --- + + *Auto-generated by [quarterly-security-audit](/.github/workflows/quarterly-security-audit.yml). Close this issue when the audit is complete.* + BODY + )" diff --git a/hermes-sovereign/security/secret-scan.yml b/hermes-sovereign/security/secret-scan.yml new file mode 100644 index 00000000..e3b2ae5d --- /dev/null +++ b/hermes-sovereign/security/secret-scan.yml @@ -0,0 +1,137 @@ +name: Secret Scan + +on: + pull_request: + types: [opened, synchronize, reopened] + +permissions: + pull-requests: write + contents: read + +jobs: + scan: + name: Scan for secrets + runs-on: ubuntu-latest + container: catthehacker/ubuntu:act-22.04 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch base branch + run: git fetch origin ${{ github.base_ref }} + + - name: Scan diff for secrets + id: scan + run: | + set -euo pipefail + + # Get only added lines from the diff (exclude deletions and context lines) + DIFF=$(git diff "origin/${{ github.base_ref }}"...HEAD -- \ + ':!*.lock' ':!uv.lock' ':!package-lock.json' ':!yarn.lock' \ + | grep '^+' | grep -v '^+++' || true) + + FINDINGS="" + CRITICAL=false + + check() { + local label="$1" + local pattern="$2" + local critical="${3:-false}" + local matches + matches=$(echo "$DIFF" | grep -oP "$pattern" || true) + if [ -n "$matches" ]; then + FINDINGS="${FINDINGS}\n- **${label}**: pattern matched" + if [ "$critical" = "true" ]; then + CRITICAL=true + fi + fi + } + + # AWS keys — critical + check "AWS Access Key" 'AKIA[0-9A-Z]{16}' true + + # Private key headers — critical + check "Private Key Header" '-----BEGIN (RSA|EC|DSA|OPENSSH|PGP) PRIVATE KEY' true + + # OpenAI / Anthropic style keys + check "OpenAI-style API key (sk-)" 'sk-[a-zA-Z0-9]{20,}' false + + # GitHub tokens + check "GitHub personal access token (ghp_)" 'ghp_[a-zA-Z0-9]{36}' true + check "GitHub fine-grained PAT (github_pat_)" 'github_pat_[a-zA-Z0-9_]{1,}' true + + # Slack tokens + check "Slack bot token (xoxb-)" 'xoxb-[0-9A-Za-z\-]{10,}' true + check "Slack user token (xoxp-)" 'xoxp-[0-9A-Za-z\-]{10,}' true + + # Generic assignment patterns — exclude obvious placeholders + GENERIC=$(echo "$DIFF" | grep -iP '(api_key|apikey|api-key|secret_key|access_token|auth_token)\s*[=:]\s*['"'"'"][^'"'"'"]{20,}['"'"'"]' \ + | grep -ivP '(fake|mock|test|placeholder|example|dummy|your[_-]|xxx|<|>|\{\{)' || true) + if [ -n "$GENERIC" ]; then + FINDINGS="${FINDINGS}\n- **Generic credential assignment**: possible hardcoded secret" + fi + + # .env additions with long values + ENV_DIFF=$(git diff "origin/${{ github.base_ref }}"...HEAD -- '*.env' '**/.env' '.env*' \ + | grep '^+' | grep -v '^+++' || true) + ENV_MATCHES=$(echo "$ENV_DIFF" | grep -P '^[A-Z_]+=.{16,}' \ + | grep -ivP '(fake|mock|test|placeholder|example|dummy|your[_-]|xxx)' || true) + if [ -n "$ENV_MATCHES" ]; then + FINDINGS="${FINDINGS}\n- **.env file**: lines with potentially real secret values detected" + fi + + # Write outputs + if [ -n "$FINDINGS" ]; then + echo "found=true" >> "$GITHUB_OUTPUT" + else + echo "found=false" >> "$GITHUB_OUTPUT" + fi + + if [ "$CRITICAL" = "true" ]; then + echo "critical=true" >> "$GITHUB_OUTPUT" + else + echo "critical=false" >> "$GITHUB_OUTPUT" + fi + + # Store findings in a file to use in comment step + printf "%b" "$FINDINGS" > /tmp/secret-findings.txt + + - name: Post PR comment with findings + if: steps.scan.outputs.found == 'true' && github.event_name == 'pull_request' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + FINDINGS=$(cat /tmp/secret-findings.txt) + SEVERITY="warning" + if [ "${{ steps.scan.outputs.critical }}" = "true" ]; then + SEVERITY="CRITICAL" + fi + + BODY="## Secret Scan — ${SEVERITY} findings + + The automated secret scanner detected potential secrets in the diff for this PR. + + ### Findings + ${FINDINGS} + + ### What to do + 1. Remove any real credentials from the diff immediately. + 2. If the match is a false positive (test fixture, placeholder), add a comment explaining why or rename the variable to include \`fake\`, \`mock\`, or \`test\`. + 3. Rotate any exposed credentials regardless of whether this PR is merged. + + --- + *Automated scan by [secret-scan](/.github/workflows/secret-scan.yml)*" + + gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" + + - name: Fail on critical secrets + if: steps.scan.outputs.critical == 'true' + run: | + echo "::error::Critical secrets detected in diff (private keys, AWS keys, or GitHub tokens). Remove them before merging." + exit 1 + + - name: Warn on non-critical findings + if: steps.scan.outputs.found == 'true' && steps.scan.outputs.critical == 'false' + run: | + echo "::warning::Potential secrets detected in diff. Review the PR comment for details." diff --git a/hermes-sovereign/security/security_pr_checklist.yml b/hermes-sovereign/security/security_pr_checklist.yml new file mode 100644 index 00000000..29ee9d15 --- /dev/null +++ b/hermes-sovereign/security/security_pr_checklist.yml @@ -0,0 +1,99 @@ +name: "🔒 Security PR Checklist" +description: "Use this when your PR touches authentication, file I/O, external API calls, or other sensitive paths." +title: "[Security Review]: " +labels: ["security", "needs-review"] +body: + - type: markdown + attributes: + value: | + ## Security Pre-Merge Review + Complete this checklist before requesting review on PRs that touch **authentication, file I/O, external API calls, or secrets handling**. + + - type: input + id: pr-link + attributes: + label: Pull Request + description: Link to the PR being reviewed + placeholder: "https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/pulls/XXX" + validations: + required: true + + - type: dropdown + id: change-type + attributes: + label: Change Category + description: What kind of sensitive change does this PR make? + multiple: true + options: + - Authentication / Authorization + - File I/O (read/write/delete) + - External API calls (outbound HTTP/network) + - Secret / credential handling + - Command execution (subprocess/shell) + - Dependency addition or update + - Configuration changes + - CI/CD pipeline changes + validations: + required: true + + - type: checkboxes + id: secrets-checklist + attributes: + label: Secrets & Credentials + options: + - label: No secrets, API keys, or credentials are hardcoded + required: true + - label: All sensitive values are loaded from environment variables or a secrets manager + required: true + - label: Test fixtures use fake/placeholder values, not real credentials + required: true + + - type: checkboxes + id: input-validation-checklist + attributes: + label: Input Validation + options: + - label: All external input (user, API, file) is validated before use + required: true + - label: File paths are validated against path traversal (`../`, null bytes, absolute paths) + - label: URLs are validated for SSRF (blocked private/metadata IPs) + - label: Shell commands do not use `shell=True` with user-controlled input + + - type: checkboxes + id: auth-checklist + attributes: + label: Authentication & Authorization (if applicable) + options: + - label: Authentication tokens are not logged or exposed in error messages + - label: Authorization checks happen server-side, not just client-side + - label: Session tokens are properly scoped and have expiry + + - type: checkboxes + id: supply-chain-checklist + attributes: + label: Supply Chain + options: + - label: New dependencies are pinned to a specific version range + - label: Dependencies come from trusted sources (PyPI, npm, official repos) + - label: No `.pth` files or install hooks that execute arbitrary code + - label: "`pip-audit` passes (no known CVEs in added dependencies)" + + - type: textarea + id: threat-model + attributes: + label: Threat Model Notes + description: | + Briefly describe the attack surface this change introduces or modifies, and how it is mitigated. + placeholder: | + This PR adds a new outbound HTTP call to the OpenRouter API. + Mitigation: URL is hardcoded (no user input), response is parsed with strict schema validation. + + - type: textarea + id: testing + attributes: + label: Security Testing Done + description: What security testing did you perform? + placeholder: | + - Ran validate_security.py — all checks pass + - Tested path traversal attempts manually + - Verified no secrets in git diff diff --git a/hermes-sovereign/security/validate_security.py b/hermes-sovereign/security/validate_security.py new file mode 100644 index 00000000..a9fe120e --- /dev/null +++ b/hermes-sovereign/security/validate_security.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +"""Comprehensive security validation script. + +Runs all security checks and reports status. +Usage: python validate_security.py +""" + +import sys +import os +import subprocess +import ast +from pathlib import Path + + +class SecurityValidator: + """Run comprehensive security validations.""" + + def __init__(self): + self.issues = [] + self.warnings = [] + self.checks_passed = 0 + self.checks_failed = 0 + + def run_all(self): + """Run all security checks.""" + print("=" * 80) + print("🔒 SECURITY VALIDATION SUITE") + print("=" * 80) + + self.check_command_injection() + self.check_path_traversal() + self.check_ssrf_protection() + self.check_secret_leakage() + self.check_interrupt_race_conditions() + self.check_test_coverage() + + self.print_summary() + return len(self.issues) == 0 + + def check_command_injection(self): + """Check for command injection vulnerabilities.""" + print("\n[1/6] Checking command injection protections...") + + # Check transcription_tools.py uses shlex.split + content = Path("tools/transcription_tools.py").read_text() + if "shlex.split" in content and "shell=False" in content: + print(" ✅ transcription_tools.py: Uses safe list-based execution") + self.checks_passed += 1 + else: + print(" ❌ transcription_tools.py: May use unsafe shell execution") + self.issues.append("Command injection in transcription_tools") + self.checks_failed += 1 + + # Check docker.py validates container IDs + content = Path("tools/environments/docker.py").read_text() + if "re.match" in content and "container" in content: + print(" ✅ docker.py: Validates container ID format") + self.checks_passed += 1 + else: + print(" ⚠️ docker.py: Container ID validation not confirmed") + self.warnings.append("Docker container ID validation") + + def check_path_traversal(self): + """Check for path traversal protections.""" + print("\n[2/6] Checking path traversal protections...") + + content = Path("tools/file_operations.py").read_text() + + checks = [ + ("_validate_safe_path", "Path validation function"), + ("_contains_path_traversal", "Traversal detection function"), + ("../", "Unix traversal pattern"), + ("..\\\\", "Windows traversal pattern"), + ("\\\\x00", "Null byte detection"), + ] + + for pattern, description in checks: + if pattern in content: + print(f" ✅ {description}") + self.checks_passed += 1 + else: + print(f" ❌ Missing: {description}") + self.issues.append(f"Path traversal: {description}") + self.checks_failed += 1 + + def check_ssrf_protection(self): + """Check for SSRF protections.""" + print("\n[3/6] Checking SSRF protections...") + + content = Path("tools/url_safety.py").read_text() + + checks = [ + ("_is_blocked_ip", "IP blocking function"), + ("create_safe_socket", "Connection-level validation"), + ("169.254", "Metadata service block"), + ("is_private", "Private IP detection"), + ] + + for pattern, description in checks: + if pattern in content: + print(f" ✅ {description}") + self.checks_passed += 1 + else: + print(f" ⚠️ {description} not found") + self.warnings.append(f"SSRF: {description}") + + def check_secret_leakage(self): + """Check for secret leakage protections.""" + print("\n[4/6] Checking secret leakage protections...") + + content = Path("tools/code_execution_tool.py").read_text() + + if "_ALLOWED_ENV_VARS" in content: + print(" ✅ Uses whitelist for environment variables") + self.checks_passed += 1 + elif "_SECRET_SUBSTRINGS" in content: + print(" ⚠️ Uses blacklist (may be outdated version)") + self.warnings.append("Blacklist instead of whitelist for secrets") + else: + print(" ❌ No secret filtering found") + self.issues.append("Secret leakage protection") + self.checks_failed += 1 + + # Check for common secret patterns in allowed list + dangerous_vars = ["API_KEY", "SECRET", "PASSWORD", "TOKEN"] + found_dangerous = [v for v in dangerous_vars if v in content] + + if found_dangerous: + print(f" ⚠️ Found potential secret vars in code: {found_dangerous}") + + def check_interrupt_race_conditions(self): + """Check for interrupt race condition fixes.""" + print("\n[5/6] Checking interrupt race condition protections...") + + content = Path("tools/interrupt.py").read_text() + + checks = [ + ("RLock", "Reentrant lock for thread safety"), + ("_interrupt_lock", "Lock variable"), + ("_interrupt_count", "Nesting count tracking"), + ] + + for pattern, description in checks: + if pattern in content: + print(f" ✅ {description}") + self.checks_passed += 1 + else: + print(f" ❌ Missing: {description}") + self.issues.append(f"Interrupt: {description}") + self.checks_failed += 1 + + def check_test_coverage(self): + """Check security test coverage.""" + print("\n[6/6] Checking security test coverage...") + + test_files = [ + "tests/tools/test_interrupt.py", + "tests/tools/test_path_traversal.py", + "tests/tools/test_command_injection.py", + ] + + for test_file in test_files: + if Path(test_file).exists(): + print(f" ✅ {test_file}") + self.checks_passed += 1 + else: + print(f" ❌ Missing: {test_file}") + self.issues.append(f"Missing test: {test_file}") + self.checks_failed += 1 + + def print_summary(self): + """Print validation summary.""" + print("\n" + "=" * 80) + print("VALIDATION SUMMARY") + print("=" * 80) + print(f"Checks Passed: {self.checks_passed}") + print(f"Checks Failed: {self.checks_failed}") + print(f"Warnings: {len(self.warnings)}") + + if self.issues: + print("\n❌ CRITICAL ISSUES:") + for issue in self.issues: + print(f" - {issue}") + + if self.warnings: + print("\n⚠️ WARNINGS:") + for warning in self.warnings: + print(f" - {warning}") + + if not self.issues: + print("\n✅ ALL SECURITY CHECKS PASSED") + + print("=" * 80) + + +if __name__ == "__main__": + validator = SecurityValidator() + success = validator.run_all() + sys.exit(0 if success else 1) diff --git a/hermes-sovereign/wizard-bootstrap/FORGE_OPERATIONS_GUIDE.md b/hermes-sovereign/wizard-bootstrap/FORGE_OPERATIONS_GUIDE.md new file mode 100644 index 00000000..17c8b753 --- /dev/null +++ b/hermes-sovereign/wizard-bootstrap/FORGE_OPERATIONS_GUIDE.md @@ -0,0 +1,215 @@ +# Forge Operations Guide + +> **Audience:** Forge wizards joining the hermes-agent project +> **Purpose:** Practical patterns, common pitfalls, and operational wisdom +> **Companion to:** `WIZARD_ENVIRONMENT_CONTRACT.md` + +--- + +## The One Rule + +**Read the actual state before acting.** + +Before touching any service, config, or codebase: `ps aux | grep hermes`, `cat ~/.hermes/gateway_state.json`, `curl http://127.0.0.1:8642/health`. The forge punishes assumptions harder than it rewards speed. Evidence always beats intuition. + +--- + +## First 15 Minutes on a New System + +```bash +# 1. Validate your environment +python wizard-bootstrap/wizard_bootstrap.py + +# 2. Check what is actually running +ps aux | grep -E 'hermes|python|gateway' + +# 3. Check the data directory +ls -la ~/.hermes/ +cat ~/.hermes/gateway_state.json 2>/dev/null | python3 -m json.tool + +# 4. Verify health endpoints (if gateway is up) +curl -sf http://127.0.0.1:8642/health | python3 -m json.tool + +# 5. Run the smoke test +source venv/bin/activate +python -m pytest tests/ -q -x --timeout=60 2>&1 | tail -20 +``` + +Do not begin work until all five steps return clean output. + +--- + +## Import Chain — Know It, Respect It + +The dependency order is load-bearing. Violating it causes silent failures: + +``` +tools/registry.py ← no deps; imported by everything + ↑ +tools/*.py ← each calls registry.register() at import time + ↑ +model_tools.py ← imports registry; triggers tool discovery + ↑ +run_agent.py / cli.py / batch_runner.py +``` + +**If you add a tool file**, you must also: +1. Add its import to `model_tools.py` `_discover_tools()` +2. Add it to `toolsets.py` (core or a named toolset) + +Missing either step causes the tool to silently not appear — no error, just absence. + +--- + +## The Five Profile Rules + +Hermes supports isolated profiles (`hermes -p myprofile`). Profile-unsafe code has caused repeated bugs. Memorize these: + +| Do this | Not this | +|---------|----------| +| `get_hermes_home()` | `Path.home() / ".hermes"` | +| `display_hermes_home()` in user messages | hardcoded `~/.hermes` strings | +| `get_hermes_home() / "sessions"` in tests | `~/.hermes/sessions` in tests | + +Import both from `hermes_constants`. Every `~/.hermes` hardcode is a latent profile bug. + +--- + +## Prompt Caching — Do Not Break It + +The agent caches system prompts. Cache breaks force re-billing of the entire context window on every turn. The following actions break caching mid-conversation and are forbidden: + +- Altering past context +- Changing the active toolset +- Reloading memories or rebuilding the system prompt + +The only sanctioned context alteration is the context compressor (`agent/context_compressor.py`). If your feature touches the message history, read that file first. + +--- + +## Adding a Slash Command (Checklist) + +Four files, in order: + +1. **`hermes_cli/commands.py`** — add `CommandDef` to `COMMAND_REGISTRY` +2. **`cli.py`** — add handler branch in `HermesCLI.process_command()` +3. **`gateway/run.py`** — add handler if it should work in messaging platforms +4. **Aliases** — add to the `aliases` tuple on the `CommandDef`; everything else updates automatically + +All downstream consumers (Telegram menu, Slack routing, autocomplete, help text) derive from `COMMAND_REGISTRY`. You never touch them directly. + +--- + +## Tool Schema Pitfalls + +**Do NOT cross-reference other toolsets in schema descriptions.** +Writing "prefer `web_search` over this tool" in a browser tool's description will cause the model to hallucinate calls to `web_search` when it's not loaded. Cross-references belong in `get_tool_definitions()` post-processing blocks in `model_tools.py`. + +**Do NOT use `\033[K` (ANSI erase-to-EOL) in display code.** +Under `prompt_toolkit`'s `patch_stdout`, it leaks as literal `?[K`. Use space-padding instead: `f"\r{line}{' ' * pad}"`. + +**Do NOT use `simple_term_menu` for interactive menus.** +It ghosts on scroll in tmux/iTerm2. Use `curses` (stdlib). See `hermes_cli/tools_config.py` for the pattern. + +--- + +## Health Check Anatomy + +A healthy instance returns: + +```json +{ + "status": "ok", + "gateway_state": "running", + "platforms": { + "telegram": {"state": "connected"} + } +} +``` + +| Field | Healthy value | What a bad value means | +|-------|--------------|----------------------| +| `status` | `"ok"` | HTTP server down | +| `gateway_state` | `"running"` | Still starting or crashed | +| `platforms..state` | `"connected"` | Auth failure or network issue | + +`gateway_state: "starting"` is normal for up to 60 s on boot. Beyond that, check logs for auth errors: + +```bash +journalctl -u hermes-gateway --since "2 minutes ago" | grep -i "error\|token\|auth" +``` + +--- + +## Gateway Won't Start — Diagnosis Order + +1. `ss -tlnp | grep 8642` — port conflict? +2. `cat ~/.hermes/gateway.pid` → `ps -p ` — stale PID file? +3. `hermes gateway start --replace` — clears stale locks and PIDs +4. `HERMES_LOG_LEVEL=DEBUG hermes gateway start` — verbose output +5. Check `~/.hermes/.env` — missing or placeholder token? + +--- + +## Before Every PR + +```bash +source venv/bin/activate +python -m pytest tests/ -q # full suite: ~3 min, ~3000 tests +python scripts/deploy-validate # deployment health check +python wizard-bootstrap/wizard_bootstrap.py # environment sanity +``` + +All three must exit 0. Do not skip. "It works locally" is not sufficient evidence. + +--- + +## Session and State Files + +| Store | Location | Notes | +|-------|----------|-------| +| Sessions | `~/.hermes/sessions/*.json` | Persisted across restarts | +| Memories | `~/.hermes/memories/*.md` | Written by the agent's memory tool | +| Cron jobs | `~/.hermes/cron/*.json` | Scheduler state | +| Gateway state | `~/.hermes/gateway_state.json` | Live platform connection status | +| Response store | `~/.hermes/response_store.db` | SQLite WAL — API server only | + +All paths go through `get_hermes_home()`. Never hardcode. Always backup before a major update: + +```bash +tar czf ~/backups/hermes_$(date +%F_%H%M).tar.gz ~/.hermes/ +``` + +--- + +## Writing Tests + +```bash +python -m pytest tests/path/to/test.py -q # single file +python -m pytest tests/ -q -k "test_name" # by name +python -m pytest tests/ -q -x # stop on first failure +``` + +**Test isolation rules:** +- `tests/conftest.py` has an autouse fixture that redirects `HERMES_HOME` to a temp dir. Never write to `~/.hermes/` in tests. +- Profile tests must mock both `Path.home()` and `HERMES_HOME`. See `tests/hermes_cli/test_profiles.py` for the pattern. +- Do not mock the database. Integration tests should use real SQLite with a temp path. + +--- + +## Commit Conventions + +``` +feat: add X # new capability +fix: correct Y # bug fix +refactor: restructure Z # no behaviour change +test: add tests for W # test-only +chore: update deps # housekeeping +docs: clarify X # documentation only +``` + +Include `Fixes #NNN` or `Refs #NNN` in the commit message body to close or reference issues automatically. + +--- + +*This guide lives in `wizard-bootstrap/`. Update it when you discover a new pitfall or pattern worth preserving.* diff --git a/hermes-sovereign/wizard-bootstrap/WIZARD_ENVIRONMENT_CONTRACT.md b/hermes-sovereign/wizard-bootstrap/WIZARD_ENVIRONMENT_CONTRACT.md new file mode 100644 index 00000000..3216f368 --- /dev/null +++ b/hermes-sovereign/wizard-bootstrap/WIZARD_ENVIRONMENT_CONTRACT.md @@ -0,0 +1,162 @@ +# Wizard Environment Contract + +> **Version:** 1.0.0 +> **Owner:** Wizard Council (Bezalel Epic-004) +> **Last updated:** 2026-04-06 + +This document defines the minimum viable state every forge wizard must maintain. +A wizard that satisfies all requirements is considered **forge-ready**. + +--- + +## 1. Python Runtime + +| Requirement | Minimum | Notes | +|-------------|---------|-------| +| Python version | 3.11 | 3.12+ recommended | +| Virtual environment | Activated | `source venv/bin/activate` before running | + +Run `python --version` to verify. + +--- + +## 2. Core Package Dependencies + +All packages in `requirements.txt` must be installed and importable. +Critical packages: `openai`, `anthropic`, `pyyaml`, `rich`, `requests`, `pydantic`, `prompt_toolkit`. + +**Verify:** +```bash +python wizard-bootstrap/wizard_bootstrap.py +``` + +--- + +## 3. LLM Provider Key + +At least one LLM provider API key must be set in `~/.hermes/.env`: + +| Variable | Provider | +|----------|----------| +| `OPENROUTER_API_KEY` | OpenRouter (200+ models) | +| `ANTHROPIC_API_KEY` | Anthropic Claude | +| `ANTHROPIC_TOKEN` | Anthropic Claude (alt) | +| `OPENAI_API_KEY` | OpenAI | +| `GLM_API_KEY` | z.ai/GLM | +| `KIMI_API_KEY` | Moonshot/Kimi | +| `MINIMAX_API_KEY` | MiniMax | + +--- + +## 4. Gitea Authentication + +| Requirement | Details | +|-------------|---------| +| Variable | `GITEA_TOKEN` or `FORGE_TOKEN` | +| Scope | Must have repo read/write access | +| Forge URL | `https://forge.alexanderwhitestone.com` (or `FORGE_URL` env var) | + +The wizard must be able to create and merge PRs on the forge. + +--- + +## 5. Telegram Connectivity (Gateway Wizards) + +Wizards that operate via the messaging gateway must also satisfy: + +| Requirement | Details | +|-------------|---------| +| Variable | `TELEGRAM_BOT_TOKEN` | +| Home channel | `TELEGRAM_HOME_CHANNEL` | +| API reachability | `api.telegram.org` must be reachable | + +CLI-only wizards may skip Telegram checks. + +--- + +## 6. HERMES_HOME + +| Requirement | Details | +|-------------|---------| +| Default | `~/.hermes` | +| Override | `HERMES_HOME` env var | +| Permissions | Owner-writable (700 recommended) | + +The directory must exist and be writable before any hermes command runs. + +--- + +## 7. Skill Dependencies (Per-Skill) + +Each skill may declare binary and environment-variable dependencies in its +`SKILL.md` frontmatter: + +```yaml +--- +name: my-skill +dependencies: + binaries: [ffmpeg, imagemagick] + env_vars: [MY_API_KEY] +--- +``` + +A wizard must satisfy all dependencies for any skill it intends to run. + +**Check all skill deps:** +```bash +python wizard-bootstrap/dependency_checker.py +``` + +--- + +## 8. Enforcement + +### New Wizard Onboarding + +Run the bootstrap script before going online: + +```bash +python wizard-bootstrap/wizard_bootstrap.py +``` + +Resolve all failures before beginning work. + +### Ongoing Compliance + +A monthly audit runs automatically (see `wizard-bootstrap/monthly_audit.py`). +The report is saved to `~/.hermes/wizard-council/audit-YYYY-MM.md` and posted +to the `wizard-council-automation` Telegram channel. + +### Skill Drift + +Run the skills audit to detect and fix drift: + +```bash +python wizard-bootstrap/skills_audit.py # detect +python wizard-bootstrap/skills_audit.py --fix # sync +``` + +--- + +## 9. Contract Versioning + +Changes to this contract require a PR reviewed by at least one wizard council +member. Bump the version number and update the date above with each change. + +--- + +## Quick Reference + +```bash +# Full environment validation +python wizard-bootstrap/wizard_bootstrap.py + +# Skills drift check +python wizard-bootstrap/skills_audit.py + +# Dependency check +python wizard-bootstrap/dependency_checker.py + +# Full monthly audit (all three checks, saves report) +python wizard-bootstrap/monthly_audit.py +``` diff --git a/hermes-sovereign/wizard-bootstrap/__init__.py b/hermes-sovereign/wizard-bootstrap/__init__.py new file mode 100644 index 00000000..5967f471 --- /dev/null +++ b/hermes-sovereign/wizard-bootstrap/__init__.py @@ -0,0 +1 @@ +# wizard-bootstrap package diff --git a/hermes-sovereign/wizard-bootstrap/dependency_checker.py b/hermes-sovereign/wizard-bootstrap/dependency_checker.py new file mode 100644 index 00000000..bd73fbd6 --- /dev/null +++ b/hermes-sovereign/wizard-bootstrap/dependency_checker.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +""" +dependency_checker.py — Cross-Wizard Dependency Validator + +Each skill may declare binary or environment-variable dependencies in its +SKILL.md frontmatter under a `dependencies` key: + + --- + name: my-skill + dependencies: + binaries: [ffmpeg, imagemagick] + env_vars: [MY_API_KEY, MY_SECRET] + --- + +This script scans all installed skills, extracts declared dependencies, and +checks whether each is satisfied in the current environment. + +Usage: + python wizard-bootstrap/dependency_checker.py + python wizard-bootstrap/dependency_checker.py --json + python wizard-bootstrap/dependency_checker.py --skill software-development/code-review +""" + +import argparse +import json +import os +import shutil +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +try: + import yaml + HAS_YAML = True +except ImportError: + HAS_YAML = False + + +# --------------------------------------------------------------------------- +# Data model +# --------------------------------------------------------------------------- + +@dataclass +class SkillDep: + skill_path: str + skill_name: str + binary: Optional[str] = None + env_var: Optional[str] = None + satisfied: bool = False + detail: str = "" + + +@dataclass +class DepReport: + deps: list[SkillDep] = field(default_factory=list) + + @property + def all_satisfied(self) -> bool: + return all(d.satisfied for d in self.deps) + + @property + def unsatisfied(self) -> list[SkillDep]: + return [d for d in self.deps if not d.satisfied] + + +# --------------------------------------------------------------------------- +# Frontmatter parser +# --------------------------------------------------------------------------- + +def _parse_frontmatter(text: str) -> dict: + """Extract YAML frontmatter from a SKILL.md file.""" + if not text.startswith("---"): + return {} + end = text.find("\n---", 3) + if end == -1: + return {} + fm_text = text[3:end].strip() + if not HAS_YAML: + return {} + try: + return yaml.safe_load(fm_text) or {} + except Exception: + return {} + + +def _load_skill_deps(skill_md: Path) -> tuple[str, list[str], list[str]]: + """ + Returns (skill_name, binaries, env_vars) from a SKILL.md frontmatter. + """ + text = skill_md.read_text(encoding="utf-8", errors="replace") + fm = _parse_frontmatter(text) + skill_name = fm.get("name", skill_md.parent.name) + deps = fm.get("dependencies", {}) + if not isinstance(deps, dict): + return skill_name, [], [] + binaries = deps.get("binaries") or [] + env_vars = deps.get("env_vars") or [] + if isinstance(binaries, str): + binaries = [binaries] + if isinstance(env_vars, str): + env_vars = [env_vars] + return skill_name, list(binaries), list(env_vars) + + +# --------------------------------------------------------------------------- +# Checks +# --------------------------------------------------------------------------- + +def _check_binary(binary: str) -> tuple[bool, str]: + path = shutil.which(binary) + if path: + return True, f"found at {path}" + return False, f"not found in PATH" + + +def _check_env_var(var: str) -> tuple[bool, str]: + val = os.environ.get(var) + if val: + return True, "set" + return False, "not set" + + +# --------------------------------------------------------------------------- +# Scanner +# --------------------------------------------------------------------------- + +def _find_skills_dir() -> Optional[Path]: + """Resolve skills directory: prefer repo root, fall back to HERMES_HOME.""" + # Check if we're inside the repo + repo_root = Path(__file__).parent.parent + repo_skills = repo_root / "skills" + if repo_skills.exists(): + return repo_skills + + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + for candidate in [hermes_home / "skills", hermes_home / "hermes-agent" / "skills"]: + if candidate.exists(): + return candidate + return None + + +def run_dep_check(skills_dir: Optional[Path] = None, skill_filter: Optional[str] = None) -> DepReport: + resolved = skills_dir or _find_skills_dir() + report = DepReport() + + if resolved is None or not resolved.exists(): + return report + + # Load ~/.hermes/.env so env var checks work + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + env_path = hermes_home / ".env" + if env_path.exists(): + try: + from dotenv import load_dotenv # noqa: PLC0415 + load_dotenv(env_path, override=False) + except Exception: + pass + + for skill_md in sorted(resolved.rglob("SKILL.md")): + rel = str(skill_md.parent.relative_to(resolved)) + if skill_filter and skill_filter not in rel: + continue + + skill_name, binaries, env_vars = _load_skill_deps(skill_md) + + for binary in binaries: + ok, detail = _check_binary(binary) + report.deps.append(SkillDep( + skill_path=rel, + skill_name=skill_name, + binary=binary, + satisfied=ok, + detail=detail, + )) + + for var in env_vars: + ok, detail = _check_env_var(var) + report.deps.append(SkillDep( + skill_path=rel, + skill_name=skill_name, + env_var=var, + satisfied=ok, + detail=detail, + )) + + return report + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + +_GREEN = "\033[32m" +_RED = "\033[31m" +_YELLOW = "\033[33m" +_BOLD = "\033[1m" +_RESET = "\033[0m" + + +def _render_terminal(report: DepReport) -> None: + print(f"\n{_BOLD}=== Cross-Wizard Dependency Check ==={_RESET}\n") + + if not report.deps: + print("No skill dependencies declared. Skills use implicit deps only.\n") + print( + f"{_YELLOW}Tip:{_RESET} Declare binary/env_var deps in SKILL.md frontmatter " + "under a 'dependencies' key to make them checkable.\n" + ) + return + + for dep in report.deps: + icon = f"{_GREEN}✓{_RESET}" if dep.satisfied else f"{_RED}✗{_RESET}" + if dep.binary: + dep_type = "binary" + dep_name = dep.binary + else: + dep_type = "env_var" + dep_name = dep.env_var + + print(f" {icon} [{dep.skill_path}] {dep_type}:{dep_name} — {dep.detail}") + + total = len(report.deps) + satisfied = sum(1 for d in report.deps if d.satisfied) + print() + if report.all_satisfied: + print(f"{_GREEN}{_BOLD}All {total} dependencies satisfied.{_RESET}\n") + else: + failed = total - satisfied + print( + f"{_RED}{_BOLD}{failed}/{total} dependencies unsatisfied.{_RESET} " + "Install missing binaries and set missing env vars.\n" + ) + + +def _render_json(report: DepReport) -> None: + out = { + "all_satisfied": report.all_satisfied, + "summary": { + "total": len(report.deps), + "satisfied": sum(1 for d in report.deps if d.satisfied), + "unsatisfied": len(report.unsatisfied), + }, + "deps": [ + { + "skill_path": d.skill_path, + "skill_name": d.skill_name, + "type": "binary" if d.binary else "env_var", + "name": d.binary or d.env_var, + "satisfied": d.satisfied, + "detail": d.detail, + } + for d in report.deps + ], + } + print(json.dumps(out, indent=2)) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + +def main() -> None: + if not HAS_YAML: + print("WARNING: pyyaml not installed — cannot parse SKILL.md frontmatter. " + "Dependency declarations will be skipped.", file=sys.stderr) + + parser = argparse.ArgumentParser( + description="Check cross-wizard skill dependencies (binaries, env vars)." + ) + parser.add_argument( + "--skills-dir", + default=None, + help="Skills directory to scan (default: auto-detect)", + ) + parser.add_argument( + "--skill", + default=None, + help="Filter to a specific skill path substring", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output results as JSON", + ) + args = parser.parse_args() + + skills_dir = Path(args.skills_dir).resolve() if args.skills_dir else None + report = run_dep_check(skills_dir=skills_dir, skill_filter=args.skill) + + if args.json: + _render_json(report) + else: + _render_terminal(report) + + sys.exit(0 if report.all_satisfied else 1) + + +if __name__ == "__main__": + main() diff --git a/hermes-sovereign/wizard-bootstrap/monthly_audit.py b/hermes-sovereign/wizard-bootstrap/monthly_audit.py new file mode 100644 index 00000000..6c9811ad --- /dev/null +++ b/hermes-sovereign/wizard-bootstrap/monthly_audit.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +""" +monthly_audit.py — Wizard Council Monthly Environment Audit + +Runs all three checks (bootstrap, skills audit, dependency check) and +produces a combined Markdown report. Designed to be invoked by cron or +manually. + +Usage: + python wizard-bootstrap/monthly_audit.py + python wizard-bootstrap/monthly_audit.py --output /path/to/report.md + python wizard-bootstrap/monthly_audit.py --post-telegram # post to configured channel + +The report is also written to ~/.hermes/wizard-council/audit-YYYY-MM.md +""" + +import argparse +import io +import json +import os +import sys +from contextlib import redirect_stdout +from datetime import datetime, timezone +from pathlib import Path + +# Ensure repo root is importable +_REPO_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(_REPO_ROOT)) + +from wizard_bootstrap import run_all_checks +from skills_audit import run_audit +from dependency_checker import run_dep_check + + +# --------------------------------------------------------------------------- +# Report builder +# --------------------------------------------------------------------------- + +def _emoji(ok: bool) -> str: + return "✅" if ok else "❌" + + +def build_report(repo_root: Path) -> str: + now = datetime.now(timezone.utc) + lines = [ + f"# Wizard Council Environment Audit", + f"", + f"**Date:** {now.strftime('%Y-%m-%d %H:%M UTC')}", + f"", + f"---", + f"", + ] + + # 1. Bootstrap checks + lines.append("## 1. Environment Bootstrap") + lines.append("") + bootstrap = run_all_checks() + for check in bootstrap.checks: + icon = _emoji(check.passed) + label = check.name.replace("_", " ").title() + lines.append(f"- {icon} **{label}**: {check.message}") + if not check.passed and check.fix_hint: + lines.append(f" - _Fix_: {check.fix_hint}") + lines.append("") + if bootstrap.passed: + lines.append("**Environment: READY** ✅") + else: + failed = len(bootstrap.failed) + lines.append(f"**Environment: {failed} check(s) FAILED** ❌") + lines.append("") + lines.append("---") + lines.append("") + + # 2. Skills audit + lines.append("## 2. Skills Drift Audit") + lines.append("") + skills_report = run_audit(repo_root) + missing = skills_report.by_status("MISSING") + extra = skills_report.by_status("EXTRA") + outdated = skills_report.by_status("OUTDATED") + ok_count = len(skills_report.by_status("OK")) + total = len(skills_report.drifts) + + lines.append(f"| Status | Count |") + lines.append(f"|--------|-------|") + lines.append(f"| ✅ OK | {ok_count} |") + lines.append(f"| ❌ Missing | {len(missing)} |") + lines.append(f"| ⚠️ Extra | {len(extra)} |") + lines.append(f"| 🔄 Outdated | {len(outdated)} |") + lines.append(f"| **Total** | **{total}** |") + lines.append("") + + if missing: + lines.append("### Missing Skills (in repo, not installed)") + for d in missing: + lines.append(f"- `{d.skill_path}`") + lines.append("") + + if outdated: + lines.append("### Outdated Skills") + for d in outdated: + lines.append(f"- `{d.skill_path}` (repo: `{d.repo_hash}`, installed: `{d.installed_hash}`)") + lines.append("") + + if extra: + lines.append("### Extra Skills (installed, not in repo)") + for d in extra: + lines.append(f"- `{d.skill_path}`") + lines.append("") + + if not skills_report.has_drift: + lines.append("**Skills: IN SYNC** ✅") + else: + lines.append("**Skills: DRIFT DETECTED** ❌ — run `python wizard-bootstrap/skills_audit.py --fix`") + lines.append("") + lines.append("---") + lines.append("") + + # 3. Dependency check + lines.append("## 3. Cross-Wizard Dependency Check") + lines.append("") + dep_report = run_dep_check() + + if not dep_report.deps: + lines.append("No explicit dependencies declared in SKILL.md frontmatter.") + lines.append("") + lines.append( + "_Tip: Add a `dependencies` block to SKILL.md to make binary/env_var " + "requirements checkable automatically._" + ) + else: + satisfied = sum(1 for d in dep_report.deps if d.satisfied) + total_deps = len(dep_report.deps) + lines.append(f"**{satisfied}/{total_deps} dependencies satisfied.**") + lines.append("") + if dep_report.unsatisfied: + lines.append("### Unsatisfied Dependencies") + for dep in dep_report.unsatisfied: + dep_type = "binary" if dep.binary else "env_var" + dep_name = dep.binary or dep.env_var + lines.append(f"- `[{dep.skill_path}]` {dep_type}:`{dep_name}` — {dep.detail}") + lines.append("") + + if dep_report.all_satisfied: + lines.append("**Dependencies: ALL SATISFIED** ✅") + else: + lines.append("**Dependencies: ISSUES FOUND** ❌") + lines.append("") + lines.append("---") + lines.append("") + + # Summary + overall_ok = bootstrap.passed and not skills_report.has_drift and dep_report.all_satisfied + lines.append("## Summary") + lines.append("") + lines.append(f"| Check | Status |") + lines.append(f"|-------|--------|") + lines.append(f"| Environment Bootstrap | {_emoji(bootstrap.passed)} |") + lines.append(f"| Skills Drift | {_emoji(not skills_report.has_drift)} |") + lines.append(f"| Dependency Check | {_emoji(dep_report.all_satisfied)} |") + lines.append("") + if overall_ok: + lines.append("**Overall: FORGE READY** ✅") + else: + lines.append("**Overall: ACTION REQUIRED** ❌") + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Output / delivery +# --------------------------------------------------------------------------- + +def _save_report(report: str, output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(report, encoding="utf-8") + print(f"Report saved to: {output_path}") + + +def _post_telegram(report: str) -> None: + """Post the report summary to Telegram via hermes gateway if configured.""" + token = os.environ.get("TELEGRAM_BOT_TOKEN") + channel = os.environ.get("TELEGRAM_HOME_CHANNEL") or os.environ.get("TELEGRAM_CHANNEL_ID") + if not (token and channel): + print("Telegram not configured (need TELEGRAM_BOT_TOKEN + TELEGRAM_HOME_CHANNEL).", file=sys.stderr) + return + + try: + import requests # noqa: PLC0415 + + # Extract just the summary section for Telegram (keep it brief) + summary_start = report.find("## Summary") + summary_text = report[summary_start:] if summary_start != -1 else report[-1000:] + payload = { + "chat_id": channel, + "text": f"🧙 **Wizard Council Monthly Audit**\n\n{summary_text}", + "parse_mode": "Markdown", + } + resp = requests.post( + f"https://api.telegram.org/bot{token}/sendMessage", + json=payload, + timeout=15, + ) + if resp.status_code == 200: + print("Report summary posted to Telegram.") + else: + print(f"Telegram post failed: HTTP {resp.status_code}", file=sys.stderr) + except Exception as exc: + print(f"Telegram post error: {exc}", file=sys.stderr) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Run the monthly Wizard Council environment audit." + ) + parser.add_argument( + "--output", + default=None, + help="Path to save the Markdown report (default: ~/.hermes/wizard-council/audit-YYYY-MM.md)", + ) + parser.add_argument( + "--repo-root", + default=str(_REPO_ROOT), + help="Root of the hermes-agent repo", + ) + parser.add_argument( + "--post-telegram", + action="store_true", + help="Post the report summary to Telegram", + ) + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + report = build_report(repo_root) + + # Print to stdout + print(report) + + # Save to default location + now = datetime.now(timezone.utc) + if args.output: + output_path = Path(args.output) + else: + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + output_path = hermes_home / "wizard-council" / f"audit-{now.strftime('%Y-%m')}.md" + + _save_report(report, output_path) + + if args.post_telegram: + _post_telegram(report) + + +if __name__ == "__main__": + main() diff --git a/hermes-sovereign/wizard-bootstrap/skills_audit.py b/hermes-sovereign/wizard-bootstrap/skills_audit.py new file mode 100644 index 00000000..2a17450c --- /dev/null +++ b/hermes-sovereign/wizard-bootstrap/skills_audit.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +""" +skills_audit.py — Skills Drift Detector + +Compares the skills bundled in the repo against those installed in +HERMES_HOME/skills/, then reports any drift: + + - MISSING — skill in repo but not in installed location + - EXTRA — skill installed but not in repo (local-only) + - OUTDATED — repo skill.md differs from installed skill.md + +Usage: + python wizard-bootstrap/skills_audit.py + python wizard-bootstrap/skills_audit.py --fix # copy missing skills + python wizard-bootstrap/skills_audit.py --json + python wizard-bootstrap/skills_audit.py --repo-root /path/to/hermes-agent +""" + +import argparse +import difflib +import hashlib +import json +import os +import shutil +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +# --------------------------------------------------------------------------- +# Data model +# --------------------------------------------------------------------------- + +@dataclass +class SkillDrift: + skill_path: str # e.g. "software-development/code-review" + status: str # "MISSING" | "EXTRA" | "OUTDATED" | "OK" + repo_hash: Optional[str] = None + installed_hash: Optional[str] = None + diff_lines: list[str] = field(default_factory=list) + + +@dataclass +class AuditReport: + drifts: list[SkillDrift] = field(default_factory=list) + repo_root: Path = Path(".") + installed_root: Path = Path(".") + + @property + def has_drift(self) -> bool: + return any(d.status != "OK" for d in self.drifts) + + def by_status(self, status: str) -> list[SkillDrift]: + return [d for d in self.drifts if d.status == status] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _sha256_file(path: Path) -> str: + h = hashlib.sha256() + h.update(path.read_bytes()) + return h.hexdigest()[:16] + + +def _find_skills(root: Path) -> dict[str, Path]: + """Return {relative_skill_path: SKILL.md path} for every skill under root.""" + skills: dict[str, Path] = {} + for skill_md in root.rglob("SKILL.md"): + # skill path is relative to root, e.g. "software-development/code-review" + rel = skill_md.parent.relative_to(root) + skills[str(rel)] = skill_md + return skills + + +def _diff_skills(repo_md: Path, installed_md: Path) -> list[str]: + repo_lines = repo_md.read_text(encoding="utf-8", errors="replace").splitlines() + inst_lines = installed_md.read_text(encoding="utf-8", errors="replace").splitlines() + diff = list( + difflib.unified_diff( + inst_lines, + repo_lines, + fromfile="installed", + tofile="repo", + lineterm="", + ) + ) + return diff + + +# --------------------------------------------------------------------------- +# Core audit logic +# --------------------------------------------------------------------------- + +def _resolve_installed_skills_root() -> Optional[Path]: + """Return the installed skills directory, or None if not found.""" + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + candidates = [ + hermes_home / "skills", + hermes_home / "hermes-agent" / "skills", + ] + for candidate in candidates: + if candidate.exists(): + return candidate + return None + + +def run_audit(repo_root: Path, installed_root: Optional[Path] = None) -> AuditReport: + repo_skills_dir = repo_root / "skills" + if not repo_skills_dir.exists(): + print(f"ERROR: Repo skills directory not found: {repo_skills_dir}", file=sys.stderr) + sys.exit(1) + + resolved_installed = installed_root or _resolve_installed_skills_root() + report = AuditReport( + repo_root=repo_root, + installed_root=resolved_installed or Path("/not-found"), + ) + + repo_map = _find_skills(repo_skills_dir) + + if resolved_installed is None or not resolved_installed.exists(): + # All repo skills are "MISSING" from the installation + for skill_path in sorted(repo_map): + report.drifts.append( + SkillDrift( + skill_path=skill_path, + status="MISSING", + repo_hash=_sha256_file(repo_map[skill_path]), + ) + ) + return report + + installed_map = _find_skills(resolved_installed) + + all_paths = sorted(set(repo_map) | set(installed_map)) + for skill_path in all_paths: + in_repo = skill_path in repo_map + in_installed = skill_path in installed_map + + if in_repo and not in_installed: + report.drifts.append( + SkillDrift( + skill_path=skill_path, + status="MISSING", + repo_hash=_sha256_file(repo_map[skill_path]), + ) + ) + elif in_installed and not in_repo: + report.drifts.append( + SkillDrift( + skill_path=skill_path, + status="EXTRA", + installed_hash=_sha256_file(installed_map[skill_path]), + ) + ) + else: + rh = _sha256_file(repo_map[skill_path]) + ih = _sha256_file(installed_map[skill_path]) + if rh != ih: + diff = _diff_skills(repo_map[skill_path], installed_map[skill_path]) + report.drifts.append( + SkillDrift( + skill_path=skill_path, + status="OUTDATED", + repo_hash=rh, + installed_hash=ih, + diff_lines=diff, + ) + ) + else: + report.drifts.append( + SkillDrift(skill_path=skill_path, status="OK", repo_hash=rh, installed_hash=ih) + ) + + return report + + +# --------------------------------------------------------------------------- +# Fix: copy missing skills into installed location +# --------------------------------------------------------------------------- + +def apply_fix(report: AuditReport) -> None: + if report.installed_root == Path("/not-found"): + print("Cannot fix: installed skills directory not found.", file=sys.stderr) + return + + repo_skills_dir = report.repo_root / "skills" + for drift in report.by_status("MISSING"): + src = repo_skills_dir / drift.skill_path / "SKILL.md" + dst = report.installed_root / drift.skill_path / "SKILL.md" + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + print(f" Installed: {drift.skill_path}") + + for drift in report.by_status("OUTDATED"): + src = repo_skills_dir / drift.skill_path / "SKILL.md" + dst = report.installed_root / drift.skill_path / "SKILL.md" + shutil.copy2(src, dst) + print(f" Updated: {drift.skill_path}") + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + +_GREEN = "\033[32m" +_RED = "\033[31m" +_YELLOW = "\033[33m" +_CYAN = "\033[36m" +_BOLD = "\033[1m" +_RESET = "\033[0m" + +_STATUS_COLOR = { + "OK": _GREEN, + "MISSING": _RED, + "EXTRA": _YELLOW, + "OUTDATED": _CYAN, +} + + +def _render_terminal(report: AuditReport, show_diff: bool = False) -> None: + print(f"\n{_BOLD}=== Wizard Skills Audit ==={_RESET}") + print(f" Repo skills: {report.repo_root / 'skills'}") + print(f" Installed skills: {report.installed_root}\n") + + if not report.drifts: + print(f"{_GREEN}No skills found to compare.{_RESET}\n") + return + + total = len(report.drifts) + ok = len(report.by_status("OK")) + missing = len(report.by_status("MISSING")) + extra = len(report.by_status("EXTRA")) + outdated = len(report.by_status("OUTDATED")) + + for drift in sorted(report.drifts, key=lambda d: (d.status == "OK", d.skill_path)): + color = _STATUS_COLOR.get(drift.status, _RESET) + print(f" {color}{drift.status:8}{_RESET} {drift.skill_path}") + if show_diff and drift.diff_lines: + for line in drift.diff_lines[:20]: + print(f" {line}") + if len(drift.diff_lines) > 20: + print(f" ... ({len(drift.diff_lines) - 20} more lines)") + + print() + print(f" Total: {total} OK: {_GREEN}{ok}{_RESET} " + f"Missing: {_RED}{missing}{_RESET} " + f"Extra: {_YELLOW}{extra}{_RESET} " + f"Outdated: {_CYAN}{outdated}{_RESET}") + print() + + if not report.has_drift: + print(f"{_GREEN}{_BOLD}No drift detected. Skills are in sync.{_RESET}\n") + else: + print(f"{_YELLOW}{_BOLD}Drift detected. Run with --fix to sync missing/outdated skills.{_RESET}\n") + + +def _render_json(report: AuditReport) -> None: + out = { + "has_drift": report.has_drift, + "repo_skills_dir": str(report.repo_root / "skills"), + "installed_skills_dir": str(report.installed_root), + "summary": { + "total": len(report.drifts), + "ok": len(report.by_status("OK")), + "missing": len(report.by_status("MISSING")), + "extra": len(report.by_status("EXTRA")), + "outdated": len(report.by_status("OUTDATED")), + }, + "drifts": [ + { + "skill_path": d.skill_path, + "status": d.status, + "repo_hash": d.repo_hash, + "installed_hash": d.installed_hash, + "diff_line_count": len(d.diff_lines), + } + for d in report.drifts + if d.status != "OK" + ], + } + print(json.dumps(out, indent=2)) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Audit wizard skills for drift between repo and installed location." + ) + parser.add_argument( + "--repo-root", + default=str(Path(__file__).parent.parent), + help="Root of the hermes-agent repo (default: parent of this script)", + ) + parser.add_argument( + "--installed-root", + default=None, + help="Installed skills directory (default: auto-detect from HERMES_HOME)", + ) + parser.add_argument( + "--fix", + action="store_true", + help="Copy missing/outdated skills from repo to installed location", + ) + parser.add_argument( + "--diff", + action="store_true", + help="Show diff for outdated skills", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output results as JSON", + ) + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + installed_root = Path(args.installed_root).resolve() if args.installed_root else None + + report = run_audit(repo_root, installed_root) + + if args.fix: + apply_fix(report) + # Re-run audit after fix to show updated state + report = run_audit(repo_root, installed_root) + + if args.json: + _render_json(report) + else: + _render_terminal(report, show_diff=args.diff) + + sys.exit(0 if not report.has_drift else 1) + + +if __name__ == "__main__": + main() diff --git a/hermes-sovereign/wizard-bootstrap/wizard_bootstrap.py b/hermes-sovereign/wizard-bootstrap/wizard_bootstrap.py new file mode 100644 index 00000000..623ea29a --- /dev/null +++ b/hermes-sovereign/wizard-bootstrap/wizard_bootstrap.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +""" +wizard_bootstrap.py — Wizard Environment Validator + +Validates that a new wizard's forge environment is ready: + 1. Python version check (>=3.11) + 2. Core dependencies installed + 3. Gitea authentication + 4. Telegram connectivity + 5. Smoke test (hermes import) + +Usage: + python wizard-bootstrap/wizard_bootstrap.py + python wizard-bootstrap/wizard_bootstrap.py --fix + python wizard-bootstrap/wizard_bootstrap.py --json + +Exits 0 if all checks pass, 1 if any check fails. +""" + +import argparse +import importlib +import json +import os +import subprocess +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +# --------------------------------------------------------------------------- +# Result model +# --------------------------------------------------------------------------- + +@dataclass +class CheckResult: + name: str + passed: bool + message: str + fix_hint: Optional[str] = None + detail: Optional[str] = None + + +@dataclass +class BootstrapReport: + checks: list[CheckResult] = field(default_factory=list) + + @property + def passed(self) -> bool: + return all(c.passed for c in self.checks) + + @property + def failed(self) -> list[CheckResult]: + return [c for c in self.checks if not c.passed] + + def add(self, result: CheckResult) -> None: + self.checks.append(result) + + +# --------------------------------------------------------------------------- +# Individual checks +# --------------------------------------------------------------------------- + +def check_python_version() -> CheckResult: + """Require Python >= 3.11.""" + major, minor, micro = sys.version_info[:3] + ok = (major, minor) >= (3, 11) + return CheckResult( + name="python_version", + passed=ok, + message=f"Python {major}.{minor}.{micro}", + fix_hint="Install Python 3.11+ via uv, pyenv, or your OS package manager.", + ) + + +def check_core_deps() -> CheckResult: + """Verify that hermes core Python packages are importable.""" + required = [ + "openai", + "anthropic", + "dotenv", + "yaml", + "rich", + "requests", + "pydantic", + "prompt_toolkit", + ] + missing = [] + for pkg in required: + # dotenv ships as 'python-dotenv' but imports as 'dotenv' + try: + importlib.import_module(pkg) + except ModuleNotFoundError: + missing.append(pkg) + + if missing: + return CheckResult( + name="core_deps", + passed=False, + message=f"Missing packages: {', '.join(missing)}", + fix_hint="Run: uv pip install -r requirements.txt (or: pip install -r requirements.txt)", + ) + return CheckResult(name="core_deps", passed=True, message="All core packages importable") + + +def check_hermes_importable() -> CheckResult: + """Smoke-test: import hermes_constants (no side effects).""" + # Add repo root to sys.path so we can import regardless of cwd + repo_root = str(Path(__file__).parent.parent) + if repo_root not in sys.path: + sys.path.insert(0, repo_root) + try: + import hermes_constants # noqa: F401 + + return CheckResult(name="hermes_smoke", passed=True, message="hermes_constants imported OK") + except Exception as exc: + return CheckResult( + name="hermes_smoke", + passed=False, + message=f"Import error: {exc}", + fix_hint="Ensure you are in the hermes-agent repo root and your venv is active.", + ) + + +def check_gitea_auth() -> CheckResult: + """Verify Gitea token env var is set and the API responds.""" + token = os.environ.get("GITEA_TOKEN") or os.environ.get("FORGE_TOKEN") + if not token: + return CheckResult( + name="gitea_auth", + passed=False, + message="GITEA_TOKEN / FORGE_TOKEN not set", + fix_hint="Export GITEA_TOKEN= in your shell or ~/.hermes/.env", + ) + + # Attempt a lightweight API call — list repos endpoint returns quickly + forge_url = os.environ.get("FORGE_URL", "https://forge.alexanderwhitestone.com") + try: + import requests # noqa: PLC0415 + + resp = requests.get( + f"{forge_url}/api/v1/repos/search", + headers={"Authorization": f"token {token}"}, + params={"limit": 1}, + timeout=10, + ) + if resp.status_code == 200: + return CheckResult(name="gitea_auth", passed=True, message="Gitea API reachable and token valid") + return CheckResult( + name="gitea_auth", + passed=False, + message=f"Gitea API returned HTTP {resp.status_code}", + fix_hint="Check that your GITEA_TOKEN is correct and not expired.", + ) + except Exception as exc: + return CheckResult( + name="gitea_auth", + passed=False, + message=f"Gitea API unreachable: {exc}", + fix_hint="Check network connectivity and FORGE_URL env var.", + ) + + +def check_telegram_connectivity() -> CheckResult: + """Verify Telegram bot token is set and the Bot API responds.""" + token = os.environ.get("TELEGRAM_BOT_TOKEN") + if not token: + return CheckResult( + name="telegram", + passed=False, + message="TELEGRAM_BOT_TOKEN not set", + fix_hint="Export TELEGRAM_BOT_TOKEN= in your shell or ~/.hermes/.env", + ) + + try: + import requests # noqa: PLC0415 + + resp = requests.get( + f"https://api.telegram.org/bot{token}/getMe", + timeout=10, + ) + if resp.status_code == 200: + data = resp.json() + username = data.get("result", {}).get("username", "?") + return CheckResult( + name="telegram", + passed=True, + message=f"Telegram bot @{username} reachable", + ) + return CheckResult( + name="telegram", + passed=False, + message=f"Telegram API returned HTTP {resp.status_code}", + fix_hint="Check that TELEGRAM_BOT_TOKEN is valid.", + ) + except Exception as exc: + return CheckResult( + name="telegram", + passed=False, + message=f"Telegram unreachable: {exc}", + fix_hint="Check network connectivity.", + ) + + +def check_env_vars() -> CheckResult: + """Check that at least one LLM provider key is configured.""" + provider_keys = [ + "OPENROUTER_API_KEY", + "ANTHROPIC_API_KEY", + "ANTHROPIC_TOKEN", + "OPENAI_API_KEY", + "GLM_API_KEY", + "KIMI_API_KEY", + "MINIMAX_API_KEY", + ] + found = [k for k in provider_keys if os.environ.get(k)] + if found: + return CheckResult( + name="llm_provider", + passed=True, + message=f"LLM provider key(s) present: {', '.join(found)}", + ) + return CheckResult( + name="llm_provider", + passed=False, + message="No LLM provider API key found", + fix_hint=( + "Set at least one of: OPENROUTER_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY " + "in ~/.hermes/.env or your shell." + ), + ) + + +def check_hermes_home() -> CheckResult: + """Verify HERMES_HOME directory exists and is writable.""" + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + if not hermes_home.exists(): + return CheckResult( + name="hermes_home", + passed=False, + message=f"HERMES_HOME does not exist: {hermes_home}", + fix_hint="Run 'hermes setup' or create the directory manually.", + ) + if not os.access(hermes_home, os.W_OK): + return CheckResult( + name="hermes_home", + passed=False, + message=f"HERMES_HOME not writable: {hermes_home}", + fix_hint=f"Fix permissions: chmod u+w {hermes_home}", + ) + return CheckResult( + name="hermes_home", + passed=True, + message=f"HERMES_HOME OK: {hermes_home}", + ) + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + +def _load_dotenv_if_available() -> None: + """Load ~/.hermes/.env so token checks work without manual export.""" + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + env_path = hermes_home / ".env" + if env_path.exists(): + try: + from dotenv import load_dotenv # noqa: PLC0415 + + load_dotenv(env_path, override=False) + except Exception: + pass # dotenv not installed yet — that's fine + + +def run_all_checks() -> BootstrapReport: + report = BootstrapReport() + _load_dotenv_if_available() + + checks = [ + check_python_version, + check_core_deps, + check_hermes_importable, + check_hermes_home, + check_env_vars, + check_gitea_auth, + check_telegram_connectivity, + ] + for fn in checks: + result = fn() + report.add(result) + + return report + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + +_GREEN = "\033[32m" +_RED = "\033[31m" +_YELLOW = "\033[33m" +_BOLD = "\033[1m" +_RESET = "\033[0m" + + +def _render_terminal(report: BootstrapReport) -> None: + print(f"\n{_BOLD}=== Wizard Bootstrap — Environment Check ==={_RESET}\n") + for check in report.checks: + icon = f"{_GREEN}✓{_RESET}" if check.passed else f"{_RED}✗{_RESET}" + label = check.name.replace("_", " ").title() + print(f" {icon} {_BOLD}{label}{_RESET}: {check.message}") + if not check.passed and check.fix_hint: + print(f" {_YELLOW}→ {check.fix_hint}{_RESET}") + if check.detail: + print(f" {check.detail}") + + total = len(report.checks) + passed = sum(1 for c in report.checks if c.passed) + print() + if report.passed: + print(f"{_GREEN}{_BOLD}All {total} checks passed. Forge is ready.{_RESET}\n") + else: + failed = total - passed + print( + f"{_RED}{_BOLD}{failed}/{total} check(s) failed.{_RESET} " + f"Resolve the issues above before going online.\n" + ) + + +def _render_json(report: BootstrapReport) -> None: + out = { + "passed": report.passed, + "summary": { + "total": len(report.checks), + "passed": sum(1 for c in report.checks if c.passed), + "failed": sum(1 for c in report.checks if not c.passed), + }, + "checks": [ + { + "name": c.name, + "passed": c.passed, + "message": c.message, + "fix_hint": c.fix_hint, + "detail": c.detail, + } + for c in report.checks + ], + } + print(json.dumps(out, indent=2)) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Validate the forge wizard environment." + ) + parser.add_argument( + "--json", + action="store_true", + help="Output results as JSON", + ) + args = parser.parse_args() + + report = run_all_checks() + + if args.json: + _render_json(report) + else: + _render_terminal(report) + + sys.exit(0 if report.passed else 1) + + +if __name__ == "__main__": + main()