fix: resolve remote hermes binary dynamically, validate before dispatch (#350 )

Root cause: vps-agent-dispatch hardcoded /root/wizards/*/hermes/venv/bin/hermes paths that go stale when wizards are reinstalled or moved. - SSHEnvironment.resolve_remote_hermes_path(): dynamic resolution via which hermes -> ~/.local/bin/hermes -> /root/wizards/*/venv/bin/hermes - SSHEnvironment.validate_remote_hermes(): structured check returning {available, path, version, error} — callers can gate dispatch on this - topology_scan.py: replace hardcoded venv/bin paths with find-based resolution - forge_health_check.py: default target from HERMES_WIZARDS_ROOT env var - 6 new tests covering resolution and validation logic Dispatch now fails loud instead of reporting OK on broken paths.
2026-04-13 18:23:53 -04:00
6 changed files with 288 additions and 167 deletions
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -544,57 +544,8 @@ def _run_job_script(script_path: str) -> tuple[bool, str]:
        return False, f"Script execution failed: {exc}"


-# Known provider aliases for mismatch detection
-_PROVIDER_ALIASES = {
-    "ollama": {"ollama", "local ollama", "localhost:11434"},
-    "anthropic": {"anthropic", "claude", "sonnet", "opus", "haiku"},
-    "nous": {"nous", "mimo", "nousresearch"},
-    "openrouter": {"openrouter"},
-    "kimi": {"kimi", "moonshot", "kimi-coding"},
-    "zai": {"zai", "glm", "zhipu"},
-    "openai": {"openai", "gpt", "codex"},
-    "gemini": {"gemini", "google"},
-}
-
-
-def _detect_provider_mismatch(prompt: str, active_provider: str) -> Optional[str]:
-    """Detect if the prompt references a provider different from the active one.
-    
-    Returns the mismatched provider name if found, else None.
-    """
-    if not active_provider or not prompt:
-        return None
-    prompt_lower = prompt.lower()
-    active_lower = active_provider.lower().strip()
-    # Find which alias group the active provider belongs to
-    active_group = None
-    for group, aliases in _PROVIDER_ALIASES.items():
-        if active_lower in aliases or active_lower.startswith(group):
-            active_group = group
-            break
-    if not active_group:
-        return None
-    # Check if the prompt references a different provider group
-    for group, aliases in _PROVIDER_ALIASES.items():
-        if group == active_group:
-            continue
-        for alias in aliases:
-            # Use word boundary-ish matching to avoid false positives
-            # (e.g. "model" shouldn't match "model: ollama")
-            if alias in prompt_lower:
-                return group
-    return None
-
-
-def _build_job_prompt(job: dict, runtime_info: Optional[dict] = None) -> str:
-    """Build the effective prompt for a cron job, optionally loading one or more skills first.
-    
-    Args:
-        job: The cron job dict.
-        runtime_info: Optional dict with 'model' and 'provider' keys from the
-                      resolved runtime, injected into the cron hint so the agent
-                      knows what provider/model it is actually running on.
-    """
+def _build_job_prompt(job: dict) -> str:
+    """Build the effective prompt for a cron job, optionally loading one or more skills first."""
    prompt = job.get("prompt", "")
    skills = job.get("skills")

@@ -626,21 +577,9 @@ def _build_job_prompt(job: dict, runtime_info: Optional[dict] = None) -> str:

    # Always prepend cron execution guidance so the agent knows how
    # delivery works and can suppress delivery when appropriate.
-    _runtime_model = runtime_info.get("model", "") if runtime_info else ""
-    _runtime_provider = runtime_info.get("provider", "") if runtime_info else ""
-    _runtime_hint = ""
-    if _runtime_model or _runtime_provider:
-        _runtime_hint = (
-            f"RUNTIME: You are running as model={_runtime_model!r}, "
-            f"provider={_runtime_provider!r}. "
-            "If your instructions reference a different provider or model, "
-            "adapt your behavior to the actual runtime above. "
-            "Do NOT attempt to reach providers/services that are not your current runtime. "
-        )
    cron_hint = (
        "[SYSTEM: You are running as a scheduled cron job. "
-        + _runtime_hint
-        + "DELIVERY: Your final response will be automatically delivered "
+        "DELIVERY: Your final response will be automatically delivered "
        "to the user — do NOT use send_message or try to deliver "
        "the output yourself. Just produce your report/output as your "
        "final response and the system handles the rest. "
@@ -727,10 +666,12 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
    
    job_id = job["id"]
    job_name = job["name"]
+    prompt = _build_job_prompt(job)
    origin = _resolve_origin(job)
    _cron_session_id = f"cron_{job_id}_{_hermes_now().strftime('%Y%m%d_%H%M%S')}"

    logger.info("Running job '%s' (ID: %s)", job_name, job_id)
+    logger.info("Prompt: %s", prompt[:100])

    try:
        # Inject origin context so the agent's send_message tool knows the chat.
@@ -821,24 +762,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            message = format_runtime_provider_error(exc)
            raise RuntimeError(message) from exc

-        # Build prompt now that we know the resolved provider/model.
-        # Inject runtime info so the agent knows what it's running on.
-        _resolved_provider = runtime.get("provider", "")
-        runtime_info = {"model": model, "provider": _resolved_provider}
-        
-        # Detect and log provider mismatches between prompt and active provider
-        _raw_prompt = job.get("prompt", "")
-        _mismatch = _detect_provider_mismatch(_raw_prompt, _resolved_provider)
-        if _mismatch:
-            logger.warning(
-                "Job '%s' prompt references provider '%s' but active provider is '%s' — "
-                "the agent will be told to adapt. Consider updating this job's prompt.",
-                job_name, _mismatch, _resolved_provider,
-            )
-        
-        prompt = _build_job_prompt(job, runtime_info=runtime_info)
-        logger.info("Prompt: %s", prompt[:100])
-
        from agent.smart_model_routing import resolve_turn_route
        turn_route = resolve_turn_route(
            prompt,
--- a/docs/bezalel/topology_scan.py
+++ b/docs/bezalel/topology_scan.py
@@ -75,7 +75,7 @@ for base in ["/root/wizards/bezalel/hermes", "/root/wizards/bezalel/evennia"]:
 write("06_git_repos", "\n".join(git_repos))

 # 8. Python Dependencies
-venv_pip = shell("/root/wizards/bezalel/hermes/venv/bin/pip freeze 2>/dev/null | head -80")
+venv_pip = shell("$(find /root/wizards -maxdepth 4 -name pip -path '*/venv/bin/pip' 2>/dev/null | head -1) freeze 2>/dev/null | head -80")
 write("07_dependencies", f"Hermes venv packages (top 80):\n{venv_pip}")

 # 9. External APIs & Endpoints
@@ -115,8 +115,8 @@ mempalace = f"""MEMPALACE CONFIGURATION
 - Identity: /root/.mempalace/identity.txt
 - Config: /root/wizards/bezalel/mempalace.yaml
 - Nightly re-mine: 03:00 UTC via /root/wizards/bezalel/mempalace_nightly.sh
- Miner binary: /root/wizards/bezalel/hermes/venv/bin/mempalace
- Current status: {shell('/root/wizards/bezalel/hermes/venv/bin/mempalace --palace /root/wizards/bezalel/.mempalace/palace status 2>/dev/null')}
+- Miner binary: $(find /root/wizards -maxdepth 4 -name mempalace -path '*/venv/bin/mempalace' 2>/dev/null | head -1)
+- Current status: {shell('$(find /root/wizards -maxdepth 4 -name mempalace -path "*/venv/bin/mempalace" 2>/dev/null | head -1) --palace /root/wizards/bezalel/.mempalace/palace status 2>/dev/null')}
 """
 write("11_mempalace_topology", mempalace)

--- a/scripts/forge_health_check.py
+++ b/scripts/forge_health_check.py
@@ -9,8 +9,8 @@ Scans wizard environments for:

 Usage:
    python scripts/forge_health_check.py /root/wizards
-    python scripts/forge_health_check.py /root/wizards --json
-    python scripts/forge_health_check.py /root/wizards --fix-permissions
+    python scripts/forge_health_check.py $HERMES_WIZARDS_ROOT --json
+    python scripts/forge_health_check.py --fix-permissions
 """

 from __future__ import annotations
@@ -263,7 +263,7 @@ def print_report(report: HealthReport) -> None:

 def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Forge Health Check")
-    parser.add_argument("target", nargs="?", default="/root/wizards", help="Root path to scan")
+    parser.add_argument("target", nargs="?", default=os.environ.get("HERMES_WIZARDS_ROOT", "/root/wizards"), help="Root path to scan")
    parser.add_argument("--json", action="store_true", help="Output JSON report")
    parser.add_argument("--fix-permissions", action="store_true", help="Auto-fix file permissions")
    args = parser.parse_args(argv)
--- a/tests/test_cron_provider_mismatch.py
+++ b/tests/test_cron_provider_mismatch.py
@@ -1,79 +0,0 @@
-"""Tests for cron scheduler provider mismatch detection and runtime-aware prompt building."""
-
-import sys
-from pathlib import Path
-
-# Ensure project root is importable
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-
-from cron.scheduler import _detect_provider_mismatch, _build_job_prompt, _PROVIDER_ALIASES
-
-
-class TestProviderMismatchDetection:
-    """Tests for _detect_provider_mismatch."""
-
-    def test_no_mismatch_when_provider_not_mentioned(self):
-        assert _detect_provider_mismatch("Check system health", "nous") is None
-
-    def test_detects_ollama_in_prompt_when_nous_active(self):
-        result = _detect_provider_mismatch("Check Ollama is responding", "nous")
-        assert result == "ollama"
-
-    def test_detects_anthropic_in_prompt_when_nous_active(self):
-        result = _detect_provider_mismatch("Use Claude to analyze", "nous")
-        assert result == "anthropic"
-
-    def test_no_mismatch_same_provider(self):
-        assert _detect_provider_mismatch("Check Ollama models", "ollama") is None
-
-    def test_no_mismatch_with_empty_prompt(self):
-        assert _detect_provider_mismatch("", "nous") is None
-
-    def test_no_mismatch_with_empty_provider(self):
-        assert _detect_provider_mismatch("Check Ollama", "") is None
-
-    def test_detects_kimi_in_prompt_when_openrouter_active(self):
-        result = _detect_provider_mismatch("Use Kimi for coding", "openrouter")
-        assert result == "kimi"
-
-    def test_detects_glm_in_prompt_when_nous_active(self):
-        result = _detect_provider_mismatch("Use GLM for analysis", "nous")
-        assert result == "zai"
-
-
-class TestBuildJobPrompt:
-    """Tests for _build_job_prompt with runtime_info."""
-
-    def test_basic_prompt_without_runtime(self):
-        job = {"prompt": "Do something", "skills": []}
-        result = _build_job_prompt(job)
-        assert "Do something" in result
-        assert "RUNTIME" not in result
-
-    def test_prompt_with_runtime_info(self):
-        job = {"prompt": "Do something", "skills": []}
-        runtime_info = {"model": "mimo-v2-pro", "provider": "nous"}
-        result = _build_job_prompt(job, runtime_info=runtime_info)
-        assert "Do something" in result
-        assert "model='mimo-v2-pro'" in result
-        assert "provider='nous'" in result
-
-    def test_prompt_with_empty_runtime_info(self):
-        job = {"prompt": "Do something", "skills": []}
-        runtime_info = {"model": "", "provider": ""}
-        result = _build_job_prompt(job, runtime_info=runtime_info)
-        assert "Do something" in result
-        assert "RUNTIME" not in result
-
-    def test_cron_hint_always_present(self):
-        job = {"prompt": "Test", "skills": []}
-        result = _build_job_prompt(job)
-        assert "scheduled cron job" in result
-        assert "[SYSTEM:" in result
-
-    def test_adapt_instruction_in_runtime_hint(self):
-        job = {"prompt": "Check Ollama health", "skills": []}
-        runtime_info = {"model": "mimo-v2-pro", "provider": "nous"}
-        result = _build_job_prompt(job, runtime_info=runtime_info)
-        assert "adapt your behavior" in result
-        assert "Do NOT attempt to reach providers" in result
--- a/tests/test_remote_hermes_path.py
+++ b/tests/test_remote_hermes_path.py
@@ -0,0 +1,175 @@
+"""Tests for #350: remote hermes path validation."""
+
+import subprocess
+from unittest.mock import MagicMock, patch, PropertyMock
+
+import pytest
+
+from tools.environments.ssh import SSHEnvironment
+
+
+class TestResolveRemoteHermesPath:
+    """Test that resolve_remote_hermes_path checks multiple locations."""
+
+    @patch("tools.environments.ssh.SSHEnvironment._establish_connection")
+    @patch("tools.environments.ssh.SSHEnvironment._detect_remote_home", return_value="/root")
+    @patch("tools.environments.ssh.SSHEnvironment._sync_skills_and_credentials")
+    def test_finds_hermes_via_which(self, mock_sync, mock_home, mock_conn):
+        """Should return path from `which hermes` when available."""
+        env = SSHEnvironment.__new__(SSHEnvironment)
+        env.host = "test-host"
+        env.user = "root"
+        env.port = 22
+        env.key_path = ""
+        env.control_socket = "/tmp/test.sock"
+
+        with patch("subprocess.run") as mock_run:
+            # First call: which hermes succeeds
+            mock_run.return_value = MagicMock(
+                stdout="/usr/local/bin/hermes\n",
+                returncode=0,
+            )
+            path = env.resolve_remote_hermes_path()
+            assert path == "/usr/local/bin/hermes"
+
+    @patch("tools.environments.ssh.SSHEnvironment._establish_connection")
+    @patch("tools.environments.ssh.SSHEnvironment._detect_remote_home", return_value="/root")
+    @patch("tools.environments.ssh.SSHEnvironment._sync_skills_and_credentials")
+    def test_falls_back_to_local_bin(self, mock_sync, mock_home, mock_conn):
+        """Should check ~/.local/bin/hermes when which fails."""
+        env = SSHEnvironment.__new__(SSHEnvironment)
+        env.host = "test-host"
+        env.user = "root"
+        env.port = 22
+        env.key_path = ""
+        env.control_socket = "/tmp/test.sock"
+
+        call_count = [0]
+
+        def mock_run_side_effect(cmd, **kwargs):
+            call_count[0] += 1
+            result = MagicMock()
+            if call_count[0] == 1:
+                # which hermes fails
+                result.stdout = ""
+                result.returncode = 1
+            elif call_count[0] == 2:
+                # ~/.local/bin/hermes exists
+                result.stdout = "/root/.local/bin/hermes\n"
+                result.returncode = 0
+            else:
+                result.stdout = ""
+                result.returncode = 1
+            return result
+
+        with patch("subprocess.run", side_effect=mock_run_side_effect):
+            path = env.resolve_remote_hermes_path()
+            assert path == "/root/.local/bin/hermes"
+
+    @patch("tools.environments.ssh.SSHEnvironment._establish_connection")
+    @patch("tools.environments.ssh.SSHEnvironment._detect_remote_home", return_value="/root")
+    @patch("tools.environments.ssh.SSHEnvironment._sync_skills_and_credentials")
+    def test_returns_empty_when_not_found(self, mock_sync, mock_home, mock_conn):
+        """Should return empty string when hermes is not found anywhere."""
+        env = SSHEnvironment.__new__(SSHEnvironment)
+        env.host = "test-host"
+        env.user = "root"
+        env.port = 22
+        env.key_path = ""
+        env.control_socket = "/tmp/test.sock"
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(stdout="", returncode=1)
+            path = env.resolve_remote_hermes_path()
+            assert path == ""
+
+
+class TestValidateRemoteHermes:
+    """Test that validate_remote_hermes returns structured results."""
+
+    @patch("tools.environments.ssh.SSHEnvironment._establish_connection")
+    @patch("tools.environments.ssh.SSHEnvironment._detect_remote_home", return_value="/root")
+    @patch("tools.environments.ssh.SSHEnvironment._sync_skills_and_credentials")
+    def test_returns_available_when_found(self, mock_sync, mock_home, mock_conn):
+        env = SSHEnvironment.__new__(SSHEnvironment)
+        env.host = "test-host"
+        env.user = "root"
+        env.port = 22
+        env.key_path = ""
+        env.control_socket = "/tmp/test.sock"
+
+        call_count = [0]
+
+        def mock_run_side_effect(cmd, **kwargs):
+            call_count[0] += 1
+            result = MagicMock()
+            if call_count[0] == 1:
+                # which hermes
+                result.stdout = "/root/.local/bin/hermes\n"
+                result.returncode = 0
+            elif call_count[0] == 2:
+                # hermes --version
+                result.stdout = "hermes-agent 1.0.0\n"
+                result.returncode = 0
+            else:
+                result.stdout = ""
+                result.returncode = 1
+            return result
+
+        with patch("subprocess.run", side_effect=mock_run_side_effect):
+            result = env.validate_remote_hermes()
+            assert result["available"] is True
+            assert result["path"] == "/root/.local/bin/hermes"
+            assert "hermes-agent" in result["version"]
+            assert result["error"] == ""
+
+    @patch("tools.environments.ssh.SSHEnvironment._establish_connection")
+    @patch("tools.environments.ssh.SSHEnvironment._detect_remote_home", return_value="/root")
+    @patch("tools.environments.ssh.SSHEnvironment._sync_skills_and_credentials")
+    def test_returns_error_when_not_found(self, mock_sync, mock_home, mock_conn):
+        env = SSHEnvironment.__new__(SSHEnvironment)
+        env.host = "test-host"
+        env.user = "root"
+        env.port = 22
+        env.key_path = ""
+        env.control_socket = "/tmp/test.sock"
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(stdout="", returncode=1)
+            result = env.validate_remote_hermes()
+            assert result["available"] is False
+            assert "not found" in result["error"]
+            assert result["path"] == ""
+
+    @patch("tools.environments.ssh.SSHEnvironment._establish_connection")
+    @patch("tools.environments.ssh.SSHEnvironment._detect_remote_home", return_value="/root")
+    @patch("tools.environments.ssh.SSHEnvironment._sync_skills_and_credentials")
+    def test_returns_error_when_version_fails(self, mock_sync, mock_home, mock_conn):
+        env = SSHEnvironment.__new__(SSHEnvironment)
+        env.host = "test-host"
+        env.user = "root"
+        env.port = 22
+        env.key_path = ""
+        env.control_socket = "/tmp/test.sock"
+
+        call_count = [0]
+
+        def mock_run_side_effect(cmd, **kwargs):
+            call_count[0] += 1
+            result = MagicMock()
+            if call_count[0] == 1:
+                result.stdout = "/root/.local/bin/hermes\n"
+                result.returncode = 0
+            elif call_count[0] == 2:
+                # hermes --version fails
+                result.stdout = ""
+                result.returncode = 127
+            else:
+                result.stdout = ""
+                result.returncode = 1
+            return result
+
+        with patch("subprocess.run", side_effect=mock_run_side_effect):
+            result = env.validate_remote_hermes()
+            assert result["available"] is False
+            assert "not executable" in result["error"]
--- a/tools/environments/ssh.py
+++ b/tools/environments/ssh.py
@@ -154,6 +154,108 @@ class SSHEnvironment(PersistentShellMixin, BaseEnvironment):
        except Exception as e:
            logger.debug("SSH: could not sync skills/credentials: %s", e)

+    def resolve_remote_hermes_path(self) -> str:
+        """Resolve the hermes binary path on the remote host.
+
+        Checks in order:
+        1. `which hermes` (respects PATH, including ~/.local/bin)
+        2. ~/.local/bin/hermes (standard install location)
+        3. Common wizard paths: /root/wizards/*/hermes/venv/bin/hermes
+
+        Returns the resolved path, or empty string if not found.
+        """
+        # 1. Try which (respects PATH including ~/.local/bin if on PATH)
+        cmd = self._build_ssh_command()
+        cmd.append("which hermes 2>/dev/null")
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
+            path = result.stdout.strip()
+            if path and result.returncode == 0:
+                logger.debug("SSH: resolved hermes via which: %s", path)
+                return path
+        except Exception:
+            pass
+
+        # 2. Try ~/.local/bin/hermes (standard install symlink)
+        cmd = self._build_ssh_command()
+        cmd.append("test -x ~/.local/bin/hermes && echo ~/.local/bin/hermes")
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
+            path = result.stdout.strip()
+            if path and result.returncode == 0:
+                logger.debug("SSH: resolved hermes at ~/.local/bin/hermes")
+                return path
+        except Exception:
+            pass
+
+        # 3. Search common wizard paths
+        cmd = self._build_ssh_command()
+        cmd.append(
+            "find /root/wizards -maxdepth 4 -path '*/hermes/venv/bin/hermes' "
+            "-executable 2>/dev/null | head -1"
+        )
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
+            path = result.stdout.strip()
+            if path and result.returncode == 0:
+                logger.debug("SSH: resolved hermes at wizard path: %s", path)
+                return path
+        except Exception:
+            pass
+
+        logger.warning("SSH: could not resolve hermes binary on %s@%s", self.user, self.host)
+        return ""
+
+    def validate_remote_hermes(self) -> dict:
+        """Validate that hermes is available on the remote host.
+
+        Returns dict with:
+            available: bool
+            path: str (resolved path if found)
+            version: str (hermes --version output if available)
+            error: str (error message if not available)
+        """
+        path = self.resolve_remote_hermes_path()
+        if not path:
+            return {
+                "available": False,
+                "path": "",
+                "version": "",
+                "error": (
+                    f"Hermes binary not found on {self.user}@{self.host}. "
+                    f"Checked: which hermes, ~/.local/bin/hermes, /root/wizards/*/hermes/venv/bin/hermes. "
+                    f"Install hermes or fix the PATH."
+                ),
+            }
+
+        # Verify it's executable and get version
+        cmd = self._build_ssh_command()
+        cmd.append(f"{path} --version 2>&1 | head -1")
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
+            version = result.stdout.strip()
+            if result.returncode != 0:
+                return {
+                    "available": False,
+                    "path": path,
+                    "version": "",
+                    "error": f"Hermes at {path} exists but is not executable (exit code {result.returncode}).",
+                }
+        except subprocess.TimeoutExpired:
+            return {
+                "available": False,
+                "path": path,
+                "version": "",
+                "error": f"Hermes at {path} timed out on --version check.",
+            }
+
+        return {
+            "available": True,
+            "path": path,
+            "version": version,
+            "error": "",
+        }
+
    def execute(self, command: str, cwd: str = "", *,
                timeout: int | None = None,
                stdin_data: str | None = None) -> dict: