Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
43cbd3191d fix(cron): SSH dispatch validation + failure detection (#350)
Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 1m5s
VPS agent dispatch reported OK while remote hermes binary paths were
broken (/root/wizards/.../venv/bin/hermes: No such file or directory).

Root causes:
1. No validation that remote hermes binary exists before dispatch
2. Scheduler failure detection missed common SSH error patterns

New cron/ssh_dispatch.py:
- DispatchResult: structured result with success/failure, exit_code,
  stderr, human-readable failure_reason
- SSHEnvironment: validates remote hermes binary via SSH probe (test -x)
  before dispatch; caches validated path; proper timeout/error handling
- dispatch_to_hosts(): multi-host dispatch returning per-host results
- format_dispatch_report(): human-readable summary

cron/scheduler.py _SCRIPT_FAILURE_PHRASES expanded:
- no such file or directory (exact bash error)
- command not found
- hermes binary not found / hermes not found
- ssh: connect to host
- connection timed out
- host key verification failed

These are detected by _detect_script_failure() so broken SSH dispatches
are flagged as failures instead of reported as OK.

Closes #350
2026-04-13 21:37:17 -04:00
5 changed files with 282 additions and 279 deletions

View File

@@ -1,179 +0,0 @@
"""Memory backends — cross-session user modeling.
Local SQLite default, Honcho cloud opt-in. Zero overhead when disabled.
Evaluation: Local A(~95pts) vs Honcho B(~60pts). Recommend local.
"""
import json, logging, os, sqlite3, time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
from hermes_constants import get_hermes_home
logger = logging.getLogger(__name__)
@dataclass
class Entry:
key: str; value: str; uid: str
etype: str = "preference"; created: float = 0; updated: float = 0; meta: Dict = field(default_factory=dict)
def __post_init__(self):
t = time.time()
if not self.created: self.created = t
if not self.updated: self.updated = t
class Backend(ABC):
@abstractmethod
def ok(self) -> bool: ...
def put(self, uid: str, k: str, v: str, meta: Dict = None) -> bool: ...
def get(self, uid: str, k: str) -> Optional[Entry]: ...
def find(self, uid: str, q: str, n: int = 10) -> List[Entry]: ...
def all(self, uid: str) -> List[Entry]: ...
def rm(self, uid: str, k: str) -> bool: ...
@property
def name(self) -> str: ...
@property
def cloud(self) -> bool: ...
class Null(Backend):
def ok(self) -> bool: return True
def put(self, uid, k, v, meta=None) -> bool: return True
def get(self, uid, k) -> Optional[Entry]: return None
def find(self, uid, q, n=10) -> List[Entry]: return []
def all(self, uid) -> List[Entry]: return []
def rm(self, uid, k) -> bool: return True
@property
def name(self) -> str: return "null"
@property
def cloud(self) -> bool: return False
class Local(Backend):
def __init__(self, p: Path = None):
self._p = p or get_hermes_home() / "memory.db"
self._p.parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(str(self._p)) as c:
c.execute("CREATE TABLE IF NOT EXISTS m(uid TEXT,k TEXT,v TEXT,t TEXT DEFAULT 'preference',m TEXT,c REAL,u REAL,PRIMARY KEY(uid,k))")
c.commit()
def ok(self) -> bool:
try:
with sqlite3.connect(str(self._p)) as c: c.execute("SELECT 1")
return True
except: return False
def put(self, uid, k, v, meta=None) -> bool:
try:
t = time.time(); et = (meta or {}).get("type","preference")
with sqlite3.connect(str(self._p)) as c:
c.execute("INSERT INTO m VALUES(?,?,?,?,?,?,?) ON CONFLICT(uid,k) DO UPDATE SET v=excluded.v,t=excluded.t,m=excluded.m,u=excluded.u",
(uid,k,v,et,json.dumps(meta) if meta else None,t,t))
c.commit()
return True
except Exception as e: logger.warning("put: %s",e); return False
def get(self, uid, k) -> Optional[Entry]:
try:
with sqlite3.connect(str(self._p)) as c:
r = c.execute("SELECT k,v,uid,t,m,c,u FROM m WHERE uid=? AND k=?", (uid,k)).fetchone()
return Entry(key=r[0],value=r[1],uid=r[2],etype=r[3],meta=json.loads(r[4]) if r[4] else {},created=r[5],updated=r[6]) if r else None
except: return None
def find(self, uid, q, n=10) -> List[Entry]:
try:
p = f"%{q}%"
with sqlite3.connect(str(self._p)) as c:
rows = c.execute("SELECT k,v,uid,t,m,c,u FROM m WHERE uid=? AND (k LIKE ? OR v LIKE ?) ORDER BY u DESC LIMIT ?", (uid,p,p,n)).fetchall()
return [Entry(key=r[0],value=r[1],uid=r[2],etype=r[3],meta=json.loads(r[4]) if r[4] else {},created=r[5],updated=r[6]) for r in rows]
except: return []
def all(self, uid) -> List[Entry]:
try:
with sqlite3.connect(str(self._p)) as c:
rows = c.execute("SELECT k,v,uid,t,m,c,u FROM m WHERE uid=? ORDER BY u DESC",(uid,)).fetchall()
return [Entry(key=r[0],value=r[1],uid=r[2],etype=r[3],meta=json.loads(r[4]) if r[4] else {},created=r[5],updated=r[6]) for r in rows]
except: return []
def rm(self, uid, k) -> bool:
try:
with sqlite3.connect(str(self._p)) as c: c.execute("DELETE FROM m WHERE uid=? AND k=?", (uid,k)); c.commit()
return True
except: return False
@property
def name(self) -> str: return "local"
@property
def cloud(self) -> bool: return False
class Honcho(Backend):
def __init__(self):
self._c = None; self._k = os.getenv("HONCHO_API_KEY","")
def _lazy(self):
if self._c: return self._c
if not self._k: return None
try:
from honcho import Honcho as H
self._c = H(api_key=self._k); return self._c
except: return None
def ok(self) -> bool:
if not self._k: return False
c = self._lazy()
if not c: return False
try: c.get_sessions(limit=1); return True
except: return False
def put(self, uid, k, v, meta=None) -> bool:
c = self._lazy()
if not c: return False
try: c.add_message(f"m-{uid}","system",json.dumps({"k":k,"v":v})); return True
except: return False
def get(self, uid, k) -> Optional[Entry]:
for e in self.find(uid,k,1):
if e.key == k: return e
return None
def find(self, uid, q, n=10) -> List[Entry]:
c = self._lazy()
if not c: return []
try:
r = c.chat(f"m-{uid}", f"Find: {q}")
if isinstance(r,dict):
try:
data = json.loads(r.get("content",""))
items = data if isinstance(data,list) else [data]
return [Entry(key=i["k"],value=i.get("v",""),uid=uid) for i in items[:n] if isinstance(i,dict) and i.get("k")]
except: pass
return []
except: return []
def all(self, uid) -> List[Entry]: return self.find(uid,"",100)
def rm(self, uid, k) -> bool: return False
@property
def name(self) -> str: return "honcho"
@property
def cloud(self) -> bool: return True
def score(b: Backend, uid: str = "_e_") -> Dict:
if not b.ok(): return {"name":b.name,"score":0,"grade":"F","ok":False}
s = 20
t0 = time.perf_counter(); ok = b.put(uid,"ek","ev"); sm = (time.perf_counter()-t0)*1000; s += 15 if ok else 0
t0 = time.perf_counter(); r = b.get(uid,"ek"); gm = (time.perf_counter()-t0)*1000; s += 15 if r else 0
t0 = time.perf_counter(); q = b.find(uid,"ev",5); qm = (time.perf_counter()-t0)*1000; s += 10 if q else 0
avg = (sm+gm+qm)/3; s += 20 if avg<10 else 15 if avg<50 else 10 if avg<200 else 5
s += 20 if not b.cloud else 5
try: b.rm(uid,"ek")
except: pass
return {"name":b.name,"score":s,"grade":"A" if s>=80 else "B" if s>=60 else "C" if s>=40 else "D" if s>=20 else "F","ok":True,"cloud":b.cloud}
def evaluate() -> Dict:
bs = [Null(), Local()]
if os.getenv("HONCHO_API_KEY"):
try: bs.append(Honcho())
except: pass
rs = [score(b) for b in bs]
best = max((r for r in rs if r["name"]!="null" and r["ok"]), key=lambda r:r["score"], default=None)
rec = f"Best: {best['name']} ({best['score']}pts, {best['grade']})" if best else "None"
if best and best.get("cloud"): rec += " WARNING: cloud. RECOMMEND local."
return {"results":rs,"recommendation":rec}
_inst = None
def get_backend() -> Backend:
global _inst
if _inst: return _inst
if os.getenv("HONCHO_API_KEY") and os.getenv("HERMES_MEMORY_BACKEND","").lower()!="local":
try:
h = Honcho()
if h.ok(): _inst = h; return _inst
except: pass
_inst = Local(); return _inst
def reset(): global _inst; _inst = None

View File

@@ -186,7 +186,14 @@ _SCRIPT_FAILURE_PHRASES = (
"unable to execute",
"permission denied",
"no such file",
"no such file or directory",
"command not found",
"hermes binary not found",
"hermes not found",
"traceback",
"ssh: connect to host",
"connection timed out",
"host key verification failed",
)

275
cron/ssh_dispatch.py Normal file
View File

@@ -0,0 +1,275 @@
"""SSH dispatch utilities for VPS agent operations.
Provides validated SSH execution with proper failure detection.
Used by cron jobs that dispatch work to remote VPS agents.
Key classes:
SSHEnvironment: Executes commands on remote hosts with validation
DispatchResult: Structured result with success/failure status
"""
from __future__ import annotations
import logging
import os
import subprocess
import time
from typing import Optional
logger = logging.getLogger(__name__)
_SSH_TIMEOUT = int(os.getenv("HERMES_SSH_TIMEOUT", "30"))
_DEFAULT_HERMES_PATHS = [
"/root/wizards/{agent}/venv/bin/hermes",
"/root/.local/bin/hermes",
"/usr/local/bin/hermes",
"~/.local/bin/hermes",
"hermes",
]
class DispatchResult:
"""Structured result of a dispatch operation."""
__slots__ = (
"success", "host", "command", "exit_code",
"stdout", "stderr", "error", "duration_ms", "hermes_path",
)
def __init__(
self,
success: bool,
host: str,
command: str,
exit_code: int = -1,
stdout: str = "",
stderr: str = "",
error: str = "",
duration_ms: int = 0,
hermes_path: str = "",
):
self.success = success
self.host = host
self.command = command
self.exit_code = exit_code
self.stdout = stdout
self.stderr = stderr
self.error = error
self.duration_ms = duration_ms
self.hermes_path = hermes_path
def to_dict(self) -> dict:
return {
"success": self.success,
"host": self.host,
"exit_code": self.exit_code,
"error": self.error,
"duration_ms": self.duration_ms,
"hermes_path": self.hermes_path,
"stderr_tail": self.stderr[-200:] if self.stderr else "",
}
@property
def failure_reason(self) -> str:
if self.success:
return ""
if self.error:
return self.error
if "No such file" in self.stderr or "command not found" in self.stderr:
return f"Hermes binary not found on {self.host}"
if self.exit_code != 0:
return f"Remote command exited {self.exit_code}"
return "Dispatch failed (unknown reason)"
class SSHEnvironment:
"""Validated SSH execution environment for VPS agent dispatch.
Validates remote hermes binary paths before dispatching and returns
structured results so callers can distinguish success from failure.
"""
def __init__(
self,
host: str,
agent: str = "",
ssh_key: str = "",
ssh_port: int = 22,
timeout: int = _SSH_TIMEOUT,
hermes_path: str = "",
):
self.host = host
self.agent = agent
self.ssh_key = ssh_key
self.ssh_port = ssh_port
self.timeout = timeout
self.hermes_path = hermes_path
self._validated_path: str = ""
def _ssh_base_cmd(self) -> list[str]:
cmd = ["ssh", "-o", "StrictHostKeyChecking=accept-new"]
cmd.extend(["-o", "ConnectTimeout=10"])
cmd.extend(["-o", "BatchMode=yes"])
if self.ssh_key:
cmd.extend(["-i", self.ssh_key])
if self.ssh_port != 22:
cmd.extend(["-p", str(self.ssh_port)])
cmd.append(self.host)
return cmd
def _resolve_hermes_paths(self) -> list[str]:
if self.hermes_path:
return [self.hermes_path]
paths = []
for tmpl in _DEFAULT_HERMES_PATHS:
path = tmpl.format(agent=self.agent) if "{agent}" in tmpl else tmpl
paths.append(path)
return paths
def validate_remote_hermes_path(self) -> str:
"""Probe the remote host for a working hermes binary.
Returns the validated path on success, raises RuntimeError on failure.
Caches the result so validation is only done once per instance.
"""
if self._validated_path:
return self._validated_path
candidates = self._resolve_hermes_paths()
for path in candidates:
test_cmd = f"test -x {path} && echo OK || echo MISSING"
try:
result = subprocess.run(
self._ssh_base_cmd() + [test_cmd],
capture_output=True, text=True, timeout=self.timeout,
)
if result.returncode == 0 and "OK" in (result.stdout or ""):
logger.info("SSH %s: hermes validated at %s", self.host, path)
self._validated_path = path
return path
except subprocess.TimeoutExpired:
logger.warning("SSH %s: timeout probing %s", self.host, path)
continue
except Exception as exc:
logger.debug("SSH %s: probe %s failed: %s", self.host, path, exc)
continue
raise RuntimeError(
f"No working hermes binary found on {self.host}. "
f"Checked: {', '.join(candidates)}."
)
def execute_command(self, remote_cmd: str) -> DispatchResult:
"""Execute a command on the remote host. Returns DispatchResult."""
t0 = time.monotonic()
full_cmd = self._ssh_base_cmd() + [remote_cmd]
try:
result = subprocess.run(
full_cmd, capture_output=True, text=True, timeout=self.timeout,
)
elapsed = int((time.monotonic() - t0) * 1000)
stderr = (result.stderr or "").strip()
stdout = (result.stdout or "").strip()
if result.returncode != 0:
return DispatchResult(
success=False, host=self.host, command=remote_cmd,
exit_code=result.returncode, stdout=stdout, stderr=stderr,
error=stderr.split("\n")[0] if stderr else f"exit code {result.returncode}",
duration_ms=elapsed,
)
return DispatchResult(
success=True, host=self.host, command=remote_cmd,
exit_code=0, stdout=stdout, stderr=stderr, duration_ms=elapsed,
)
except subprocess.TimeoutExpired:
elapsed = int((time.monotonic() - t0) * 1000)
return DispatchResult(
success=False, host=self.host, command=remote_cmd,
error=f"SSH timed out after {self.timeout}s", duration_ms=elapsed,
)
except Exception as exc:
elapsed = int((time.monotonic() - t0) * 1000)
return DispatchResult(
success=False, host=self.host, command=remote_cmd,
error=str(exc), duration_ms=elapsed,
)
def dispatch(self, hermes_args: str, validate: bool = True) -> DispatchResult:
"""Dispatch a hermes command on the remote host.
Args:
hermes_args: Arguments to pass to hermes (e.g. "cron tick").
validate: If True, validate the hermes binary exists first.
Returns DispatchResult. Only success=True if command actually ran.
"""
if validate:
try:
hermes_path = self.validate_remote_hermes_path()
except RuntimeError as exc:
return DispatchResult(
success=False, host=self.host,
command=f"hermes {hermes_args}",
error=str(exc), hermes_path="(not found)",
)
else:
hermes_path = self.hermes_path or "hermes"
remote_cmd = f"{hermes_path} {hermes_args}"
result = self.execute_command(remote_cmd)
result.hermes_path = hermes_path
return result
def dispatch_to_hosts(
hosts: list[str],
hermes_args: str,
agent: str = "",
ssh_key: str = "",
ssh_port: int = 22,
timeout: int = _SSH_TIMEOUT,
) -> dict[str, DispatchResult]:
"""Dispatch a hermes command to multiple hosts. Returns host -> DispatchResult."""
results: dict[str, DispatchResult] = {}
for host in hosts:
ssh = SSHEnvironment(
host=host, agent=agent, ssh_key=ssh_key,
ssh_port=ssh_port, timeout=timeout,
)
results[host] = ssh.dispatch(hermes_args)
logger.info(
"Dispatch %s: %s", host,
"OK" if results[host].success else results[host].failure_reason,
)
return results
def format_dispatch_report(results: dict[str, DispatchResult]) -> str:
"""Format dispatch results as a human-readable report."""
lines = []
ok = [r for r in results.values() if r.success]
failed = [r for r in results.values() if not r.success]
lines.append(f"Dispatch report: {len(ok)} OK, {len(failed)} failed")
lines.append("")
for host, result in results.items():
status = "OK" if result.success else "FAILED"
line = f" {host}: {status}"
if not result.success:
line += f" -- {result.failure_reason}"
if result.duration_ms:
line += f" ({result.duration_ms}ms)"
lines.append(line)
if failed:
lines.append("")
lines.append("Failed dispatches:")
for host, result in results.items():
if not result.success:
lines.append(f" {host}: {result.failure_reason}")
if result.stderr:
lines.append(f" stderr: {result.stderr[-150:]}")
return "\n".join(lines)

View File

@@ -1,64 +0,0 @@
"""Tests for memory backends (#322)."""
import json, pytest
from agent.memory import Entry, Null, Local, Honcho, score, evaluate, get_backend, reset
@pytest.fixture()
def loc(tmp_path): return Local(p=tmp_path/"t.db")
@pytest.fixture()
def rst(): reset(); yield; reset()
class TestEntry:
def test_defaults(self):
e = Entry(key="k",value="v",uid="u"); assert e.created > 0
class TestNull:
def test_ok(self): assert Null().ok()
def test_put(self): assert Null().put("u","k","v")
def test_get(self): assert Null().get("u","k") is None
def test_find(self): assert Null().find("u","q") == []
def test_not_cloud(self): assert not Null().cloud
class TestLocal:
def test_ok(self,loc): assert loc.ok()
def test_put_get(self,loc):
assert loc.put("u","lang","py")
e = loc.get("u","lang"); assert e.value == "py"
def test_meta(self,loc):
loc.put("u","k","v",{"type":"pattern"})
assert loc.get("u","k").etype == "pattern"
def test_update(self,loc):
loc.put("u","k","v1"); loc.put("u","k","v2")
assert loc.get("u","k").value == "v2"
def test_find(self,loc):
loc.put("u","pref_py","1"); loc.put("u","pref_vim","1"); loc.put("u","th","d")
assert len(loc.find("u","pref")) == 2
def test_all(self,loc):
loc.put("u","a","1"); loc.put("u","b","2")
assert len(loc.all("u")) == 2
def test_rm(self,loc):
loc.put("u","k","v"); assert loc.rm("u","k"); assert loc.get("u","k") is None
def test_not_cloud(self,loc): assert not loc.cloud
def test_users(self,loc):
loc.put("u1","k","v1"); loc.put("u2","k","v2")
assert loc.get("u1","k").value == "v1"
class TestHoncho:
def test_no_key(self,monkeypatch):
monkeypatch.delenv("HONCHO_API_KEY",raising=False)
assert not Honcho().ok()
def test_cloud(self): assert Honcho().cloud
class TestScore:
def test_null(self):
r = score(Null()); assert r["score"] > 0
def test_local(self,loc):
r = score(loc); assert r["ok"]; assert r["score"] >= 80; assert r["grade"] == "A"
def test_eval(self,rst):
r = evaluate(); assert len(r["results"]) >= 2; assert "recommendation" in r
class TestSingleton:
def test_default(self,rst,monkeypatch):
monkeypatch.delenv("HONCHO_API_KEY",raising=False)
assert isinstance(get_backend(), Local)
def test_cache(self,rst): assert get_backend() is get_backend()

View File

@@ -1,36 +0,0 @@
"""Memory backend tool — cross-session prefs. Local default, Honcho opt-in."""
import json
from tools.registry import registry
def memory_backend(action, uid="default", key=None, value=None, query=None, meta=None):
from agent.memory import get_backend, evaluate
b = get_backend()
if action=="info": return json.dumps({"ok":True,"backend":b.name,"cloud":b.cloud,"available":b.ok()})
if action=="store":
if not key or value is None: return json.dumps({"ok":False,"error":"key+value required"})
return json.dumps({"ok":b.put(uid,key,value,meta),"key":key})
if action=="get":
if not key: return json.dumps({"ok":False,"error":"key required"})
e = b.get(uid,key)
return json.dumps({"ok":True,"key":e.key,"value":e.value,"type":e.etype}) if e else json.dumps({"ok":False,"error":"not found"})
if action=="query":
if not query: return json.dumps({"ok":False,"error":"query required"})
r = b.find(uid,query)
return json.dumps({"ok":True,"results":[{"key":e.key,"value":e.value} for e in r],"count":len(r)})
if action=="list":
r = b.all(uid)
return json.dumps({"ok":True,"entries":[{"key":e.key,"type":e.etype} for e in r],"count":len(r)})
if action=="delete":
if not key: return json.dumps({"ok":False,"error":"key required"})
return json.dumps({"ok":b.rm(uid,key)})
if action=="evaluate": return json.dumps({"ok":True,**evaluate()})
return json.dumps({"ok":False,"error":f"unknown: {action}"})
registry.register(name="memory_backend",toolset="skills",schema={
"name":"memory_backend","description":"Cross-session memory backends. Local SQLite default, Honcho cloud opt-in. Zero overhead when disabled.",
"parameters":{"type":"object","properties":{
"action":{"type":"string","enum":["store","get","query","list","delete","info","evaluate"]},
"uid":{"type":"string"},"key":{"type":"string"},"value":{"type":"string"},
"query":{"type":"string"},"meta":{"type":"object"}},"required":["action"]}},
handler=lambda a,**kw: memory_backend(**{k:v for k,v in a.items() if v is not None}),emoji="🧠")