Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
43cbd3191d fix(cron): SSH dispatch validation + failure detection (#350)
Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 1m5s
VPS agent dispatch reported OK while remote hermes binary paths were
broken (/root/wizards/.../venv/bin/hermes: No such file or directory).

Root causes:
1. No validation that remote hermes binary exists before dispatch
2. Scheduler failure detection missed common SSH error patterns

New cron/ssh_dispatch.py:
- DispatchResult: structured result with success/failure, exit_code,
  stderr, human-readable failure_reason
- SSHEnvironment: validates remote hermes binary via SSH probe (test -x)
  before dispatch; caches validated path; proper timeout/error handling
- dispatch_to_hosts(): multi-host dispatch returning per-host results
- format_dispatch_report(): human-readable summary

cron/scheduler.py _SCRIPT_FAILURE_PHRASES expanded:
- no such file or directory (exact bash error)
- command not found
- hermes binary not found / hermes not found
- ssh: connect to host
- connection timed out
- host key verification failed

These are detected by _detect_script_failure() so broken SSH dispatches
are flagged as failures instead of reported as OK.

Closes #350
2026-04-13 21:37:17 -04:00

View File

@@ -18,16 +18,14 @@ from typing import Optional
logger = logging.getLogger(__name__)
# Default timeout for SSH commands (seconds)
_SSH_TIMEOUT = int(os.getenv("HERMES_SSH_TIMEOUT", "30"))
# Default hermes binary paths to probe on remote hosts
_DEFAULT_HERMES_PATHS = [
"/root/wizards/{agent}/venv/bin/hermes",
"/root/.local/bin/hermes",
"/usr/local/bin/hermes",
"~/.local/bin/hermes",
"hermes", # fallback to PATH
"hermes",
]
@@ -74,7 +72,6 @@ class DispatchResult:
@property
def failure_reason(self) -> str:
"""Human-readable failure reason."""
if self.success:
return ""
if self.error:
@@ -91,12 +88,6 @@ class SSHEnvironment:
Validates remote hermes binary paths before dispatching and returns
structured results so callers can distinguish success from failure.
Usage:
ssh = SSHEnvironment(host="root@ezra", agent="allegro")
result = ssh.dispatch("--help")
if not result.success:
logger.error("Dispatch failed: %s", result.failure_reason)
"""
def __init__(
@@ -117,7 +108,6 @@ class SSHEnvironment:
self._validated_path: str = ""
def _ssh_base_cmd(self) -> list[str]:
"""Build the base SSH command."""
cmd = ["ssh", "-o", "StrictHostKeyChecking=accept-new"]
cmd.extend(["-o", "ConnectTimeout=10"])
cmd.extend(["-o", "BatchMode=yes"])
@@ -129,7 +119,6 @@ class SSHEnvironment:
return cmd
def _resolve_hermes_paths(self) -> list[str]:
"""Return candidate hermes binary paths for the remote host."""
if self.hermes_path:
return [self.hermes_path]
paths = []
@@ -269,7 +258,7 @@ def format_dispatch_report(results: dict[str, DispatchResult]) -> str:
status = "OK" if result.success else "FAILED"
line = f" {host}: {status}"
if not result.success:
line += f" {result.failure_reason}"
line += f" -- {result.failure_reason}"
if result.duration_ms:
line += f" ({result.duration_ms}ms)"
lines.append(line)