Files
hermes-agent/tools/environments/modal.py

425 lines
15 KiB
Python
Raw Normal View History

"""Modal cloud execution environment using the native Modal SDK directly.
2026-02-21 22:31:43 -08:00
Uses ``Sandbox.create()`` + ``Sandbox.exec()`` instead of the older runtime
wrapper, while preserving Hermes' persistent snapshot behavior across sessions.
"""
import asyncio
import json
import logging
import shlex
import threading
2026-02-21 22:31:43 -08:00
import uuid
from pathlib import Path
from typing import Any, Dict, Optional
2026-02-21 22:31:43 -08:00
from hermes_constants import get_hermes_home
2026-02-21 22:31:43 -08:00
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
fix(cli): respect HERMES_HOME in all remaining hardcoded ~/.hermes paths Several files resolved paths via Path.home() / ".hermes" or os.path.expanduser("~/.hermes/..."), bypassing the HERMES_HOME environment variable. This broke isolation when running multiple Hermes instances with distinct HERMES_HOME directories. Replace all hardcoded paths with calls to get_hermes_home() from hermes_cli.config, consistent with the rest of the codebase. Files fixed: - tools/process_registry.py (processes.json) - gateway/pairing.py (pairing/) - gateway/sticker_cache.py (sticker_cache.json) - gateway/channel_directory.py (channel_directory.json, sessions.json) - gateway/config.py (gateway.json, config.yaml, sessions_dir) - gateway/mirror.py (sessions/) - gateway/hooks.py (hooks/) - gateway/platforms/base.py (image_cache/, audio_cache/, document_cache/) - gateway/platforms/whatsapp.py (whatsapp/session) - gateway/delivery.py (cron/output) - agent/auxiliary_client.py (auth.json) - agent/prompt_builder.py (SOUL.md) - cli.py (config.yaml, images/, pastes/, history) - run_agent.py (logs/) - tools/environments/base.py (sandboxes/) - tools/environments/modal.py (modal_snapshots.json) - tools/environments/singularity.py (singularity_snapshots.json) - tools/tts_tool.py (audio_cache) - hermes_cli/status.py (cron/jobs.json, sessions.json) - hermes_cli/gateway.py (logs/, whatsapp session) - hermes_cli/main.py (whatsapp/session) Tests updated to use HERMES_HOME env var instead of patching Path.home(). Closes #892 (cherry picked from commit 78ac1bba43b8b74a934c6172f2c29bb4d03164b9)
2026-03-11 07:31:41 +01:00
_SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"
_DIRECT_SNAPSHOT_NAMESPACE = "direct"
def _load_snapshots() -> Dict[str, str]:
"""Load snapshot ID mapping from disk."""
if _SNAPSHOT_STORE.exists():
try:
return json.loads(_SNAPSHOT_STORE.read_text())
except Exception:
pass
return {}
def _save_snapshots(data: Dict[str, str]) -> None:
"""Persist snapshot ID mapping to disk."""
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
2026-02-21 22:31:43 -08:00
def _direct_snapshot_key(task_id: str) -> str:
return f"{_DIRECT_SNAPSHOT_NAMESPACE}:{task_id}"
2026-02-21 22:31:43 -08:00
def _get_snapshot_restore_candidate(task_id: str) -> tuple[str | None, bool]:
"""Return a snapshot id and whether it came from the legacy key format."""
snapshots = _load_snapshots()
namespaced_key = _direct_snapshot_key(task_id)
snapshot_id = snapshots.get(namespaced_key)
if isinstance(snapshot_id, str) and snapshot_id:
return snapshot_id, False
legacy_snapshot_id = snapshots.get(task_id)
if isinstance(legacy_snapshot_id, str) and legacy_snapshot_id:
return legacy_snapshot_id, True
return None, False
def _store_direct_snapshot(task_id: str, snapshot_id: str) -> None:
"""Persist the direct Modal snapshot id under the direct namespace."""
snapshots = _load_snapshots()
snapshots[_direct_snapshot_key(task_id)] = snapshot_id
snapshots.pop(task_id, None)
_save_snapshots(snapshots)
def _delete_direct_snapshot(task_id: str, snapshot_id: str | None = None) -> None:
"""Remove direct Modal snapshot entries for a task, including legacy keys."""
snapshots = _load_snapshots()
updated = False
for key in (_direct_snapshot_key(task_id), task_id):
value = snapshots.get(key)
if value is None:
continue
if snapshot_id is None or value == snapshot_id:
snapshots.pop(key, None)
updated = True
if updated:
_save_snapshots(snapshots)
def _resolve_modal_image(image_spec: Any) -> Any:
"""Convert registry references or snapshot ids into Modal image objects."""
import modal as _modal
if not isinstance(image_spec, str):
return image_spec
if image_spec.startswith("im-"):
return _modal.Image.from_id(image_spec)
return _modal.Image.from_registry(
image_spec,
setup_dockerfile_commands=[
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
],
)
class _AsyncWorker:
"""Background thread with its own event loop for async-safe Modal calls."""
2026-02-21 22:31:43 -08:00
def __init__(self):
self._loop: Optional[asyncio.AbstractEventLoop] = None
self._thread: Optional[threading.Thread] = None
self._started = threading.Event()
def start(self):
self._thread = threading.Thread(target=self._run_loop, daemon=True)
self._thread.start()
self._started.wait(timeout=30)
def _run_loop(self):
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
self._started.set()
self._loop.run_forever()
def run_coroutine(self, coro, timeout=600):
if self._loop is None or self._loop.is_closed():
raise RuntimeError("AsyncWorker loop is not running")
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
return future.result(timeout=timeout)
def stop(self):
if self._loop and self._loop.is_running():
self._loop.call_soon_threadsafe(self._loop.stop)
if self._thread:
self._thread.join(timeout=10)
class ModalEnvironment(BaseEnvironment):
"""Modal cloud execution via native Modal sandboxes."""
2026-02-21 22:31:43 -08:00
def __init__(
self,
image: str,
cwd: str = "/root",
timeout: int = 60,
modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
persistent_filesystem: bool = True,
task_id: str = "default",
):
2026-02-21 22:31:43 -08:00
super().__init__(cwd=cwd, timeout=timeout)
self._persistent = persistent_filesystem
self._task_id = task_id
self._base_image = image
self._sandbox = None
self._app = None
self._worker = _AsyncWorker()
self._synced_creds: Dict[str, tuple] = {}
sandbox_kwargs = dict(modal_sandbox_kwargs or {})
restored_snapshot_id = None
restored_from_legacy_key = False
if self._persistent:
restored_snapshot_id, restored_from_legacy_key = _get_snapshot_restore_candidate(
self._task_id
)
if restored_snapshot_id:
logger.info("Modal: restoring from snapshot %s", restored_snapshot_id[:20])
import modal as _modal
cred_mounts = []
try:
from tools.credential_files import get_credential_file_mounts
for mount_entry in get_credential_file_mounts():
cred_mounts.append(
_modal.Mount.from_local_file(
mount_entry["host_path"],
remote_path=mount_entry["container_path"],
)
)
logger.info(
"Modal: mounting credential %s -> %s",
mount_entry["host_path"],
mount_entry["container_path"],
)
except Exception as e:
logger.debug("Modal: could not load credential file mounts: %s", e)
self._worker.start()
async def _create_sandbox(image_spec: Any):
app = await _modal.App.lookup.aio("hermes-agent", create_if_missing=True)
create_kwargs = dict(sandbox_kwargs)
if cred_mounts:
existing_mounts = list(create_kwargs.pop("mounts", []))
existing_mounts.extend(cred_mounts)
create_kwargs["mounts"] = existing_mounts
sandbox = await _modal.Sandbox.create.aio(
"sleep",
"infinity",
image=image_spec,
app=app,
timeout=int(create_kwargs.pop("timeout", 3600)),
**create_kwargs,
)
return app, sandbox
try:
target_image_spec = restored_snapshot_id or image
try:
# _resolve_modal_image keeps the Modal bootstrap fix together:
# it applies ensurepip via setup_dockerfile_commands before
# Modal builds or restores the image.
effective_image = _resolve_modal_image(target_image_spec)
self._app, self._sandbox = self._worker.run_coroutine(
_create_sandbox(effective_image),
timeout=300,
)
except Exception as exc:
if not restored_snapshot_id:
raise
logger.warning(
"Modal: failed to restore snapshot %s, retrying with base image: %s",
restored_snapshot_id[:20],
exc,
)
_delete_direct_snapshot(self._task_id, restored_snapshot_id)
base_image = _resolve_modal_image(image)
self._app, self._sandbox = self._worker.run_coroutine(
_create_sandbox(base_image),
timeout=300,
)
else:
if restored_snapshot_id and restored_from_legacy_key:
_store_direct_snapshot(self._task_id, restored_snapshot_id)
logger.info(
"Modal: migrated legacy snapshot entry for task %s",
self._task_id,
)
except Exception:
self._worker.stop()
raise
2026-02-21 22:31:43 -08:00
logger.info("Modal: sandbox created (task=%s)", self._task_id)
2026-02-21 22:31:43 -08:00
def _sync_credential_files(self) -> None:
"""Push credential files into the running sandbox."""
try:
from tools.credential_files import get_credential_file_mounts
mounts = get_credential_file_mounts()
if not mounts:
return
for entry in mounts:
host_path = entry["host_path"]
container_path = entry["container_path"]
hp = Path(host_path)
try:
stat = hp.stat()
file_key = (stat.st_mtime, stat.st_size)
except OSError:
continue
if self._synced_creds.get(container_path) == file_key:
continue
try:
content = hp.read_text(encoding="utf-8")
except Exception:
continue
import base64
b64 = base64.b64encode(content.encode("utf-8")).decode("ascii")
container_dir = str(Path(container_path).parent)
cmd = (
f"mkdir -p {shlex.quote(container_dir)} && "
f"echo {shlex.quote(b64)} | base64 -d > {shlex.quote(container_path)}"
)
async def _write():
proc = await self._sandbox.exec.aio("bash", "-c", cmd)
await proc.wait.aio()
self._worker.run_coroutine(_write(), timeout=15)
self._synced_creds[container_path] = file_key
logger.debug(
"Modal: synced credential %s -> %s",
host_path,
container_path,
)
except Exception as e:
logger.debug("Modal: credential file sync failed: %s", e)
def execute(
self,
command: str,
cwd: str = "",
*,
timeout: int | None = None,
stdin_data: str | None = None,
) -> dict:
self._sync_credential_files()
2026-02-21 22:31:43 -08:00
if stdin_data is not None:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
while marker in stdin_data:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"
exec_command, sudo_stdin = self._prepare_command(command)
# Modal sandboxes execute commands via exec() and cannot pipe
# subprocess stdin directly. When a sudo password is present,
# use a shell-level pipe from printf.
if sudo_stdin is not None:
exec_command = (
f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}"
)
effective_cwd = cwd or self.cwd
effective_timeout = timeout or self.timeout
full_command = f"cd {shlex.quote(effective_cwd)} && {exec_command}"
result_holder = {"value": None, "error": None}
def _run():
try:
async def _do_execute():
process = await self._sandbox.exec.aio(
"bash",
"-c",
full_command,
timeout=effective_timeout,
)
stdout = await process.stdout.read.aio()
stderr = await process.stderr.read.aio()
exit_code = await process.wait.aio()
if isinstance(stdout, bytes):
stdout = stdout.decode("utf-8", errors="replace")
if isinstance(stderr, bytes):
stderr = stderr.decode("utf-8", errors="replace")
output = stdout
if stderr:
output = f"{stdout}\n{stderr}" if stdout else stderr
return output, exit_code
output, exit_code = self._worker.run_coroutine(
_do_execute(),
timeout=effective_timeout + 30,
)
result_holder["value"] = {
"output": output,
"returncode": exit_code,
}
except Exception as e:
result_holder["error"] = e
t = threading.Thread(target=_run, daemon=True)
t.start()
while t.is_alive():
t.join(timeout=0.2)
if is_interrupted():
try:
self._worker.run_coroutine(
self._sandbox.terminate.aio(),
timeout=15,
)
except Exception:
pass
return {
"output": "[Command interrupted - Modal sandbox terminated]",
"returncode": 130,
}
if result_holder["error"]:
return {
"output": f"Modal execution error: {result_holder['error']}",
"returncode": 1,
}
return result_holder["value"]
2026-02-21 22:31:43 -08:00
def cleanup(self):
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
if self._sandbox is None:
return
if self._persistent:
try:
async def _snapshot():
img = await self._sandbox.snapshot_filesystem.aio()
return img.object_id
try:
snapshot_id = self._worker.run_coroutine(_snapshot(), timeout=60)
except Exception:
snapshot_id = None
if snapshot_id:
_store_direct_snapshot(self._task_id, snapshot_id)
logger.info(
"Modal: saved filesystem snapshot %s for task %s",
snapshot_id[:20],
self._task_id,
)
except Exception as e:
logger.warning("Modal: filesystem snapshot failed: %s", e)
try:
self._worker.run_coroutine(
self._sandbox.terminate.aio(),
timeout=15,
)
except Exception:
pass
finally:
self._worker.stop()
self._sandbox = None
self._app = None