hermes-agent/tools/environments/modal.py

"""Modal cloud execution environment using SWE-ReX directly.

Supports persistent filesystem snapshots: when enabled, the sandbox's filesystem
is snapshotted on cleanup and restored on next creation, so installed packages,
project files, and config changes survive across sessions.
"""

import asyncio
import json
import logging
import threading
import uuid
from pathlib import Path
from typing import Any, Dict, Optional

from hermes_cli.config import get_hermes_home
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted

logger = logging.getLogger(__name__)

_SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"


def _load_snapshots() -> Dict[str, str]:
    """Load snapshot ID mapping from disk."""
    if _SNAPSHOT_STORE.exists():
        try:
            return json.loads(_SNAPSHOT_STORE.read_text())
        except Exception:
            pass
    return {}


def _save_snapshots(data: Dict[str, str]) -> None:
    """Persist snapshot ID mapping to disk."""
    _SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
    _SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))


class _AsyncWorker:
    """Background thread with its own event loop for async-safe swe-rex calls.

    Allows sync code to submit async coroutines and block for results,
    even when called from inside another running event loop (e.g. Atropos).
    """

    def __init__(self):
        self._loop: Optional[asyncio.AbstractEventLoop] = None
        self._thread: Optional[threading.Thread] = None
        self._started = threading.Event()

    def start(self):
        self._thread = threading.Thread(target=self._run_loop, daemon=True)
        self._thread.start()
        self._started.wait(timeout=30)

    def _run_loop(self):
        self._loop = asyncio.new_event_loop()
        asyncio.set_event_loop(self._loop)
        self._started.set()
        self._loop.run_forever()

    def run_coroutine(self, coro, timeout=600):
        if self._loop is None or self._loop.is_closed():
            raise RuntimeError("AsyncWorker loop is not running")
        future = asyncio.run_coroutine_threadsafe(coro, self._loop)
        return future.result(timeout=timeout)

    def stop(self):
        if self._loop and self._loop.is_running():
            self._loop.call_soon_threadsafe(self._loop.stop)
        if self._thread:
            self._thread.join(timeout=10)


class ModalEnvironment(BaseEnvironment):
    """Modal cloud execution via SWE-ReX.

    Uses swe-rex's ModalDeployment directly for sandbox management.
    Adds sudo -S support, configurable resources (CPU, memory, disk),
    and optional filesystem persistence via Modal's snapshot API.
    """

    def __init__(
        self,
        image: str,
        cwd: str = "/root",
        timeout: int = 60,
        modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
        persistent_filesystem: bool = True,
        task_id: str = "default",
    ):
        super().__init__(cwd=cwd, timeout=timeout)

        self._persistent = persistent_filesystem
        self._task_id = task_id
        self._base_image = image
        self._deployment = None
        self._worker = _AsyncWorker()

        sandbox_kwargs = dict(modal_sandbox_kwargs or {})

        # If persistent, try to restore from a previous snapshot
        restored_image = None
        if self._persistent:
            snapshot_id = _load_snapshots().get(self._task_id)
            if snapshot_id:
                try:
                    import modal
                    restored_image = modal.Image.from_id(snapshot_id)
                    logger.info("Modal: restoring from snapshot %s", snapshot_id[:20])
                except Exception as e:
                    logger.warning("Modal: failed to restore snapshot, using base image: %s", e)
                    restored_image = None

        effective_image = restored_image if restored_image else image

        # Pre-build a modal.Image with pip fix for Modal's legacy image builder.
        # Some task images have broken pip; fix via ensurepip before Modal uses it.
        import modal as _modal
        if isinstance(effective_image, str):
            effective_image = _modal.Image.from_registry(
                effective_image,
                setup_dockerfile_commands=[
                    "RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
                    "python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
                ],
            )

        # Start the async worker thread and create the deployment on it
        # so all gRPC channels are bound to the worker's event loop.
        self._worker.start()

        from swerex.deployment.modal import ModalDeployment

        async def _create_and_start():
            deployment = ModalDeployment(
                image=effective_image,
                startup_timeout=180.0,
                runtime_timeout=3600.0,
                deployment_timeout=3600.0,
                install_pipx=True,
                modal_sandbox_kwargs=sandbox_kwargs,
            )
            await deployment.start()
            return deployment

        self._deployment = self._worker.run_coroutine(_create_and_start())

    def execute(self, command: str, cwd: str = "", *,
                timeout: int | None = None,
                stdin_data: str | None = None) -> dict:
        if stdin_data is not None:
            marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
            while marker in stdin_data:
                marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
            command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"

        exec_command, sudo_stdin = self._prepare_command(command)

        # Modal sandboxes execute commands via the Modal SDK and cannot pipe
        # subprocess stdin directly the way a local Popen can.  When a sudo
        # password is present, use a shell-level pipe from printf so that the
        # password feeds sudo -S without appearing as an echo argument embedded
        # in the shell string.
        if sudo_stdin is not None:
            import shlex
            exec_command = (
                f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}"
            )

        from swerex.runtime.abstract import Command as RexCommand

        effective_cwd = cwd or self.cwd
        effective_timeout = timeout or self.timeout

        # Run in a background thread so we can poll for interrupts
        result_holder = {"value": None, "error": None}

        def _run():
            try:
                async def _do_execute():
                    return await self._deployment.runtime.execute(
                        RexCommand(
                            command=exec_command,
                            shell=True,
                            check=False,
                            cwd=effective_cwd,
                            timeout=effective_timeout,
                            merge_output_streams=True,
                        )
                    )
                output = self._worker.run_coroutine(_do_execute())
                result_holder["value"] = {
                    "output": output.stdout,
                    "returncode": output.exit_code,
                }
            except Exception as e:
                result_holder["error"] = e

        t = threading.Thread(target=_run, daemon=True)
        t.start()
        while t.is_alive():
            t.join(timeout=0.2)
            if is_interrupted():
                try:
                    self._worker.run_coroutine(
                        asyncio.wait_for(self._deployment.stop(), timeout=10),
                        timeout=15,
                    )
                except Exception:
                    pass
                return {
                    "output": "[Command interrupted - Modal sandbox terminated]",
                    "returncode": 130,
                }

        if result_holder["error"]:
            return {"output": f"Modal execution error: {result_holder['error']}", "returncode": 1}
        return result_holder["value"]

    def cleanup(self):
        """Snapshot the filesystem (if persistent) then stop the sandbox."""
        if self._deployment is None:
            return

        if self._persistent:
            try:
                sandbox = getattr(self._deployment, '_sandbox', None)
                if sandbox:
                    async def _snapshot():
                        img = await sandbox.snapshot_filesystem.aio()
                        return img.object_id

                    try:
                        snapshot_id = self._worker.run_coroutine(_snapshot(), timeout=60)
                    except Exception:
                        snapshot_id = None

                    if snapshot_id:
                        snapshots = _load_snapshots()
                        snapshots[self._task_id] = snapshot_id
                        _save_snapshots(snapshots)
                        logger.info("Modal: saved filesystem snapshot %s for task %s",
                                    snapshot_id[:20], self._task_id)
            except Exception as e:
                logger.warning("Modal: filesystem snapshot failed: %s", e)

        try:
            self._worker.run_coroutine(
                asyncio.wait_for(self._deployment.stop(), timeout=10),
                timeout=15,
            )
        except Exception:
            pass
        finally:
            self._worker.stop()
            self._deployment = None