Drop the mini-swe-agent git submodule. All terminal backends now use hermes-agent's own environment implementations directly. Docker backend: - Inline the `docker run -d` container startup (was 15 lines in minisweagent's DockerEnvironment). Our wrapper already handled execute(), cleanup(), security hardening, volumes, and resource limits. Modal backend: - Import swe-rex's ModalDeployment directly instead of going through minisweagent's 90-line passthrough wrapper. - Bake the _AsyncWorker pattern (from environments/patches.py) directly into ModalEnvironment for Atropos compatibility without monkey-patching. Cleanup: - Remove minisweagent_path.py (submodule path resolution helper) - Remove submodule init/install from install.sh and setup-hermes.sh - Remove mini-swe-agent from .gitmodules - environments/patches.py is now a no-op (kept for backward compat) - terminal_tool.py no longer does sys.path hacking for minisweagent - mini_swe_runner.py guards imports (optional, for RL training only) - Update all affected tests to mock the new direct subprocess calls - Update README.md, CONTRIBUTING.md No functionality change — all Docker, Modal, local, SSH, Singularity, and Daytona backends behave identically. 6093 tests pass.
311 lines
13 KiB
Python
311 lines
13 KiB
Python
"""Tests for Modal sandbox infrastructure fixes (TBLite baseline).
|
|
|
|
Covers the bugs discovered while setting up TBLite evaluation:
|
|
1. Tool resolution — terminal + file tools load correctly
|
|
2. CWD fix — host paths get replaced with /root for container backends
|
|
3. ephemeral_disk version check
|
|
4. Tilde ~ replaced with /root for container backends
|
|
5. ensurepip fix in Modal image builder
|
|
6. install_pipx stays True for swerex-remote
|
|
7. /home/ added to host prefix check
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
import pytest
|
|
|
|
# Ensure repo root is importable
|
|
_repo_root = Path(__file__).resolve().parent.parent.parent
|
|
if str(_repo_root) not in sys.path:
|
|
sys.path.insert(0, str(_repo_root))
|
|
|
|
try:
|
|
import tools.terminal_tool # noqa: F401
|
|
_tt_mod = sys.modules["tools.terminal_tool"]
|
|
except ImportError:
|
|
pytest.skip("hermes-agent tools not importable (missing deps)", allow_module_level=True)
|
|
|
|
|
|
# =========================================================================
|
|
# Test 1: Tool resolution includes terminal + file tools
|
|
# =========================================================================
|
|
|
|
class TestToolResolution:
|
|
"""Verify get_tool_definitions returns all expected tools for eval."""
|
|
|
|
def test_terminal_and_file_toolsets_resolve_all_tools(self):
|
|
"""enabled_toolsets=['terminal', 'file'] should produce 6 tools."""
|
|
from model_tools import get_tool_definitions
|
|
tools = get_tool_definitions(
|
|
enabled_toolsets=["terminal", "file"],
|
|
quiet_mode=True,
|
|
)
|
|
names = {t["function"]["name"] for t in tools}
|
|
expected = {"terminal", "process", "read_file", "write_file", "search_files", "patch"}
|
|
assert expected == names, f"Expected {expected}, got {names}"
|
|
|
|
def test_terminal_tool_present(self):
|
|
"""The terminal tool must be present (not silently dropped)."""
|
|
from model_tools import get_tool_definitions
|
|
tools = get_tool_definitions(
|
|
enabled_toolsets=["terminal", "file"],
|
|
quiet_mode=True,
|
|
)
|
|
names = [t["function"]["name"] for t in tools]
|
|
assert "terminal" in names, f"terminal tool missing! Only got: {names}."
|
|
|
|
|
|
# =========================================================================
|
|
# Test 2-4: CWD handling for container backends
|
|
# =========================================================================
|
|
|
|
class TestCwdHandling:
|
|
"""Verify host paths are sanitized for container backends."""
|
|
|
|
def test_home_path_replaced_for_modal(self):
|
|
"""TERMINAL_CWD=/home/user/... should be replaced with /root for modal."""
|
|
with patch.dict(os.environ, {
|
|
"TERMINAL_ENV": "modal",
|
|
"TERMINAL_CWD": "/home/dakota/github/hermes-agent",
|
|
}):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == "/root", (
|
|
f"Expected /root, got {config['cwd']}. "
|
|
"/home/ paths should be replaced for modal backend."
|
|
)
|
|
|
|
def test_users_path_replaced_for_docker_by_default(self):
|
|
"""Docker should keep host paths out of the sandbox unless explicitly enabled."""
|
|
with patch.dict(os.environ, {
|
|
"TERMINAL_ENV": "docker",
|
|
"TERMINAL_CWD": "/Users/someone/projects",
|
|
}):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == "/root", (
|
|
f"Expected /root, got {config['cwd']}. "
|
|
"Host paths should be discarded for docker backend by default."
|
|
)
|
|
assert config["host_cwd"] is None
|
|
assert config["docker_mount_cwd_to_workspace"] is False
|
|
|
|
def test_users_path_maps_to_workspace_for_docker_when_enabled(self):
|
|
"""Docker should map the host cwd into /workspace only when explicitly enabled."""
|
|
with patch.dict(os.environ, {
|
|
"TERMINAL_ENV": "docker",
|
|
"TERMINAL_CWD": "/Users/someone/projects",
|
|
"TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE": "true",
|
|
}):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == "/workspace"
|
|
assert config["host_cwd"] == "/Users/someone/projects"
|
|
assert config["docker_mount_cwd_to_workspace"] is True
|
|
|
|
def test_windows_path_replaced_for_modal(self):
|
|
"""TERMINAL_CWD=C:\\Users\\... should be replaced for modal."""
|
|
with patch.dict(os.environ, {
|
|
"TERMINAL_ENV": "modal",
|
|
"TERMINAL_CWD": "C:\\Users\\someone\\projects",
|
|
}):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == "/root"
|
|
|
|
def test_default_cwd_is_root_for_container_backends(self):
|
|
"""Container backends should default to /root, not ~."""
|
|
for backend in ("modal", "docker", "singularity", "daytona"):
|
|
with patch.dict(os.environ, {"TERMINAL_ENV": backend}, clear=False):
|
|
# Remove TERMINAL_CWD so it uses default
|
|
env = os.environ.copy()
|
|
env.pop("TERMINAL_CWD", None)
|
|
env.pop("TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE", None)
|
|
with patch.dict(os.environ, env, clear=True):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == "/root", (
|
|
f"Backend {backend}: expected /root default, got {config['cwd']}"
|
|
)
|
|
|
|
def test_docker_default_cwd_maps_current_directory_when_enabled(self):
|
|
"""Docker should use /workspace when cwd mounting is explicitly enabled."""
|
|
with patch("tools.terminal_tool.os.getcwd", return_value="/home/user/project"):
|
|
with patch.dict(os.environ, {
|
|
"TERMINAL_ENV": "docker",
|
|
"TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE": "true",
|
|
}, clear=False):
|
|
env = os.environ.copy()
|
|
env.pop("TERMINAL_CWD", None)
|
|
with patch.dict(os.environ, env, clear=True):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == "/workspace"
|
|
assert config["host_cwd"] == "/home/user/project"
|
|
|
|
def test_local_backend_uses_getcwd(self):
|
|
"""Local backend should use os.getcwd(), not /root."""
|
|
with patch.dict(os.environ, {"TERMINAL_ENV": "local"}, clear=False):
|
|
env = os.environ.copy()
|
|
env.pop("TERMINAL_CWD", None)
|
|
with patch.dict(os.environ, env, clear=True):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == os.getcwd()
|
|
|
|
def test_create_environment_passes_docker_host_cwd_and_flag(self, monkeypatch):
|
|
"""Docker host cwd and mount flag should reach DockerEnvironment."""
|
|
captured = {}
|
|
sentinel = object()
|
|
|
|
def _fake_docker_environment(**kwargs):
|
|
captured.update(kwargs)
|
|
return sentinel
|
|
|
|
monkeypatch.setattr(_tt_mod, "_DockerEnvironment", _fake_docker_environment)
|
|
|
|
env = _tt_mod._create_environment(
|
|
env_type="docker",
|
|
image="python:3.11",
|
|
cwd="/workspace",
|
|
timeout=60,
|
|
container_config={"docker_mount_cwd_to_workspace": True},
|
|
host_cwd="/home/user/project",
|
|
)
|
|
|
|
assert env is sentinel
|
|
assert captured["cwd"] == "/workspace"
|
|
assert captured["host_cwd"] == "/home/user/project"
|
|
assert captured["auto_mount_cwd"] is True
|
|
|
|
def test_ssh_preserves_home_paths(self):
|
|
"""SSH backend should NOT replace /home/ paths (they're valid remotely)."""
|
|
with patch.dict(os.environ, {
|
|
"TERMINAL_ENV": "ssh",
|
|
"TERMINAL_CWD": "/home/remote-user/work",
|
|
"TERMINAL_SSH_HOST": "example.com",
|
|
"TERMINAL_SSH_USER": "user",
|
|
}):
|
|
config = _tt_mod._get_env_config()
|
|
assert config["cwd"] == "/home/remote-user/work", (
|
|
"SSH backend should preserve /home/ paths"
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# Test 5: ephemeral_disk version check
|
|
# =========================================================================
|
|
|
|
class TestEphemeralDiskCheck:
|
|
"""Verify ephemeral_disk is only passed when modal supports it."""
|
|
|
|
def test_ephemeral_disk_skipped_when_unsupported(self):
|
|
"""If modal.Sandbox.create doesn't have ephemeral_disk param, skip it."""
|
|
# Mock the modal import and Sandbox.create signature
|
|
mock_modal = MagicMock()
|
|
mock_sandbox_create = MagicMock()
|
|
# Simulate a signature WITHOUT ephemeral_disk
|
|
import inspect
|
|
mock_params = {
|
|
"args": inspect.Parameter("args", inspect.Parameter.VAR_POSITIONAL),
|
|
"image": inspect.Parameter("image", inspect.Parameter.KEYWORD_ONLY),
|
|
"timeout": inspect.Parameter("timeout", inspect.Parameter.KEYWORD_ONLY),
|
|
"cpu": inspect.Parameter("cpu", inspect.Parameter.KEYWORD_ONLY),
|
|
"memory": inspect.Parameter("memory", inspect.Parameter.KEYWORD_ONLY),
|
|
}
|
|
mock_sig = inspect.Signature(parameters=list(mock_params.values()))
|
|
|
|
with patch.dict(os.environ, {"TERMINAL_ENV": "modal"}):
|
|
config = _tt_mod._get_env_config()
|
|
# The config has container_disk default of 51200
|
|
disk = config.get("container_disk", 51200)
|
|
assert disk > 0, "disk should default to > 0"
|
|
|
|
# Simulate the version check logic from terminal_tool.py
|
|
sandbox_kwargs = {}
|
|
if disk > 0:
|
|
try:
|
|
if "ephemeral_disk" in mock_params:
|
|
sandbox_kwargs["ephemeral_disk"] = disk
|
|
except Exception:
|
|
pass
|
|
|
|
assert "ephemeral_disk" not in sandbox_kwargs, (
|
|
"ephemeral_disk should not be set when Sandbox.create doesn't support it"
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# Test 6: ModalEnvironment defaults
|
|
# =========================================================================
|
|
|
|
class TestModalEnvironmentDefaults:
|
|
"""Verify ModalEnvironment has correct defaults."""
|
|
|
|
def test_default_cwd_is_root(self):
|
|
"""ModalEnvironment default cwd should be /root, not ~."""
|
|
from tools.environments.modal import ModalEnvironment
|
|
import inspect
|
|
sig = inspect.signature(ModalEnvironment.__init__)
|
|
cwd_default = sig.parameters["cwd"].default
|
|
assert cwd_default == "/root", (
|
|
f"ModalEnvironment cwd default should be /root, got {cwd_default!r}. "
|
|
"Tilde ~ is not expanded by subprocess.run(cwd=...)."
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# Test 7: ensurepip fix in patches.py
|
|
# =========================================================================
|
|
|
|
class TestEnsurepipFix:
|
|
"""Verify the pip fix is applied in the ModalEnvironment init."""
|
|
|
|
def test_modal_environment_creates_image_with_setup_commands(self):
|
|
"""ModalEnvironment.__init__ should create a modal.Image with pip fix."""
|
|
try:
|
|
from tools.environments.modal import ModalEnvironment
|
|
except ImportError:
|
|
pytest.skip("tools.environments.modal not importable")
|
|
|
|
import inspect
|
|
source = inspect.getsource(ModalEnvironment.__init__)
|
|
assert "ensurepip" in source, (
|
|
"ModalEnvironment should include ensurepip fix "
|
|
"for Modal's legacy image builder"
|
|
)
|
|
assert "setup_dockerfile_commands" in source, (
|
|
"ModalEnvironment should use setup_dockerfile_commands "
|
|
"to fix pip before Modal's bootstrap"
|
|
)
|
|
|
|
def test_modal_environment_uses_install_pipx(self):
|
|
"""ModalEnvironment should pass install_pipx to ModalDeployment."""
|
|
try:
|
|
from tools.environments.modal import ModalEnvironment
|
|
except ImportError:
|
|
pytest.skip("tools.environments.modal not importable")
|
|
|
|
import inspect
|
|
source = inspect.getsource(ModalEnvironment.__init__)
|
|
assert "install_pipx" in source, (
|
|
"ModalEnvironment should pass install_pipx to ModalDeployment"
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# Test 8: Host prefix list completeness
|
|
# =========================================================================
|
|
|
|
class TestHostPrefixList:
|
|
"""Verify the host prefix list catches common host-only paths."""
|
|
|
|
def test_all_common_host_prefixes_caught(self):
|
|
"""The host prefix check should catch /Users/, /home/, C:\\, C:/."""
|
|
# Read the actual source to verify the prefixes
|
|
import inspect
|
|
source = inspect.getsource(_tt_mod._get_env_config)
|
|
for prefix in ["/Users/", "/home/", 'C:\\\\"', "C:/"]:
|
|
# Normalize for source comparison
|
|
check = prefix.rstrip('"')
|
|
assert check in source or prefix in source, (
|
|
f"Host prefix {prefix!r} not found in _get_env_config. "
|
|
"Container backends need this to avoid using host paths."
|
|
)
|