Add new environments and enhance tool context functionality

- Introduced new environments: Terminal Test Environment and SWE Environment, each with default configurations for testing and software engineering tasks.
- Added TerminalBench 2.0 evaluation environment with comprehensive setup for agentic LLMs, including task execution and verification.
- Enhanced ToolContext with methods for uploading and downloading files, ensuring binary-safe operations.
- Updated documentation across environments to reflect new features and usage instructions.
- Refactored existing environment configurations for consistency and clarity.
This commit is contained in:
teknium
2026-02-10 19:39:05 +00:00
parent e8343f2d87
commit 35ad3146a8
18 changed files with 1428 additions and 19 deletions

View File

@@ -31,6 +31,8 @@ from .terminal_tool import (
cleanup_vm,
cleanup_all_environments,
get_active_environments_info,
register_task_env_overrides,
clear_task_env_overrides,
TERMINAL_TOOL_DESCRIPTION
)
@@ -139,6 +141,8 @@ __all__ = [
'cleanup_vm',
'cleanup_all_environments',
'get_active_environments_info',
'register_task_env_overrides',
'clear_task_env_overrides',
'TERMINAL_TOOL_DESCRIPTION',
# Terminal tools (Hecate/MorphCloud backend)
'terminal_hecate_tool',

View File

@@ -39,19 +39,24 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
# Create environment OUTSIDE locks so we don't block other rollouts
# during slow Modal/Docker startup (~10s)
if needs_creation:
from tools.terminal_tool import _task_env_overrides
config = _get_env_config()
env_type = config["env_type"]
# Check per-task overrides (set by environments like TerminalBench2Env)
overrides = _task_env_overrides.get(task_id, {})
if env_type == "docker":
image = config["docker_image"]
image = overrides.get("docker_image") or config["docker_image"]
elif env_type == "singularity":
image = config["singularity_image"]
image = overrides.get("singularity_image") or config["singularity_image"]
elif env_type == "modal":
image = config["modal_image"]
image = overrides.get("modal_image") or config["modal_image"]
else:
image = ""
cwd = config["cwd"]
cwd = overrides.get("cwd") or config["cwd"]
_check_disk_usage_warning()
if not os.getenv("HERMES_QUIET"):
print(f"[FileTools] Creating new {env_type} environment for task {task_id[:8]}...", flush=True)

View File

@@ -976,13 +976,37 @@ class _ModalEnvironment:
Wraps mini-swe-agent's SwerexModalEnvironment but adds:
- SUDO_PASSWORD support via _transform_sudo_command
- Automatic async-safety patches (applied once, before first use)
Note: stdin handling is not needed for Modal since it uses remote async execution.
The patches replace SwerexModalEnvironment's asyncio.run() calls with a
background thread approach, making it safe to use inside any event loop
(e.g., Atropos). Applied here at the point of use rather than relying on
import-time side effects, so ALL callers get the fix automatically.
"""
# Class-level flag: patches only need to be applied once
_patches_applied = False
def __init__(self, image: str, cwd: str = "/root", timeout: int = 60):
# Ensure async-safety patches are applied before creating any
# SwerexModalEnvironment instance. This is the single authoritative
# place -- no other module needs to call apply_patches() for Modal.
if not _ModalEnvironment._patches_applied:
try:
from environments.patches import apply_patches
apply_patches()
except ImportError:
pass # patches module not available (standalone use)
_ModalEnvironment._patches_applied = True
from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment
self._inner = SwerexModalEnvironment(image=image, cwd=cwd, timeout=timeout)
# Generous startup timeout: sandbox creation can take 30-60s for cold images,
# and the SWE-ReX runtime needs another 10-30s to boot inside it.
self._inner = SwerexModalEnvironment(
image=image, cwd=cwd, timeout=timeout,
startup_timeout=180.0,
runtime_timeout=3600.0,
)
self.cwd = cwd
self.timeout = timeout
@@ -1033,7 +1057,7 @@ TERMINAL_TOOL_DESCRIPTION = """Execute commands on a secure Linux environment.
- Run servers/long processes in background
- Monitor disk usage for large tasks
- Install whatever tools you need with apt-get or pip
- Do not be afraid to run pip with --break-system-packages
- Try to create or use a venv with uv or python -m venv to keep isolation from global system packages.
**Things to avoid:**
- Do NOT use interactive tools such as tmux, vim, nano, python repl - you will get stuck.
@@ -1432,7 +1456,9 @@ def terminal_tool(
env = _active_environments[effective_task_id]
if needs_creation:
_check_disk_usage_warning()
# Disk usage warning only relevant for local/singularity backends
if env_type in ("singularity", "local"):
_check_disk_usage_warning()
if not os.getenv("HERMES_QUIET"):
print(f"[Terminal] Creating new {env_type} environment for task {effective_task_id[:8]}...", flush=True)
try: