From 35ad3146a8ab4b64c19dfe1b5b41b192c8d2b60a Mon Sep 17 00:00:00 2001 From: teknium Date: Tue, 10 Feb 2026 19:39:05 +0000 Subject: [PATCH] Add new environments and enhance tool context functionality - Introduced new environments: Terminal Test Environment and SWE Environment, each with default configurations for testing and software engineering tasks. - Added TerminalBench 2.0 evaluation environment with comprehensive setup for agentic LLMs, including task execution and verification. - Enhanced ToolContext with methods for uploading and downloading files, ensuring binary-safe operations. - Updated documentation across environments to reflect new features and usage instructions. - Refactored existing environment configurations for consistency and clarity. --- environments/README.md | 330 ++++++++ environments/__init__.py | 9 +- environments/benchmarks/__init__.py | 0 .../benchmarks/terminalbench_2/__init__.py | 0 .../benchmarks/terminalbench_2/default.yaml | 41 + .../benchmarks/terminalbench_2/run_eval.sh | 32 + .../terminalbench_2/terminalbench2_env.py | 730 ++++++++++++++++++ environments/hermes_swe_env/__init__.py | 0 .../default.yaml} | 3 +- .../{ => hermes_swe_env}/hermes_swe_env.py | 2 +- environments/terminal_test_env/__init__.py | 0 .../default.yaml} | 5 +- .../terminal_test_env.py | 2 +- environments/tool_context.py | 178 ++++- evals/terminal-bench-2/evaluate_config.yaml | 64 ++ tools/__init__.py | 4 + tools/file_tools.py | 13 +- tools/terminal_tool.py | 34 +- 18 files changed, 1428 insertions(+), 19 deletions(-) create mode 100644 environments/README.md create mode 100644 environments/benchmarks/__init__.py create mode 100644 environments/benchmarks/terminalbench_2/__init__.py create mode 100644 environments/benchmarks/terminalbench_2/default.yaml create mode 100755 environments/benchmarks/terminalbench_2/run_eval.sh create mode 100644 environments/benchmarks/terminalbench_2/terminalbench2_env.py create mode 100644 environments/hermes_swe_env/__init__.py rename environments/{configs/swe_default.yaml => hermes_swe_env/default.yaml} (88%) rename environments/{ => hermes_swe_env}/hermes_swe_env.py (99%) create mode 100644 environments/terminal_test_env/__init__.py rename environments/{configs/terminal_test_default.yaml => terminal_test_env/default.yaml} (84%) rename environments/{ => terminal_test_env}/terminal_test_env.py (99%) create mode 100644 evals/terminal-bench-2/evaluate_config.yaml diff --git a/environments/README.md b/environments/README.md new file mode 100644 index 00000000..e14a0b9e --- /dev/null +++ b/environments/README.md @@ -0,0 +1,330 @@ +# Hermes-Agent Atropos Environments + +This directory contains the integration layer between **hermes-agent's** tool-calling capabilities and the **Atropos** RL training framework. It provides everything needed to run agentic LLMs through multi-turn tool-calling loops, score their output with arbitrary reward functions, and feed results into Atropos for training or evaluation. + +## Architecture Overview + +``` + Atropos Framework + ┌───────────────────────┐ + │ BaseEnv │ (atroposlib) + │ - Server management │ + │ - Worker scheduling │ + │ - Wandb logging │ + │ - CLI (serve/process/ │ + │ evaluate) │ + └───────────┬───────────┘ + │ inherits + ┌───────────┴───────────┐ + │ HermesAgentBaseEnv │ hermes_base_env.py + │ - Terminal backend │ + │ - Tool resolution │ + │ - Agent loop │ + │ - ToolContext │ + │ - Async patches │ + └───────────┬───────────┘ + │ inherits + ┌─────────────────┼─────────────────┐ + │ │ │ + TerminalTestEnv HermesSweEnv TerminalBench2EvalEnv + (stack testing) (SWE training) (TB2 benchmark eval) +``` + +### Inheritance Chain + +**BaseEnv** (from `atroposlib`) is the Atropos base class. It provides: +- Server management (OpenAI-compatible API servers, VLLM, SGLang) +- Worker scheduling for parallel rollouts +- Wandb integration for metrics and rollout logging +- CLI interface with three subcommands: `serve`, `process`, `evaluate` +- `evaluate_log()` for saving eval results to JSON + samples.jsonl + +**HermesAgentBaseEnv** (`hermes_base_env.py`) extends BaseEnv with hermes-agent specifics: +- Sets `os.environ["TERMINAL_ENV"]` to configure the terminal backend (local, docker, modal, ssh, singularity) +- Resolves hermes-agent toolsets via `_resolve_tools_for_group()` (calls `get_tool_definitions()` from `model_tools.py`) +- Implements `collect_trajectory()` which runs the full agent loop and computes rewards +- Supports two-phase operation (Phase 1: OpenAI server, Phase 2: VLLM ManagedServer) +- Applies monkey patches for async-safe tool operation at import time + +Concrete environments inherit from `HermesAgentBaseEnv` and implement: +- `setup()` -- Load dataset, initialize state +- `get_next_item()` -- Return the next item for rollout +- `format_prompt()` -- Convert a dataset item into the user message +- `compute_reward()` -- Score the rollout using ToolContext +- `evaluate()` -- Periodic evaluation logic + +## Core Components + +### Agent Loop (`agent_loop.py`) + +`HermesAgentLoop` is the reusable multi-turn agent engine. It runs the same pattern as hermes-agent's `run_agent.py`: + +1. Send messages + tools to the API via `server.chat_completion()` +2. If the response contains `tool_calls`, execute each one via `handle_function_call()` from `model_tools.py` +3. Append tool results to the conversation and go back to step 1 +4. If the response has no tool_calls, the agent is done + +Tool calls are executed in a thread pool (`run_in_executor`) so backends that use `asyncio.run()` internally (Modal, Docker) don't deadlock inside Atropos's event loop. + +Returns an `AgentResult` containing the full conversation history, turn count, reasoning content per turn, tool errors, and optional ManagedServer state (for Phase 2). + +### Tool Context (`tool_context.py`) + +`ToolContext` is a per-rollout handle that gives reward/verification functions direct access to **all** hermes-agent tools, scoped to the rollout's `task_id`. The same `task_id` means the terminal/browser session is the SAME one the model used during its rollout -- all state (files, processes, browser tabs) is preserved. + +```python +async def compute_reward(self, item, result, ctx: ToolContext): + # Run tests in the model's terminal sandbox + test = ctx.terminal("pytest -v") + if test["exit_code"] == 0: + return 1.0 + + # Check if a file was created + content = ctx.read_file("/workspace/solution.py") + if content.get("content"): + return 0.5 + + # Download files locally for verification (binary-safe) + ctx.download_file("/remote/output.bin", "/local/output.bin") + + return 0.0 +``` + +Available methods: +- **Terminal**: `terminal(command, timeout)` -- run shell commands +- **Files**: `read_file(path)`, `write_file(path, content)`, `search(query, path)` +- **Transfers**: `upload_file()`, `upload_dir()`, `download_file()`, `download_dir()` -- binary-safe file transfers between host and sandbox +- **Web**: `web_search(query)`, `web_extract(urls)` +- **Browser**: `browser_navigate(url)`, `browser_snapshot()` +- **Generic**: `call_tool(name, args)` -- call any hermes-agent tool by name +- **Cleanup**: `cleanup()` -- release all resources (called automatically after `compute_reward`) + +### Patches (`patches.py`) + +**Problem**: Some hermes-agent tools use `asyncio.run()` internally (e.g., mini-swe-agent's Modal backend via SWE-ReX). This crashes when called from inside Atropos's event loop because `asyncio.run()` cannot be nested. + +**Solution**: `patches.py` monkey-patches `SwerexModalEnvironment` to use a dedicated background thread (`_AsyncWorker`) with its own event loop. The calling code sees the same sync interface, but internally the async work happens on a separate thread that doesn't conflict with Atropos's loop. + +What gets patched: +- `SwerexModalEnvironment.__init__` -- creates Modal deployment on a background thread +- `SwerexModalEnvironment.execute` -- runs commands on the same background thread +- `SwerexModalEnvironment.stop` -- stops deployment on the background thread + +The patches are: +- **Idempotent** -- calling `apply_patches()` multiple times is safe +- **Transparent** -- same interface and behavior, only the internal async execution changes +- **Universal** -- works identically in normal CLI use (no running event loop) + +Applied automatically at import time by `hermes_base_env.py`. + +### Tool Call Parsers (`tool_call_parsers/`) + +Client-side parsers that extract structured `tool_calls` from raw model output text. Used in **Phase 2** (VLLM server type) where ManagedServer's `/generate` endpoint returns raw text without tool call parsing. + +Each parser is a standalone reimplementation of the corresponding VLLM parser's `extract_tool_calls()` logic. No VLLM dependency -- only standard library (`re`, `json`, `uuid`) and `openai` types. + +Available parsers: +- `hermes` -- Hermes/ChatML `` XML format +- `mistral` -- Mistral `[TOOL_CALLS]` format +- `llama3_json` -- Llama 3 JSON tool calling +- `qwen` -- Qwen tool calling format +- `qwen3_coder` -- Qwen3 Coder format +- `deepseek_v3` -- DeepSeek V3 format +- `deepseek_v3_1` -- DeepSeek V3.1 format +- `kimi_k2` -- Kimi K2 format +- `longcat` -- Longcat format +- `glm45` / `glm47` -- GLM model formats + +Usage: +```python +from environments.tool_call_parsers import get_parser + +parser = get_parser("hermes") +content, tool_calls = parser.parse(raw_model_output) +``` + +In Phase 1 (OpenAI server type), these parsers are not needed -- the server handles tool call parsing natively. + +## Two-Phase Operation + +### Phase 1: OpenAI Server (Evaluation / SFT Data Generation) + +Uses `server.chat_completion()` with `tools=` parameter. The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing natively. Returns `ChatCompletion` objects with structured `tool_calls`. + +- Good for: evaluation, SFT data generation, testing +- Run with: `serve` (with `run-api`), `process`, or `evaluate` subcommands +- Placeholder tokens are created for the Atropos pipeline + +### Phase 2: VLLM ManagedServer (Full RL Training) + +Uses ManagedServer for exact token IDs + logprobs via `/generate`. Client-side tool call parser (from `tool_call_parsers/`) reconstructs structured `tool_calls` from raw output. + +- Good for: full RL training with GRPO/PPO +- Run with: `serve` subcommand +- Real tokens, masks, and logprobs flow through the pipeline + +## Directory Structure + +``` +environments/ +├── README.md # This file +├── __init__.py # Package exports +├── hermes_base_env.py # Abstract base (HermesAgentBaseEnv) +├── agent_loop.py # Multi-turn agent engine (HermesAgentLoop) +├── tool_context.py # Per-rollout tool access for reward functions +├── patches.py # Async-safety patches for Modal backend +│ +├── tool_call_parsers/ # Phase 2 client-side parsers +│ ├── __init__.py # Registry + base class +│ ├── hermes_parser.py +│ ├── mistral_parser.py +│ ├── llama_parser.py +│ ├── qwen_parser.py +│ ├── qwen3_coder_parser.py +│ ├── deepseek_v3_parser.py +│ ├── deepseek_v3_1_parser.py +│ ├── kimi_k2_parser.py +│ ├── longcat_parser.py +│ ├── glm45_parser.py +│ └── glm47_parser.py +│ +├── terminal_test_env/ # Stack validation environment +│ └── terminal_test_env.py +│ +├── hermes_swe_env/ # SWE-bench style training environment +│ └── hermes_swe_env.py +│ +└── benchmarks/ # Evaluation benchmarks + └── terminalbench_2/ + └── terminalbench2_env.py +``` + +## Concrete Environments + +### TerminalTestEnv (`terminal_test_env/`) + +A self-contained environment with inline tasks (no external dataset needed) for validating the full stack end-to-end. Each task asks the model to create a file at a known path, and the verifier checks the content matches. + +```bash +# Serve mode (needs run-api) +run-api +python environments/terminal_test_env/terminal_test_env.py serve + +# Process mode (no run-api, saves to JSONL) +python environments/terminal_test_env/terminal_test_env.py process \ + --env.data_path_to_save_groups terminal_test_output.jsonl +``` + +### HermesSweEnv (`hermes_swe_env/`) + +SWE-bench style training environment. The model gets a coding task, uses terminal + file + web tools to solve it, and the reward function runs tests in the same Modal sandbox. + +```bash +python environments/hermes_swe_env/hermes_swe_env.py serve \ + --openai.model_name YourModel \ + --env.dataset_name bigcode/humanevalpack \ + --env.terminal_backend modal +``` + +### TerminalBench2EvalEnv (`benchmarks/terminalbench_2/`) + +**Eval-only** environment for the Terminal-Bench 2.0 benchmark (89 tasks). Each task gets a pre-built Docker Hub image, a natural language instruction, and a test suite. The agent uses terminal + file tools to solve the task, then the test suite verifies correctness. + +Follows the standard Atropos eval pattern (like GPQA, MMLU, etc.): +- Run via `evaluate` subcommand (no `run-api` needed) +- `setup()` loads the dataset, `evaluate()` runs all tasks +- `rollout_and_score_eval()` handles per-task agent loop + test verification +- Downloads verifier output locally for reliable reward checking (Harbor pattern) + +```bash +# Run full benchmark +python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ + --openai.model_name anthropic/claude-opus-4.6 + +# Run subset of tasks +python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ + --openai.model_name anthropic/claude-opus-4.6 \ + --env.task_filter fix-git,git-multibranch + +# Skip specific tasks +python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ + --openai.model_name anthropic/claude-opus-4.6 \ + --env.skip_tasks heavy-task,slow-task +``` + +## Creating a New Environment + +### Training Environment + +1. Create a new directory under `environments/` +2. Create your env file inheriting from `HermesAgentBaseEnv` +3. Implement the four abstract methods + `evaluate()` + +```python +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig + +class MyEnvConfig(HermesAgentEnvConfig): + pass # Add custom fields as needed + +class MyEnv(HermesAgentBaseEnv): + name = "my-env" + env_config_cls = MyEnvConfig + + @classmethod + def config_init(cls): + env_config = MyEnvConfig( + enabled_toolsets=["terminal", "file"], + terminal_backend="modal", + # ... other config + ) + server_configs = [APIServerConfig(...)] + return env_config, server_configs + + async def setup(self): + self.dataset = load_dataset(...) + self.iter = 0 + + async def get_next_item(self): + item = self.dataset[self.iter % len(self.dataset)] + self.iter += 1 + return item + + def format_prompt(self, item): + return item["instruction"] + + async def compute_reward(self, item, result, ctx): + # ctx gives you full tool access to the rollout's sandbox + test = ctx.terminal("pytest -v") + return 1.0 if test["exit_code"] == 0 else 0.0 + + async def evaluate(self, *args, **kwargs): + # Periodic evaluation logic + ... + +if __name__ == "__main__": + MyEnv.cli() +``` + +### Eval-Only Environment (Benchmark) + +For eval benchmarks, follow the pattern in `terminalbench2_env.py`: +1. Create under `environments/benchmarks/your-benchmark/` +2. Inherit from `HermesAgentBaseEnv` +3. Set eval-only config: `eval_handling=STOP_TRAIN`, `steps_per_eval=1`, `total_steps=1` +4. Stub the training methods (`collect_trajectories`, `score`) +5. Implement `rollout_and_score_eval()` and `evaluate()` +6. Run with `evaluate` subcommand + +## Key Config Fields + +| Field | Description | Default | +|-------|-------------|---------| +| `enabled_toolsets` | Which hermes toolsets to enable | `None` (all) | +| `disabled_toolsets` | Toolsets to disable | `None` | +| `distribution` | Probabilistic toolset distribution name | `None` | +| `max_agent_turns` | Max LLM calls per rollout | `30` | +| `agent_temperature` | Sampling temperature | `1.0` | +| `terminal_backend` | `local`, `docker`, `modal`, `ssh`, `singularity` | `local` | +| `system_prompt` | System message for the agent | `None` | +| `tool_call_parser` | Parser name for Phase 2 | `hermes` | +| `eval_handling` | `STOP_TRAIN`, `LIMIT_TRAIN`, `NONE` | `STOP_TRAIN` | diff --git a/environments/__init__.py b/environments/__init__.py index 5f2fb6c7..f0c959ca 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -4,15 +4,18 @@ Hermes-Agent Atropos Environments Provides a layered integration between hermes-agent's tool-calling capabilities and the Atropos RL training framework. -Layers: +Core layers: - agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling - tool_context: Per-rollout tool access handle for reward/verification functions - hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos - tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate) Concrete environments: - - terminal_test_env: Simple file-creation tasks for testing the stack - - hermes_swe_env: SWE-bench style tasks with Modal sandboxes + - terminal_test_env/: Simple file-creation tasks for testing the stack + - hermes_swe_env/: SWE-bench style tasks with Modal sandboxes + +Benchmarks (eval-only): + - benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation """ from environments.agent_loop import AgentResult, HermesAgentLoop diff --git a/environments/benchmarks/__init__.py b/environments/benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/environments/benchmarks/terminalbench_2/__init__.py b/environments/benchmarks/terminalbench_2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml new file mode 100644 index 00000000..e6b3014c --- /dev/null +++ b/environments/benchmarks/terminalbench_2/default.yaml @@ -0,0 +1,41 @@ +# Terminal-Bench 2.0 Evaluation -- Default Configuration +# +# Eval-only environment for the TB2 benchmark (89 terminal tasks). +# Uses Modal terminal backend for per-task cloud-isolated sandboxes +# and OpenRouter for inference. +# +# Usage: +# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ +# --config environments/benchmarks/terminalbench_2/default.yaml +# +# # Override model: +# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ +# --config environments/benchmarks/terminalbench_2/default.yaml \ +# --openai.model_name anthropic/claude-sonnet-4 + +env: + enabled_toolsets: ["terminal", "file"] + max_agent_turns: 60 + max_token_length: 16000 + agent_temperature: 0.6 + terminal_backend: "modal" + dataset_name: "NousResearch/terminal-bench-2" + test_timeout: 180 + tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" + use_wandb: true + wandb_name: "terminal-bench-2" + ensure_scores_are_not_same: false + data_dir_to_save_evals: "evals/terminal-bench-2" + system_prompt: > + You are a skilled software engineer and system administrator with + access to a terminal and file tools. You are working inside a Linux + container environment. Complete the user's task by using the available + tools. Be methodical: explore the environment first, plan your approach, + then execute step by step. Verify your work before finishing. + +openai: + base_url: "https://openrouter.ai/api/v1" + model_name: "anthropic/claude-opus-4.6" + server_type: "openai" + health_check: false + # api_key loaded from OPENROUTER_API_KEY in .env diff --git a/environments/benchmarks/terminalbench_2/run_eval.sh b/environments/benchmarks/terminalbench_2/run_eval.sh new file mode 100755 index 00000000..d4f1dcd6 --- /dev/null +++ b/environments/benchmarks/terminalbench_2/run_eval.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Terminal-Bench 2.0 Evaluation +# +# Run from repo root: +# bash environments/benchmarks/terminalbench_2/run_eval.sh +# +# Override model: +# bash environments/benchmarks/terminalbench_2/run_eval.sh \ +# --openai.model_name anthropic/claude-sonnet-4 +# +# Run a subset: +# bash environments/benchmarks/terminalbench_2/run_eval.sh \ +# --env.task_filter fix-git,git-multibranch + +mkdir -p logs evals/terminal-bench-2 +LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log" + +echo "Terminal-Bench 2.0 Evaluation" +echo "Log: $LOG_FILE" +echo "" + +export TERMINAL_ENV=modal +export TERMINAL_TIMEOUT=300 + +python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ + --config environments/benchmarks/terminalbench_2/default.yaml \ + "$@" \ + 2>&1 | tee "$LOG_FILE" + +echo "" +echo "Log saved to: $LOG_FILE" diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py new file mode 100644 index 00000000..916fdad9 --- /dev/null +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -0,0 +1,730 @@ +""" +TerminalBench2Env -- Terminal-Bench 2.0 Evaluation Environment + +Evaluates agentic LLMs on challenging terminal tasks from Terminal-Bench 2.0. +Each task provides a unique Docker environment (pre-built on Docker Hub), a natural +language instruction, and a test suite for verification. The agent uses terminal + +file tools to complete the task, then the test suite runs inside the same sandbox. + +This is an eval-only environment (not a training environment). It is designed to +be run via the `evaluate` subcommand: + + python environments/terminalbench2_env.py evaluate \\ + --env.dataset_name NousResearch/terminal-bench-2 + +The evaluate flow: + 1. setup() -- Loads the TB2 dataset from HuggingFace + 2. evaluate() -- Iterates over all tasks, running each through: + a. rollout_and_score_eval() -- Per-task agent loop + test verification + - Resolves Docker image (pre-built Hub image or Dockerfile fallback) + - Registers per-task Modal sandbox via register_task_env_overrides() + - Runs the HermesAgentLoop (terminal + file tools) + - Uploads test suite and runs test.sh in the same sandbox + - Returns binary pass/fail result + b. Aggregates per-task, per-category, and overall pass rates + c. Logs results via evaluate_log() and wandb + +Key features: + - Per-task Modal sandboxes using pre-built Docker Hub images + - Binary reward: 1.0 if all tests pass, 0.0 otherwise + - Concurrency-controlled parallel evaluation via asyncio.Semaphore + - Per-task, per-category, and aggregate pass rate tracking +""" + +import asyncio +import base64 +import io +import json +import logging +import os +import shutil +import sys +import tarfile +import tempfile +import time +import uuid +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +# Ensure repo root is on sys.path for imports +_repo_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +from pydantic import Field + +from atroposlib.envs.base import EvalHandlingEnum +from atroposlib.envs.server_handling.server_manager import APIServerConfig + +from environments.agent_loop import AgentResult, HermesAgentLoop +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig +from environments.tool_context import ToolContext +from tools.terminal_tool import ( + register_task_env_overrides, + clear_task_env_overrides, + cleanup_vm, +) + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Configuration +# ============================================================================= + +class TerminalBench2EvalConfig(HermesAgentEnvConfig): + """ + Configuration for the Terminal-Bench 2.0 evaluation environment. + + Extends HermesAgentEnvConfig with TB2-specific settings for dataset loading, + test execution, task filtering, and eval concurrency. + """ + + # --- Dataset --- + dataset_name: str = Field( + default="NousResearch/terminal-bench-2", + description="HuggingFace dataset containing TB2 tasks.", + ) + + # --- Test execution --- + test_timeout: int = Field( + default=180, + description="Timeout in seconds for running the test suite after agent completes.", + ) + + # --- Image strategy --- + force_build: bool = Field( + default=False, + description="If True, always build from Dockerfile (ignore docker_image). " + "Useful for testing custom Dockerfiles.", + ) + + # --- Task filtering (comma-separated from CLI) --- + task_filter: Optional[str] = Field( + default=None, + description="Comma-separated task names to run (e.g., 'fix-git,broken-pipe'). " + "If not set, all tasks are run.", + ) + skip_tasks: Optional[str] = Field( + default=None, + description="Comma-separated task names to skip (e.g., 'heavy-task,slow-task').", + ) + + + +# ============================================================================= +# Tar extraction helper +# ============================================================================= + +def _extract_base64_tar(b64_data: str, target_dir: Path): + """Extract a base64-encoded tar.gz archive into target_dir.""" + if not b64_data: + return + raw = base64.b64decode(b64_data) + buf = io.BytesIO(raw) + with tarfile.open(fileobj=buf, mode="r:gz") as tar: + tar.extractall(path=str(target_dir)) + + +# ============================================================================= +# Main Environment +# ============================================================================= + +class TerminalBench2EvalEnv(HermesAgentBaseEnv): + """ + Terminal-Bench 2.0 evaluation environment (eval-only, no training). + + Inherits from HermesAgentBaseEnv for: + - Terminal backend setup (os.environ["TERMINAL_ENV"]) + - Tool resolution via _resolve_tools_for_group() + - Monkey patches for async-safe tool operation + - Wandb trajectory formatting + + The evaluate flow (triggered by `environment.py evaluate`): + 1. setup() -- Load dataset from HuggingFace + 2. evaluate() -- Run all tasks through rollout_and_score_eval() + + Each task in rollout_and_score_eval(): + 1. Resolve Docker image (pre-built Hub image or Dockerfile fallback) + 2. Register per-task Modal sandbox override + 3. Run HermesAgentLoop with terminal + file tools + 4. Upload test suite and execute test.sh in the same sandbox + 5. Check /logs/verifier/reward.txt for pass/fail + 6. Clean up sandbox, overrides, and temp files + """ + + name = "terminal-bench-2" + env_config_cls = TerminalBench2EvalConfig + + @classmethod + def config_init(cls) -> Tuple[TerminalBench2EvalConfig, List[APIServerConfig]]: + """ + Default configuration for Terminal-Bench 2.0 evaluation. + + Uses eval-only settings: + - eval_handling=STOP_TRAIN so the eval flow runs cleanly + - steps_per_eval=1, total_steps=1 so eval triggers immediately + - group_size=1 (one rollout per group, each task is expensive) + + Uses Modal terminal backend (cloud-isolated sandbox per task) and + OpenRouter with Claude for inference. + """ + env_config = TerminalBench2EvalConfig( + # Terminal + file tools only (the agent interacts via shell commands) + enabled_toolsets=["terminal", "file"], + disabled_toolsets=None, + distribution=None, + + # Agent settings -- TB2 tasks are complex, need many turns + max_agent_turns=60, + max_token_length=16000, + agent_temperature=0.6, + system_prompt=( + "You are a skilled software engineer and system administrator with " + "access to a terminal and file tools. You are working inside a Linux " + "container environment. Complete the user's task by using the available " + "tools. Be methodical: explore the environment first, plan your approach, " + "then execute step by step. Verify your work before finishing." + ), + + # Modal backend for per-task cloud-isolated sandboxes + terminal_backend="modal", + + # Test execution timeout (TB2 test scripts can install deps like pytest) + test_timeout=180, + + # --- Eval-only Atropos settings --- + # These settings make the env work as an eval-only environment: + # - STOP_TRAIN: pauses training during eval (standard for eval envs) + # - steps_per_eval=1, total_steps=1: eval triggers immediately + # - group_size=1: one rollout per group (each task is expensive) + eval_handling=EvalHandlingEnum.STOP_TRAIN, + group_size=1, + steps_per_eval=1, + total_steps=1, + + tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B", + use_wandb=True, + wandb_name="terminal-bench-2", + ensure_scores_are_not_same=False, # Binary rewards may all be 0 or 1 + ) + + # OpenRouter with Claude -- API key loaded from .env + server_configs = [ + APIServerConfig( + base_url="https://openrouter.ai/api/v1", + model_name="anthropic/claude-sonnet-4", + server_type="openai", + api_key=os.getenv("OPENROUTER_API_KEY", ""), + health_check=False, + ) + ] + + return env_config, server_configs + + # ========================================================================= + # Setup -- load dataset + # ========================================================================= + + async def setup(self): + """Load the Terminal-Bench 2.0 dataset from HuggingFace.""" + from datasets import load_dataset + + print(f"Loading TB2 dataset from: {self.config.dataset_name}") + ds = load_dataset(self.config.dataset_name, split="train") + + # Apply task filters (comma-separated strings from CLI) + tasks = list(ds) + if self.config.task_filter: + allowed = {name.strip() for name in self.config.task_filter.split(",")} + tasks = [t for t in tasks if t["task_name"] in allowed] + print(f" Filtered to {len(tasks)} tasks: {sorted(allowed)}") + if self.config.skip_tasks: + skip = {name.strip() for name in self.config.skip_tasks.split(",")} + tasks = [t for t in tasks if t["task_name"] not in skip] + print(f" After skip_tasks: {len(tasks)} tasks (skipped: {sorted(skip)})") + + self.all_eval_items = tasks + self.iter = 0 + + # Build category index for per-category metrics + self.category_index: Dict[str, List[int]] = defaultdict(list) + for i, task in enumerate(self.all_eval_items): + self.category_index[task.get("category", "unknown")].append(i) + + # Reward tracking for wandb logging + self.eval_metrics: List[Tuple[str, float]] = [] + + print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories") + for cat, indices in sorted(self.category_index.items()): + print(f" {cat}: {len(indices)} tasks") + + # ========================================================================= + # Training pipeline stubs -- NOT used in eval-only mode + # ========================================================================= + # These satisfy the abstract method requirements from HermesAgentBaseEnv. + # The evaluate subcommand calls setup() -> evaluate() directly, bypassing + # the training pipeline entirely. + + async def get_next_item(self): + """Return next item (stub -- not used in eval-only mode).""" + item = self.all_eval_items[self.iter % len(self.all_eval_items)] + self.iter += 1 + return item + + def format_prompt(self, item: Dict[str, Any]) -> str: + """Return the task's instruction as the user prompt.""" + return item["instruction"] + + async def compute_reward(self, item, result, ctx) -> float: + """Compute reward (stub -- actual verification is in rollout_and_score_eval).""" + return 0.0 + + async def collect_trajectories(self, item): + """Collect trajectories (stub -- not used in eval-only mode).""" + return None, [] + + async def score(self, rollout_group_data): + """Score rollouts (stub -- not used in eval-only mode).""" + return None + + # ========================================================================= + # Docker image resolution + # ========================================================================= + + def _resolve_task_image( + self, item: Dict[str, Any], task_name: str + ) -> Tuple[str, Optional[Path]]: + """ + Resolve the Docker image for a task, with fallback to Dockerfile. + + Strategy (mirrors Harbor's approach): + 1. If force_build=True, always build from Dockerfile in environment_tar + 2. If docker_image is available, use the pre-built Docker Hub image (fast) + 3. Otherwise, extract Dockerfile from environment_tar and build (slow) + + Returns: + (modal_image, temp_dir) -- modal_image is a Docker Hub name or a + Dockerfile path. temp_dir is set if we extracted files that need + cleanup later. + """ + docker_image = item.get("docker_image", "") + environment_tar = item.get("environment_tar", "") + + # Fast path: use pre-built Docker Hub image + if docker_image and not self.config.force_build: + logger.info("Task %s: using pre-built image %s", task_name, docker_image) + return docker_image, None + + # Slow path: extract Dockerfile from environment_tar and build + if environment_tar: + task_dir = Path(tempfile.mkdtemp(prefix=f"tb2-{task_name}-")) + _extract_base64_tar(environment_tar, task_dir) + dockerfile_path = task_dir / "Dockerfile" + if dockerfile_path.exists(): + logger.info( + "Task %s: building from Dockerfile (force_build=%s, docker_image=%s)", + task_name, self.config.force_build, bool(docker_image), + ) + return str(dockerfile_path), task_dir + + # Neither available -- fall back to Hub image if force_build was True + if docker_image: + logger.warning( + "Task %s: force_build=True but no environment_tar, " + "falling back to docker_image %s", task_name, docker_image, + ) + return docker_image, None + + return "", None + + # ========================================================================= + # Per-task evaluation -- agent loop + test verification + # ========================================================================= + + async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict: + """ + Evaluate a single TB2 task: run the agent loop, then verify with tests. + + This is the core evaluation method. For each task it: + 1. Resolves the Docker image and registers the Modal sandbox override + 2. Runs HermesAgentLoop with terminal + file tools + 3. Uploads the test suite into the sandbox + 4. Executes test.sh and checks the result + 5. Cleans up the sandbox and temp files + + Args: + eval_item: A single TB2 task dict from the dataset + + Returns: + Dict with 'passed' (bool), 'reward' (float), 'task_name' (str), + 'category' (str), and optional debug info + """ + task_name = eval_item.get("task_name", "unknown") + category = eval_item.get("category", "unknown") + task_id = str(uuid.uuid4()) + task_dir = None # Set if we extract a Dockerfile (needs cleanup) + + try: + # --- 1. Resolve Docker image --- + modal_image, task_dir = self._resolve_task_image(eval_item, task_name) + if not modal_image: + logger.error("Task %s: no docker_image or environment_tar, skipping", task_name) + return { + "passed": False, "reward": 0.0, + "task_name": task_name, "category": category, + "error": "no_image", + } + + # --- 2. Register per-task Modal image override --- + register_task_env_overrides(task_id, {"modal_image": modal_image}) + logger.info( + "Task %s: registered image override for task_id %s", + task_name, task_id[:8], + ) + + # --- 3. Resolve tools and build messages --- + tools, valid_names = self._resolve_tools_for_group() + + messages: List[Dict[str, Any]] = [] + if self.config.system_prompt: + messages.append({"role": "system", "content": self.config.system_prompt}) + messages.append({"role": "user", "content": self.format_prompt(eval_item)}) + + # --- 4. Run agent loop --- + agent = HermesAgentLoop( + server=self.server, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + ) + result = await agent.run(messages) + + # --- 5. Verify -- run test suite in the agent's sandbox --- + # Skip verification if the agent produced no meaningful output + only_system_and_user = all( + msg.get("role") in ("system", "user") for msg in result.messages + ) + if result.turns_used == 0 or only_system_and_user: + logger.warning( + "Task %s: agent produced no output (turns=%d). Reward=0.", + task_name, result.turns_used, + ) + reward = 0.0 + else: + ctx = ToolContext(task_id) + try: + reward = self._run_tests(eval_item, ctx, task_name) + except Exception as e: + logger.error("Task %s: test verification failed: %s", task_name, e) + reward = 0.0 + finally: + ctx.cleanup() + + passed = reward == 1.0 + status = "PASS" if passed else "FAIL" + print(f" [{status}] {task_name} (turns={result.turns_used})") + logger.info( + "Task %s: reward=%.1f, turns=%d, finished=%s", + task_name, reward, result.turns_used, result.finished_naturally, + ) + + return { + "passed": passed, + "reward": reward, + "task_name": task_name, + "category": category, + "turns_used": result.turns_used, + "finished_naturally": result.finished_naturally, + } + + except Exception as e: + logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True) + print(f" [ERROR] {task_name}: {e}") + return { + "passed": False, "reward": 0.0, + "task_name": task_name, "category": category, + "error": str(e), + } + + finally: + # --- Cleanup: clear overrides, sandbox, and temp files --- + clear_task_env_overrides(task_id) + try: + cleanup_vm(task_id) + except Exception as e: + logger.debug("VM cleanup for %s: %s", task_id[:8], e) + if task_dir and task_dir.exists(): + shutil.rmtree(task_dir, ignore_errors=True) + + def _run_tests( + self, item: Dict[str, Any], ctx: ToolContext, task_name: str + ) -> float: + """ + Upload and execute the test suite in the agent's sandbox, then + download the verifier output locally to read the reward. + + Follows Harbor's verification pattern: + 1. Upload tests/ directory into the sandbox + 2. Execute test.sh inside the sandbox + 3. Download /logs/verifier/ directory to a local temp dir + 4. Read reward.txt locally with native Python I/O + + Downloading locally avoids issues with the file_read tool on + the Modal VM and matches how Harbor handles verification. + + TB2 test scripts (test.sh) typically: + 1. Install pytest via uv/pip + 2. Run pytest against the test files in /tests/ + 3. Write results to /logs/verifier/reward.txt + + Args: + item: The TB2 task dict (contains tests_tar, test_sh) + ctx: ToolContext scoped to this task's sandbox + task_name: For logging + + Returns: + 1.0 if tests pass, 0.0 otherwise + """ + tests_tar = item.get("tests_tar", "") + test_sh = item.get("test_sh", "") + + if not test_sh: + logger.warning("Task %s: no test_sh content, reward=0", task_name) + return 0.0 + + # Create required directories in the sandbox + ctx.terminal("mkdir -p /tests /logs/verifier") + + # Upload test files into the sandbox (binary-safe via base64) + if tests_tar: + tests_temp = Path(tempfile.mkdtemp(prefix=f"tb2-tests-{task_name}-")) + try: + _extract_base64_tar(tests_tar, tests_temp) + ctx.upload_dir(str(tests_temp), "/tests") + except Exception as e: + logger.warning("Task %s: failed to upload test files: %s", task_name, e) + finally: + shutil.rmtree(tests_temp, ignore_errors=True) + + # Write the test runner script (test.sh) + ctx.write_file("/tests/test.sh", test_sh) + ctx.terminal("chmod +x /tests/test.sh") + + # Execute the test suite + logger.info( + "Task %s: running test suite (timeout=%ds)", + task_name, self.config.test_timeout, + ) + test_result = ctx.terminal( + "bash /tests/test.sh", + timeout=self.config.test_timeout, + ) + + exit_code = test_result.get("exit_code", -1) + output = test_result.get("output", "") + + # Download the verifier output directory locally, then read reward.txt + # with native Python I/O. This avoids issues with file_read on the + # Modal VM and matches Harbor's verification pattern. + reward = 0.0 + local_verifier_dir = Path(tempfile.mkdtemp(prefix=f"tb2-verifier-{task_name}-")) + try: + ctx.download_dir("/logs/verifier", str(local_verifier_dir)) + + reward_file = local_verifier_dir / "reward.txt" + if reward_file.exists() and reward_file.stat().st_size > 0: + content = reward_file.read_text().strip() + if content == "1": + reward = 1.0 + elif content == "0": + reward = 0.0 + else: + # Unexpected content -- try parsing as float + try: + reward = float(content) + except (ValueError, TypeError): + logger.warning( + "Task %s: reward.txt content unexpected (%r), " + "falling back to exit_code=%d", + task_name, content, exit_code, + ) + reward = 1.0 if exit_code == 0 else 0.0 + else: + # reward.txt not written -- fall back to exit code + logger.warning( + "Task %s: reward.txt not found after download, " + "falling back to exit_code=%d", + task_name, exit_code, + ) + reward = 1.0 if exit_code == 0 else 0.0 + except Exception as e: + logger.warning( + "Task %s: failed to download verifier dir: %s, " + "falling back to exit_code=%d", + task_name, e, exit_code, + ) + reward = 1.0 if exit_code == 0 else 0.0 + finally: + shutil.rmtree(local_verifier_dir, ignore_errors=True) + + # Log test output for debugging failures + if reward == 0.0: + output_preview = output[-500:] if output else "(no output)" + logger.info( + "Task %s: FAIL (exit_code=%d)\n%s", + task_name, exit_code, output_preview, + ) + + return reward + + # ========================================================================= + # Evaluate -- main entry point for the eval subcommand + # ========================================================================= + + async def evaluate(self, *args, **kwargs) -> None: + """ + Run Terminal-Bench 2.0 evaluation over all tasks. + + This is the main entry point when invoked via: + python environments/terminalbench2_env.py evaluate + + Runs all tasks through rollout_and_score_eval() via asyncio.gather() + (same pattern as GPQA and other Atropos eval envs). Aggregates + per-task, per-category, and overall pass rates, then logs to wandb + and evaluate_log(). + """ + start_time = time.time() + + print(f"\n{'='*60}") + print("Starting Terminal-Bench 2.0 Evaluation") + print(f"{'='*60}") + print(f" Dataset: {self.config.dataset_name}") + print(f" Total tasks: {len(self.all_eval_items)}") + print(f" Max agent turns: {self.config.max_agent_turns}") + print(f" Terminal backend: {self.config.terminal_backend}") + print(f"{'='*60}\n") + + # Fire all tasks -- Atropos / Modal handle scheduling + from tqdm.asyncio import tqdm_asyncio + eval_tasks = [ + self.rollout_and_score_eval(item) for item in self.all_eval_items + ] + results = await tqdm_asyncio.gather(*eval_tasks, desc="Evaluating TB2") + + end_time = time.time() + + # Filter out None results (shouldn't happen, but be safe) + valid_results = [r for r in results if r is not None] + + if not valid_results: + print("Warning: No valid evaluation results obtained") + return + + # ---- Compute metrics ---- + total = len(valid_results) + passed = sum(1 for r in valid_results if r.get("passed")) + overall_pass_rate = passed / total if total > 0 else 0.0 + + # Per-category breakdown + cat_results: Dict[str, List[Dict]] = defaultdict(list) + for r in valid_results: + cat_results[r.get("category", "unknown")].append(r) + + # Build metrics dict + eval_metrics = { + "eval/pass_rate": overall_pass_rate, + "eval/total_tasks": total, + "eval/passed_tasks": passed, + "eval/evaluation_time_seconds": end_time - start_time, + } + + # Per-category metrics + for category, cat_items in sorted(cat_results.items()): + cat_passed = sum(1 for r in cat_items if r.get("passed")) + cat_total = len(cat_items) + cat_pass_rate = cat_passed / cat_total if cat_total > 0 else 0.0 + cat_key = category.replace(" ", "_").replace("-", "_").lower() + eval_metrics[f"eval/pass_rate_{cat_key}"] = cat_pass_rate + + # Store metrics for wandb_log + self.eval_metrics = [(k, v) for k, v in eval_metrics.items()] + + # ---- Print summary ---- + print(f"\n{'='*60}") + print("Terminal-Bench 2.0 Evaluation Results") + print(f"{'='*60}") + print(f"Overall Pass Rate: {overall_pass_rate:.4f} ({passed}/{total})") + print(f"Evaluation Time: {end_time - start_time:.1f} seconds") + + print("\nCategory Breakdown:") + for category, cat_items in sorted(cat_results.items()): + cat_passed = sum(1 for r in cat_items if r.get("passed")) + cat_total = len(cat_items) + cat_rate = cat_passed / cat_total if cat_total > 0 else 0.0 + print(f" {category}: {cat_rate:.1%} ({cat_passed}/{cat_total})") + + # Print individual task results + print("\nTask Results:") + for r in sorted(valid_results, key=lambda x: x.get("task_name", "")): + status = "PASS" if r.get("passed") else "FAIL" + turns = r.get("turns_used", "?") + error = r.get("error", "") + extra = f" (error: {error})" if error else "" + print(f" [{status}] {r['task_name']} (turns={turns}){extra}") + + print(f"{'='*60}\n") + + # Build sample records for evaluate_log + samples = [ + { + "task_name": r.get("task_name"), + "category": r.get("category"), + "passed": r.get("passed"), + "reward": r.get("reward"), + "turns_used": r.get("turns_used"), + "error": r.get("error"), + } + for r in valid_results + ] + + # Log evaluation results + try: + await self.evaluate_log( + metrics=eval_metrics, + samples=samples, + start_time=start_time, + end_time=end_time, + generation_parameters={ + "temperature": self.config.agent_temperature, + "max_tokens": self.config.max_token_length, + "max_agent_turns": self.config.max_agent_turns, + "terminal_backend": self.config.terminal_backend, + }, + ) + except Exception as e: + print(f"Error logging evaluation results: {e}") + + # ========================================================================= + # Wandb logging + # ========================================================================= + + async def wandb_log(self, wandb_metrics: Optional[Dict] = None): + """Log TB2-specific metrics to wandb.""" + if wandb_metrics is None: + wandb_metrics = {} + + # Add stored eval metrics + for metric_name, metric_value in self.eval_metrics: + wandb_metrics[metric_name] = metric_value + self.eval_metrics = [] + + await super().wandb_log(wandb_metrics) + + +if __name__ == "__main__": + TerminalBench2EvalEnv.cli() diff --git a/environments/hermes_swe_env/__init__.py b/environments/hermes_swe_env/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/environments/configs/swe_default.yaml b/environments/hermes_swe_env/default.yaml similarity index 88% rename from environments/configs/swe_default.yaml rename to environments/hermes_swe_env/default.yaml index 3477e4b3..2d011334 100644 --- a/environments/configs/swe_default.yaml +++ b/environments/hermes_swe_env/default.yaml @@ -4,7 +4,8 @@ # Uses terminal + file + web toolsets. # # Usage: -# python environments/hermes_swe_env.py serve --config environments/configs/swe_default.yaml +# python environments/hermes_swe_env/hermes_swe_env.py serve \ +# --config environments/hermes_swe_env/default.yaml env: enabled_toolsets: ["terminal", "file", "web"] diff --git a/environments/hermes_swe_env.py b/environments/hermes_swe_env/hermes_swe_env.py similarity index 99% rename from environments/hermes_swe_env.py rename to environments/hermes_swe_env/hermes_swe_env.py index 23b3e8f0..49c521e5 100644 --- a/environments/hermes_swe_env.py +++ b/environments/hermes_swe_env/hermes_swe_env.py @@ -36,7 +36,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union # Ensure repo root is on sys.path for imports -_repo_root = Path(__file__).resolve().parent.parent +_repo_root = Path(__file__).resolve().parent.parent.parent if str(_repo_root) not in sys.path: sys.path.insert(0, str(_repo_root)) diff --git a/environments/terminal_test_env/__init__.py b/environments/terminal_test_env/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/environments/configs/terminal_test_default.yaml b/environments/terminal_test_env/default.yaml similarity index 84% rename from environments/configs/terminal_test_default.yaml rename to environments/terminal_test_env/default.yaml index 19b38e33..dc971071 100644 --- a/environments/configs/terminal_test_default.yaml +++ b/environments/terminal_test_env/default.yaml @@ -6,9 +6,8 @@ # # Usage: # run-api -# python environments/terminal_test_env.py serve -# # Or with config file: -# python environments/terminal_test_env.py serve --config environments/configs/terminal_test_default.yaml +# python environments/terminal_test_env/terminal_test_env.py serve \ +# --config environments/terminal_test_env/default.yaml env: enabled_toolsets: ["terminal", "file"] diff --git a/environments/terminal_test_env.py b/environments/terminal_test_env/terminal_test_env.py similarity index 99% rename from environments/terminal_test_env.py rename to environments/terminal_test_env/terminal_test_env.py index eb941496..4d151ee7 100644 --- a/environments/terminal_test_env.py +++ b/environments/terminal_test_env/terminal_test_env.py @@ -36,7 +36,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union # Ensure repo root is on sys.path for imports -_repo_root = Path(__file__).resolve().parent.parent +_repo_root = Path(__file__).resolve().parent.parent.parent if str(_repo_root) not in sys.path: sys.path.insert(0, str(_repo_root)) diff --git a/environments/tool_context.py b/environments/tool_context.py index 03a49c11..dc207937 100644 --- a/environments/tool_context.py +++ b/environments/tool_context.py @@ -129,11 +129,14 @@ class ToolContext: def write_file(self, path: str, content: str) -> Dict[str, Any]: """ - Write a file in the rollout's filesystem. + Write a TEXT file in the rollout's filesystem. + + Uses a shell heredoc under the hood, so this is only safe for text content. + For binary files (images, compiled artifacts, etc.), use upload_file() instead. Args: path: File path to write - content: Content to write + content: Text content to write Returns: Dict with success status or error @@ -146,6 +149,177 @@ class ToolContext: except json.JSONDecodeError: return {"error": result} + def upload_file(self, local_path: str, remote_path: str) -> Dict[str, Any]: + """ + Upload a local file to the rollout's sandbox (binary-safe). + + Unlike write_file() which passes content through a shell heredoc (text-only), + this method base64-encodes the file and decodes it inside the sandbox. + Safe for any file type: binaries, images, archives, etc. + + For large files (>1MB), the content is split into chunks to avoid + hitting shell command-length limits. + + Args: + local_path: Path to a local file on the host + remote_path: Destination path inside the sandbox + + Returns: + Dict with 'exit_code' and 'output' + """ + import base64 + from pathlib import Path as _Path + + local = _Path(local_path) + if not local.exists(): + return {"exit_code": -1, "output": f"Local file not found: {local_path}"} + + raw = local.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + + # Ensure parent directory exists in the sandbox + parent = str(_Path(remote_path).parent) + if parent not in (".", "/"): + self.terminal(f"mkdir -p {parent}", timeout=10) + + # For small files, single command is fine + chunk_size = 60_000 # ~60KB per chunk (well within shell limits) + if len(b64) <= chunk_size: + result = self.terminal( + f"printf '%s' '{b64}' | base64 -d > {remote_path}", + timeout=30, + ) + else: + # For larger files, write base64 in chunks then decode + tmp_b64 = "/tmp/_hermes_upload.b64" + self.terminal(f": > {tmp_b64}", timeout=5) # truncate + for i in range(0, len(b64), chunk_size): + chunk = b64[i : i + chunk_size] + self.terminal(f"printf '%s' '{chunk}' >> {tmp_b64}", timeout=15) + result = self.terminal( + f"base64 -d {tmp_b64} > {remote_path} && rm -f {tmp_b64}", + timeout=30, + ) + + return result + + def upload_dir(self, local_dir: str, remote_dir: str) -> List[Dict[str, Any]]: + """ + Upload an entire local directory to the rollout's sandbox (binary-safe). + + Recursively uploads all files, preserving directory structure. + + Args: + local_dir: Path to a local directory on the host + remote_dir: Destination directory inside the sandbox + + Returns: + List of results, one per file uploaded + """ + from pathlib import Path as _Path + + local = _Path(local_dir) + if not local.exists() or not local.is_dir(): + return [{"exit_code": -1, "output": f"Local directory not found: {local_dir}"}] + + results = [] + for file_path in sorted(local.rglob("*")): + if file_path.is_file(): + relative = file_path.relative_to(local) + target = f"{remote_dir}/{relative}" + results.append(self.upload_file(str(file_path), target)) + return results + + def download_file(self, remote_path: str, local_path: str) -> Dict[str, Any]: + """ + Download a file from the rollout's sandbox to the host (binary-safe). + + The inverse of upload_file(). Base64-encodes the file inside the sandbox, + reads the encoded data through the terminal, and decodes it locally. + Safe for any file type. + + Args: + remote_path: Path to the file inside the sandbox + local_path: Destination path on the host + + Returns: + Dict with 'success' (bool) and 'bytes' (int) or 'error' (str) + """ + import base64 + from pathlib import Path as _Path + + # Base64-encode the file inside the sandbox and capture output + result = self.terminal( + f"base64 {remote_path} 2>/dev/null", + timeout=30, + ) + + if result.get("exit_code", -1) != 0: + return { + "success": False, + "error": f"Failed to read remote file: {result.get('output', '')}", + } + + b64_data = result.get("output", "").strip() + if not b64_data: + return {"success": False, "error": f"Remote file is empty or missing: {remote_path}"} + + try: + raw = base64.b64decode(b64_data) + except Exception as e: + return {"success": False, "error": f"Base64 decode failed: {e}"} + + # Write to local host filesystem + local = _Path(local_path) + local.parent.mkdir(parents=True, exist_ok=True) + local.write_bytes(raw) + + return {"success": True, "bytes": len(raw)} + + def download_dir(self, remote_dir: str, local_dir: str) -> List[Dict[str, Any]]: + """ + Download a directory from the rollout's sandbox to the host (binary-safe). + + Lists all files in the remote directory, then downloads each one. + Preserves directory structure. + + Args: + remote_dir: Path to the directory inside the sandbox + local_dir: Destination directory on the host + + Returns: + List of results, one per file downloaded + """ + from pathlib import Path as _Path + + # List files in the remote directory + ls_result = self.terminal( + f"find {remote_dir} -type f 2>/dev/null", + timeout=15, + ) + + if ls_result.get("exit_code", -1) != 0: + return [{"success": False, "error": f"Failed to list remote dir: {remote_dir}"}] + + file_list = ls_result.get("output", "").strip() + if not file_list: + return [{"success": False, "error": f"Remote directory is empty or missing: {remote_dir}"}] + + results = [] + for remote_file in file_list.splitlines(): + remote_file = remote_file.strip() + if not remote_file: + continue + # Compute the relative path to preserve directory structure + if remote_file.startswith(remote_dir): + relative = remote_file[len(remote_dir):].lstrip("/") + else: + relative = _Path(remote_file).name + local_file = str(_Path(local_dir) / relative) + results.append(self.download_file(remote_file, local_file)) + + return results + def search(self, query: str, path: str = ".") -> Dict[str, Any]: """ Search for text in the rollout's filesystem. diff --git a/evals/terminal-bench-2/evaluate_config.yaml b/evals/terminal-bench-2/evaluate_config.yaml new file mode 100644 index 00000000..1537d63c --- /dev/null +++ b/evals/terminal-bench-2/evaluate_config.yaml @@ -0,0 +1,64 @@ +env: + group_size: 1 + max_num_workers: -1 + max_eval_workers: 16 + max_num_workers_per_node: 8 + steps_per_eval: 1 + max_token_length: 32000 + eval_handling: STOP_TRAIN + eval_limit_ratio: 0.5 + inference_weight: 1.0 + batch_size: -1 + max_batches_offpolicy: 3 + tokenizer_name: NousResearch/Hermes-3-Llama-3.1-8B + use_wandb: false + rollout_server_url: http://localhost:8000 + total_steps: 1 + wandb_name: terminal-bench-2 + num_rollouts_to_keep: 32 + num_rollouts_per_group_for_logging: 1 + ensure_scores_are_not_same: false + data_path_to_save_groups: null + data_dir_to_save_evals: evals/terminal-bench-2 + min_items_sent_before_logging: 2 + include_messages: false + min_batch_allocation: null + worker_timeout: 600.0 + thinking_mode: false + reasoning_effort: null + max_reasoning_tokens: null + custom_thinking_prompt: null + enabled_toolsets: + - terminal + - file + disabled_toolsets: null + distribution: null + max_agent_turns: 60 + system_prompt: 'You are a skilled software engineer and system administrator with + access to a terminal and file tools. You are working inside a Linux container + environment. Complete the user''s task by using the available tools. Be methodical: + explore the environment first, plan your approach, then execute step by step. + Verify your work before finishing.' + agent_temperature: 1.0 + terminal_backend: modal + dataset_name: NousResearch/terminal-bench-2 + dataset_split: train + prompt_field: prompt + tool_call_parser: hermes + test_timeout: 180 + force_build: false + task_filter: fix-git + skip_tasks: null +openai: +- timeout: 1200 + num_max_requests_at_once: 512 + num_requests_for_eval: 64 + model_name: anthropic/claude-sonnet-4 + rolling_buffer_length: 1000 + server_type: openai + api_key: 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 + base_url: https://openrouter.ai/api/v1 + n_kwarg_is_ignored: false + health_check: false +slurm: false +testing: false diff --git a/tools/__init__.py b/tools/__init__.py index 004a6add..18d9cfcb 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -31,6 +31,8 @@ from .terminal_tool import ( cleanup_vm, cleanup_all_environments, get_active_environments_info, + register_task_env_overrides, + clear_task_env_overrides, TERMINAL_TOOL_DESCRIPTION ) @@ -139,6 +141,8 @@ __all__ = [ 'cleanup_vm', 'cleanup_all_environments', 'get_active_environments_info', + 'register_task_env_overrides', + 'clear_task_env_overrides', 'TERMINAL_TOOL_DESCRIPTION', # Terminal tools (Hecate/MorphCloud backend) 'terminal_hecate_tool', diff --git a/tools/file_tools.py b/tools/file_tools.py index 955bdbd1..89c75eff 100644 --- a/tools/file_tools.py +++ b/tools/file_tools.py @@ -39,19 +39,24 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations: # Create environment OUTSIDE locks so we don't block other rollouts # during slow Modal/Docker startup (~10s) if needs_creation: + from tools.terminal_tool import _task_env_overrides + config = _get_env_config() env_type = config["env_type"] + # Check per-task overrides (set by environments like TerminalBench2Env) + overrides = _task_env_overrides.get(task_id, {}) + if env_type == "docker": - image = config["docker_image"] + image = overrides.get("docker_image") or config["docker_image"] elif env_type == "singularity": - image = config["singularity_image"] + image = overrides.get("singularity_image") or config["singularity_image"] elif env_type == "modal": - image = config["modal_image"] + image = overrides.get("modal_image") or config["modal_image"] else: image = "" - cwd = config["cwd"] + cwd = overrides.get("cwd") or config["cwd"] _check_disk_usage_warning() if not os.getenv("HERMES_QUIET"): print(f"[FileTools] Creating new {env_type} environment for task {task_id[:8]}...", flush=True) diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index 0369459d..db3b8c63 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -976,13 +976,37 @@ class _ModalEnvironment: Wraps mini-swe-agent's SwerexModalEnvironment but adds: - SUDO_PASSWORD support via _transform_sudo_command + - Automatic async-safety patches (applied once, before first use) - Note: stdin handling is not needed for Modal since it uses remote async execution. + The patches replace SwerexModalEnvironment's asyncio.run() calls with a + background thread approach, making it safe to use inside any event loop + (e.g., Atropos). Applied here at the point of use rather than relying on + import-time side effects, so ALL callers get the fix automatically. """ + # Class-level flag: patches only need to be applied once + _patches_applied = False + def __init__(self, image: str, cwd: str = "/root", timeout: int = 60): + # Ensure async-safety patches are applied before creating any + # SwerexModalEnvironment instance. This is the single authoritative + # place -- no other module needs to call apply_patches() for Modal. + if not _ModalEnvironment._patches_applied: + try: + from environments.patches import apply_patches + apply_patches() + except ImportError: + pass # patches module not available (standalone use) + _ModalEnvironment._patches_applied = True + from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment - self._inner = SwerexModalEnvironment(image=image, cwd=cwd, timeout=timeout) + # Generous startup timeout: sandbox creation can take 30-60s for cold images, + # and the SWE-ReX runtime needs another 10-30s to boot inside it. + self._inner = SwerexModalEnvironment( + image=image, cwd=cwd, timeout=timeout, + startup_timeout=180.0, + runtime_timeout=3600.0, + ) self.cwd = cwd self.timeout = timeout @@ -1033,7 +1057,7 @@ TERMINAL_TOOL_DESCRIPTION = """Execute commands on a secure Linux environment. - Run servers/long processes in background - Monitor disk usage for large tasks - Install whatever tools you need with apt-get or pip -- Do not be afraid to run pip with --break-system-packages +- Try to create or use a venv with uv or python -m venv to keep isolation from global system packages. **Things to avoid:** - Do NOT use interactive tools such as tmux, vim, nano, python repl - you will get stuck. @@ -1432,7 +1456,9 @@ def terminal_tool( env = _active_environments[effective_task_id] if needs_creation: - _check_disk_usage_warning() + # Disk usage warning only relevant for local/singularity backends + if env_type in ("singularity", "local"): + _check_disk_usage_warning() if not os.getenv("HERMES_QUIET"): print(f"[Terminal] Creating new {env_type} environment for task {effective_task_id[:8]}...", flush=True) try: