From c6df39955ccf38bb513a5bb26809184609675060 Mon Sep 17 00:00:00 2001 From: Blake Johnson Date: Sat, 7 Mar 2026 21:34:06 +0000 Subject: [PATCH] fix: limit concurrent Modal sandbox creations to avoid deadlocks - Add max_concurrent_tasks config (default 8) with semaphore in TB2 eval - Pass cwd: /app via register_task_env_overrides for TB2 tasks - Add /home/ to host path prefixes as safety net for container backends When all 86 TerminalBench2 tasks fire simultaneously, each creates a Modal sandbox via asyncio.run() inside a thread pool worker. Modal's blocking calls deadlock when too many are created at once. The semaphore ensures max 8 concurrent creations. Co-Authored-By: hermes-agent[bot] --- .../benchmarks/terminalbench_2/default.yaml | 4 ++++ .../terminalbench_2/terminalbench2_env.py | 24 +++++++++++++++++-- tools/environments/modal.py | 4 ++++ tools/terminal_tool.py | 3 ++- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml index 0c3eeb665..eb675b12e 100644 --- a/environments/benchmarks/terminalbench_2/default.yaml +++ b/environments/benchmarks/terminalbench_2/default.yaml @@ -29,6 +29,10 @@ env: wandb_name: "terminal-bench-2" ensure_scores_are_not_same: false data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2" + # CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks. + # Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes + # are created simultaneously inside thread pool workers via asyncio.run(). + max_concurrent_tasks: 8 openai: base_url: "https://openrouter.ai/api/v1" diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index ccb65b326..6c2da14cb 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -118,6 +118,15 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig): "Tasks exceeding this are scored as FAIL. Default 30 minutes.", ) + # --- Concurrency control --- + max_concurrent_tasks: int = Field( + default=8, + description="Maximum number of tasks to run concurrently. " + "Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. " + "Modal has internal limits and creating too many sandboxes simultaneously " + "causes blocking calls to deadlock inside the thread pool.", + ) + # Tasks that cannot run properly on Modal and are excluded from scoring. MODAL_INCOMPATIBLE_TASKS = { @@ -430,7 +439,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): } # --- 2. Register per-task Modal image override --- - register_task_env_overrides(task_id, {"modal_image": modal_image}) + register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"}) logger.info( "Task %s: registered image override for task_id %s", task_name, task_id[:8], @@ -733,12 +742,23 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): print(f" Tool thread pool: {self.config.tool_pool_size}") print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd") print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)") + print(f" Max concurrent tasks: {self.config.max_concurrent_tasks}") print(f"{'='*60}\n") + # Semaphore to limit concurrent Modal sandbox creations. + # Without this, all 86 tasks fire simultaneously, each creating a Modal + # sandbox via asyncio.run() inside a thread pool worker. Modal's blocking + # calls (App.lookup, etc.) deadlock when too many are created at once. + semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks) + + async def _eval_with_semaphore(item): + async with semaphore: + return await self._eval_with_timeout(item) + # Fire all tasks with wall-clock timeout, track live accuracy on the bar total_tasks = len(self.all_eval_items) eval_tasks = [ - asyncio.ensure_future(self._eval_with_timeout(item)) + asyncio.ensure_future(_eval_with_semaphore(item)) for item in self.all_eval_items ] diff --git a/tools/environments/modal.py b/tools/environments/modal.py index 84a9a6d75..d9732529a 100644 --- a/tools/environments/modal.py +++ b/tools/environments/modal.py @@ -137,6 +137,10 @@ class ModalEnvironment(BaseEnvironment): def cleanup(self): """Snapshot the filesystem (if persistent) then stop the sandbox.""" + # Check if _inner was ever set (init may have failed) + if not hasattr(self, '_inner') or self._inner is None: + return + if self._persistent: try: sandbox = getattr(self._inner, 'deployment', None) diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index e123262c5..88064d749 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -424,7 +424,8 @@ def _get_env_config() -> Dict[str, Any]: # SSH is excluded since /home/ paths are valid on remote machines. cwd = os.getenv("TERMINAL_CWD", default_cwd) if env_type in ("modal", "docker", "singularity", "daytona") and cwd: - host_prefixes = ("/Users/", "C:\\", "C:/") + # Host paths that won't exist inside containers + host_prefixes = ("/Users/", "/home/", "C:\\", "C:/") if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd: logger.info("Ignoring TERMINAL_CWD=%r for %s backend " "(host path won't exist in sandbox). Using %r instead.",