From c6df39955ccf38bb513a5bb26809184609675060 Mon Sep 17 00:00:00 2001
From: Blake Johnson <johnsonblake1@gmail.com>
Date: Sat, 7 Mar 2026 21:34:06 +0000
Subject: [PATCH] fix: limit concurrent Modal sandbox creations to avoid
 deadlocks

- Add max_concurrent_tasks config (default 8) with semaphore in TB2 eval
- Pass cwd: /app via register_task_env_overrides for TB2 tasks
- Add /home/ to host path prefixes as safety net for container backends

When all 86 TerminalBench2 tasks fire simultaneously, each creates a Modal sandbox
via asyncio.run() inside a thread pool worker. Modal's blocking calls deadlock
when too many are created at once. The semaphore ensures max 8 concurrent creations.

Co-Authored-By: hermes-agent[bot] <hermes-agent[bot]@users.noreply.github.com>
---
 .../benchmarks/terminalbench_2/default.yaml   |  4 ++++
 .../terminalbench_2/terminalbench2_env.py     | 24 +++++++++++++++++--
 tools/environments/modal.py                   |  4 ++++
 tools/terminal_tool.py                        |  3 ++-
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml
index 0c3eeb665..eb675b12e 100644
--- a/environments/benchmarks/terminalbench_2/default.yaml
+++ b/environments/benchmarks/terminalbench_2/default.yaml
@@ -29,6 +29,10 @@ env:
   wandb_name: "terminal-bench-2"
   ensure_scores_are_not_same: false
   data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
+  # CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
+  # Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
+  # are created simultaneously inside thread pool workers via asyncio.run().
+  max_concurrent_tasks: 8
 
 openai:
   base_url: "https://openrouter.ai/api/v1"
diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
index ccb65b326..6c2da14cb 100644
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@@ -118,6 +118,15 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
         "Tasks exceeding this are scored as FAIL. Default 30 minutes.",
     )
 
+    # --- Concurrency control ---
+    max_concurrent_tasks: int = Field(
+        default=8,
+        description="Maximum number of tasks to run concurrently. "
+        "Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. "
+        "Modal has internal limits and creating too many sandboxes simultaneously "
+        "causes blocking calls to deadlock inside the thread pool.",
+    )
+
 
 # Tasks that cannot run properly on Modal and are excluded from scoring.
 MODAL_INCOMPATIBLE_TASKS = {
@@ -430,7 +439,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                 }
 
             # --- 2. Register per-task Modal image override ---
-            register_task_env_overrides(task_id, {"modal_image": modal_image})
+            register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"})
             logger.info(
                 "Task %s: registered image override for task_id %s",
                 task_name, task_id[:8],
@@ -733,12 +742,23 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
         print(f"  Tool thread pool: {self.config.tool_pool_size}")
         print(f"  Terminal timeout: {self.config.terminal_timeout}s/cmd")
         print(f"  Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
+        print(f"  Max concurrent tasks: {self.config.max_concurrent_tasks}")
         print(f"{'='*60}\n")
 
+        # Semaphore to limit concurrent Modal sandbox creations.
+        # Without this, all 86 tasks fire simultaneously, each creating a Modal
+        # sandbox via asyncio.run() inside a thread pool worker. Modal's blocking
+        # calls (App.lookup, etc.) deadlock when too many are created at once.
+        semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks)
+
+        async def _eval_with_semaphore(item):
+            async with semaphore:
+                return await self._eval_with_timeout(item)
+
         # Fire all tasks with wall-clock timeout, track live accuracy on the bar
         total_tasks = len(self.all_eval_items)
         eval_tasks = [
-            asyncio.ensure_future(self._eval_with_timeout(item))
+            asyncio.ensure_future(_eval_with_semaphore(item))
             for item in self.all_eval_items
         ]
 
diff --git a/tools/environments/modal.py b/tools/environments/modal.py
index 84a9a6d75..d9732529a 100644
--- a/tools/environments/modal.py
+++ b/tools/environments/modal.py
@@ -137,6 +137,10 @@ class ModalEnvironment(BaseEnvironment):
 
     def cleanup(self):
         """Snapshot the filesystem (if persistent) then stop the sandbox."""
+        # Check if _inner was ever set (init may have failed)
+        if not hasattr(self, '_inner') or self._inner is None:
+            return
+
         if self._persistent:
             try:
                 sandbox = getattr(self._inner, 'deployment', None)
diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py
index e123262c5..88064d749 100644
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@@ -424,7 +424,8 @@ def _get_env_config() -> Dict[str, Any]:
     # SSH is excluded since /home/ paths are valid on remote machines.
     cwd = os.getenv("TERMINAL_CWD", default_cwd)
     if env_type in ("modal", "docker", "singularity", "daytona") and cwd:
-        host_prefixes = ("/Users/", "C:\\", "C:/")
+        # Host paths that won't exist inside containers
+        host_prefixes = ("/Users/", "/home/", "C:\\", "C:/")
         if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd:
             logger.info("Ignoring TERMINAL_CWD=%r for %s backend "
                         "(host path won't exist in sandbox). Using %r instead.",