Merge PR #621: fix: limit concurrent Modal sandbox creations to avoid deadlocks

Authored by voteblake.

- Semaphore limits concurrent Modal sandbox creations to 8 (configurable)
  to prevent thread pool deadlocks when 86+ tasks fire simultaneously
- Modal cleanup guard for failed init (prevents AttributeError)
- CWD override to /app for TB2 containers
- Add /home/ to host path validation for container backends
This commit is contained in:
teknium1
2026-03-10 05:57:54 -07:00
4 changed files with 32 additions and 3 deletions

View File

@@ -29,6 +29,10 @@ env:
wandb_name: "terminal-bench-2"
ensure_scores_are_not_same: false
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
# CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
# Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
# are created simultaneously inside thread pool workers via asyncio.run().
max_concurrent_tasks: 8
openai:
base_url: "https://openrouter.ai/api/v1"

View File

@@ -118,6 +118,15 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
)
# --- Concurrency control ---
max_concurrent_tasks: int = Field(
default=8,
description="Maximum number of tasks to run concurrently. "
"Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. "
"Modal has internal limits and creating too many sandboxes simultaneously "
"causes blocking calls to deadlock inside the thread pool.",
)
# Tasks that cannot run properly on Modal and are excluded from scoring.
MODAL_INCOMPATIBLE_TASKS = {
@@ -430,7 +439,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
}
# --- 2. Register per-task Modal image override ---
register_task_env_overrides(task_id, {"modal_image": modal_image})
register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"})
logger.info(
"Task %s: registered image override for task_id %s",
task_name, task_id[:8],
@@ -733,12 +742,23 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
print(f" Tool thread pool: {self.config.tool_pool_size}")
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
print(f" Max concurrent tasks: {self.config.max_concurrent_tasks}")
print(f"{'='*60}\n")
# Semaphore to limit concurrent Modal sandbox creations.
# Without this, all 86 tasks fire simultaneously, each creating a Modal
# sandbox via asyncio.run() inside a thread pool worker. Modal's blocking
# calls (App.lookup, etc.) deadlock when too many are created at once.
semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks)
async def _eval_with_semaphore(item):
async with semaphore:
return await self._eval_with_timeout(item)
# Fire all tasks with wall-clock timeout, track live accuracy on the bar
total_tasks = len(self.all_eval_items)
eval_tasks = [
asyncio.ensure_future(self._eval_with_timeout(item))
asyncio.ensure_future(_eval_with_semaphore(item))
for item in self.all_eval_items
]

View File

@@ -137,6 +137,10 @@ class ModalEnvironment(BaseEnvironment):
def cleanup(self):
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
# Check if _inner was ever set (init may have failed)
if not hasattr(self, '_inner') or self._inner is None:
return
if self._persistent:
try:
sandbox = getattr(self._inner, 'deployment', None)

View File

@@ -437,7 +437,8 @@ def _get_env_config() -> Dict[str, Any]:
# SSH is excluded since /home/ paths are valid on remote machines.
cwd = os.getenv("TERMINAL_CWD", default_cwd)
if env_type in ("modal", "docker", "singularity", "daytona") and cwd:
host_prefixes = ("/Users/", "C:\\", "C:/")
# Host paths that won't exist inside containers
host_prefixes = ("/Users/", "/home/", "C:\\", "C:/")
if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd:
logger.info("Ignoring TERMINAL_CWD=%r for %s backend "
"(host path won't exist in sandbox). Using %r instead.",