Merge PR #621: fix: limit concurrent Modal sandbox creations to avoid deadlocks
Authored by voteblake. - Semaphore limits concurrent Modal sandbox creations to 8 (configurable) to prevent thread pool deadlocks when 86+ tasks fire simultaneously - Modal cleanup guard for failed init (prevents AttributeError) - CWD override to /app for TB2 containers - Add /home/ to host path validation for container backends
This commit is contained in:
@@ -29,6 +29,10 @@ env:
|
||||
wandb_name: "terminal-bench-2"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
|
||||
# CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
|
||||
# Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
|
||||
# are created simultaneously inside thread pool workers via asyncio.run().
|
||||
max_concurrent_tasks: 8
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
|
||||
@@ -118,6 +118,15 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
|
||||
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
|
||||
)
|
||||
|
||||
# --- Concurrency control ---
|
||||
max_concurrent_tasks: int = Field(
|
||||
default=8,
|
||||
description="Maximum number of tasks to run concurrently. "
|
||||
"Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. "
|
||||
"Modal has internal limits and creating too many sandboxes simultaneously "
|
||||
"causes blocking calls to deadlock inside the thread pool.",
|
||||
)
|
||||
|
||||
|
||||
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
||||
MODAL_INCOMPATIBLE_TASKS = {
|
||||
@@ -430,7 +439,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
}
|
||||
|
||||
# --- 2. Register per-task Modal image override ---
|
||||
register_task_env_overrides(task_id, {"modal_image": modal_image})
|
||||
register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"})
|
||||
logger.info(
|
||||
"Task %s: registered image override for task_id %s",
|
||||
task_name, task_id[:8],
|
||||
@@ -733,12 +742,23 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
print(f" Tool thread pool: {self.config.tool_pool_size}")
|
||||
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
|
||||
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
|
||||
print(f" Max concurrent tasks: {self.config.max_concurrent_tasks}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Semaphore to limit concurrent Modal sandbox creations.
|
||||
# Without this, all 86 tasks fire simultaneously, each creating a Modal
|
||||
# sandbox via asyncio.run() inside a thread pool worker. Modal's blocking
|
||||
# calls (App.lookup, etc.) deadlock when too many are created at once.
|
||||
semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks)
|
||||
|
||||
async def _eval_with_semaphore(item):
|
||||
async with semaphore:
|
||||
return await self._eval_with_timeout(item)
|
||||
|
||||
# Fire all tasks with wall-clock timeout, track live accuracy on the bar
|
||||
total_tasks = len(self.all_eval_items)
|
||||
eval_tasks = [
|
||||
asyncio.ensure_future(self._eval_with_timeout(item))
|
||||
asyncio.ensure_future(_eval_with_semaphore(item))
|
||||
for item in self.all_eval_items
|
||||
]
|
||||
|
||||
|
||||
@@ -137,6 +137,10 @@ class ModalEnvironment(BaseEnvironment):
|
||||
|
||||
def cleanup(self):
|
||||
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
|
||||
# Check if _inner was ever set (init may have failed)
|
||||
if not hasattr(self, '_inner') or self._inner is None:
|
||||
return
|
||||
|
||||
if self._persistent:
|
||||
try:
|
||||
sandbox = getattr(self._inner, 'deployment', None)
|
||||
|
||||
@@ -437,7 +437,8 @@ def _get_env_config() -> Dict[str, Any]:
|
||||
# SSH is excluded since /home/ paths are valid on remote machines.
|
||||
cwd = os.getenv("TERMINAL_CWD", default_cwd)
|
||||
if env_type in ("modal", "docker", "singularity", "daytona") and cwd:
|
||||
host_prefixes = ("/Users/", "C:\\", "C:/")
|
||||
# Host paths that won't exist inside containers
|
||||
host_prefixes = ("/Users/", "/home/", "C:\\", "C:/")
|
||||
if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd:
|
||||
logger.info("Ignoring TERMINAL_CWD=%r for %s backend "
|
||||
"(host path won't exist in sandbox). Using %r instead.",
|
||||
|
||||
Reference in New Issue
Block a user