diff --git a/batch_runner.py b/batch_runner.py index 23eeec48e..0b75f1e4c 100644 --- a/batch_runner.py +++ b/batch_runner.py @@ -250,7 +250,7 @@ def _process_single_prompt( task_id = f"task_{prompt_index}" # Per-prompt container image override: if the dataset row has an 'image' field, - # register it for this task's sandbox. Works with Docker, Modal, and Singularity. + # register it for this task's sandbox. Works with Docker, Modal, Singularity, and Daytona. container_image = prompt_data.get("image") or prompt_data.get("docker_image") if container_image: # Verify the image is accessible before spending tokens on the agent loop. @@ -292,6 +292,7 @@ def _process_single_prompt( "docker_image": container_image, "modal_image": container_image, "singularity_image": f"docker://{container_image}", + "daytona_image": container_image, } if prompt_data.get("cwd"): overrides["cwd"] = prompt_data["cwd"] diff --git a/environments/agent_loop.py b/environments/agent_loop.py index 62ab08d61..ce2b1f9b3 100644 --- a/environments/agent_loop.py +++ b/environments/agent_loop.py @@ -23,7 +23,7 @@ from typing import Any, Dict, List, Optional, Set from model_tools import handle_function_call # Thread pool for running sync tool calls that internally use asyncio.run() -# (e.g., mini-swe-agent's modal/docker backends). Running them in a separate +# (e.g., mini-swe-agent's modal/docker/daytona backends). Running them in a separate # thread gives them a clean event loop so they don't deadlock inside Atropos's loop. # Size must be large enough for concurrent eval tasks (e.g., 89 TB2 tasks all # making tool calls). Too small = thread pool starvation, tasks queue for minutes. @@ -336,7 +336,7 @@ class HermesAgentLoop: tool_elapsed = _time.monotonic() - tool_submit_time else: # Run tool calls in a thread pool so backends that - # use asyncio.run() internally (modal, docker) get + # use asyncio.run() internally (modal, docker, daytona) get # a clean event loop instead of deadlocking. loop = asyncio.get_event_loop() # Capture current tool_name/args for the lambda diff --git a/environments/tool_context.py b/environments/tool_context.py index d7fde1fec..10f537d72 100644 --- a/environments/tool_context.py +++ b/environments/tool_context.py @@ -44,7 +44,7 @@ _tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=4) def _run_tool_in_thread(tool_name: str, arguments: Dict[str, Any], task_id: str) -> str: """ Run a tool call in a thread pool executor so backends that use asyncio.run() - internally (modal, docker) get a clean event loop. + internally (modal, docker, daytona) get a clean event loop. If we're already in an async context, executes handle_function_call() in a disposable worker thread and blocks for the result. @@ -95,7 +95,7 @@ class ToolContext: backend = os.getenv("TERMINAL_ENV", "local") logger.debug("ToolContext.terminal [%s backend] task=%s: %s", backend, self.task_id[:8], command[:100]) - # Run via thread helper so modal/docker backends' asyncio.run() doesn't deadlock + # Run via thread helper so modal/docker/daytona backends' asyncio.run() doesn't deadlock result = _run_tool_in_thread( "terminal", {"command": command, "timeout": timeout}, diff --git a/tools/process_registry.py b/tools/process_registry.py index 584f4b112..7f571f206 100644 --- a/tools/process_registry.py +++ b/tools/process_registry.py @@ -11,7 +11,7 @@ Tracks processes spawned via terminal(background=true), providing: Background processes execute THROUGH the environment interface -- nothing runs on the host machine unless TERMINAL_ENV=local. For Docker, Singularity, -Modal, and SSH backends, the command runs inside the sandbox. +Modal, Daytona, and SSH backends, the command runs inside the sandbox. Usage: from tools.process_registry import process_registry @@ -238,7 +238,7 @@ class ProcessRegistry: """ Spawn a background process through a non-local environment backend. - For Docker/Singularity/Modal/SSH: runs the command inside the sandbox + For Docker/Singularity/Modal/Daytona/SSH: runs the command inside the sandbox using the environment's execute() interface. We wrap the command to capture the in-sandbox PID and redirect output to a log file inside the sandbox, then poll the log via subsequent execute() calls.