diff --git a/.env.example b/.env.example index f319b1151..e77eb617e 100644 --- a/.env.example +++ b/.env.example @@ -2,14 +2,15 @@ # Copy this file to .env and fill in your API keys # ============================================================================= -# LLM PROVIDER (OpenRouter - Primary) +# LLM PROVIDER (OpenRouter) # ============================================================================= # OpenRouter provides access to many models through one API -# Get at: https://openrouter.ai/keys +# All LLM calls go through OpenRouter - no direct provider keys needed +# Get your key at: https://openrouter.ai/keys OPENROUTER_API_KEY= # Default model to use (OpenRouter format: provider/model) -# Examples: anthropic/claude-sonnet-4, openai/gpt-4o, google/gemini-2.0-flash +# Examples: anthropic/claude-sonnet-4, openai/gpt-4o, google/gemini-2.0-flash, zhipuai/glm-4-plus LLM_MODEL=anthropic/claude-sonnet-4 # ============================================================================= @@ -31,14 +32,17 @@ FAL_KEY= # ============================================================================= # TERMINAL TOOL CONFIGURATION (mini-swe-agent backend) # ============================================================================= -# Backend type: "local", "docker", or "modal" +# Backend type: "local", "singularity", "docker", or "modal" # - local: Runs directly on your machine (fastest, no isolation) -# - docker: Runs in Docker containers (isolated, requires Docker installed) +# - singularity: Runs in Apptainer/Singularity containers (HPC clusters, no root needed) +# - docker: Runs in Docker containers (isolated, requires Docker + docker group) # - modal: Runs in Modal cloud sandboxes (scalable, requires Modal account) -TERMINAL_ENV=docker +TERMINAL_ENV=singularity -# Docker image to use (for docker and modal backends) -TERMINAL_DOCKER_IMAGE=python:3.11-slim +# Container images (for singularity/docker/modal backends) +TERMINAL_DOCKER_IMAGE=python:3.11 +TERMINAL_SINGULARITY_IMAGE=docker://python:3.11 +TERMINAL_MODAL_IMAGE=python:3.11 # Working directory inside the container TERMINAL_CWD=/tmp @@ -57,6 +61,73 @@ TERMINAL_LIFETIME_SECONDS=300 # This will authenticate via browser and store credentials locally. # No API key needed in .env - Modal handles auth automatically. +# ============================================================================= +# BROWSER TOOL CONFIGURATION (agent-browser + Browserbase) +# ============================================================================= +# Browser automation requires Browserbase cloud service for remote browser execution. +# This allows the agent to navigate websites, fill forms, and extract information. +# +# STEALTH MODES: +# - Basic Stealth: ALWAYS active (random fingerprints, auto CAPTCHA solving) +# - Advanced Stealth: Requires BROWSERBASE_ADVANCED_STEALTH=true (Scale Plan only) + +# Browserbase API Key - Cloud browser execution +# Get at: https://browserbase.com/ +BROWSERBASE_API_KEY= + +# Browserbase Project ID - From your Browserbase dashboard +BROWSERBASE_PROJECT_ID= + +# Enable residential proxies for better CAPTCHA solving (default: true) +# Routes traffic through residential IPs, significantly improves success rate +BROWSERBASE_PROXIES=true + +# Enable advanced stealth mode (default: false, requires Scale Plan) +# Uses custom Chromium build to avoid bot detection altogether +BROWSERBASE_ADVANCED_STEALTH=false + +# Browser session timeout in seconds (optional, default: 300) +# Sessions are cleaned up after this duration of inactivity +BROWSER_SESSION_TIMEOUT=300 + +# ============================================================================= +# Browser automation requires Browserbase cloud service for remote browser execution. +# This allows the agent to navigate websites, fill forms, and extract information. + +# Browserbase API Key - Cloud browser execution +# Get at: https://browserbase.com/ +BROWSERBASE_API_KEY= + +# Browserbase Project ID - From your Browserbase dashboard +BROWSERBASE_PROJECT_ID= + +# Enable proxies for better CAPTCHA solving and anti-bot avoidance (default: true) +# Proxies route traffic through residential IPs for more reliable access +BROWSERBASE_PROXIES=true + +# Enable advanced stealth mode (default: false, requires Scale Plan) +# Uses custom Chromium build to avoid bot detection altogether +BROWSERBASE_ADVANCED_STEALTH=false + +# Browser session timeout in seconds (optional, default: 300) +# Sessions are cleaned up after this duration of inactivity +BROWSER_SESSION_TIMEOUT=300 + +# ============================================================================= +# Browser automation requires Browserbase cloud service for remote browser execution. +# This allows the agent to navigate websites, fill forms, and extract information. + +# Browserbase API Key - Cloud browser execution +# Get at: https://browserbase.com/ +BROWSERBASE_API_KEY= + +# Browserbase Project ID - From your Browserbase dashboard +BROWSERBASE_PROJECT_ID= + +# Browser session timeout in seconds (optional, default: 300) +# Sessions are cleaned up after this duration of inactivity +BROWSER_SESSION_TIMEOUT=300 + # ============================================================================= # LEGACY/OPTIONAL API KEYS # ============================================================================= @@ -69,10 +140,6 @@ MORPH_API_KEY= HECATE_VM_LIFETIME_SECONDS=300 HECATE_DEFAULT_SNAPSHOT_ID=snapshot_p5294qxt -# Direct provider keys (optional - OpenRouter is preferred) -ANTHROPIC_API_KEY= -OPENAI_API_KEY= - # ============================================================================= # DEBUG OPTIONS # ============================================================================= @@ -80,3 +147,12 @@ WEB_TOOLS_DEBUG=false VISION_TOOLS_DEBUG=false MOA_TOOLS_DEBUG=false IMAGE_TOOLS_DEBUG=false + +# Scratch directory for Singularity sandboxes (optional) +# If not set, uses /scratch (if available) or /tmp +# Set this to a directory with lots of space for large pip installs +# TERMINAL_SCRATCH_DIR=/scratch/myuser + +# Disk usage warning threshold in GB (default: 500) +# Warning is printed when total sandbox disk usage exceeds this +TERMINAL_DISK_WARNING_GB=500 diff --git a/README.md b/README.md index d679ca062..9b675494c 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ An AI agent with advanced tool-calling capabilities, featuring a flexible toolse - **Web Tools**: Search, extract content, and crawl websites - **Terminal Tools**: Execute commands via mini-swe-agent (local, Docker, or Modal backends) +- **Browser Tools**: Automate web browsers to navigate, click, type, and extract content - **Vision Tools**: Analyze images from URLs - **Reasoning Tools**: Advanced multi-model reasoning (Mixture of Agents) - **Creative Tools**: Generate images from text prompts @@ -53,9 +54,9 @@ nano .env # or use your preferred editor - `NOUS_API_KEY` - Vision & reasoning tools (get at: https://inference-api.nousresearch.com/) - `FAL_KEY` - Image generation (get at: https://fal.ai/) -**Optional API Keys:** -- `ANTHROPIC_API_KEY` - Direct Anthropic access (if not using OpenRouter) -- `OPENAI_API_KEY` - Direct OpenAI access (if not using OpenRouter) +**Optional API Keys (for specific features):** +- `BROWSERBASE_API_KEY` - Browser automation (get at: https://browserbase.com/) +- `BROWSERBASE_PROJECT_ID` - From Browserbase dashboard - `MORPH_API_KEY` - For legacy Hecate terminal backend (get at: https://morph.so/) ### 4. Configure Terminal Backend @@ -63,19 +64,22 @@ nano .env # or use your preferred editor The terminal tool uses **mini-swe-agent** environments. Configure in `.env`: ```bash -# Backend: "local" (host machine), "docker" (containers), or "modal" (cloud) -TERMINAL_ENV=local # Default: runs on host machine -TERMINAL_ENV=docker # Recommended: isolated Docker containers +# Backend: "local", "docker", "singularity", or "modal" +TERMINAL_ENV=local # Default: runs on host machine (no isolation) +TERMINAL_ENV=singularity # Recommended for HPC: Apptainer/Singularity containers +TERMINAL_ENV=docker # Isolated Docker containers TERMINAL_ENV=modal # Cloud execution via Modal -# Docker settings (for docker/modal backends) +# Container image (for docker/singularity/modal backends) TERMINAL_DOCKER_IMAGE=python:3.11-slim +TERMINAL_SINGULARITY_IMAGE=docker://python:3.11-slim TERMINAL_TIMEOUT=60 ``` **Backend Requirements:** -- **local**: No extra setup (runs directly on your machine) -- **docker**: Requires Docker installed and running. User must be in `docker` group. +- **local**: No extra setup (runs directly on your machine, no isolation) +- **singularity**: Requires Apptainer or Singularity installed (common on HPC clusters, no root needed) +- **docker**: Requires Docker installed and user in `docker` group - **modal**: Requires Modal account (see setup below) ### Modal Cloud Backend Setup @@ -95,6 +99,55 @@ TERMINAL_ENV=modal Modal uses CLI-based authentication (stored in `~/.modal/`), so no API key is needed in `.env`. After running `modal setup`, commands will automatically execute in Modal's cloud sandboxes. +### Browser Tools Setup + +Browser tools enable the agent to navigate websites, fill forms, click buttons, and extract content. They use [agent-browser](https://github.com/vercel-labs/agent-browser) CLI with [Browserbase](https://browserbase.com) cloud execution. + +```bash +# 1. Install Node.js (if not already installed) +# Use nvm (recommended) or your package manager + +# 2. Install agent-browser CLI globally +npm install -g agent-browser + +# 3. Get Browserbase credentials +# Sign up at https://browserbase.com/ and get your: +# - API Key (from Settings โ†’ API Keys) +# - Project ID (from your project dashboard) + +# 4. Add to your .env file: +BROWSERBASE_API_KEY=your_api_key_here +BROWSERBASE_PROJECT_ID=your_project_id_here +``` + +**Available Browser Tools:** + +| Tool | Description | +|------|-------------| +| `browser_navigate` | Navigate to a URL | +| `browser_snapshot` | Get text-based page snapshot with element refs | +| `browser_click` | Click an element by ref (e.g., `@e5`) | +| `browser_type` | Type text into an input field | +| `browser_scroll` | Scroll up or down | +| `browser_back` | Go back in browser history | +| `browser_press` | Press a keyboard key (Enter, Tab, etc.) | +| `browser_close` | Close the browser session | +| `browser_get_images` | Get list of images on the page | + +**Example Usage:** +```bash +# Use browser tools with web search and vision +python run_agent.py \ + --query "Go to amazon.com and find the price of the latest Kindle" \ + --enabled_toolsets=browser,web,vision + +# Use browser-focused distribution +python batch_runner.py \ + --dataset_file=browser_tasks.jsonl \ + --distribution=browser_use \ + --run_name=browser_run +``` + See `.env.example` for all available configuration options including debug settings. ## Toolsets System @@ -267,10 +320,6 @@ All environment variables can be configured in the `.env` file (copy from `.env. - `NOUS_API_KEY`: Vision and reasoning tools - `FAL_KEY`: Image generation tools -**Optional Direct Provider Keys:** -- `ANTHROPIC_API_KEY`: Direct Anthropic access (fallback if OpenRouter not set) -- `OPENAI_API_KEY`: Direct OpenAI access (fallback if OpenRouter not set) - **Terminal Tool Configuration (mini-swe-agent backend):** - `TERMINAL_ENV`: Backend type - `local`, `docker`, or `modal` (default: `local`) - `TERMINAL_DOCKER_IMAGE`: Docker image to use (default: `python:3.11-slim`) @@ -278,6 +327,11 @@ All environment variables can be configured in the `.env` file (copy from `.env. - `TERMINAL_LIFETIME_SECONDS`: Cleanup inactive environments after this time (default: `300`) - `TERMINAL_CWD`: Working directory inside containers (default: `/tmp`) +**Browser Tool Configuration (agent-browser + Browserbase):** +- `BROWSERBASE_API_KEY`: Browserbase API key for cloud browser execution +- `BROWSERBASE_PROJECT_ID`: Browserbase project ID +- `BROWSER_SESSION_TIMEOUT`: Session timeout in seconds (default: `300`) + **Legacy Hecate Terminal Backend (optional):** - `MORPH_API_KEY`: For Hecate/MorphCloud terminal backend - `HECATE_VM_LIFETIME_SECONDS`: VM lifetime (default: 300) diff --git a/batch_runner.py b/batch_runner.py index b6caaf30f..90434b5b5 100644 --- a/batch_runner.py +++ b/batch_runner.py @@ -49,8 +49,13 @@ _WORKER_CONFIG = {} # All possible tools - used to ensure consistent schema across all trajectory entries # This is required because Arrow/Parquet (used by HuggingFace datasets) needs identical schemas ALL_POSSIBLE_TOOLS = { - 'terminal', 'web_search', 'web_extract', 'web_crawl', - 'vision_analyze', 'image_generate', 'mixture_of_agents' + 'terminal', 'web_search', 'web_extract', + 'vision_analyze', 'image_generate', 'mixture_of_agents', + # Browser automation tools + 'browser_navigate', 'browser_snapshot', 'browser_click', + 'browser_type', 'browser_scroll', 'browser_back', + 'browser_press', 'browser_close', 'browser_get_images', + 'browser_vision' } # Default stats for tools that weren't used @@ -828,8 +833,13 @@ class BatchRunner: combined_file = self.output_dir / "trajectories.jsonl" print(f"\n๐Ÿ“ฆ Combining ALL batch files into {combined_file.name}...") - VALID_TOOLS = {'web_search', 'web_extract', 'web_crawl', 'terminal', 'vision_analyze', - 'image_generate', 'mixture_of_agents'} + VALID_TOOLS = {'web_search', 'web_extract', 'terminal', 'vision_analyze', + 'image_generate', 'mixture_of_agents', + # Browser automation tools + 'browser_navigate', 'browser_snapshot', 'browser_click', + 'browser_type', 'browser_scroll', 'browser_back', + 'browser_press', 'browser_close', 'browser_get_images', + 'browser_vision'} total_entries = 0 filtered_entries = 0 @@ -928,9 +938,9 @@ def main( batch_size: int = None, run_name: str = None, distribution: str = "default", - model: str = "claude-opus-4-20250514", + model: str = "anthropic/claude-sonnet-4-20250514", api_key: str = None, - base_url: str = "https://api.anthropic.com/v1/", + base_url: str = "https://openrouter.ai/api/v1", max_turns: int = 10, num_workers: int = 4, resume: bool = False, diff --git a/model_tools.py b/model_tools.py index a2b04c50b..dc9a3ef2c 100644 --- a/model_tools.py +++ b/model_tools.py @@ -37,6 +37,22 @@ from tools.terminal_hecate import terminal_hecate_tool, check_hecate_requirement from tools.vision_tools import vision_analyze_tool, check_vision_requirements from tools.mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements from tools.image_generation_tool import image_generate_tool, check_image_generation_requirements +# Browser automation tools (agent-browser + Browserbase) +from tools.browser_tool import ( + browser_navigate, + browser_snapshot, + browser_click, + browser_type, + browser_scroll, + browser_back, + browser_press, + browser_close, + browser_get_images, + browser_vision, + cleanup_browser, + check_browser_requirements, + BROWSER_TOOL_SCHEMAS +) from toolsets import ( get_toolset, resolve_toolset, resolve_multiple_toolsets, get_all_toolsets, get_toolset_names, validate_toolset, @@ -55,7 +71,7 @@ def get_web_tool_definitions() -> List[Dict[str, Any]]: "type": "function", "function": { "name": "web_search", - "description": "Search the web for information on any topic. Returns up to 5 relevant results with titles and URLs. Uses advanced search depth for comprehensive results.", + "description": "Search the web for information on any topic. Returns up to 5 relevant results with titles and URLs. Uses advanced search depth for comprehensive results. PREFERRED over browser tools for finding information - faster and more cost-effective. Use browser tools only when you need to interact with pages (click, fill forms, handle dynamic content).", "parameters": { "type": "object", "properties": { @@ -72,7 +88,7 @@ def get_web_tool_definitions() -> List[Dict[str, Any]]: "type": "function", "function": { "name": "web_extract", - "description": "Extract and read the full content from specific web page URLs. Useful for getting detailed information from webpages found through search. The content returned will be excerpts and key points summarized with an LLM to reduce impact on the context window.", + "description": "Extract and read the full content from specific web page URLs. Useful for getting detailed information from webpages found through search. The content returned will be excerpts and key points summarized with an LLM to reduce impact on the context window. PREFERRED over browser tools for reading page content - faster and more cost-effective. Use browser tools only when pages require interaction or have dynamic content.", "parameters": { "type": "object", "properties": { @@ -87,27 +103,6 @@ def get_web_tool_definitions() -> List[Dict[str, Any]]: } } }, - { - "type": "function", - "function": { - "name": "web_crawl", - "description": "Crawl a website with specific instructions to find and extract targeted content. Uses AI to intelligently navigate and extract relevant information from across the site. The content returned will be excerpts and key points summarized with an LLM to reduce impact on the context window.", - "parameters": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "The base URL to crawl (can include or exclude https://)" - }, - "instructions": { - "type": "string", - "description": "Specific instructions for what to crawl/extract using AI intelligence (e.g., 'Find pricing information', 'Get documentation pages', 'Extract contact details')" - } - }, - "required": ["url"] - } - } - } ] def get_terminal_tool_definitions() -> List[Dict[str, Any]]: @@ -244,6 +239,18 @@ def get_image_tool_definitions() -> List[Dict[str, Any]]: ] +def get_browser_tool_definitions() -> List[Dict[str, Any]]: + """ + Get tool definitions for browser automation tools in OpenAI's expected format. + + Uses agent-browser CLI with Browserbase cloud execution. + + Returns: + List[Dict]: List of browser tool definitions compatible with OpenAI API + """ + return [{"type": "function", "function": schema} for schema in BROWSER_TOOL_SCHEMAS] + + def get_all_tool_names() -> List[str]: """ Get the names of all available tools across all toolsets. @@ -255,7 +262,7 @@ def get_all_tool_names() -> List[str]: # Web tools if check_firecrawl_api_key(): - tool_names.extend(["web_search", "web_extract", "web_crawl"]) + tool_names.extend(["web_search", "web_extract"]) # Terminal tools (mini-swe-agent backend) if check_terminal_requirements(): @@ -273,6 +280,15 @@ def get_all_tool_names() -> List[str]: if check_image_generation_requirements(): tool_names.extend(["image_generate"]) + # Browser automation tools + if check_browser_requirements(): + tool_names.extend([ + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision" + ]) + return tool_names @@ -288,12 +304,22 @@ def get_toolset_for_tool(tool_name: str) -> str: """ toolset_mapping = { "web_search": "web_tools", - "web_extract": "web_tools", - "web_crawl": "web_tools", + "web_extract": "web_tools", "terminal": "terminal_tools", "vision_analyze": "vision_tools", "mixture_of_agents": "moa_tools", - "image_generate": "image_tools" + "image_generate": "image_tools", + # Browser automation tools + "browser_navigate": "browser_tools", + "browser_snapshot": "browser_tools", + "browser_click": "browser_tools", + "browser_type": "browser_tools", + "browser_scroll": "browser_tools", + "browser_back": "browser_tools", + "browser_press": "browser_tools", + "browser_close": "browser_tools", + "browser_get_images": "browser_tools", + "browser_vision": "browser_tools" } return toolset_mapping.get(tool_name, "unknown") @@ -357,6 +383,10 @@ def get_tool_definitions( for tool in get_image_tool_definitions(): all_available_tools_map[tool["function"]["name"]] = tool + if check_browser_requirements(): + for tool in get_browser_tool_definitions(): + all_available_tools_map[tool["function"]["name"]] = tool + # Determine which tools to include based on toolsets tools_to_include = set() @@ -369,14 +399,20 @@ def get_tool_definitions( print(f"โœ… Enabled toolset '{toolset_name}': {', '.join(resolved_tools) if resolved_tools else 'no tools'}") else: # Try legacy compatibility - if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools"]: + if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "browser_tools"]: # Map legacy names to new system legacy_map = { - "web_tools": ["web_search", "web_extract", "web_crawl"], + "web_tools": ["web_search", "web_extract"], "terminal_tools": ["terminal"], "vision_tools": ["vision_analyze"], "moa_tools": ["mixture_of_agents"], - "image_tools": ["image_generate"] + "image_tools": ["image_generate"], + "browser_tools": [ + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision" + ] } legacy_tools = legacy_map.get(toolset_name, []) tools_to_include.update(legacy_tools) @@ -404,13 +440,19 @@ def get_tool_definitions( print(f"๐Ÿšซ Disabled toolset '{toolset_name}': {', '.join(resolved_tools) if resolved_tools else 'no tools'}") else: # Try legacy compatibility - if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools"]: + if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "browser_tools"]: legacy_map = { - "web_tools": ["web_search", "web_extract", "web_crawl"], + "web_tools": ["web_search", "web_extract"], "terminal_tools": ["terminal"], "vision_tools": ["vision_analyze"], "moa_tools": ["mixture_of_agents"], - "image_tools": ["image_generate"] + "image_tools": ["image_generate"], + "browser_tools": [ + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision" + ] } legacy_tools = legacy_map.get(toolset_name, []) tools_to_include.difference_update(legacy_tools) @@ -465,12 +507,6 @@ def handle_web_function_call(function_name: str, function_args: Dict[str, Any]) # Run async function in event loop return asyncio.run(web_extract_tool(urls, "markdown")) - elif function_name == "web_crawl": - url = function_args.get("url", "") - instructions = function_args.get("instructions") - # Run async function in event loop - return asyncio.run(web_crawl_tool(url, instructions, "basic")) - else: return json.dumps({"error": f"Unknown web function: {function_name}"}, ensure_ascii=False) @@ -603,7 +639,58 @@ def handle_image_function_call(function_name: str, function_args: Dict[str, Any] return json.dumps({"error": f"Unknown image generation function: {function_name}"}, ensure_ascii=False) -def handle_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str: +# Browser tool handlers mapping +BROWSER_HANDLERS = { + "browser_navigate": browser_navigate, + "browser_click": browser_click, + "browser_type": browser_type, + "browser_scroll": browser_scroll, + "browser_back": browser_back, + "browser_press": browser_press, + "browser_close": browser_close, + "browser_get_images": browser_get_images, + "browser_vision": browser_vision, +} + + +def handle_browser_function_call( + function_name: str, + function_args: Dict[str, Any], + task_id: Optional[str] = None, + user_task: Optional[str] = None +) -> str: + """ + Handle function calls for browser automation tools. + + Args: + function_name (str): Name of the browser function to call + function_args (Dict): Arguments for the function + task_id (str): Task identifier for session isolation + user_task (str): User's current task (for task-aware extraction in snapshots) + + Returns: + str: Function result as JSON string + """ + # Special handling for browser_snapshot which needs user_task for extraction + if function_name == "browser_snapshot": + full = function_args.get("full", False) + return browser_snapshot(full=full, task_id=task_id, user_task=user_task) + + # Handle other browser tools + if function_name in BROWSER_HANDLERS: + handler = BROWSER_HANDLERS[function_name] + # Add task_id to args + return handler(**function_args, task_id=task_id) + + return json.dumps({"error": f"Unknown browser function: {function_name}"}, ensure_ascii=False) + + +def handle_function_call( + function_name: str, + function_args: Dict[str, Any], + task_id: Optional[str] = None, + user_task: Optional[str] = None +) -> str: """ Main function call dispatcher that routes calls to appropriate toolsets. @@ -614,7 +701,8 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any], task Args: function_name (str): Name of the function to call function_args (Dict): Arguments for the function - task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional) + task_id (str): Unique identifier for this task to isolate VMs/sessions between concurrent tasks (optional) + user_task (str): The user's original task/query (used for task-aware content extraction) (optional) Returns: str: Function result as JSON string @@ -624,7 +712,7 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any], task """ try: # Route web tools - if function_name in ["web_search", "web_extract", "web_crawl"]: + if function_name in ["web_search", "web_extract"]: return handle_web_function_call(function_name, function_args) # Route terminal tools @@ -643,6 +731,15 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any], task elif function_name in ["image_generate"]: return handle_image_function_call(function_name, function_args) + # Route browser automation tools + elif function_name in [ + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision" + ]: + return handle_browser_function_call(function_name, function_args, task_id, user_task) + else: error_msg = f"Unknown function: {function_name}" print(f"โŒ {error_msg}") @@ -664,8 +761,8 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]: toolsets = { "web_tools": { "available": check_firecrawl_api_key(), - "tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"], - "description": "Web search, content extraction, and website crawling tools", + "tools": ["web_search_tool", "web_extract_tool"], + "description": "Web search and content extraction tools", "requirements": ["FIRECRAWL_API_KEY environment variable"] }, "terminal_tools": { @@ -691,6 +788,17 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]: "tools": ["image_generate_tool"], "description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality", "requirements": ["FAL_KEY environment variable", "fal-client package"] + }, + "browser_tools": { + "available": check_browser_requirements(), + "tools": [ + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision" + ], + "description": "Browser automation for web interaction using agent-browser CLI with Browserbase cloud execution", + "requirements": ["BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID", "agent-browser npm package"] } } @@ -708,7 +816,8 @@ def check_toolset_requirements() -> Dict[str, bool]: "terminal_tools": check_terminal_requirements(), "vision_tools": check_vision_requirements(), "moa_tools": check_moa_requirements(), - "image_tools": check_image_generation_requirements() + "image_tools": check_image_generation_requirements(), + "browser_tools": check_browser_requirements() } if __name__ == "__main__": diff --git a/run_agent.py b/run_agent.py index 496f600b5..174c2bf97 100644 --- a/run_agent.py +++ b/run_agent.py @@ -44,6 +44,7 @@ else: # Import our tool system from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements from tools.terminal_tool import cleanup_vm +from tools.browser_tool import cleanup_browser class AIAgent: @@ -58,7 +59,7 @@ class AIAgent: self, base_url: str = None, api_key: str = None, - model: str = "anthropic/claude-sonnet-4-20250514", + model: str = "anthropic/claude-sonnet-4-20250514", # OpenRouter format max_iterations: int = 10, tool_delay: float = 1.0, enabled_toolsets: List[str] = None, @@ -156,10 +157,7 @@ class AIAgent: client_kwargs["api_key"] = api_key else: # Primary: OPENROUTER_API_KEY, fallback to direct provider keys - client_kwargs["api_key"] = os.getenv( - "OPENROUTER_API_KEY", - os.getenv("ANTHROPIC_API_KEY", os.getenv("OPENAI_API_KEY", "")) - ) + client_kwargs["api_key"] = os.getenv("OPENROUTER_API_KEY", "") try: self.client = OpenAI(**client_kwargs) @@ -339,11 +337,12 @@ class AIAgent: # Check if this message has tool calls if "tool_calls" in msg and msg["tool_calls"]: # Format assistant message with tool calls + # Add tags around reasoning for trajectory storage content = "" # Prepend reasoning in tags if available if msg.get("reasoning") and msg["reasoning"].strip(): - content = f"{msg['reasoning']}" + content = f"\n{msg['reasoning']}\n\n" if msg.get("content") and msg["content"].strip(): content += msg["content"] + "\n" @@ -406,17 +405,18 @@ class AIAgent: else: # Regular assistant message without tool calls + # Add tags around reasoning for trajectory storage content = "" # Prepend reasoning in tags if available if msg.get("reasoning") and msg["reasoning"].strip(): - content = f"{msg['reasoning']}" + content = f"\n{msg['reasoning']}\n\n" content += msg["content"] or "" trajectory.append({ "from": "gpt", - "value": content + "value": content.strip() }) elif msg["role"] == "user": @@ -515,7 +515,31 @@ class AIAgent: # Prepare messages for API call # If we have an ephemeral system prompt, prepend it to the messages - api_messages = messages.copy() + # Note: Reasoning is embedded in content via tags for trajectory storage. + # However, providers like Moonshot AI require a separate 'reasoning_content' field + # on assistant messages with tool_calls. We handle both cases here. + api_messages = [] + for msg in messages: + api_msg = msg.copy() + + # For assistant messages with tool_calls, providers require 'reasoning_content' field + # Extract reasoning from our stored 'reasoning' field and add it as 'reasoning_content' + if msg.get("role") == "assistant" and msg.get("tool_calls"): + reasoning_text = msg.get("reasoning") + if reasoning_text: + # Add reasoning_content for API compatibility (Moonshot AI, Novita, etc.) + api_msg["reasoning_content"] = reasoning_text + + # Remove 'reasoning' field - it's for trajectory storage only + # The reasoning is already in the content via tags AND + # we've added reasoning_content for API compatibility above + if "reasoning" in api_msg: + api_msg.pop("reasoning") + # Remove 'reasoning_details' if present - we use reasoning_content instead + if "reasoning_details" in api_msg: + api_msg.pop("reasoning_details") + api_messages.append(api_msg) + if active_system_prompt: # Insert system message at the beginning api_messages = [{"role": "system", "content": active_system_prompt}] + api_messages @@ -582,7 +606,9 @@ class AIAgent: print(f"{self.log_prefix}โฑ๏ธ API call completed in {api_duration:.2f}s") if self.verbose_logging: - logging.debug(f"API Response received - Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}") + # Log response with provider info if available + resp_model = getattr(response, 'model', 'N/A') if response else 'N/A' + logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}") # Validate response has valid choices before proceeding if response is None or not hasattr(response, 'choices') or response.choices is None or len(response.choices) == 0: @@ -600,12 +626,28 @@ class AIAgent: # Check for error field in response (some providers include this) error_msg = "Unknown" + provider_name = "Unknown" if response and hasattr(response, 'error') and response.error: error_msg = str(response.error) + # Try to extract provider from error metadata + if hasattr(response.error, 'metadata') and response.error.metadata: + provider_name = response.error.metadata.get('provider_name', 'Unknown') elif response and hasattr(response, 'message') and response.message: error_msg = str(response.message) + # Try to get provider from model field (OpenRouter often returns actual model used) + if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model: + provider_name = f"model={response.model}" + + # Check for x-openrouter-provider or similar metadata + if provider_name == "Unknown" and response: + # Log all response attributes for debugging + resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')} + if self.verbose_logging: + logging.debug(f"Response attributes for invalid response: {resp_attrs}") + print(f"{self.log_prefix}โš ๏ธ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}") + print(f"{self.log_prefix} ๐Ÿข Provider: {provider_name}") print(f"{self.log_prefix} ๐Ÿ“ Provider message: {error_msg[:200]}") print(f"{self.log_prefix} โฑ๏ธ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)") @@ -623,7 +665,7 @@ class AIAgent: # Longer backoff for rate limiting (likely cause of None choices) wait_time = min(5 * (2 ** (retry_count - 1)), 120) # 5s, 10s, 20s, 40s, 80s, 120s print(f"{self.log_prefix}โณ Retrying in {wait_time}s (extended backoff for possible rate limit)...") - logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)}") + logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") time.sleep(wait_time) continue # Retry the API call @@ -639,12 +681,17 @@ class AIAgent: print(f"{self.log_prefix} โช Rolling back to last complete assistant turn") rolled_back_messages = self._get_messages_up_to_last_assistant(messages) - # Clean up VM + # Clean up VM and browser try: cleanup_vm(effective_task_id) except Exception as e: if self.verbose_logging: logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}") + try: + cleanup_browser(effective_task_id) + except Exception as e: + if self.verbose_logging: + logging.warning(f"Failed to cleanup browser for task {effective_task_id}: {e}") return { "final_response": None, @@ -799,17 +846,21 @@ class AIAgent: self._invalid_json_retries = 0 # Extract reasoning from response if available (for reasoning models like minimax, kimi, etc.) - reasoning_content = None + # Extract reasoning from response for storage + # The reasoning_content field will be added when preparing API messages + reasoning_text = None if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning: - reasoning_content = assistant_message.reasoning + reasoning_text = assistant_message.reasoning elif hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content: - reasoning_content = assistant_message.reasoning_content + reasoning_text = assistant_message.reasoning_content - # Add assistant message with tool calls to conversation - messages.append({ + # Build assistant message with tool calls + # Content stays as-is; reasoning is stored separately and will be passed + # to the API via reasoning_content field when preparing api_messages + assistant_msg = { "role": "assistant", - "content": assistant_message.content, - "reasoning": reasoning_content, # Store reasoning for trajectory + "content": assistant_message.content or "", + "reasoning": reasoning_text, # Stored for trajectory extraction & API calls "tool_calls": [ { "id": tool_call.id, @@ -821,7 +872,9 @@ class AIAgent: } for tool_call in assistant_message.tool_calls ] - }) + } + + messages.append(assistant_msg) # Execute each tool call for i, tool_call in enumerate(assistant_message.tool_calls, 1): @@ -896,12 +949,17 @@ class AIAgent: rolled_back_messages = self._get_messages_up_to_last_assistant(messages) - # Clean up VM + # Clean up VM and browser try: cleanup_vm(effective_task_id) except Exception as e: if self.verbose_logging: logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}") + try: + cleanup_browser(effective_task_id) + except Exception as e: + if self.verbose_logging: + logging.warning(f"Failed to cleanup browser for task {effective_task_id}: {e}") return { "final_response": None, @@ -917,18 +975,21 @@ class AIAgent: self._empty_content_retries = 0 # Extract reasoning from response if available - reasoning_content = None + reasoning_text = None if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning: - reasoning_content = assistant_message.reasoning + reasoning_text = assistant_message.reasoning elif hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content: - reasoning_content = assistant_message.reasoning_content + reasoning_text = assistant_message.reasoning_content - # Add final assistant message - messages.append({ + # Build final assistant message + # Content stays as-is; reasoning stored separately for trajectory extraction + final_msg = { "role": "assistant", "content": final_response, - "reasoning": reasoning_content # Store reasoning for trajectory - }) + "reasoning": reasoning_text # Stored for trajectory extraction + } + + messages.append(final_msg) print(f"๐ŸŽ‰ Conversation completed after {api_call_count} OpenAI-compatible API call(s)") break @@ -963,12 +1024,18 @@ class AIAgent: # Save trajectory if enabled self._save_trajectory(messages, user_message, completed) - # Clean up VM for this task after conversation completes + # Clean up VM and browser for this task after conversation completes try: cleanup_vm(effective_task_id) except Exception as e: if self.verbose_logging: logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}") + + try: + cleanup_browser(effective_task_id) + except Exception as e: + if self.verbose_logging: + logging.warning(f"Failed to cleanup browser for task {effective_task_id}: {e}") return { "final_response": final_response, @@ -994,14 +1061,15 @@ class AIAgent: def main( query: str = None, - model: str = "claude-opus-4-20250514", + model: str = "anthropic/claude-sonnet-4-20250514", api_key: str = None, - base_url: str = "https://api.anthropic.com/v1/", + base_url: str = "https://openrouter.ai/api/v1", max_turns: int = 10, enabled_toolsets: str = None, disabled_toolsets: str = None, list_tools: bool = False, save_trajectories: bool = False, + save_sample: bool = False, verbose: bool = False, log_prefix_chars: int = 20 ): @@ -1010,16 +1078,17 @@ def main( Args: query (str): Natural language query for the agent. Defaults to Python 3.13 example. - model (str): Model name to use. Defaults to claude-opus-4-20250514. - api_key (str): API key for authentication. Uses ANTHROPIC_API_KEY env var if not provided. - base_url (str): Base URL for the model API. Defaults to https://api.anthropic.com/v1/ + model (str): Model name to use (OpenRouter format: provider/model). Defaults to anthropic/claude-sonnet-4-20250514. + api_key (str): API key for authentication. Uses OPENROUTER_API_KEY env var if not provided. + base_url (str): Base URL for the model API. Defaults to https://openrouter.ai/api/v1 max_turns (int): Maximum number of API call iterations. Defaults to 10. enabled_toolsets (str): Comma-separated list of toolsets to enable. Supports predefined toolsets (e.g., "research", "development", "safe"). Multiple toolsets can be combined: "web,vision" disabled_toolsets (str): Comma-separated list of toolsets to disable (e.g., "terminal") list_tools (bool): Just list available tools and exit - save_trajectories (bool): Save conversation trajectories to JSONL files. Defaults to False. + save_trajectories (bool): Save conversation trajectories to JSONL files (appends to trajectory_samples.jsonl). Defaults to False. + save_sample (bool): Save a single trajectory sample to a UUID-named JSONL file for inspection. Defaults to False. verbose (bool): Enable verbose logging for debugging. Defaults to False. log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses. Defaults to 20. @@ -1173,6 +1242,34 @@ def main( print("-" * 30) print(result['final_response']) + # Save sample trajectory to UUID-named file if requested + if save_sample: + import uuid + sample_id = str(uuid.uuid4())[:8] + sample_filename = f"sample_{sample_id}.jsonl" + + # Convert messages to trajectory format (same as batch_runner) + trajectory = agent._convert_to_trajectory_format( + result['messages'], + user_query, + result['completed'] + ) + + entry = { + "conversations": trajectory, + "timestamp": datetime.now().isoformat(), + "model": model, + "completed": result['completed'], + "query": user_query + } + + try: + with open(sample_filename, "w", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + print(f"\n๐Ÿ’พ Sample trajectory saved to: {sample_filename}") + except Exception as e: + print(f"\nโš ๏ธ Failed to save sample: {e}") + print("\n๐Ÿ‘‹ Agent execution completed!") diff --git a/tests/test_modal_terminal.py b/tests/test_modal_terminal.py new file mode 100644 index 000000000..c9f7406f0 --- /dev/null +++ b/tests/test_modal_terminal.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Test Modal Terminal Tool + +This script tests that the Modal terminal backend is correctly configured +and can execute commands in Modal sandboxes. + +Usage: + # Run with Modal backend + TERMINAL_ENV=modal python tests/test_modal_terminal.py + + # Or run directly (will use whatever TERMINAL_ENV is set in .env) + python tests/test_modal_terminal.py +""" + +import os +import sys +import json +from pathlib import Path + +# Try to load .env file if python-dotenv is available +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + # Manually load .env if dotenv not available + env_file = Path(__file__).parent.parent / ".env" + if env_file.exists(): + with open(env_file) as f: + for line in f: + line = line.strip() + if line and not line.startswith('#') and '=' in line: + key, value = line.split('=', 1) + # Remove quotes if present + value = value.strip().strip('"').strip("'") + os.environ.setdefault(key.strip(), value) + +# Add parent directory to path for imports +parent_dir = Path(__file__).parent.parent +sys.path.insert(0, str(parent_dir)) +sys.path.insert(0, str(parent_dir / "mini-swe-agent" / "src")) + +# Import terminal_tool module directly using importlib to avoid tools/__init__.py +import importlib.util +terminal_tool_path = parent_dir / "tools" / "terminal_tool.py" +spec = importlib.util.spec_from_file_location("terminal_tool", terminal_tool_path) +terminal_module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(terminal_module) + +terminal_tool = terminal_module.terminal_tool +check_terminal_requirements = terminal_module.check_terminal_requirements +_get_env_config = terminal_module._get_env_config +cleanup_vm = terminal_module.cleanup_vm +get_active_environments_info = terminal_module.get_active_environments_info + + +def test_modal_requirements(): + """Test that Modal requirements are met.""" + print("\n" + "=" * 60) + print("TEST 1: Modal Requirements Check") + print("=" * 60) + + config = _get_env_config() + print(f"Current TERMINAL_ENV: {config['env_type']}") + print(f"Modal image: {config['modal_image']}") + + # Check for Modal authentication + modal_token = os.getenv("MODAL_TOKEN_ID") + modal_toml = Path.home() / ".modal.toml" + + print(f"\nModal authentication:") + print(f" MODAL_TOKEN_ID env var: {'โœ… Set' if modal_token else 'โŒ Not set'}") + print(f" ~/.modal.toml file: {'โœ… Exists' if modal_toml.exists() else 'โŒ Not found'}") + + if config['env_type'] != 'modal': + print(f"\nโš ๏ธ TERMINAL_ENV is '{config['env_type']}', not 'modal'") + print(" Set TERMINAL_ENV=modal in .env or export it to test Modal backend") + return False + + requirements_met = check_terminal_requirements() + print(f"\nRequirements check: {'โœ… Passed' if requirements_met else 'โŒ Failed'}") + + return requirements_met + + +def test_simple_command(): + """Test executing a simple command.""" + print("\n" + "=" * 60) + print("TEST 2: Simple Command Execution") + print("=" * 60) + + test_task_id = "modal_test_simple" + + print("Executing: echo 'Hello from Modal!'") + result = terminal_tool("echo 'Hello from Modal!'", task_id=test_task_id) + result_json = json.loads(result) + + print(f"\nResult:") + print(f" Output: {result_json.get('output', '')[:200]}") + print(f" Exit code: {result_json.get('exit_code')}") + print(f" Error: {result_json.get('error')}") + + success = result_json.get('exit_code') == 0 and 'Hello from Modal!' in result_json.get('output', '') + print(f"\nTest: {'โœ… Passed' if success else 'โŒ Failed'}") + + # Cleanup + cleanup_vm(test_task_id) + + return success + + +def test_python_execution(): + """Test executing Python code in Modal.""" + print("\n" + "=" * 60) + print("TEST 3: Python Execution") + print("=" * 60) + + test_task_id = "modal_test_python" + + python_cmd = 'python3 -c "import sys; print(f\'Python {sys.version}\')"' + print(f"Executing: {python_cmd}") + + result = terminal_tool(python_cmd, task_id=test_task_id) + result_json = json.loads(result) + + print(f"\nResult:") + print(f" Output: {result_json.get('output', '')[:200]}") + print(f" Exit code: {result_json.get('exit_code')}") + print(f" Error: {result_json.get('error')}") + + success = result_json.get('exit_code') == 0 and 'Python' in result_json.get('output', '') + print(f"\nTest: {'โœ… Passed' if success else 'โŒ Failed'}") + + # Cleanup + cleanup_vm(test_task_id) + + return success + + +def test_pip_install(): + """Test installing a package with pip in Modal.""" + print("\n" + "=" * 60) + print("TEST 4: Pip Install Test") + print("=" * 60) + + test_task_id = "modal_test_pip" + + # Install a small package and verify + print("Executing: pip install --break-system-packages cowsay && python3 -c \"import cowsay; cowsay.cow('Modal works!')\"") + + result = terminal_tool( + "pip install --break-system-packages cowsay && python3 -c \"import cowsay; cowsay.cow('Modal works!')\"", + task_id=test_task_id, + timeout=120 + ) + result_json = json.loads(result) + + print(f"\nResult:") + output = result_json.get('output', '') + print(f" Output (last 500 chars): ...{output[-500:] if len(output) > 500 else output}") + print(f" Exit code: {result_json.get('exit_code')}") + print(f" Error: {result_json.get('error')}") + + success = result_json.get('exit_code') == 0 and 'Modal works!' in result_json.get('output', '') + print(f"\nTest: {'โœ… Passed' if success else 'โŒ Failed'}") + + # Cleanup + cleanup_vm(test_task_id) + + return success + + +def test_filesystem_persistence(): + """Test that filesystem persists between commands in the same task.""" + print("\n" + "=" * 60) + print("TEST 5: Filesystem Persistence") + print("=" * 60) + + test_task_id = "modal_test_persist" + + # Create a file + print("Step 1: Creating test file...") + result1 = terminal_tool("echo 'persistence test' > /tmp/modal_test.txt", task_id=test_task_id) + result1_json = json.loads(result1) + print(f" Exit code: {result1_json.get('exit_code')}") + + # Read the file back + print("Step 2: Reading test file...") + result2 = terminal_tool("cat /tmp/modal_test.txt", task_id=test_task_id) + result2_json = json.loads(result2) + print(f" Output: {result2_json.get('output', '')}") + print(f" Exit code: {result2_json.get('exit_code')}") + + success = ( + result1_json.get('exit_code') == 0 and + result2_json.get('exit_code') == 0 and + 'persistence test' in result2_json.get('output', '') + ) + print(f"\nTest: {'โœ… Passed' if success else 'โŒ Failed'}") + + # Cleanup + cleanup_vm(test_task_id) + + return success + + +def test_environment_isolation(): + """Test that different task_ids get isolated environments.""" + print("\n" + "=" * 60) + print("TEST 6: Environment Isolation") + print("=" * 60) + + task1 = "modal_test_iso_1" + task2 = "modal_test_iso_2" + + # Create file in task1 + print("Step 1: Creating file in task1...") + result1 = terminal_tool("echo 'task1 data' > /tmp/isolated.txt", task_id=task1) + + # Try to read from task2 (should not exist) + print("Step 2: Trying to read file from task2 (should not exist)...") + result2 = terminal_tool("cat /tmp/isolated.txt 2>&1 || echo 'FILE_NOT_FOUND'", task_id=task2) + result2_json = json.loads(result2) + + # The file should either not exist or be empty in task2 + output = result2_json.get('output', '') + isolated = 'task1 data' not in output or 'FILE_NOT_FOUND' in output or 'No such file' in output + + print(f" Task2 output: {output[:200]}") + print(f"\nTest: {'โœ… Passed (environments isolated)' if isolated else 'โŒ Failed (environments NOT isolated)'}") + + # Cleanup + cleanup_vm(task1) + cleanup_vm(task2) + + return isolated + + +def main(): + """Run all Modal terminal tests.""" + print("๐Ÿงช Modal Terminal Tool Test Suite") + print("=" * 60) + + # Check current config + config = _get_env_config() + print(f"\nCurrent configuration:") + print(f" TERMINAL_ENV: {config['env_type']}") + print(f" TERMINAL_MODAL_IMAGE: {config['modal_image']}") + print(f" TERMINAL_TIMEOUT: {config['timeout']}s") + + if config['env_type'] != 'modal': + print(f"\nโš ๏ธ WARNING: TERMINAL_ENV is set to '{config['env_type']}', not 'modal'") + print(" To test Modal specifically, set TERMINAL_ENV=modal") + response = input("\n Continue testing with current backend? (y/n): ") + if response.lower() != 'y': + print("Aborting.") + return + + results = {} + + # Run tests + results['requirements'] = test_modal_requirements() + + if not results['requirements']: + print("\nโŒ Requirements not met. Cannot continue with other tests.") + return + + results['simple_command'] = test_simple_command() + results['python_execution'] = test_python_execution() + results['pip_install'] = test_pip_install() + results['filesystem_persistence'] = test_filesystem_persistence() + results['environment_isolation'] = test_environment_isolation() + + # Summary + print("\n" + "=" * 60) + print("TEST SUMMARY") + print("=" * 60) + + passed = sum(1 for v in results.values() if v) + total = len(results) + + for test_name, passed_test in results.items(): + status = "โœ… PASSED" if passed_test else "โŒ FAILED" + print(f" {test_name}: {status}") + + print(f"\nTotal: {passed}/{total} tests passed") + + # Show active environments + env_info = get_active_environments_info() + print(f"\nActive environments after tests: {env_info['count']}") + if env_info['count'] > 0: + print(f" Task IDs: {env_info['task_ids']}") + + return passed == total + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/tools/__init__.py b/tools/__init__.py index b0e2f03c2..a7915081b 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -24,11 +24,13 @@ from .web_tools import ( check_firecrawl_api_key ) -# Primary terminal tool (mini-swe-agent backend: local/docker/modal) +# Primary terminal tool (mini-swe-agent backend: local/docker/singularity/modal) from .terminal_tool import ( terminal_tool, check_terminal_requirements, cleanup_vm, + cleanup_all_environments, + get_active_environments_info, TERMINAL_TOOL_DESCRIPTION ) @@ -54,6 +56,25 @@ from .image_generation_tool import ( check_image_generation_requirements ) +# Browser automation tools (agent-browser + Browserbase) +from .browser_tool import ( + browser_navigate, + browser_snapshot, + browser_click, + browser_type, + browser_scroll, + browser_back, + browser_press, + browser_close, + browser_get_images, + browser_vision, + cleanup_browser, + cleanup_all_browsers, + get_active_browser_sessions, + check_browser_requirements, + BROWSER_TOOL_SCHEMAS +) + __all__ = [ # Web tools 'web_search_tool', @@ -64,6 +85,8 @@ __all__ = [ 'terminal_tool', 'check_terminal_requirements', 'cleanup_vm', + 'cleanup_all_environments', + 'get_active_environments_info', 'TERMINAL_TOOL_DESCRIPTION', # Terminal tools (Hecate/MorphCloud backend) 'terminal_hecate_tool', @@ -78,5 +101,21 @@ __all__ = [ # Image generation tools 'image_generate_tool', 'check_image_generation_requirements', + # Browser automation tools + 'browser_navigate', + 'browser_snapshot', + 'browser_click', + 'browser_type', + 'browser_scroll', + 'browser_back', + 'browser_press', + 'browser_close', + 'browser_get_images', + 'browser_vision', + 'cleanup_browser', + 'cleanup_all_browsers', + 'get_active_browser_sessions', + 'check_browser_requirements', + 'BROWSER_TOOL_SCHEMAS', ] diff --git a/tools/browser_tool.py b/tools/browser_tool.py new file mode 100644 index 000000000..917c32560 --- /dev/null +++ b/tools/browser_tool.py @@ -0,0 +1,1454 @@ +#!/usr/bin/env python3 +""" +Browser Tool Module + +This module provides browser automation tools using agent-browser CLI with +Browserbase cloud execution. It enables AI agents to navigate websites, +interact with page elements, and extract information in a text-based format. + +The tool uses agent-browser's accessibility tree (ariaSnapshot) for text-based +page representation, making it ideal for LLM agents without vision capabilities. + +Features: +- Cloud browser execution via Browserbase (no local browser needed) +- Basic Stealth Mode always active (random fingerprints, CAPTCHA solving) +- Proxies enabled by default for better CAPTCHA solving and anti-bot avoidance +- Session isolation per task ID +- Text-based page snapshots using accessibility tree +- Element interaction via ref selectors (@e1, @e2, etc.) +- Task-aware content extraction using LLM summarization +- Automatic cleanup of browser sessions + +Environment Variables: +- BROWSERBASE_API_KEY: API key for Browserbase (required) +- BROWSERBASE_PROJECT_ID: Project ID for Browserbase (required) +- BROWSERBASE_PROXIES: Enable/disable residential proxies (default: "true") +- BROWSERBASE_ADVANCED_STEALTH: Enable advanced stealth mode with custom Chromium, + requires Scale Plan (default: "false") +- BROWSERBASE_KEEP_ALIVE: Enable keepAlive for session reconnection after disconnects, + requires paid plan (default: "true") +- BROWSERBASE_SESSION_TIMEOUT: Custom session timeout in milliseconds. Set to extend + beyond project default. Common values: 600000 (10min), 1800000 (30min) (default: none) + +Usage: + from tools.browser_tool import browser_navigate, browser_snapshot, browser_click + + # Navigate to a page + result = browser_navigate("https://example.com", task_id="task_123") + + # Get page snapshot + snapshot = browser_snapshot(task_id="task_123") + + # Click an element + browser_click("@e5", task_id="task_123") +""" + +import atexit +import json +import os +import signal +import subprocess +import shutil +import sys +import asyncio +import requests +from typing import Dict, Any, Optional, List +from pathlib import Path + +# Try to import httpx for async LLM calls +try: + import httpx + HTTPX_AVAILABLE = True +except ImportError: + HTTPX_AVAILABLE = False + + +# ============================================================================ +# Configuration +# ============================================================================ + +# Default timeout for browser commands (seconds) +DEFAULT_COMMAND_TIMEOUT = 30 + +# Default session timeout (seconds) +DEFAULT_SESSION_TIMEOUT = 300 + +# Max tokens for snapshot content before summarization +SNAPSHOT_SUMMARIZE_THRESHOLD = 8000 + +# Model for task-aware extraction +EXTRACTION_MODEL = "google/gemini-3-flash-preview" + +# Track active sessions per task +# Now stores tuple of (session_name, browserbase_session_id, cdp_url) +_active_sessions: Dict[str, Dict[str, str]] = {} # task_id -> {session_name, bb_session_id, cdp_url} + +# Flag to track if cleanup has been done +_cleanup_done = False + + +def _emergency_cleanup_all_sessions(): + """ + Emergency cleanup of all active browser sessions. + Called on process exit or interrupt to prevent orphaned sessions. + """ + global _cleanup_done + if _cleanup_done: + return + _cleanup_done = True + + if not _active_sessions: + return + + print(f"\n[browser_tool] Emergency cleanup: closing {len(_active_sessions)} active session(s)...", file=sys.stderr) + + try: + api_key = os.environ.get("BROWSERBASE_API_KEY") + project_id = os.environ.get("BROWSERBASE_PROJECT_ID") + + if not api_key or not project_id: + print("[browser_tool] WARNING: Cannot cleanup - missing BROWSERBASE credentials", file=sys.stderr) + return + + for task_id, session_info in list(_active_sessions.items()): + bb_session_id = session_info.get("bb_session_id") + if bb_session_id: + try: + response = requests.post( + f"https://api.browserbase.com/v1/sessions/{bb_session_id}", + headers={ + "X-BB-API-Key": api_key, + "Content-Type": "application/json" + }, + json={ + "projectId": project_id, + "status": "REQUEST_RELEASE" + }, + timeout=5 # Short timeout for cleanup + ) + if response.status_code in (200, 201, 204): + print(f"[browser_tool] Closed session {bb_session_id}", file=sys.stderr) + else: + print(f"[browser_tool] Failed to close session {bb_session_id}: HTTP {response.status_code}", file=sys.stderr) + except Exception as e: + print(f"[browser_tool] Error closing session {bb_session_id}: {e}", file=sys.stderr) + + _active_sessions.clear() + except Exception as e: + print(f"[browser_tool] Emergency cleanup error: {e}", file=sys.stderr) + + +def _signal_handler(signum, frame): + """Handle interrupt signals to cleanup sessions before exit.""" + print(f"\n[browser_tool] Received signal {signum}, cleaning up...", file=sys.stderr) + _emergency_cleanup_all_sessions() + sys.exit(128 + signum) + + +# Register cleanup handlers +atexit.register(_emergency_cleanup_all_sessions) + +# Only register signal handlers in main process (not in multiprocessing workers) +try: + if os.getpid() == os.getpgrp(): # Main process check + signal.signal(signal.SIGINT, _signal_handler) + signal.signal(signal.SIGTERM, _signal_handler) +except (OSError, AttributeError): + pass # Signal handling not available (e.g., Windows or worker process) + + +# ============================================================================ +# Tool Schemas +# ============================================================================ + +BROWSER_TOOL_SCHEMAS = [ + { + "name": "browser_navigate", + "description": "Navigate to a URL in the browser. Opens the page and waits for it to load. Returns the final URL and page title. IMPORTANT: This should be the FIRST browser tool called - it initializes the browser session and loads the target page. Other browser tools require a page to be loaded first. NOTE: For simple information retrieval, prefer using web_search or web_extract first as they are faster and more cost-effective. Use browser tools when you need to interact with a page (click buttons, fill forms, handle dynamic content).", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to navigate to (e.g., 'https://example.com')" + } + }, + "required": ["url"] + } + }, + { + "name": "browser_snapshot", + "description": "Get a text-based snapshot of the current page's accessibility tree. Returns interactive elements with ref IDs (like @e1, @e2) that can be used with browser_click and browser_type. Use full=true to get the complete page content including all text; use full=false (default) for a compact view focused on interactive elements. Requires browser_navigate to be called first.", + "parameters": { + "type": "object", + "properties": { + "full": { + "type": "boolean", + "description": "If true, returns complete page content. If false (default), returns compact view with interactive elements only.", + "default": False + } + }, + "required": [] + } + }, + { + "name": "browser_click", + "description": "Click on an element identified by its ref ID from the snapshot (e.g., '@e5'). The ref IDs are shown in square brackets in the snapshot output. Requires browser_navigate and browser_snapshot to be called first.", + "parameters": { + "type": "object", + "properties": { + "ref": { + "type": "string", + "description": "The element reference from the snapshot (e.g., '@e5', '@e12')" + } + }, + "required": ["ref"] + } + }, + { + "name": "browser_type", + "description": "Type text into an input field identified by its ref ID. Clears the field first, then types the new text. Requires browser_navigate and browser_snapshot to be called first.", + "parameters": { + "type": "object", + "properties": { + "ref": { + "type": "string", + "description": "The element reference from the snapshot (e.g., '@e3')" + }, + "text": { + "type": "string", + "description": "The text to type into the field" + } + }, + "required": ["ref", "text"] + } + }, + { + "name": "browser_scroll", + "description": "Scroll the page in a direction. Use this to reveal more content that may be below or above the current viewport. Requires browser_navigate to be called first.", + "parameters": { + "type": "object", + "properties": { + "direction": { + "type": "string", + "enum": ["up", "down"], + "description": "Direction to scroll" + } + }, + "required": ["direction"] + } + }, + { + "name": "browser_back", + "description": "Navigate back to the previous page in browser history. Requires browser_navigate to be called first.", + "parameters": { + "type": "object", + "properties": {}, + "required": [] + } + }, + { + "name": "browser_press", + "description": "Press a keyboard key. Useful for submitting forms (Enter), navigating (Tab), or keyboard shortcuts. Requires browser_navigate to be called first.", + "parameters": { + "type": "object", + "properties": { + "key": { + "type": "string", + "description": "Key to press (e.g., 'Enter', 'Tab', 'Escape', 'ArrowDown')" + } + }, + "required": ["key"] + } + }, + { + "name": "browser_close", + "description": "Close the browser session and release resources. Call this when done with browser tasks to free up Browserbase session quota.", + "parameters": { + "type": "object", + "properties": {}, + "required": [] + } + }, + { + "name": "browser_get_images", + "description": "Get a list of all images on the current page with their URLs and alt text. Useful for finding images to analyze with the vision tool. Requires browser_navigate to be called first.", + "parameters": { + "type": "object", + "properties": {}, + "required": [] + } + }, + { + "name": "browser_vision", + "description": "Take a screenshot of the current page and analyze it with vision AI. Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Requires browser_navigate to be called first.", + "parameters": { + "type": "object", + "properties": { + "question": { + "type": "string", + "description": "What you want to know about the page visually. Be specific about what you're looking for." + } + }, + "required": ["question"] + } + }, +] + + +# ============================================================================ +# Utility Functions +# ============================================================================ + +def _create_browserbase_session(task_id: str) -> Dict[str, str]: + """ + Create a Browserbase session with stealth features. + + Browserbase Stealth Modes: + - Basic Stealth: ALWAYS enabled automatically. Generates random fingerprints, + viewports, and solves visual CAPTCHAs. No configuration needed. + - Advanced Stealth: Uses custom Chromium build for better bot detection avoidance. + Requires Scale Plan. Enable via BROWSERBASE_ADVANCED_STEALTH=true. + + Proxies are enabled by default to route traffic through residential IPs, + which significantly improves CAPTCHA solving rates. Can be disabled via + BROWSERBASE_PROXIES=false if needed. + + Args: + task_id: Unique identifier for the task + + Returns: + Dict with session_name, bb_session_id, cdp_url, and feature flags + """ + import uuid + import sys + + config = _get_browserbase_config() + + # Check for optional settings from environment + # Proxies: enabled by default for better CAPTCHA solving + enable_proxies = os.environ.get("BROWSERBASE_PROXIES", "true").lower() != "false" + # Advanced Stealth: requires Scale Plan, disabled by default + enable_advanced_stealth = os.environ.get("BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true" + # keepAlive: enabled by default (requires paid plan) - allows reconnection after disconnects + enable_keep_alive = os.environ.get("BROWSERBASE_KEEP_ALIVE", "true").lower() != "false" + # Custom session timeout in milliseconds (optional) - extends session beyond project default + custom_timeout_ms = os.environ.get("BROWSERBASE_SESSION_TIMEOUT") + + # Track which features are actually enabled for logging/debugging + features_enabled = { + "basic_stealth": True, # Always on + "proxies": False, + "advanced_stealth": False, + "keep_alive": False, + "custom_timeout": False, + } + + # Build session configuration + # Note: Basic stealth mode is ALWAYS active - no configuration needed + session_config = { + "projectId": config["project_id"], + } + + # Enable keepAlive for session reconnection (default: true, requires paid plan) + # Allows reconnecting to the same session after network hiccups + if enable_keep_alive: + session_config["keepAlive"] = True + + # Add custom timeout if specified (in milliseconds) + # This extends session duration beyond project's default timeout + if custom_timeout_ms: + try: + timeout_val = int(custom_timeout_ms) + if timeout_val > 0: + session_config["timeout"] = timeout_val + except ValueError: + print(f"[browser_tool] WARNING: Invalid BROWSERBASE_SESSION_TIMEOUT value: {custom_timeout_ms}", + file=sys.stderr) + + # Enable proxies for better CAPTCHA solving (default: true) + # Routes traffic through residential IPs for more reliable access + if enable_proxies: + session_config["proxies"] = True + + # Add advanced stealth if enabled (requires Scale Plan) + # Uses custom Chromium build to avoid bot detection altogether + if enable_advanced_stealth: + session_config["browserSettings"] = { + "advancedStealth": True, + } + + # Create session via Browserbase API + response = requests.post( + "https://api.browserbase.com/v1/sessions", + headers={ + "Content-Type": "application/json", + "X-BB-API-Key": config["api_key"], + }, + json=session_config, + timeout=30 + ) + + # Track if we fell back from paid features + proxies_fallback = False + keepalive_fallback = False + + # Handle 402 Payment Required - likely paid features not available + # Try to identify which feature caused the issue and retry without it + if response.status_code == 402: + # First try without keepAlive (most likely culprit for paid plan requirement) + if enable_keep_alive: + keepalive_fallback = True + print(f"[browser_tool] WARNING: keepAlive may require paid plan (402), retrying without it. " + f"Sessions may timeout during long operations.", file=sys.stderr) + session_config.pop("keepAlive", None) + response = requests.post( + "https://api.browserbase.com/v1/sessions", + headers={ + "Content-Type": "application/json", + "X-BB-API-Key": config["api_key"], + }, + json=session_config, + timeout=30 + ) + + # If still 402, try without proxies too + if response.status_code == 402 and enable_proxies: + proxies_fallback = True + print(f"[browser_tool] WARNING: Proxies unavailable (402), retrying without proxies. " + f"Bot detection may be less effective.", file=sys.stderr) + session_config.pop("proxies", None) + response = requests.post( + "https://api.browserbase.com/v1/sessions", + headers={ + "Content-Type": "application/json", + "X-BB-API-Key": config["api_key"], + }, + json=session_config, + timeout=30 + ) + + if not response.ok: + raise RuntimeError(f"Failed to create Browserbase session: {response.status_code} {response.text}") + + session_data = response.json() + session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}" + + # Update features based on what actually succeeded + if enable_proxies and not proxies_fallback: + features_enabled["proxies"] = True + if enable_advanced_stealth: + features_enabled["advanced_stealth"] = True + if enable_keep_alive and not keepalive_fallback: + features_enabled["keep_alive"] = True + if custom_timeout_ms and "timeout" in session_config: + features_enabled["custom_timeout"] = True + + # Log session info for debugging + feature_str = ", ".join(k for k, v in features_enabled.items() if v) + print(f"[browser_tool] Created session {session_name} with features: {feature_str}", file=sys.stderr) + + return { + "session_name": session_name, + "bb_session_id": session_data["id"], + "cdp_url": session_data["connectUrl"], + "features": features_enabled, + } + + +def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]: + """ + Get or create session info for the given task. + + Creates a Browserbase session with proxies enabled if one doesn't exist. + + Args: + task_id: Unique identifier for the task + + Returns: + Dict with session_name, bb_session_id, and cdp_url + """ + if task_id is None: + task_id = "default" + + # Check if we already have a session for this task + if task_id in _active_sessions: + return _active_sessions[task_id] + + # Create a new Browserbase session with proxies + session_info = _create_browserbase_session(task_id) + _active_sessions[task_id] = session_info + + return session_info + + +def _get_session_name(task_id: Optional[str] = None) -> str: + """ + Get the session name for agent-browser CLI. + + Args: + task_id: Unique identifier for the task + + Returns: + Session name for agent-browser + """ + session_info = _get_session_info(task_id) + return session_info["session_name"] + + +def _get_browserbase_config() -> Dict[str, str]: + """ + Get Browserbase configuration from environment. + + Returns: + Dict with api_key and project_id + + Raises: + ValueError: If required env vars are not set + """ + api_key = os.environ.get("BROWSERBASE_API_KEY") + project_id = os.environ.get("BROWSERBASE_PROJECT_ID") + + if not api_key or not project_id: + raise ValueError( + "BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment variables are required. " + "Get your credentials at https://browserbase.com" + ) + + return { + "api_key": api_key, + "project_id": project_id + } + + +def _find_agent_browser() -> str: + """ + Find the agent-browser CLI executable. + + Returns: + Path to agent-browser executable + + Raises: + FileNotFoundError: If agent-browser is not installed + """ + # Check if it's in PATH + which_result = shutil.which("agent-browser") + if which_result: + return which_result + + # Check common npx locations + npx_path = shutil.which("npx") + if npx_path: + return "npx agent-browser" + + raise FileNotFoundError( + "agent-browser CLI not found. Install it with: npm install -g agent-browser\n" + "Or ensure npx is available in your PATH." + ) + + +def _run_browser_command( + task_id: str, + command: str, + args: List[str] = None, + timeout: int = DEFAULT_COMMAND_TIMEOUT +) -> Dict[str, Any]: + """ + Run an agent-browser CLI command using our pre-created Browserbase session. + + Args: + task_id: Task identifier to get the right session + command: The command to run (e.g., "open", "click") + args: Additional arguments for the command + timeout: Command timeout in seconds + + Returns: + Parsed JSON response from agent-browser + """ + args = args or [] + + # Build the command + try: + browser_cmd = _find_agent_browser() + except FileNotFoundError as e: + return {"success": False, "error": str(e)} + + # Get session info (creates Browserbase session with proxies if needed) + try: + session_info = _get_session_info(task_id) + except Exception as e: + return {"success": False, "error": f"Failed to create browser session: {str(e)}"} + + # Connect via CDP to our pre-created Browserbase session (with proxies) + # Use --cdp flag to connect to existing session instead of creating new one + cmd_parts = browser_cmd.split() + [ + "--session", session_info["session_name"], + "--cdp", session_info["cdp_url"], # Connect to our proxied session + "--json", # Always request JSON output + command + ] + args + + try: + result = subprocess.run( + cmd_parts, + capture_output=True, + text=True, + timeout=timeout, + env={**os.environ} + ) + + # Parse JSON output + if result.stdout.strip(): + try: + return json.loads(result.stdout.strip()) + except json.JSONDecodeError: + # If not valid JSON, return as raw output + return { + "success": True, + "data": {"raw": result.stdout.strip()} + } + + # Check for errors + if result.returncode != 0: + error_msg = result.stderr.strip() if result.stderr else f"Command failed with code {result.returncode}" + return {"success": False, "error": error_msg} + + return {"success": True, "data": {}} + + except subprocess.TimeoutExpired: + return {"success": False, "error": f"Command timed out after {timeout} seconds"} + except Exception as e: + return {"success": False, "error": str(e)} + + +async def _extract_relevant_content( + snapshot_text: str, + user_task: Optional[str] = None +) -> str: + """ + Use LLM to extract relevant content from a snapshot based on the user's task. + + This provides task-aware summarization that preserves meaningful text content + (paragraphs, prices, descriptions) relevant to what the user is trying to accomplish. + + Args: + snapshot_text: The full snapshot text + user_task: The user's current task/goal (optional) + + Returns: + Summarized/extracted content + """ + if not HTTPX_AVAILABLE: + # Fall back to simple truncation + return _truncate_snapshot(snapshot_text) + + # Get API key + api_key = os.environ.get("OPENROUTER_API_KEY") + if not api_key: + return _truncate_snapshot(snapshot_text) + + # Build extraction prompt + if user_task: + extraction_prompt = f"""You are a content extractor for a browser automation agent. + +The user's task is: {user_task} + +Given the following page snapshot (accessibility tree representation), extract and summarize the most relevant information for completing this task. Focus on: +1. Interactive elements (buttons, links, inputs) that might be needed +2. Text content relevant to the task (prices, descriptions, headings, important info) +3. Navigation structure if relevant + +Keep ref IDs (like [ref=e5]) for interactive elements so the agent can use them. + +Page Snapshot: +{snapshot_text} + +Provide a concise summary that preserves actionable information and relevant content.""" + else: + extraction_prompt = f"""Summarize this page snapshot, preserving: +1. All interactive elements with their ref IDs (like [ref=e5]) +2. Key text content and headings +3. Important information visible on the page + +Page Snapshot: +{snapshot_text} + +Provide a concise summary focused on interactive elements and key content.""" + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + json={ + "model": EXTRACTION_MODEL, + "messages": [ + {"role": "user", "content": extraction_prompt} + ], + "max_tokens": 4000, + "temperature": 0.1 + } + ) + + if response.status_code == 200: + result = response.json() + return result["choices"][0]["message"]["content"] + else: + # Fall back to truncation on API error + return _truncate_snapshot(snapshot_text) + + except Exception: + # Fall back to truncation on any error + return _truncate_snapshot(snapshot_text) + + +def _truncate_snapshot(snapshot_text: str, max_chars: int = 8000) -> str: + """ + Simple truncation fallback for snapshots. + + Args: + snapshot_text: The snapshot text to truncate + max_chars: Maximum characters to keep + + Returns: + Truncated text with indicator if truncated + """ + if len(snapshot_text) <= max_chars: + return snapshot_text + + return snapshot_text[:max_chars] + "\n\n[... content truncated ...]" + + +# ============================================================================ +# Browser Tool Functions +# ============================================================================ + +def browser_navigate(url: str, task_id: Optional[str] = None) -> str: + """ + Navigate to a URL in the browser. + + Args: + url: The URL to navigate to + task_id: Task identifier for session isolation + + Returns: + JSON string with navigation result (includes stealth features info on first nav) + """ + effective_task_id = task_id or "default" + + # Get session info to check if this is a new session + # (will create one with features logged if not exists) + session_info = _get_session_info(effective_task_id) + is_first_nav = session_info.get("_first_nav", True) + + # Mark that we've done at least one navigation + if is_first_nav: + session_info["_first_nav"] = False + + result = _run_browser_command(effective_task_id, "open", [url], timeout=60) + + if result.get("success"): + data = result.get("data", {}) + title = data.get("title", "") + final_url = data.get("url", url) + + response = { + "success": True, + "url": final_url, + "title": title + } + + # Detect common "blocked" page patterns from title/url + blocked_patterns = [ + "access denied", "access to this page has been denied", + "blocked", "bot detected", "verification required", + "please verify", "are you a robot", "captcha", + "cloudflare", "ddos protection", "checking your browser", + "just a moment", "attention required" + ] + title_lower = title.lower() + + if any(pattern in title_lower for pattern in blocked_patterns): + response["bot_detection_warning"] = ( + f"Page title '{title}' suggests bot detection. The site may have blocked this request. " + "Options: 1) Try adding delays between actions, 2) Access different pages first, " + "3) Enable advanced stealth (BROWSERBASE_ADVANCED_STEALTH=true, requires Scale plan), " + "4) Some sites have very aggressive bot detection that may be unavoidable." + ) + + # Include feature info on first navigation so model knows what's active + if is_first_nav and "features" in session_info: + features = session_info["features"] + active_features = [k for k, v in features.items() if v] + if not features.get("proxies"): + response["stealth_warning"] = ( + "Running WITHOUT residential proxies. Bot detection may be more aggressive. " + "Consider upgrading Browserbase plan for proxy support." + ) + response["stealth_features"] = active_features + + return json.dumps(response, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", "Navigation failed") + }, ensure_ascii=False) + + +def browser_snapshot( + full: bool = False, + task_id: Optional[str] = None, + user_task: Optional[str] = None +) -> str: + """ + Get a text-based snapshot of the current page's accessibility tree. + + Args: + full: If True, return complete snapshot. If False, return compact view. + task_id: Task identifier for session isolation + user_task: The user's current task (for task-aware extraction) + + Returns: + JSON string with page snapshot + """ + effective_task_id = task_id or "default" + + # Build command args based on full flag + args = [] + if not full: + args.extend(["-c"]) # Compact mode + + result = _run_browser_command(effective_task_id, "snapshot", args) + + if result.get("success"): + data = result.get("data", {}) + snapshot_text = data.get("snapshot", "") + refs = data.get("refs", {}) + + # Check if snapshot needs summarization + if len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD and user_task: + # Run async extraction + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + snapshot_text = loop.run_until_complete( + _extract_relevant_content(snapshot_text, user_task) + ) + elif len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD: + snapshot_text = _truncate_snapshot(snapshot_text) + + response = { + "success": True, + "snapshot": snapshot_text, + "element_count": len(refs) if refs else 0 + } + + return json.dumps(response, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", "Failed to get snapshot") + }, ensure_ascii=False) + + +def browser_click(ref: str, task_id: Optional[str] = None) -> str: + """ + Click on an element. + + Args: + ref: Element reference (e.g., "@e5") + task_id: Task identifier for session isolation + + Returns: + JSON string with click result + """ + effective_task_id = task_id or "default" + + # Ensure ref starts with @ + if not ref.startswith("@"): + ref = f"@{ref}" + + result = _run_browser_command(effective_task_id, "click", [ref]) + + if result.get("success"): + return json.dumps({ + "success": True, + "clicked": ref + }, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", f"Failed to click {ref}") + }, ensure_ascii=False) + + +def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: + """ + Type text into an input field. + + Args: + ref: Element reference (e.g., "@e3") + text: Text to type + task_id: Task identifier for session isolation + + Returns: + JSON string with type result + """ + effective_task_id = task_id or "default" + + # Ensure ref starts with @ + if not ref.startswith("@"): + ref = f"@{ref}" + + # Use fill command (clears then types) + result = _run_browser_command(effective_task_id, "fill", [ref, text]) + + if result.get("success"): + return json.dumps({ + "success": True, + "typed": text, + "element": ref + }, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", f"Failed to type into {ref}") + }, ensure_ascii=False) + + +def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: + """ + Scroll the page. + + Args: + direction: "up" or "down" + task_id: Task identifier for session isolation + + Returns: + JSON string with scroll result + """ + effective_task_id = task_id or "default" + + # Validate direction + if direction not in ["up", "down"]: + return json.dumps({ + "success": False, + "error": f"Invalid direction '{direction}'. Use 'up' or 'down'." + }, ensure_ascii=False) + + result = _run_browser_command(effective_task_id, "scroll", [direction]) + + if result.get("success"): + return json.dumps({ + "success": True, + "scrolled": direction + }, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", f"Failed to scroll {direction}") + }, ensure_ascii=False) + + +def browser_back(task_id: Optional[str] = None) -> str: + """ + Navigate back in browser history. + + Args: + task_id: Task identifier for session isolation + + Returns: + JSON string with navigation result + """ + effective_task_id = task_id or "default" + result = _run_browser_command(effective_task_id, "back", []) + + if result.get("success"): + data = result.get("data", {}) + return json.dumps({ + "success": True, + "url": data.get("url", "") + }, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", "Failed to go back") + }, ensure_ascii=False) + + +def browser_press(key: str, task_id: Optional[str] = None) -> str: + """ + Press a keyboard key. + + Args: + key: Key to press (e.g., "Enter", "Tab") + task_id: Task identifier for session isolation + + Returns: + JSON string with key press result + """ + effective_task_id = task_id or "default" + result = _run_browser_command(effective_task_id, "press", [key]) + + if result.get("success"): + return json.dumps({ + "success": True, + "pressed": key + }, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", f"Failed to press {key}") + }, ensure_ascii=False) + + +def browser_close(task_id: Optional[str] = None) -> str: + """ + Close the browser session. + + Args: + task_id: Task identifier for session isolation + + Returns: + JSON string with close result + """ + effective_task_id = task_id or "default" + result = _run_browser_command(effective_task_id, "close", []) + + # Close the BrowserBase session via API + session_key = task_id if task_id and task_id in _active_sessions else "default" + if session_key in _active_sessions: + session_info = _active_sessions[session_key] + bb_session_id = session_info.get("bb_session_id") + if bb_session_id: + try: + config = _get_browserbase_config() + _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"]) + except Exception as e: + print(f"[browser_tool] Warning: Could not close BrowserBase session: {e}", file=sys.stderr) + del _active_sessions[session_key] + + if result.get("success"): + return json.dumps({ + "success": True, + "closed": True + }, ensure_ascii=False) + else: + # Even if close fails, session was released + return json.dumps({ + "success": True, + "closed": True, + "warning": result.get("error", "Session may not have been active") + }, ensure_ascii=False) + + +def browser_get_images(task_id: Optional[str] = None) -> str: + """ + Get all images on the current page. + + Args: + task_id: Task identifier for session isolation + + Returns: + JSON string with list of images (src and alt) + """ + effective_task_id = task_id or "default" + + # Use eval to run JavaScript that extracts images + js_code = """JSON.stringify( + [...document.images].map(img => ({ + src: img.src, + alt: img.alt || '', + width: img.naturalWidth, + height: img.naturalHeight + })).filter(img => img.src && !img.src.startsWith('data:')) + )""" + + result = _run_browser_command(effective_task_id, "eval", [js_code]) + + if result.get("success"): + data = result.get("data", {}) + raw_result = data.get("result", "[]") + + try: + # Parse the JSON string returned by JavaScript + if isinstance(raw_result, str): + images = json.loads(raw_result) + else: + images = raw_result + + return json.dumps({ + "success": True, + "images": images, + "count": len(images) + }, ensure_ascii=False) + except json.JSONDecodeError: + return json.dumps({ + "success": True, + "images": [], + "count": 0, + "warning": "Could not parse image data" + }, ensure_ascii=False) + else: + return json.dumps({ + "success": False, + "error": result.get("error", "Failed to get images") + }, ensure_ascii=False) + + +def browser_vision(question: str, task_id: Optional[str] = None) -> str: + """ + Take a screenshot of the current page and analyze it with vision AI. + + This tool captures what's visually displayed in the browser and sends it + to Gemini for analysis. Useful for understanding visual content that the + text-based snapshot may not capture (CAPTCHAs, verification challenges, + images, complex layouts, etc.). + + Args: + question: What you want to know about the page visually + task_id: Task identifier for session isolation + + Returns: + JSON string with vision analysis results + """ + import base64 + import tempfile + import uuid as uuid_mod + from pathlib import Path + + effective_task_id = task_id or "default" + + # Check for OpenRouter API key + api_key = os.environ.get("OPENROUTER_API_KEY") + if not api_key: + return json.dumps({ + "success": False, + "error": "OPENROUTER_API_KEY not set. Vision analysis requires this API key." + }, ensure_ascii=False) + + # Create a temporary file for the screenshot + temp_dir = Path(tempfile.gettempdir()) + screenshot_path = temp_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" + + try: + # Take screenshot using agent-browser + result = _run_browser_command( + effective_task_id, + "screenshot", + [str(screenshot_path)], + timeout=30 + ) + + if not result.get("success"): + return json.dumps({ + "success": False, + "error": f"Failed to take screenshot: {result.get('error', 'Unknown error')}" + }, ensure_ascii=False) + + # Check if screenshot file was created + if not screenshot_path.exists(): + return json.dumps({ + "success": False, + "error": "Screenshot file was not created" + }, ensure_ascii=False) + + # Read and convert to base64 + image_data = screenshot_path.read_bytes() + image_base64 = base64.b64encode(image_data).decode("ascii") + data_url = f"data:image/png;base64,{image_base64}" + + # Prepare the vision prompt + vision_prompt = f"""You are analyzing a screenshot of a web browser. + +User's question: {question} + +Provide a detailed and helpful answer based on what you see in the screenshot. +If there are interactive elements, describe them. If there are verification challenges +or CAPTCHAs, describe what type they are and what action might be needed. +Focus on answering the user's specific question.""" + + # Call OpenRouter/Gemini for vision analysis + if HTTPX_AVAILABLE: + import asyncio + + async def analyze_screenshot(): + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + json={ + "model": "google/gemini-3-flash-preview", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": vision_prompt}, + { + "type": "image_url", + "image_url": {"url": data_url} + } + ] + } + ], + "max_tokens": 2000, + "temperature": 0.1 + } + ) + + if response.status_code != 200: + return { + "success": False, + "error": f"Vision API error: {response.status_code} - {response.text[:200]}" + } + + result_data = response.json() + analysis = result_data["choices"][0]["message"]["content"] + return { + "success": True, + "analysis": analysis + } + + # Run the async function + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + vision_result = loop.run_until_complete(analyze_screenshot()) + return json.dumps(vision_result, ensure_ascii=False) + + else: + # Fallback: use synchronous requests + response = requests.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + json={ + "model": "google/gemini-3-flash-preview", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": vision_prompt}, + { + "type": "image_url", + "image_url": {"url": data_url} + } + ] + } + ], + "max_tokens": 2000, + "temperature": 0.1 + }, + timeout=60 + ) + + if response.status_code != 200: + return json.dumps({ + "success": False, + "error": f"Vision API error: {response.status_code} - {response.text[:200]}" + }, ensure_ascii=False) + + result_data = response.json() + analysis = result_data["choices"][0]["message"]["content"] + return json.dumps({ + "success": True, + "analysis": analysis + }, ensure_ascii=False) + + except Exception as e: + return json.dumps({ + "success": False, + "error": f"Error during vision analysis: {str(e)}" + }, ensure_ascii=False) + + finally: + # Clean up screenshot file + if screenshot_path.exists(): + try: + screenshot_path.unlink() + except Exception: + pass + + +# ============================================================================ +# Cleanup and Management Functions +# ============================================================================ + +def _close_browserbase_session(session_id: str, api_key: str, project_id: str) -> bool: + """ + Close a Browserbase session immediately via the API. + + Uses POST /v1/sessions/{id} with status=REQUEST_RELEASE to immediately + terminate the session without waiting for keepAlive timeout. + + Args: + session_id: The Browserbase session ID + api_key: Browserbase API key + project_id: Browserbase project ID + + Returns: + True if session was successfully closed, False otherwise + """ + try: + # POST to update session status to REQUEST_RELEASE + response = requests.post( + f"https://api.browserbase.com/v1/sessions/{session_id}", + headers={ + "X-BB-API-Key": api_key, + "Content-Type": "application/json" + }, + json={ + "projectId": project_id, + "status": "REQUEST_RELEASE" + }, + timeout=10 + ) + + if response.status_code in (200, 201, 204): + print(f"[browser_tool] Successfully closed BrowserBase session {session_id}", file=sys.stderr) + return True + else: + print(f"[browser_tool] Failed to close session {session_id}: HTTP {response.status_code} - {response.text[:200]}", file=sys.stderr) + return False + + except Exception as e: + print(f"[browser_tool] Exception closing session {session_id}: {e}", file=sys.stderr) + return False + + +def cleanup_browser(task_id: Optional[str] = None) -> None: + """ + Clean up browser session for a task. + + Called automatically when a task completes. + Closes both the agent-browser session and the Browserbase session. + + Args: + task_id: Task identifier to clean up + """ + if task_id is None: + task_id = "default" + + print(f"[browser_tool] cleanup_browser called for task_id: {task_id}", file=sys.stderr) + print(f"[browser_tool] Active sessions: {list(_active_sessions.keys())}", file=sys.stderr) + + if task_id in _active_sessions: + session_info = _active_sessions[task_id] + bb_session_id = session_info.get("bb_session_id", "unknown") + print(f"[browser_tool] Found session for task {task_id}: bb_session_id={bb_session_id}", file=sys.stderr) + + # Try to close via agent-browser first + try: + _run_browser_command(task_id, "close", [], timeout=10) + print(f"[browser_tool] agent-browser close command completed for task {task_id}", file=sys.stderr) + except Exception as e: + print(f"[browser_tool] agent-browser close failed for task {task_id}: {e}", file=sys.stderr) + + # Close the Browserbase session immediately via API + try: + config = _get_browserbase_config() + success = _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"]) + if not success: + print(f"[browser_tool] WARNING: Could not close BrowserBase session {bb_session_id}", file=sys.stderr) + except Exception as e: + print(f"[browser_tool] Exception during BrowserBase session close: {e}", file=sys.stderr) + + del _active_sessions[task_id] + print(f"[browser_tool] Removed task {task_id} from active sessions", file=sys.stderr) + else: + print(f"[browser_tool] No active session found for task_id: {task_id}", file=sys.stderr) + + +def cleanup_all_browsers() -> None: + """ + Clean up all active browser sessions. + + Useful for cleanup on shutdown. + """ + for task_id in list(_active_sessions.keys()): + cleanup_browser(task_id) + + +def get_active_browser_sessions() -> Dict[str, Dict[str, str]]: + """ + Get information about active browser sessions. + + Returns: + Dict mapping task_id to session info (session_name, bb_session_id, cdp_url) + """ + return _active_sessions.copy() + + +# ============================================================================ +# Requirements Check +# ============================================================================ + +def check_browser_requirements() -> bool: + """ + Check if browser tool requirements are met. + + Returns: + True if all requirements are met, False otherwise + """ + # Check for Browserbase credentials + api_key = os.environ.get("BROWSERBASE_API_KEY") + project_id = os.environ.get("BROWSERBASE_PROJECT_ID") + + if not api_key or not project_id: + return False + + # Check for agent-browser CLI + try: + _find_agent_browser() + return True + except FileNotFoundError: + return False + + +# ============================================================================ +# Module Test +# ============================================================================ + +if __name__ == "__main__": + """ + Simple test/demo when run directly + """ + print("๐ŸŒ Browser Tool Module") + print("=" * 40) + + # Check requirements + if check_browser_requirements(): + print("โœ… All requirements met") + else: + print("โŒ Missing requirements:") + if not os.environ.get("BROWSERBASE_API_KEY"): + print(" - BROWSERBASE_API_KEY not set") + if not os.environ.get("BROWSERBASE_PROJECT_ID"): + print(" - BROWSERBASE_PROJECT_ID not set") + try: + _find_agent_browser() + except FileNotFoundError: + print(" - agent-browser CLI not found") + + print("\n๐Ÿ“‹ Available Browser Tools:") + for schema in BROWSER_TOOL_SCHEMAS: + print(f" ๐Ÿ”น {schema['name']}: {schema['description'][:60]}...") + + print("\n๐Ÿ’ก Usage:") + print(" from tools.browser_tool import browser_navigate, browser_snapshot") + print(" result = browser_navigate('https://example.com', task_id='my_task')") + print(" snapshot = browser_snapshot(task_id='my_task')") diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index 139c485d9..672987717 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -32,6 +32,10 @@ import sys import time import threading import atexit +import shutil +import subprocess +import tempfile +import uuid from pathlib import Path from typing import Optional, Dict, Any @@ -40,6 +44,168 @@ mini_swe_path = Path(__file__).parent.parent / "mini-swe-agent" / "src" if mini_swe_path.exists(): sys.path.insert(0, str(mini_swe_path)) + +# ============================================================================= +# Custom Singularity Environment with more space +# ============================================================================= + +def _get_scratch_dir() -> Path: + """Get the best directory for Singularity sandboxes - prefers /scratch if available.""" + # Check for configurable scratch directory first (highest priority) + custom_scratch = os.getenv("TERMINAL_SCRATCH_DIR") + if custom_scratch: + scratch_path = Path(custom_scratch) + scratch_path.mkdir(parents=True, exist_ok=True) + return scratch_path + + # Check for /scratch (common on HPC clusters, especially GPU nodes) + scratch = Path("/scratch") + if scratch.exists() and os.access(scratch, os.W_OK): + # Create user-specific subdirectory + user_scratch = scratch / os.getenv("USER", "hermes") / "hermes-agent" + user_scratch.mkdir(parents=True, exist_ok=True) + print(f"[Terminal] Using /scratch for sandboxes: {user_scratch}") + return user_scratch + + # Fall back to /tmp + print("[Terminal] Warning: /scratch not available, using /tmp (limited space)") + return Path(tempfile.gettempdir()) + + +# Disk usage warning threshold (in GB) +DISK_USAGE_WARNING_THRESHOLD_GB = float(os.getenv("TERMINAL_DISK_WARNING_GB", "500")) + + +def _check_disk_usage_warning(): + """Check if total disk usage exceeds warning threshold.""" + scratch_dir = _get_scratch_dir() + + try: + # Get total size of hermes directories + total_bytes = 0 + import glob + for path in glob.glob(str(scratch_dir / "hermes-*")): + for f in Path(path).rglob('*'): + if f.is_file(): + try: + total_bytes += f.stat().st_size + except: + pass + + total_gb = total_bytes / (1024 ** 3) + + if total_gb > DISK_USAGE_WARNING_THRESHOLD_GB: + print(f"โš ๏ธ [Terminal] WARNING: Disk usage ({total_gb:.1f}GB) exceeds threshold ({DISK_USAGE_WARNING_THRESHOLD_GB}GB)") + print(f" Consider running cleanup_all_environments() or reducing parallel workers") + return True + + return False + except Exception as e: + return False + + +class _SingularityEnvironment: + """ + Custom Singularity/Apptainer environment with better space management. + + - Builds sandbox in /scratch (if available) or configurable location + - Binds a large working directory into the container + - Keeps container isolated from host filesystem + """ + + def __init__(self, image: str, cwd: str = "/workspace", timeout: int = 60): + self.image = image + self.cwd = cwd + self.timeout = timeout + + # Use apptainer if available, otherwise singularity + self.executable = "apptainer" if shutil.which("apptainer") else "singularity" + + # Get scratch directory for sandbox + self.scratch_dir = _get_scratch_dir() + + # Create unique sandbox directory + self.sandbox_id = f"hermes-{uuid.uuid4().hex[:12]}" + self.sandbox_dir = self.scratch_dir / self.sandbox_id + + # Create a working directory that will be bound into the container + self.work_dir = self.scratch_dir / f"{self.sandbox_id}-work" + self.work_dir.mkdir(parents=True, exist_ok=True) + + # Build the sandbox + self._build_sandbox() + + def _build_sandbox(self): + """Build a writable sandbox from the container image.""" + try: + result = subprocess.run( + [self.executable, "build", "--sandbox", str(self.sandbox_dir), self.image], + capture_output=True, + text=True, + timeout=300 # 5 min timeout for building + ) + if result.returncode != 0: + raise RuntimeError(f"Failed to build sandbox: {result.stderr}") + + # Create /workspace directory inside the sandbox for bind mounting + workspace_in_sandbox = self.sandbox_dir / "workspace" + workspace_in_sandbox.mkdir(parents=True, exist_ok=True) + + except subprocess.TimeoutExpired: + shutil.rmtree(self.sandbox_dir, ignore_errors=True) + raise RuntimeError("Sandbox build timed out") + + def execute(self, command: str, cwd: str = "", *, timeout: int | None = None) -> dict: + """Execute a command in the Singularity container.""" + cmd = [self.executable, "exec"] + + # Isolation flags - contain but allow network + cmd.extend(["--contain", "--cleanenv"]) + + # Bind the working directory into the container at /workspace + # This gives the container access to a large writable space + cmd.extend(["--bind", f"{self.work_dir}:/workspace"]) + + # Also bind it to /tmp inside container for pip cache etc. + cmd.extend(["--bind", f"{self.work_dir}:/tmp"]) + + # Set working directory + work_dir = cwd or self.cwd + cmd.extend(["--pwd", work_dir]) + + # Use writable sandbox + cmd.extend(["--writable", str(self.sandbox_dir)]) + + # Execute the command + cmd.extend(["bash", "-c", command]) + + try: + result = subprocess.run( + cmd, + text=True, + timeout=timeout or self.timeout, + encoding="utf-8", + errors="replace", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + return {"output": result.stdout, "returncode": result.returncode} + except subprocess.TimeoutExpired: + return {"output": f"Command timed out after {timeout or self.timeout}s", "returncode": 124} + + def cleanup(self): + """Clean up sandbox and working directory.""" + shutil.rmtree(self.sandbox_dir, ignore_errors=True) + shutil.rmtree(self.work_dir, ignore_errors=True) + + def stop(self): + """Alias for cleanup.""" + self.cleanup() + + def __del__(self): + """Cleanup on destruction.""" + self.cleanup() + # Tool description for LLM TERMINAL_TOOL_DESCRIPTION = """Execute commands on a secure Linux environment. @@ -71,6 +237,7 @@ TERMINAL_TOOL_DESCRIPTION = """Execute commands on a secure Linux environment. # Global state for environment lifecycle management _active_environments: Dict[str, Any] = {} +_task_workdirs: Dict[str, str] = {} # Maps task_id to working directory _last_activity: Dict[str, float] = {} _env_lock = threading.Lock() _cleanup_thread = None @@ -80,9 +247,10 @@ _cleanup_running = False def _get_env_config() -> Dict[str, Any]: """Get terminal environment configuration from environment variables.""" return { - "env_type": os.getenv("TERMINAL_ENV", "local"), # local, docker, or modal - "docker_image": os.getenv("TERMINAL_DOCKER_IMAGE", "python:3.11-slim"), - "modal_image": os.getenv("TERMINAL_MODAL_IMAGE", "python:3.11-slim"), + "env_type": os.getenv("TERMINAL_ENV", "local"), # local, docker, singularity, or modal + "docker_image": os.getenv("TERMINAL_DOCKER_IMAGE", "python:3.11"), + "singularity_image": os.getenv("TERMINAL_SINGULARITY_IMAGE", "docker://python:3.11"), + "modal_image": os.getenv("TERMINAL_MODAL_IMAGE", "python:3.11"), "cwd": os.getenv("TERMINAL_CWD", "/tmp"), "timeout": int(os.getenv("TERMINAL_TIMEOUT", "60")), "lifetime_seconds": int(os.getenv("TERMINAL_LIFETIME_SECONDS", "300")), @@ -94,8 +262,8 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int): Create an execution environment from mini-swe-agent. Args: - env_type: One of "local", "docker", "modal" - image: Docker/Modal image name (ignored for local) + env_type: One of "local", "docker", "singularity", "modal" + image: Docker/Singularity/Modal image name (ignored for local) cwd: Working directory timeout: Default command timeout @@ -110,12 +278,16 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int): from minisweagent.environments.docker import DockerEnvironment return DockerEnvironment(image=image, cwd=cwd, timeout=timeout) + elif env_type == "singularity": + # Use custom Singularity environment with better space management + return _SingularityEnvironment(image=image, cwd=cwd, timeout=timeout) + elif env_type == "modal": from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment return SwerexModalEnvironment(image=image, cwd=cwd, timeout=timeout) else: - raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', or 'modal'") + raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', 'singularity', or 'modal'") def _cleanup_inactive_envs(lifetime_seconds: int = 300): @@ -147,6 +319,8 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300): if task_id in _last_activity: del _last_activity[task_id] + if task_id in _task_workdirs: + del _task_workdirs[task_id] except Exception as e: error_str = str(e) @@ -160,6 +334,8 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300): del _active_environments[task_id] if task_id in _last_activity: del _last_activity[task_id] + if task_id in _task_workdirs: + del _task_workdirs[task_id] def _cleanup_thread_worker(): @@ -198,9 +374,63 @@ def _stop_cleanup_thread(): _cleanup_thread.join(timeout=5) +def get_active_environments_info() -> Dict[str, Any]: + """Get information about currently active environments.""" + info = { + "count": len(_active_environments), + "task_ids": list(_active_environments.keys()), + "workdirs": dict(_task_workdirs), + } + + # Calculate total disk usage + total_size = 0 + for task_id in _active_environments.keys(): + # Check sandbox and workdir sizes + scratch_dir = _get_scratch_dir() + for pattern in [f"hermes-*{task_id[:8]}*"]: + import glob + for path in glob.glob(str(scratch_dir / "hermes-*")): + try: + size = sum(f.stat().st_size for f in Path(path).rglob('*') if f.is_file()) + total_size += size + except: + pass + + info["total_disk_usage_mb"] = round(total_size / (1024 * 1024), 2) + return info + + +def cleanup_all_environments(): + """Clean up ALL active environments. Use with caution.""" + global _active_environments, _last_activity, _task_workdirs + + task_ids = list(_active_environments.keys()) + cleaned = 0 + + for task_id in task_ids: + try: + cleanup_vm(task_id) + cleaned += 1 + except Exception as e: + print(f"[Terminal Cleanup] Error cleaning {task_id}: {e}") + + # Also clean any orphaned directories + scratch_dir = _get_scratch_dir() + import glob + for path in glob.glob(str(scratch_dir / "hermes-*")): + try: + shutil.rmtree(path, ignore_errors=True) + print(f"[Terminal Cleanup] Removed orphaned: {path}") + except: + pass + + print(f"[Terminal Cleanup] Cleaned {cleaned} environments") + return cleaned + + def cleanup_vm(task_id: str): """Manually clean up a specific environment by task_id.""" - global _active_environments, _last_activity + global _active_environments, _last_activity, _task_workdirs with _env_lock: try: @@ -216,6 +446,9 @@ def cleanup_vm(task_id: str): del _active_environments[task_id] print(f"[Terminal Cleanup] Manually cleaned up environment for task: {task_id}") + if task_id in _task_workdirs: + del _task_workdirs[task_id] + if task_id in _last_activity: del _last_activity[task_id] @@ -268,6 +501,8 @@ def terminal_tool( # Select image based on env type if env_type == "docker": image = config["docker_image"] + elif env_type == "singularity": + image = config["singularity_image"] elif env_type == "modal": image = config["modal_image"] else: @@ -280,12 +515,26 @@ def terminal_tool( # Use task_id for environment isolation effective_task_id = task_id or "default" + # For local environment, create a unique subdirectory per task + # This prevents parallel tasks from overwriting each other's files + if env_type == "local": + import uuid + with _env_lock: + if effective_task_id not in _task_workdirs: + task_workdir = Path(cwd) / f"hermes-{effective_task_id}-{uuid.uuid4().hex[:8]}" + task_workdir.mkdir(parents=True, exist_ok=True) + _task_workdirs[effective_task_id] = str(task_workdir) + cwd = _task_workdirs[effective_task_id] + # Start cleanup thread _start_cleanup_thread() # Get or create environment with _env_lock: if effective_task_id not in _active_environments: + # Check disk usage before creating new environment + _check_disk_usage_warning() + try: _active_environments[effective_task_id] = _create_environment( env_type=env_type, @@ -397,6 +646,16 @@ def check_terminal_requirements() -> bool: import subprocess result = subprocess.run(["docker", "version"], capture_output=True, timeout=5) return result.returncode == 0 + elif env_type == "singularity": + from minisweagent.environments.singularity import SingularityEnvironment + # Check if singularity/apptainer is available + import subprocess + import shutil + executable = shutil.which("apptainer") or shutil.which("singularity") + if executable: + result = subprocess.run([executable, "--version"], capture_output=True, timeout=5) + return result.returncode == 0 + return False elif env_type == "modal": from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment # Check for modal token diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 7b02c1043..f0145b7b6 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -155,10 +155,14 @@ async def _download_image(image_url: str, destination: Path, max_retries: int = for attempt in range(max_retries): try: # Download the image with appropriate headers using async httpx - async with httpx.AsyncClient(timeout=30.0) as client: + # Enable follow_redirects to handle image CDNs that redirect (e.g., Imgur, Picsum) + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: response = await client.get( image_url, - headers={"User-Agent": "hermes-agent-vision/1.0"}, + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "image/*,*/*;q=0.8", + }, ) response.raise_for_status() diff --git a/toolset_distributions.py b/toolset_distributions.py index 079619478..7eb5980a1 100644 --- a/toolset_distributions.py +++ b/toolset_distributions.py @@ -35,7 +35,8 @@ DISTRIBUTIONS = { "vision": 100, "image_gen": 100, "terminal": 100, - "moa": 100 + "moa": 100, + "browser": 100 } }, @@ -55,22 +56,24 @@ DISTRIBUTIONS = { "research": { "description": "Web research with vision analysis and reasoning", "toolsets": { - "web": 90, # 90% chance of web tools - "vision": 50, # 50% chance of vision tools - "moa": 40, # 40% chance of reasoning tools - "terminal": 10 # 10% chance of terminal tools + "web": 90, # 90% chance of web tools + "browser": 70, # 70% chance of browser tools for deep research + "vision": 50, # 50% chance of vision tools + "moa": 40, # 40% chance of reasoning tools + "terminal": 10 # 10% chance of terminal tools } }, # Scientific problem solving focused distribution "science": { - "description": "Web research with vision analysis and reasoning", + "description": "Scientific research with web, terminal, and browser capabilities", "toolsets": { - "web": 94, # 90% chance of web tools - "vision": 65, # 50% chance of vision tools - "moa": 10, # 40% chance of reasoning tools - "terminal": 94, # 10% chance of terminal tools - "image_gen": 15 # 80% chance of image generation tools + "web": 94, # 94% chance of web tools + "terminal": 94, # 94% chance of terminal tools + "vision": 65, # 65% chance of vision tools + "browser": 50, # 50% chance of browser for accessing papers/databases + "image_gen": 15, # 15% chance of image generation tools + "moa": 10 # 10% chance of reasoning tools } }, @@ -90,6 +93,7 @@ DISTRIBUTIONS = { "description": "All tools except terminal for safety", "toolsets": { "web": 80, + "browser": 70, # Browser is safe (no local filesystem access) "vision": 60, "image_gen": 60, "moa": 50 @@ -104,7 +108,8 @@ DISTRIBUTIONS = { "vision": 50, "image_gen": 50, "terminal": 50, - "moa": 50 + "moa": 50, + "browser": 50 } }, @@ -116,6 +121,23 @@ DISTRIBUTIONS = { } }, + # Terminal only + "terminal_only": { + "description": "Only terminal tool for code execution tasks", + "toolsets": { + "terminal": 100 + } + }, + + # Terminal + web (common for coding tasks that need docs) + "terminal_web": { + "description": "Terminal with web search for documentation lookup", + "toolsets": { + "terminal": 100, + "web": 100 + } + }, + # Creative (vision + image generation) "creative": { "description": "Image generation and vision analysis focus", @@ -134,6 +156,58 @@ DISTRIBUTIONS = { "web": 30, "terminal": 20 } + }, + + # Browser-based web interaction + "browser_use": { + "description": "Full browser-based web interaction with search, vision, and page control", + "toolsets": { + "browser": 100, # All browser tools always available + "web": 80, # Web search for finding URLs and quick lookups + "vision": 70 # Vision analysis for images found on pages + } + }, + + # Browser only (no other tools) + "browser_only": { + "description": "Only browser automation tools for pure web interaction tasks", + "toolsets": { + "browser": 100 + } + }, + + # Browser-focused tasks distribution (for browser-use-tasks.jsonl) + "browser_tasks": { + "description": "Browser-focused distribution (browser toolset includes web_search for finding URLs since Google blocks direct browser searches)", + "toolsets": { + "browser": 97, # 97% - browser tools (includes web_search) almost always available + "vision": 12, # 12% - vision analysis occasionally + "terminal": 15 # 15% - terminal occasionally for local operations + } + }, + + # Terminal-focused tasks distribution (for nous-terminal-tasks.jsonl) + "terminal_tasks": { + "description": "Terminal-focused distribution with high terminal availability, occasional other tools", + "toolsets": { + "terminal": 97, # 97% - terminal almost always available + "web": 15, # 15% - web search/scrape for documentation + "browser": 10, # 10% - browser occasionally for web interaction + "vision": 8, # 8% - vision analysis rarely + "image_gen": 3 # 3% - image generation very rarely + } + }, + + # Mixed browser+terminal tasks distribution (for mixed-browser-terminal-tasks.jsonl) + "mixed_tasks": { + "description": "Mixed distribution with high browser and terminal availability for complex tasks", + "toolsets": { + "browser": 92, # 92% - browser tools highly available + "terminal": 92, # 92% - terminal highly available + "web": 35, # 35% - web search/scrape fairly common + "vision": 15, # 15% - vision analysis occasionally + "image_gen": 15 # 15% - image generation occasionally + } } } diff --git a/toolsets.py b/toolsets.py index 058abbe4a..8206af85b 100644 --- a/toolsets.py +++ b/toolsets.py @@ -33,10 +33,16 @@ TOOLSETS = { # Basic toolsets - individual tool categories "web": { "description": "Web research and content extraction tools", - "tools": ["web_search", "web_extract", "web_crawl"], + "tools": ["web_search", "web_extract"], "includes": [] # No other toolsets included }, + "search": { + "description": "Web search only (no content extraction/scraping)", + "tools": ["web_search"], + "includes": [] + }, + "vision": { "description": "Image analysis and vision tools", "tools": ["vision_analyze"], @@ -61,6 +67,17 @@ TOOLSETS = { "includes": [] }, + "browser": { + "description": "Browser automation for web interaction (navigate, click, type, scroll, iframes, hold-click) with web search for finding URLs", + "tools": [ + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision", "web_search" + ], + "includes": [] + }, + # Scenario-specific toolsets "debugging": {