From d44b6b7f1b094ef06302dda8069df2802466ca4e Mon Sep 17 00:00:00 2001 From: ShawnPana Date: Tue, 17 Mar 2026 00:16:34 -0700 Subject: [PATCH] feat(browser): multi-provider cloud browser support + Browser Use integration Introduce a cloud browser provider abstraction so users can switch between Local Browser, Browserbase, and Browser Use (or future providers) via hermes tools / hermes setup. Cloud browser providers are behind an ABC (tools/browser_providers/base.py) so adding a new provider is a single-file addition with no changes to browser_tool.py internals. Changes: - tools/browser_providers/ package with ABC, Browserbase extraction, and Browser Use provider - browser_tool.py refactored to use _PROVIDER_REGISTRY + _get_cloud_provider() (cached) instead of hardcoded _is_local_mode() / _create_browserbase_session() - tools_config.py: generic _is_provider_active() / _detect_active_provider_index() replace TTS-only logic; Browser Use added as third browser option - config.py: BROWSER_USE_API_KEY added to OPTIONAL_ENV_VARS + show_config + allowlist - subprocess pipe hang fix: agent-browser daemon inherits pipe fds, communicate() blocks. Replaced with Popen + temp files. Original PR: #1208 Co-authored-by: ShawnPana --- hermes_cli/config.py | 11 +- hermes_cli/tools_config.py | 77 +++-- tools/browser_providers/__init__.py | 10 + tools/browser_providers/base.py | 59 ++++ tools/browser_providers/browser_use.py | 107 +++++++ tools/browser_providers/browserbase.py | 206 +++++++++++++ tools/browser_tool.py | 400 ++++++++----------------- 7 files changed, 567 insertions(+), 303 deletions(-) create mode 100644 tools/browser_providers/__init__.py create mode 100644 tools/browser_providers/base.py create mode 100644 tools/browser_providers/browser_use.py create mode 100644 tools/browser_providers/browserbase.py diff --git a/hermes_cli/config.py b/hermes_cli/config.py index c3a4c701..0700890f 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -507,6 +507,14 @@ OPTIONAL_ENV_VARS = { "password": False, "category": "tool", }, + "BROWSER_USE_API_KEY": { + "description": "Browser Use API key for cloud browser (optional — local browser works without this)", + "prompt": "Browser Use API key", + "url": "https://browser-use.com/", + "tools": ["browser_navigate", "browser_click"], + "password": True, + "category": "tool", + }, "FAL_KEY": { "description": "FAL API key for image generation", "prompt": "FAL API key", @@ -1258,6 +1266,7 @@ def show_config(): ("VOICE_TOOLS_OPENAI_KEY", "OpenAI (STT/TTS)"), ("FIRECRAWL_API_KEY", "Firecrawl"), ("BROWSERBASE_API_KEY", "Browserbase"), + ("BROWSER_USE_API_KEY", "Browser Use"), ("FAL_KEY", "FAL"), ] @@ -1404,7 +1413,7 @@ def set_config_value(key: str, value: str): # Check if it's an API key (goes to .env) api_keys = [ 'OPENROUTER_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'VOICE_TOOLS_OPENAI_KEY', - 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', + 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY', 'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN', 'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY', 'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN', diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index b819fafa..186100c0 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -190,6 +190,7 @@ TOOL_CATEGORIES = { "name": "Local Browser", "tag": "Free headless Chromium (no API key needed)", "env_vars": [], + "browser_provider": None, "post_setup": "browserbase", # Same npm install for agent-browser }, { @@ -199,6 +200,16 @@ TOOL_CATEGORIES = { {"key": "BROWSERBASE_API_KEY", "prompt": "Browserbase API key", "url": "https://browserbase.com"}, {"key": "BROWSERBASE_PROJECT_ID", "prompt": "Browserbase project ID"}, ], + "browser_provider": "browserbase", + "post_setup": "browserbase", + }, + { + "name": "Browser Use", + "tag": "Cloud browser with remote execution", + "env_vars": [ + {"key": "BROWSER_USE_API_KEY", "prompt": "Browser Use API key", "url": "https://browser-use.com"}, + ], + "browser_provider": "browser-use", "post_setup": "browserbase", }, ], @@ -575,10 +586,10 @@ def _configure_tool_category(ts_key: str, cat: dict, config: dict): configured = "" env_vars = p.get("env_vars", []) if not env_vars or all(get_env_value(v["key"]) for v in env_vars): - if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]: + if _is_provider_active(p, config): configured = " [active]" elif not env_vars: - configured = " [active]" if config.get("tts", {}).get("provider", "edge") == p.get("tts_provider", "") else "" + configured = "" else: configured = " [configured]" provider_choices.append(f"{p['name']}{tag}{configured}") @@ -587,15 +598,7 @@ def _configure_tool_category(ts_key: str, cat: dict, config: dict): provider_choices.append("Skip — keep defaults / configure later") # Detect current provider as default - default_idx = 0 - for i, p in enumerate(providers): - if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]: - default_idx = i - break - env_vars = p.get("env_vars", []) - if env_vars and all(get_env_value(v["key"]) for v in env_vars): - default_idx = i - break + default_idx = _detect_active_provider_index(providers, config) provider_idx = _prompt_choice(f" {title}:", provider_choices, default_idx) @@ -607,6 +610,28 @@ def _configure_tool_category(ts_key: str, cat: dict, config: dict): _configure_provider(providers[provider_idx], config) +def _is_provider_active(provider: dict, config: dict) -> bool: + """Check if a provider entry matches the currently active config.""" + if provider.get("tts_provider"): + return config.get("tts", {}).get("provider") == provider["tts_provider"] + if "browser_provider" in provider: + current = config.get("browser", {}).get("cloud_provider") + return provider["browser_provider"] == current + return False + + +def _detect_active_provider_index(providers: list, config: dict) -> int: + """Return the index of the currently active provider, or 0.""" + for i, p in enumerate(providers): + if _is_provider_active(p, config): + return i + # Fallback: env vars present → likely configured + env_vars = p.get("env_vars", []) + if env_vars and all(get_env_value(v["key"]) for v in env_vars): + return i + return 0 + + def _configure_provider(provider: dict, config: dict): """Configure a single provider - prompt for API keys and set config.""" env_vars = provider.get("env_vars", []) @@ -615,6 +640,15 @@ def _configure_provider(provider: dict, config: dict): if provider.get("tts_provider"): config.setdefault("tts", {})["provider"] = provider["tts_provider"] + # Set browser cloud provider in config if applicable + if "browser_provider" in provider: + bp = provider["browser_provider"] + if bp: + config.setdefault("browser", {})["cloud_provider"] = bp + _print_success(f" Browser cloud provider set to: {bp}") + else: + config.get("browser", {}).pop("cloud_provider", None) + if not env_vars: _print_success(f" {provider['name']} - no configuration needed!") return @@ -767,7 +801,7 @@ def _configure_tool_category_for_reconfig(ts_key: str, cat: dict, config: dict): configured = "" env_vars = p.get("env_vars", []) if not env_vars or all(get_env_value(v["key"]) for v in env_vars): - if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]: + if _is_provider_active(p, config): configured = " [active]" elif not env_vars: configured = "" @@ -775,15 +809,7 @@ def _configure_tool_category_for_reconfig(ts_key: str, cat: dict, config: dict): configured = " [configured]" provider_choices.append(f"{p['name']}{tag}{configured}") - default_idx = 0 - for i, p in enumerate(providers): - if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]: - default_idx = i - break - env_vars = p.get("env_vars", []) - if env_vars and all(get_env_value(v["key"]) for v in env_vars): - default_idx = i - break + default_idx = _detect_active_provider_index(providers, config) provider_idx = _prompt_choice(" Select provider:", provider_choices, default_idx) _reconfigure_provider(providers[provider_idx], config) @@ -797,6 +823,15 @@ def _reconfigure_provider(provider: dict, config: dict): config.setdefault("tts", {})["provider"] = provider["tts_provider"] _print_success(f" TTS provider set to: {provider['tts_provider']}") + if "browser_provider" in provider: + bp = provider["browser_provider"] + if bp: + config.setdefault("browser", {})["cloud_provider"] = bp + _print_success(f" Browser cloud provider set to: {bp}") + else: + config.get("browser", {}).pop("cloud_provider", None) + _print_success(f" Browser set to local mode") + if not env_vars: _print_success(f" {provider['name']} - no configuration needed!") return diff --git a/tools/browser_providers/__init__.py b/tools/browser_providers/__init__.py new file mode 100644 index 00000000..7fa59ef0 --- /dev/null +++ b/tools/browser_providers/__init__.py @@ -0,0 +1,10 @@ +"""Cloud browser provider abstraction. + +Import the ABC so callers can do:: + + from tools.browser_providers import CloudBrowserProvider +""" + +from tools.browser_providers.base import CloudBrowserProvider + +__all__ = ["CloudBrowserProvider"] diff --git a/tools/browser_providers/base.py b/tools/browser_providers/base.py new file mode 100644 index 00000000..6b8e1ed4 --- /dev/null +++ b/tools/browser_providers/base.py @@ -0,0 +1,59 @@ +"""Abstract base class for cloud browser providers.""" + +from abc import ABC, abstractmethod +from typing import Dict + + +class CloudBrowserProvider(ABC): + """Interface for cloud browser backends (Browserbase, Steel, etc.). + + Implementations live in sibling modules and are registered in + ``browser_tool._PROVIDER_REGISTRY``. The user selects a provider via + ``hermes setup`` / ``hermes tools``; the choice is persisted as + ``config["browser"]["cloud_provider"]``. + """ + + @abstractmethod + def provider_name(self) -> str: + """Short, human-readable name shown in logs and diagnostics.""" + + @abstractmethod + def is_configured(self) -> bool: + """Return True when all required env vars / credentials are present. + + Called at tool-registration time (``check_browser_requirements``) to + gate availability. Must be cheap — no network calls. + """ + + @abstractmethod + def create_session(self, task_id: str) -> Dict[str, object]: + """Create a cloud browser session and return session metadata. + + Must return a dict with at least:: + + { + "session_name": str, # unique name for agent-browser --session + "bb_session_id": str, # provider session ID (for close/cleanup) + "cdp_url": str, # CDP websocket URL + "features": dict, # feature flags that were enabled + } + + ``bb_session_id`` is a legacy key name kept for backward compat with + the rest of browser_tool.py — it holds the provider's session ID + regardless of which provider is in use. + """ + + @abstractmethod + def close_session(self, session_id: str) -> bool: + """Release / terminate a cloud session by its provider session ID. + + Returns True on success, False on failure. Should not raise. + """ + + @abstractmethod + def emergency_cleanup(self, session_id: str) -> None: + """Best-effort session teardown during process exit. + + Called from atexit / signal handlers. Must tolerate missing + credentials, network errors, etc. — log and move on. + """ diff --git a/tools/browser_providers/browser_use.py b/tools/browser_providers/browser_use.py new file mode 100644 index 00000000..48a61840 --- /dev/null +++ b/tools/browser_providers/browser_use.py @@ -0,0 +1,107 @@ +"""Browser Use cloud browser provider.""" + +import logging +import os +import uuid +from typing import Dict + +import requests + +from tools.browser_providers.base import CloudBrowserProvider + +logger = logging.getLogger(__name__) + +_BASE_URL = "https://api.browser-use.com/api/v2" + + +class BrowserUseProvider(CloudBrowserProvider): + """Browser Use (https://browser-use.com) cloud browser backend.""" + + def provider_name(self) -> str: + return "Browser Use" + + def is_configured(self) -> bool: + return bool(os.environ.get("BROWSER_USE_API_KEY")) + + # ------------------------------------------------------------------ + # Session lifecycle + # ------------------------------------------------------------------ + + def _headers(self) -> Dict[str, str]: + api_key = os.environ.get("BROWSER_USE_API_KEY") + if not api_key: + raise ValueError( + "BROWSER_USE_API_KEY environment variable is required. " + "Get your key at https://browser-use.com" + ) + return { + "Content-Type": "application/json", + "X-Browser-Use-API-Key": api_key, + } + + def create_session(self, task_id: str) -> Dict[str, object]: + response = requests.post( + f"{_BASE_URL}/browsers", + headers=self._headers(), + json={}, + timeout=30, + ) + + if not response.ok: + raise RuntimeError( + f"Failed to create Browser Use session: " + f"{response.status_code} {response.text}" + ) + + session_data = response.json() + session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}" + + logger.info("Created Browser Use session %s", session_name) + + return { + "session_name": session_name, + "bb_session_id": session_data["id"], + "cdp_url": session_data["cdpUrl"], + "features": {"browser_use": True}, + } + + def close_session(self, session_id: str) -> bool: + try: + response = requests.patch( + f"{_BASE_URL}/browsers/{session_id}", + headers=self._headers(), + json={"action": "stop"}, + timeout=10, + ) + if response.status_code in (200, 201, 204): + logger.debug("Successfully closed Browser Use session %s", session_id) + return True + else: + logger.warning( + "Failed to close Browser Use session %s: HTTP %s - %s", + session_id, + response.status_code, + response.text[:200], + ) + return False + except Exception as e: + logger.error("Exception closing Browser Use session %s: %s", session_id, e) + return False + + def emergency_cleanup(self, session_id: str) -> None: + api_key = os.environ.get("BROWSER_USE_API_KEY") + if not api_key: + logger.warning("Cannot emergency-cleanup Browser Use session %s — missing credentials", session_id) + return + try: + requests.patch( + f"{_BASE_URL}/browsers/{session_id}", + headers={ + "Content-Type": "application/json", + "X-Browser-Use-API-Key": api_key, + }, + json={"action": "stop"}, + timeout=5, + ) + except Exception as e: + logger.debug("Emergency cleanup failed for Browser Use session %s: %s", session_id, e) diff --git a/tools/browser_providers/browserbase.py b/tools/browser_providers/browserbase.py new file mode 100644 index 00000000..1aad8e6e --- /dev/null +++ b/tools/browser_providers/browserbase.py @@ -0,0 +1,206 @@ +"""Browserbase cloud browser provider.""" + +import logging +import os +import uuid +from typing import Dict + +import requests + +from tools.browser_providers.base import CloudBrowserProvider + +logger = logging.getLogger(__name__) + + +class BrowserbaseProvider(CloudBrowserProvider): + """Browserbase (https://browserbase.com) cloud browser backend.""" + + def provider_name(self) -> str: + return "Browserbase" + + def is_configured(self) -> bool: + return bool( + os.environ.get("BROWSERBASE_API_KEY") + and os.environ.get("BROWSERBASE_PROJECT_ID") + ) + + # ------------------------------------------------------------------ + # Session lifecycle + # ------------------------------------------------------------------ + + def _get_config(self) -> Dict[str, str]: + api_key = os.environ.get("BROWSERBASE_API_KEY") + project_id = os.environ.get("BROWSERBASE_PROJECT_ID") + if not api_key or not project_id: + raise ValueError( + "BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment " + "variables are required. Get your credentials at " + "https://browserbase.com" + ) + return {"api_key": api_key, "project_id": project_id} + + def create_session(self, task_id: str) -> Dict[str, object]: + config = self._get_config() + + # Optional env-var knobs + enable_proxies = os.environ.get("BROWSERBASE_PROXIES", "true").lower() != "false" + enable_advanced_stealth = os.environ.get("BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true" + enable_keep_alive = os.environ.get("BROWSERBASE_KEEP_ALIVE", "true").lower() != "false" + custom_timeout_ms = os.environ.get("BROWSERBASE_SESSION_TIMEOUT") + + features_enabled = { + "basic_stealth": True, + "proxies": False, + "advanced_stealth": False, + "keep_alive": False, + "custom_timeout": False, + } + + session_config: Dict[str, object] = {"projectId": config["project_id"]} + + if enable_keep_alive: + session_config["keepAlive"] = True + + if custom_timeout_ms: + try: + timeout_val = int(custom_timeout_ms) + if timeout_val > 0: + session_config["timeout"] = timeout_val + except ValueError: + logger.warning("Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms) + + if enable_proxies: + session_config["proxies"] = True + + if enable_advanced_stealth: + session_config["browserSettings"] = {"advancedStealth": True} + + # --- Create session via API --- + headers = { + "Content-Type": "application/json", + "X-BB-API-Key": config["api_key"], + } + response = requests.post( + "https://api.browserbase.com/v1/sessions", + headers=headers, + json=session_config, + timeout=30, + ) + + proxies_fallback = False + keepalive_fallback = False + + # Handle 402 — paid features unavailable + if response.status_code == 402: + if enable_keep_alive: + keepalive_fallback = True + logger.warning( + "keepAlive may require paid plan (402), retrying without it. " + "Sessions may timeout during long operations." + ) + session_config.pop("keepAlive", None) + response = requests.post( + "https://api.browserbase.com/v1/sessions", + headers=headers, + json=session_config, + timeout=30, + ) + + if response.status_code == 402 and enable_proxies: + proxies_fallback = True + logger.warning( + "Proxies unavailable (402), retrying without proxies. " + "Bot detection may be less effective." + ) + session_config.pop("proxies", None) + response = requests.post( + "https://api.browserbase.com/v1/sessions", + headers=headers, + json=session_config, + timeout=30, + ) + + if not response.ok: + raise RuntimeError( + f"Failed to create Browserbase session: " + f"{response.status_code} {response.text}" + ) + + session_data = response.json() + session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}" + + if enable_proxies and not proxies_fallback: + features_enabled["proxies"] = True + if enable_advanced_stealth: + features_enabled["advanced_stealth"] = True + if enable_keep_alive and not keepalive_fallback: + features_enabled["keep_alive"] = True + if custom_timeout_ms and "timeout" in session_config: + features_enabled["custom_timeout"] = True + + feature_str = ", ".join(k for k, v in features_enabled.items() if v) + logger.info("Created Browserbase session %s with features: %s", session_name, feature_str) + + return { + "session_name": session_name, + "bb_session_id": session_data["id"], + "cdp_url": session_data["connectUrl"], + "features": features_enabled, + } + + def close_session(self, session_id: str) -> bool: + try: + config = self._get_config() + except ValueError: + logger.warning("Cannot close Browserbase session %s — missing credentials", session_id) + return False + + try: + response = requests.post( + f"https://api.browserbase.com/v1/sessions/{session_id}", + headers={ + "X-BB-API-Key": config["api_key"], + "Content-Type": "application/json", + }, + json={ + "projectId": config["project_id"], + "status": "REQUEST_RELEASE", + }, + timeout=10, + ) + if response.status_code in (200, 201, 204): + logger.debug("Successfully closed Browserbase session %s", session_id) + return True + else: + logger.warning( + "Failed to close session %s: HTTP %s - %s", + session_id, + response.status_code, + response.text[:200], + ) + return False + except Exception as e: + logger.error("Exception closing Browserbase session %s: %s", session_id, e) + return False + + def emergency_cleanup(self, session_id: str) -> None: + api_key = os.environ.get("BROWSERBASE_API_KEY") + project_id = os.environ.get("BROWSERBASE_PROJECT_ID") + if not api_key or not project_id: + logger.warning("Cannot emergency-cleanup Browserbase session %s — missing credentials", session_id) + return + try: + requests.post( + f"https://api.browserbase.com/v1/sessions/{session_id}", + headers={ + "X-BB-API-Key": api_key, + "Content-Type": "application/json", + }, + json={ + "projectId": project_id, + "status": "REQUEST_RELEASE", + }, + timeout=5, + ) + except Exception as e: + logger.debug("Emergency cleanup failed for Browserbase session %s: %s", session_id, e) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index e595e810..d57eedee 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -65,6 +65,9 @@ import requests from typing import Dict, Any, Optional, List from pathlib import Path from agent.auxiliary_client import call_llm +from tools.browser_providers.base import CloudBrowserProvider +from tools.browser_providers.browserbase import BrowserbaseProvider +from tools.browser_providers.browser_use import BrowserUseProvider logger = logging.getLogger(__name__) @@ -108,16 +111,43 @@ def _get_cdp_override() -> str: return os.environ.get("BROWSER_CDP_URL", "").strip() -def _is_local_mode() -> bool: - """Return True when no Browserbase credentials are configured. +# ============================================================================ +# Cloud Provider Registry +# ============================================================================ - In local mode the browser tools launch a headless Chromium instance via - ``agent-browser --session`` instead of connecting to a remote Browserbase - session via ``--cdp``. +_PROVIDER_REGISTRY: Dict[str, type] = { + "browserbase": BrowserbaseProvider, + "browser-use": BrowserUseProvider, +} + +_cached_cloud_provider: Optional[CloudBrowserProvider] = None +_cloud_provider_resolved = False + + +def _get_cloud_provider() -> Optional[CloudBrowserProvider]: + """Return the configured cloud browser provider, or None for local mode. + + Reads ``config["browser"]["cloud_provider"]`` once and caches the result + for the process lifetime. If unset → local mode (None). """ - if _get_cdp_override(): - return False # CDP override takes priority - return not (os.environ.get("BROWSERBASE_API_KEY") and os.environ.get("BROWSERBASE_PROJECT_ID")) + global _cached_cloud_provider, _cloud_provider_resolved + if _cloud_provider_resolved: + return _cached_cloud_provider + + _cloud_provider_resolved = True + try: + hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) + config_path = hermes_home / "config.yaml" + if config_path.exists(): + import yaml + with open(config_path) as f: + cfg = yaml.safe_load(f) or {} + provider_key = cfg.get("browser", {}).get("cloud_provider") + if provider_key and provider_key in _PROVIDER_REGISTRY: + _cached_cloud_provider = _PROVIDER_REGISTRY[provider_key]() + except Exception as e: + logger.debug("Could not read cloud_provider from config: %s", e) + return _cached_cloud_provider def _socket_safe_tmpdir() -> str: @@ -452,161 +482,6 @@ BROWSER_TOOL_SCHEMAS = [ # Utility Functions # ============================================================================ -def _create_browserbase_session(task_id: str) -> Dict[str, str]: - """ - Create a Browserbase session with stealth features. - - Browserbase Stealth Modes: - - Basic Stealth: ALWAYS enabled automatically. Generates random fingerprints, - viewports, and solves visual CAPTCHAs. No configuration needed. - - Advanced Stealth: Uses custom Chromium build for better bot detection avoidance. - Requires Scale Plan. Enable via BROWSERBASE_ADVANCED_STEALTH=true. - - Proxies are enabled by default to route traffic through residential IPs, - which significantly improves CAPTCHA solving rates. Can be disabled via - BROWSERBASE_PROXIES=false if needed. - - Args: - task_id: Unique identifier for the task - - Returns: - Dict with session_name, bb_session_id, cdp_url, and feature flags - """ - import uuid - import sys - - config = _get_browserbase_config() - - # Check for optional settings from environment - # Proxies: enabled by default for better CAPTCHA solving - enable_proxies = os.environ.get("BROWSERBASE_PROXIES", "true").lower() != "false" - # Advanced Stealth: requires Scale Plan, disabled by default - enable_advanced_stealth = os.environ.get("BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true" - # keepAlive: enabled by default (requires paid plan) - allows reconnection after disconnects - enable_keep_alive = os.environ.get("BROWSERBASE_KEEP_ALIVE", "true").lower() != "false" - # Custom session timeout in milliseconds (optional) - extends session beyond project default - custom_timeout_ms = os.environ.get("BROWSERBASE_SESSION_TIMEOUT") - - # Track which features are actually enabled for logging/debugging - features_enabled = { - "basic_stealth": True, # Always on - "proxies": False, - "advanced_stealth": False, - "keep_alive": False, - "custom_timeout": False, - } - - # Build session configuration - # Note: Basic stealth mode is ALWAYS active - no configuration needed - session_config = { - "projectId": config["project_id"], - } - - # Enable keepAlive for session reconnection (default: true, requires paid plan) - # Allows reconnecting to the same session after network hiccups - if enable_keep_alive: - session_config["keepAlive"] = True - - # Add custom timeout if specified (in milliseconds) - # This extends session duration beyond project's default timeout - if custom_timeout_ms: - try: - timeout_val = int(custom_timeout_ms) - if timeout_val > 0: - session_config["timeout"] = timeout_val - except ValueError: - logger.warning("Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms) - - # Enable proxies for better CAPTCHA solving (default: true) - # Routes traffic through residential IPs for more reliable access - if enable_proxies: - session_config["proxies"] = True - - # Add advanced stealth if enabled (requires Scale Plan) - # Uses custom Chromium build to avoid bot detection altogether - if enable_advanced_stealth: - session_config["browserSettings"] = { - "advancedStealth": True, - } - - # Create session via Browserbase API - response = requests.post( - "https://api.browserbase.com/v1/sessions", - headers={ - "Content-Type": "application/json", - "X-BB-API-Key": config["api_key"], - }, - json=session_config, - timeout=30 - ) - - # Track if we fell back from paid features - proxies_fallback = False - keepalive_fallback = False - - # Handle 402 Payment Required - likely paid features not available - # Try to identify which feature caused the issue and retry without it - if response.status_code == 402: - # First try without keepAlive (most likely culprit for paid plan requirement) - if enable_keep_alive: - keepalive_fallback = True - logger.warning("keepAlive may require paid plan (402), retrying without it. " - "Sessions may timeout during long operations.") - session_config.pop("keepAlive", None) - response = requests.post( - "https://api.browserbase.com/v1/sessions", - headers={ - "Content-Type": "application/json", - "X-BB-API-Key": config["api_key"], - }, - json=session_config, - timeout=30 - ) - - # If still 402, try without proxies too - if response.status_code == 402 and enable_proxies: - proxies_fallback = True - logger.warning("Proxies unavailable (402), retrying without proxies. " - "Bot detection may be less effective.") - session_config.pop("proxies", None) - response = requests.post( - "https://api.browserbase.com/v1/sessions", - headers={ - "Content-Type": "application/json", - "X-BB-API-Key": config["api_key"], - }, - json=session_config, - timeout=30 - ) - - if not response.ok: - raise RuntimeError(f"Failed to create Browserbase session: {response.status_code} {response.text}") - - session_data = response.json() - session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}" - - # Update features based on what actually succeeded - if enable_proxies and not proxies_fallback: - features_enabled["proxies"] = True - if enable_advanced_stealth: - features_enabled["advanced_stealth"] = True - if enable_keep_alive and not keepalive_fallback: - features_enabled["keep_alive"] = True - if custom_timeout_ms and "timeout" in session_config: - features_enabled["custom_timeout"] = True - - # Log session info for debugging - feature_str = ", ".join(k for k, v in features_enabled.items() if v) - logger.info("Created session %s with features: %s", session_name, feature_str) - - return { - "session_name": session_name, - "bb_session_id": session_data["id"], - "cdp_url": session_data["connectUrl"], - "features": features_enabled, - } - - def _create_local_session(task_id: str) -> Dict[str, str]: import uuid session_name = f"h_{uuid.uuid4().hex[:10]}" @@ -667,10 +542,12 @@ def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]: cdp_override = _get_cdp_override() if cdp_override: session_info = _create_cdp_session(task_id, cdp_override) - elif _is_local_mode(): - session_info = _create_local_session(task_id) else: - session_info = _create_browserbase_session(task_id) + provider = _get_cloud_provider() + if provider is None: + session_info = _create_local_session(task_id) + else: + session_info = provider.create_session(task_id) with _cleanup_lock: _active_sessions[task_id] = session_info @@ -692,31 +569,6 @@ def _get_session_name(task_id: Optional[str] = None) -> str: return session_info["session_name"] -def _get_browserbase_config() -> Dict[str, str]: - """ - Get Browserbase configuration from environment. - - Returns: - Dict with api_key and project_id - - Raises: - ValueError: If required env vars are not set - """ - api_key = os.environ.get("BROWSERBASE_API_KEY") - project_id = os.environ.get("BROWSERBASE_PROJECT_ID") - - if not api_key or not project_id: - raise ValueError( - "BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment variables are required. " - "Get your credentials at https://browserbase.com" - ) - - return { - "api_key": api_key, - "project_id": project_id - } - - def _find_agent_browser() -> str: """ Find the agent-browser CLI executable. @@ -859,27 +711,62 @@ def _run_browser_command( browser_env["PATH"] = ":".join(path_parts) browser_env["AGENT_BROWSER_SOCKET_DIR"] = task_socket_dir - result = subprocess.run( - cmd_parts, - capture_output=True, - text=True, - timeout=timeout, - env=browser_env, - ) - + # Use temp files for stdout/stderr instead of pipes. + # agent-browser starts a background daemon that inherits file + # descriptors. With capture_output=True (pipes), the daemon keeps + # the pipe fds open after the CLI exits, so communicate() never + # sees EOF and blocks until the timeout fires. + stdout_path = os.path.join(task_socket_dir, f"_stdout_{command}") + stderr_path = os.path.join(task_socket_dir, f"_stderr_{command}") + stdout_fd = os.open(stdout_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + stderr_fd = os.open(stderr_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + try: + proc = subprocess.Popen( + cmd_parts, + stdout=stdout_fd, + stderr=stderr_fd, + stdin=subprocess.DEVNULL, + env=browser_env, + ) + finally: + os.close(stdout_fd) + os.close(stderr_fd) + + try: + proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + logger.warning("browser '%s' timed out after %ds (task=%s, socket_dir=%s)", + command, timeout, task_id, task_socket_dir) + return {"success": False, "error": f"Command timed out after {timeout} seconds"} + + with open(stdout_path, "r") as f: + stdout = f.read() + with open(stderr_path, "r") as f: + stderr = f.read() + returncode = proc.returncode + + # Clean up temp files (best-effort) + for p in (stdout_path, stderr_path): + try: + os.unlink(p) + except OSError: + pass + # Log stderr for diagnostics — use warning level on failure so it's visible - if result.stderr and result.stderr.strip(): - level = logging.WARNING if result.returncode != 0 else logging.DEBUG - logger.log(level, "browser '%s' stderr: %s", command, result.stderr.strip()[:500]) + if stderr and stderr.strip(): + level = logging.WARNING if returncode != 0 else logging.DEBUG + logger.log(level, "browser '%s' stderr: %s", command, stderr.strip()[:500]) # Log empty output as warning — common sign of broken agent-browser - if not result.stdout.strip() and result.returncode == 0: + if not stdout.strip() and returncode == 0: logger.warning("browser '%s' returned empty stdout with rc=0. " "cmd=%s stderr=%s", command, " ".join(cmd_parts[:4]) + "...", - (result.stderr or "")[:200]) + (stderr or "")[:200]) - stdout_text = result.stdout.strip() + stdout_text = stdout.strip() if stdout_text: try: @@ -890,15 +777,15 @@ def _run_browser_command( if not snap_data.get("snapshot") and not snap_data.get("refs"): logger.warning("snapshot returned empty content. " "Possible stale daemon or CDP connection issue. " - "returncode=%s", result.returncode) + "returncode=%s", returncode) return parsed except json.JSONDecodeError: raw = stdout_text[:2000] logger.warning("browser '%s' returned non-JSON output (rc=%s): %s", - command, result.returncode, raw[:500]) + command, returncode, raw[:500]) if command == "screenshot": - stderr_text = (result.stderr or "").strip() + stderr_text = (stderr or "").strip() combined_text = "\n".join( part for part in [stdout_text, stderr_text] if part ) @@ -923,17 +810,13 @@ def _run_browser_command( } # Check for errors - if result.returncode != 0: - error_msg = result.stderr.strip() if result.stderr else f"Command failed with code {result.returncode}" - logger.warning("browser '%s' failed (rc=%s): %s", command, result.returncode, error_msg[:300]) + if returncode != 0: + error_msg = stderr.strip() if stderr else f"Command failed with code {returncode}" + logger.warning("browser '%s' failed (rc=%s): %s", command, returncode, error_msg[:300]) return {"success": False, "error": error_msg} return {"success": True, "data": {}} - except subprocess.TimeoutExpired: - logger.warning("browser '%s' timed out after %ds (task=%s, socket_dir=%s)", - command, timeout, task_id, task_socket_dir) - return {"success": False, "error": f"Command timed out after {timeout} seconds"} except Exception as e: logger.warning("browser '%s' exception: %s", command, e, exc_info=True) return {"success": False, "error": str(e)} @@ -1509,7 +1392,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] if not result.get("success"): error_detail = result.get("error", "Unknown error") - mode = "local" if _is_local_mode() else "cloud" + _cp = _get_cloud_provider() + mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" return json.dumps({ "success": False, "error": f"Failed to take screenshot ({mode} mode): {error_detail}" @@ -1521,7 +1405,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] # Check if screenshot file was created if not screenshot_path.exists(): - mode = "local" if _is_local_mode() else "cloud" + _cp = _get_cloud_provider() + mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" return json.dumps({ "success": False, "error": ( @@ -1639,48 +1524,6 @@ def _cleanup_old_recordings(max_age_hours=72): # Cleanup and Management Functions # ============================================================================ -def _close_browserbase_session(session_id: str, api_key: str, project_id: str) -> bool: - """ - Close a Browserbase session immediately via the API. - - Uses POST /v1/sessions/{id} with status=REQUEST_RELEASE to immediately - terminate the session without waiting for keepAlive timeout. - - Args: - session_id: The Browserbase session ID - api_key: Browserbase API key - project_id: Browserbase project ID - - Returns: - True if session was successfully closed, False otherwise - """ - try: - # POST to update session status to REQUEST_RELEASE - response = requests.post( - f"https://api.browserbase.com/v1/sessions/{session_id}", - headers={ - "X-BB-API-Key": api_key, - "Content-Type": "application/json" - }, - json={ - "projectId": project_id, - "status": "REQUEST_RELEASE" - }, - timeout=10 - ) - - if response.status_code in (200, 201, 204): - logger.debug("Successfully closed BrowserBase session %s", session_id) - return True - else: - logger.warning("Failed to close session %s: HTTP %s - %s", session_id, response.status_code, response.text[:200]) - return False - - except Exception as e: - logger.error("Exception closing session %s: %s", session_id, e) - return False - - def cleanup_browser(task_id: Optional[str] = None) -> None: """ Clean up browser session for a task. @@ -1721,15 +1564,14 @@ def cleanup_browser(task_id: Optional[str] = None) -> None: _active_sessions.pop(task_id, None) _session_last_activity.pop(task_id, None) - # Cloud mode: close the Browserbase session via API - if bb_session_id and not _is_local_mode(): - try: - config = _get_browserbase_config() - success = _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"]) - if not success: - logger.warning("Could not close BrowserBase session %s", bb_session_id) - except Exception as e: - logger.error("Exception during BrowserBase session close: %s", e) + # Cloud mode: close the cloud browser session via provider API + if bb_session_id: + provider = _get_cloud_provider() + if provider is not None: + try: + provider.close_session(bb_session_id) + except Exception as e: + logger.warning("Could not close cloud browser session: %s", e) # Kill the daemon process and clean up socket directory session_name = session_info.get("session_name", "") @@ -1798,12 +1640,10 @@ def check_browser_requirements() -> bool: except FileNotFoundError: return False - # In cloud mode, also require Browserbase credentials - if not _is_local_mode(): - api_key = os.environ.get("BROWSERBASE_API_KEY") - project_id = os.environ.get("BROWSERBASE_PROJECT_ID") - if not api_key or not project_id: - return False + # In cloud mode, also require provider credentials + provider = _get_cloud_provider() + if provider is not None and not provider.is_configured(): + return False return True @@ -1819,7 +1659,8 @@ if __name__ == "__main__": print("🌐 Browser Tool Module") print("=" * 40) - mode = "local" if _is_local_mode() else "cloud (Browserbase)" + _cp = _get_cloud_provider() + mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" print(f" Mode: {mode}") # Check requirements @@ -1832,12 +1673,9 @@ if __name__ == "__main__": except FileNotFoundError: print(" - agent-browser CLI not found") print(" Install: npm install -g agent-browser && agent-browser install --with-deps") - if not _is_local_mode(): - if not os.environ.get("BROWSERBASE_API_KEY"): - print(" - BROWSERBASE_API_KEY not set (required for cloud mode)") - if not os.environ.get("BROWSERBASE_PROJECT_ID"): - print(" - BROWSERBASE_PROJECT_ID not set (required for cloud mode)") - print(" Tip: unset BROWSERBASE_API_KEY to use free local mode instead") + if _cp is not None and not _cp.is_configured(): + print(f" - {_cp.provider_name()} credentials not configured") + print(" Tip: remove cloud_provider from config to use free local mode instead") print("\n📋 Available Browser Tools:") for schema in BROWSER_TOOL_SCHEMAS: