feat(browser): multi-provider cloud browser support + Browser Use integration

Introduce a cloud browser provider abstraction so users can switch
between Local Browser, Browserbase, and Browser Use (or future providers)
via hermes tools / hermes setup.

Cloud browser providers are behind an ABC (tools/browser_providers/base.py)
so adding a new provider is a single-file addition with no changes to
browser_tool.py internals.

Changes:
- tools/browser_providers/ package with ABC, Browserbase extraction,
  and Browser Use provider
- browser_tool.py refactored to use _PROVIDER_REGISTRY + _get_cloud_provider()
  (cached) instead of hardcoded _is_local_mode() / _create_browserbase_session()
- tools_config.py: generic _is_provider_active() / _detect_active_provider_index()
  replace TTS-only logic; Browser Use added as third browser option
- config.py: BROWSER_USE_API_KEY added to OPTIONAL_ENV_VARS + show_config + allowlist
- subprocess pipe hang fix: agent-browser daemon inherits pipe fds,
  communicate() blocks. Replaced with Popen + temp files.

Original PR: #1208
Co-authored-by: ShawnPana <shawnpana@users.noreply.github.com>
This commit is contained in:
ShawnPana
2026-03-17 00:16:34 -07:00
committed by teknium1
parent 4768ea624d
commit d44b6b7f1b
7 changed files with 567 additions and 303 deletions

View File

@@ -507,6 +507,14 @@ OPTIONAL_ENV_VARS = {
"password": False,
"category": "tool",
},
"BROWSER_USE_API_KEY": {
"description": "Browser Use API key for cloud browser (optional — local browser works without this)",
"prompt": "Browser Use API key",
"url": "https://browser-use.com/",
"tools": ["browser_navigate", "browser_click"],
"password": True,
"category": "tool",
},
"FAL_KEY": {
"description": "FAL API key for image generation",
"prompt": "FAL API key",
@@ -1258,6 +1266,7 @@ def show_config():
("VOICE_TOOLS_OPENAI_KEY", "OpenAI (STT/TTS)"),
("FIRECRAWL_API_KEY", "Firecrawl"),
("BROWSERBASE_API_KEY", "Browserbase"),
("BROWSER_USE_API_KEY", "Browser Use"),
("FAL_KEY", "FAL"),
]
@@ -1404,7 +1413,7 @@ def set_config_value(key: str, value: str):
# Check if it's an API key (goes to .env)
api_keys = [
'OPENROUTER_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'VOICE_TOOLS_OPENAI_KEY',
'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID',
'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY',
'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN',
'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY',
'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN',

View File

@@ -190,6 +190,7 @@ TOOL_CATEGORIES = {
"name": "Local Browser",
"tag": "Free headless Chromium (no API key needed)",
"env_vars": [],
"browser_provider": None,
"post_setup": "browserbase", # Same npm install for agent-browser
},
{
@@ -199,6 +200,16 @@ TOOL_CATEGORIES = {
{"key": "BROWSERBASE_API_KEY", "prompt": "Browserbase API key", "url": "https://browserbase.com"},
{"key": "BROWSERBASE_PROJECT_ID", "prompt": "Browserbase project ID"},
],
"browser_provider": "browserbase",
"post_setup": "browserbase",
},
{
"name": "Browser Use",
"tag": "Cloud browser with remote execution",
"env_vars": [
{"key": "BROWSER_USE_API_KEY", "prompt": "Browser Use API key", "url": "https://browser-use.com"},
],
"browser_provider": "browser-use",
"post_setup": "browserbase",
},
],
@@ -575,10 +586,10 @@ def _configure_tool_category(ts_key: str, cat: dict, config: dict):
configured = ""
env_vars = p.get("env_vars", [])
if not env_vars or all(get_env_value(v["key"]) for v in env_vars):
if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]:
if _is_provider_active(p, config):
configured = " [active]"
elif not env_vars:
configured = " [active]" if config.get("tts", {}).get("provider", "edge") == p.get("tts_provider", "") else ""
configured = ""
else:
configured = " [configured]"
provider_choices.append(f"{p['name']}{tag}{configured}")
@@ -587,15 +598,7 @@ def _configure_tool_category(ts_key: str, cat: dict, config: dict):
provider_choices.append("Skip — keep defaults / configure later")
# Detect current provider as default
default_idx = 0
for i, p in enumerate(providers):
if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]:
default_idx = i
break
env_vars = p.get("env_vars", [])
if env_vars and all(get_env_value(v["key"]) for v in env_vars):
default_idx = i
break
default_idx = _detect_active_provider_index(providers, config)
provider_idx = _prompt_choice(f" {title}:", provider_choices, default_idx)
@@ -607,6 +610,28 @@ def _configure_tool_category(ts_key: str, cat: dict, config: dict):
_configure_provider(providers[provider_idx], config)
def _is_provider_active(provider: dict, config: dict) -> bool:
"""Check if a provider entry matches the currently active config."""
if provider.get("tts_provider"):
return config.get("tts", {}).get("provider") == provider["tts_provider"]
if "browser_provider" in provider:
current = config.get("browser", {}).get("cloud_provider")
return provider["browser_provider"] == current
return False
def _detect_active_provider_index(providers: list, config: dict) -> int:
"""Return the index of the currently active provider, or 0."""
for i, p in enumerate(providers):
if _is_provider_active(p, config):
return i
# Fallback: env vars present → likely configured
env_vars = p.get("env_vars", [])
if env_vars and all(get_env_value(v["key"]) for v in env_vars):
return i
return 0
def _configure_provider(provider: dict, config: dict):
"""Configure a single provider - prompt for API keys and set config."""
env_vars = provider.get("env_vars", [])
@@ -615,6 +640,15 @@ def _configure_provider(provider: dict, config: dict):
if provider.get("tts_provider"):
config.setdefault("tts", {})["provider"] = provider["tts_provider"]
# Set browser cloud provider in config if applicable
if "browser_provider" in provider:
bp = provider["browser_provider"]
if bp:
config.setdefault("browser", {})["cloud_provider"] = bp
_print_success(f" Browser cloud provider set to: {bp}")
else:
config.get("browser", {}).pop("cloud_provider", None)
if not env_vars:
_print_success(f" {provider['name']} - no configuration needed!")
return
@@ -767,7 +801,7 @@ def _configure_tool_category_for_reconfig(ts_key: str, cat: dict, config: dict):
configured = ""
env_vars = p.get("env_vars", [])
if not env_vars or all(get_env_value(v["key"]) for v in env_vars):
if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]:
if _is_provider_active(p, config):
configured = " [active]"
elif not env_vars:
configured = ""
@@ -775,15 +809,7 @@ def _configure_tool_category_for_reconfig(ts_key: str, cat: dict, config: dict):
configured = " [configured]"
provider_choices.append(f"{p['name']}{tag}{configured}")
default_idx = 0
for i, p in enumerate(providers):
if p.get("tts_provider") and config.get("tts", {}).get("provider") == p["tts_provider"]:
default_idx = i
break
env_vars = p.get("env_vars", [])
if env_vars and all(get_env_value(v["key"]) for v in env_vars):
default_idx = i
break
default_idx = _detect_active_provider_index(providers, config)
provider_idx = _prompt_choice(" Select provider:", provider_choices, default_idx)
_reconfigure_provider(providers[provider_idx], config)
@@ -797,6 +823,15 @@ def _reconfigure_provider(provider: dict, config: dict):
config.setdefault("tts", {})["provider"] = provider["tts_provider"]
_print_success(f" TTS provider set to: {provider['tts_provider']}")
if "browser_provider" in provider:
bp = provider["browser_provider"]
if bp:
config.setdefault("browser", {})["cloud_provider"] = bp
_print_success(f" Browser cloud provider set to: {bp}")
else:
config.get("browser", {}).pop("cloud_provider", None)
_print_success(f" Browser set to local mode")
if not env_vars:
_print_success(f" {provider['name']} - no configuration needed!")
return

View File

@@ -0,0 +1,10 @@
"""Cloud browser provider abstraction.
Import the ABC so callers can do::
from tools.browser_providers import CloudBrowserProvider
"""
from tools.browser_providers.base import CloudBrowserProvider
__all__ = ["CloudBrowserProvider"]

View File

@@ -0,0 +1,59 @@
"""Abstract base class for cloud browser providers."""
from abc import ABC, abstractmethod
from typing import Dict
class CloudBrowserProvider(ABC):
"""Interface for cloud browser backends (Browserbase, Steel, etc.).
Implementations live in sibling modules and are registered in
``browser_tool._PROVIDER_REGISTRY``. The user selects a provider via
``hermes setup`` / ``hermes tools``; the choice is persisted as
``config["browser"]["cloud_provider"]``.
"""
@abstractmethod
def provider_name(self) -> str:
"""Short, human-readable name shown in logs and diagnostics."""
@abstractmethod
def is_configured(self) -> bool:
"""Return True when all required env vars / credentials are present.
Called at tool-registration time (``check_browser_requirements``) to
gate availability. Must be cheap — no network calls.
"""
@abstractmethod
def create_session(self, task_id: str) -> Dict[str, object]:
"""Create a cloud browser session and return session metadata.
Must return a dict with at least::
{
"session_name": str, # unique name for agent-browser --session
"bb_session_id": str, # provider session ID (for close/cleanup)
"cdp_url": str, # CDP websocket URL
"features": dict, # feature flags that were enabled
}
``bb_session_id`` is a legacy key name kept for backward compat with
the rest of browser_tool.py — it holds the provider's session ID
regardless of which provider is in use.
"""
@abstractmethod
def close_session(self, session_id: str) -> bool:
"""Release / terminate a cloud session by its provider session ID.
Returns True on success, False on failure. Should not raise.
"""
@abstractmethod
def emergency_cleanup(self, session_id: str) -> None:
"""Best-effort session teardown during process exit.
Called from atexit / signal handlers. Must tolerate missing
credentials, network errors, etc. — log and move on.
"""

View File

@@ -0,0 +1,107 @@
"""Browser Use cloud browser provider."""
import logging
import os
import uuid
from typing import Dict
import requests
from tools.browser_providers.base import CloudBrowserProvider
logger = logging.getLogger(__name__)
_BASE_URL = "https://api.browser-use.com/api/v2"
class BrowserUseProvider(CloudBrowserProvider):
"""Browser Use (https://browser-use.com) cloud browser backend."""
def provider_name(self) -> str:
return "Browser Use"
def is_configured(self) -> bool:
return bool(os.environ.get("BROWSER_USE_API_KEY"))
# ------------------------------------------------------------------
# Session lifecycle
# ------------------------------------------------------------------
def _headers(self) -> Dict[str, str]:
api_key = os.environ.get("BROWSER_USE_API_KEY")
if not api_key:
raise ValueError(
"BROWSER_USE_API_KEY environment variable is required. "
"Get your key at https://browser-use.com"
)
return {
"Content-Type": "application/json",
"X-Browser-Use-API-Key": api_key,
}
def create_session(self, task_id: str) -> Dict[str, object]:
response = requests.post(
f"{_BASE_URL}/browsers",
headers=self._headers(),
json={},
timeout=30,
)
if not response.ok:
raise RuntimeError(
f"Failed to create Browser Use session: "
f"{response.status_code} {response.text}"
)
session_data = response.json()
session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}"
logger.info("Created Browser Use session %s", session_name)
return {
"session_name": session_name,
"bb_session_id": session_data["id"],
"cdp_url": session_data["cdpUrl"],
"features": {"browser_use": True},
}
def close_session(self, session_id: str) -> bool:
try:
response = requests.patch(
f"{_BASE_URL}/browsers/{session_id}",
headers=self._headers(),
json={"action": "stop"},
timeout=10,
)
if response.status_code in (200, 201, 204):
logger.debug("Successfully closed Browser Use session %s", session_id)
return True
else:
logger.warning(
"Failed to close Browser Use session %s: HTTP %s - %s",
session_id,
response.status_code,
response.text[:200],
)
return False
except Exception as e:
logger.error("Exception closing Browser Use session %s: %s", session_id, e)
return False
def emergency_cleanup(self, session_id: str) -> None:
api_key = os.environ.get("BROWSER_USE_API_KEY")
if not api_key:
logger.warning("Cannot emergency-cleanup Browser Use session %s — missing credentials", session_id)
return
try:
requests.patch(
f"{_BASE_URL}/browsers/{session_id}",
headers={
"Content-Type": "application/json",
"X-Browser-Use-API-Key": api_key,
},
json={"action": "stop"},
timeout=5,
)
except Exception as e:
logger.debug("Emergency cleanup failed for Browser Use session %s: %s", session_id, e)

View File

@@ -0,0 +1,206 @@
"""Browserbase cloud browser provider."""
import logging
import os
import uuid
from typing import Dict
import requests
from tools.browser_providers.base import CloudBrowserProvider
logger = logging.getLogger(__name__)
class BrowserbaseProvider(CloudBrowserProvider):
"""Browserbase (https://browserbase.com) cloud browser backend."""
def provider_name(self) -> str:
return "Browserbase"
def is_configured(self) -> bool:
return bool(
os.environ.get("BROWSERBASE_API_KEY")
and os.environ.get("BROWSERBASE_PROJECT_ID")
)
# ------------------------------------------------------------------
# Session lifecycle
# ------------------------------------------------------------------
def _get_config(self) -> Dict[str, str]:
api_key = os.environ.get("BROWSERBASE_API_KEY")
project_id = os.environ.get("BROWSERBASE_PROJECT_ID")
if not api_key or not project_id:
raise ValueError(
"BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment "
"variables are required. Get your credentials at "
"https://browserbase.com"
)
return {"api_key": api_key, "project_id": project_id}
def create_session(self, task_id: str) -> Dict[str, object]:
config = self._get_config()
# Optional env-var knobs
enable_proxies = os.environ.get("BROWSERBASE_PROXIES", "true").lower() != "false"
enable_advanced_stealth = os.environ.get("BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true"
enable_keep_alive = os.environ.get("BROWSERBASE_KEEP_ALIVE", "true").lower() != "false"
custom_timeout_ms = os.environ.get("BROWSERBASE_SESSION_TIMEOUT")
features_enabled = {
"basic_stealth": True,
"proxies": False,
"advanced_stealth": False,
"keep_alive": False,
"custom_timeout": False,
}
session_config: Dict[str, object] = {"projectId": config["project_id"]}
if enable_keep_alive:
session_config["keepAlive"] = True
if custom_timeout_ms:
try:
timeout_val = int(custom_timeout_ms)
if timeout_val > 0:
session_config["timeout"] = timeout_val
except ValueError:
logger.warning("Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms)
if enable_proxies:
session_config["proxies"] = True
if enable_advanced_stealth:
session_config["browserSettings"] = {"advancedStealth": True}
# --- Create session via API ---
headers = {
"Content-Type": "application/json",
"X-BB-API-Key": config["api_key"],
}
response = requests.post(
"https://api.browserbase.com/v1/sessions",
headers=headers,
json=session_config,
timeout=30,
)
proxies_fallback = False
keepalive_fallback = False
# Handle 402 — paid features unavailable
if response.status_code == 402:
if enable_keep_alive:
keepalive_fallback = True
logger.warning(
"keepAlive may require paid plan (402), retrying without it. "
"Sessions may timeout during long operations."
)
session_config.pop("keepAlive", None)
response = requests.post(
"https://api.browserbase.com/v1/sessions",
headers=headers,
json=session_config,
timeout=30,
)
if response.status_code == 402 and enable_proxies:
proxies_fallback = True
logger.warning(
"Proxies unavailable (402), retrying without proxies. "
"Bot detection may be less effective."
)
session_config.pop("proxies", None)
response = requests.post(
"https://api.browserbase.com/v1/sessions",
headers=headers,
json=session_config,
timeout=30,
)
if not response.ok:
raise RuntimeError(
f"Failed to create Browserbase session: "
f"{response.status_code} {response.text}"
)
session_data = response.json()
session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}"
if enable_proxies and not proxies_fallback:
features_enabled["proxies"] = True
if enable_advanced_stealth:
features_enabled["advanced_stealth"] = True
if enable_keep_alive and not keepalive_fallback:
features_enabled["keep_alive"] = True
if custom_timeout_ms and "timeout" in session_config:
features_enabled["custom_timeout"] = True
feature_str = ", ".join(k for k, v in features_enabled.items() if v)
logger.info("Created Browserbase session %s with features: %s", session_name, feature_str)
return {
"session_name": session_name,
"bb_session_id": session_data["id"],
"cdp_url": session_data["connectUrl"],
"features": features_enabled,
}
def close_session(self, session_id: str) -> bool:
try:
config = self._get_config()
except ValueError:
logger.warning("Cannot close Browserbase session %s — missing credentials", session_id)
return False
try:
response = requests.post(
f"https://api.browserbase.com/v1/sessions/{session_id}",
headers={
"X-BB-API-Key": config["api_key"],
"Content-Type": "application/json",
},
json={
"projectId": config["project_id"],
"status": "REQUEST_RELEASE",
},
timeout=10,
)
if response.status_code in (200, 201, 204):
logger.debug("Successfully closed Browserbase session %s", session_id)
return True
else:
logger.warning(
"Failed to close session %s: HTTP %s - %s",
session_id,
response.status_code,
response.text[:200],
)
return False
except Exception as e:
logger.error("Exception closing Browserbase session %s: %s", session_id, e)
return False
def emergency_cleanup(self, session_id: str) -> None:
api_key = os.environ.get("BROWSERBASE_API_KEY")
project_id = os.environ.get("BROWSERBASE_PROJECT_ID")
if not api_key or not project_id:
logger.warning("Cannot emergency-cleanup Browserbase session %s — missing credentials", session_id)
return
try:
requests.post(
f"https://api.browserbase.com/v1/sessions/{session_id}",
headers={
"X-BB-API-Key": api_key,
"Content-Type": "application/json",
},
json={
"projectId": project_id,
"status": "REQUEST_RELEASE",
},
timeout=5,
)
except Exception as e:
logger.debug("Emergency cleanup failed for Browserbase session %s: %s", session_id, e)

View File

@@ -65,6 +65,9 @@ import requests
from typing import Dict, Any, Optional, List
from pathlib import Path
from agent.auxiliary_client import call_llm
from tools.browser_providers.base import CloudBrowserProvider
from tools.browser_providers.browserbase import BrowserbaseProvider
from tools.browser_providers.browser_use import BrowserUseProvider
logger = logging.getLogger(__name__)
@@ -108,16 +111,43 @@ def _get_cdp_override() -> str:
return os.environ.get("BROWSER_CDP_URL", "").strip()
def _is_local_mode() -> bool:
"""Return True when no Browserbase credentials are configured.
# ============================================================================
# Cloud Provider Registry
# ============================================================================
In local mode the browser tools launch a headless Chromium instance via
``agent-browser --session`` instead of connecting to a remote Browserbase
session via ``--cdp``.
_PROVIDER_REGISTRY: Dict[str, type] = {
"browserbase": BrowserbaseProvider,
"browser-use": BrowserUseProvider,
}
_cached_cloud_provider: Optional[CloudBrowserProvider] = None
_cloud_provider_resolved = False
def _get_cloud_provider() -> Optional[CloudBrowserProvider]:
"""Return the configured cloud browser provider, or None for local mode.
Reads ``config["browser"]["cloud_provider"]`` once and caches the result
for the process lifetime. If unset → local mode (None).
"""
if _get_cdp_override():
return False # CDP override takes priority
return not (os.environ.get("BROWSERBASE_API_KEY") and os.environ.get("BROWSERBASE_PROJECT_ID"))
global _cached_cloud_provider, _cloud_provider_resolved
if _cloud_provider_resolved:
return _cached_cloud_provider
_cloud_provider_resolved = True
try:
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
config_path = hermes_home / "config.yaml"
if config_path.exists():
import yaml
with open(config_path) as f:
cfg = yaml.safe_load(f) or {}
provider_key = cfg.get("browser", {}).get("cloud_provider")
if provider_key and provider_key in _PROVIDER_REGISTRY:
_cached_cloud_provider = _PROVIDER_REGISTRY[provider_key]()
except Exception as e:
logger.debug("Could not read cloud_provider from config: %s", e)
return _cached_cloud_provider
def _socket_safe_tmpdir() -> str:
@@ -452,161 +482,6 @@ BROWSER_TOOL_SCHEMAS = [
# Utility Functions
# ============================================================================
def _create_browserbase_session(task_id: str) -> Dict[str, str]:
"""
Create a Browserbase session with stealth features.
Browserbase Stealth Modes:
- Basic Stealth: ALWAYS enabled automatically. Generates random fingerprints,
viewports, and solves visual CAPTCHAs. No configuration needed.
- Advanced Stealth: Uses custom Chromium build for better bot detection avoidance.
Requires Scale Plan. Enable via BROWSERBASE_ADVANCED_STEALTH=true.
Proxies are enabled by default to route traffic through residential IPs,
which significantly improves CAPTCHA solving rates. Can be disabled via
BROWSERBASE_PROXIES=false if needed.
Args:
task_id: Unique identifier for the task
Returns:
Dict with session_name, bb_session_id, cdp_url, and feature flags
"""
import uuid
import sys
config = _get_browserbase_config()
# Check for optional settings from environment
# Proxies: enabled by default for better CAPTCHA solving
enable_proxies = os.environ.get("BROWSERBASE_PROXIES", "true").lower() != "false"
# Advanced Stealth: requires Scale Plan, disabled by default
enable_advanced_stealth = os.environ.get("BROWSERBASE_ADVANCED_STEALTH", "false").lower() == "true"
# keepAlive: enabled by default (requires paid plan) - allows reconnection after disconnects
enable_keep_alive = os.environ.get("BROWSERBASE_KEEP_ALIVE", "true").lower() != "false"
# Custom session timeout in milliseconds (optional) - extends session beyond project default
custom_timeout_ms = os.environ.get("BROWSERBASE_SESSION_TIMEOUT")
# Track which features are actually enabled for logging/debugging
features_enabled = {
"basic_stealth": True, # Always on
"proxies": False,
"advanced_stealth": False,
"keep_alive": False,
"custom_timeout": False,
}
# Build session configuration
# Note: Basic stealth mode is ALWAYS active - no configuration needed
session_config = {
"projectId": config["project_id"],
}
# Enable keepAlive for session reconnection (default: true, requires paid plan)
# Allows reconnecting to the same session after network hiccups
if enable_keep_alive:
session_config["keepAlive"] = True
# Add custom timeout if specified (in milliseconds)
# This extends session duration beyond project's default timeout
if custom_timeout_ms:
try:
timeout_val = int(custom_timeout_ms)
if timeout_val > 0:
session_config["timeout"] = timeout_val
except ValueError:
logger.warning("Invalid BROWSERBASE_SESSION_TIMEOUT value: %s", custom_timeout_ms)
# Enable proxies for better CAPTCHA solving (default: true)
# Routes traffic through residential IPs for more reliable access
if enable_proxies:
session_config["proxies"] = True
# Add advanced stealth if enabled (requires Scale Plan)
# Uses custom Chromium build to avoid bot detection altogether
if enable_advanced_stealth:
session_config["browserSettings"] = {
"advancedStealth": True,
}
# Create session via Browserbase API
response = requests.post(
"https://api.browserbase.com/v1/sessions",
headers={
"Content-Type": "application/json",
"X-BB-API-Key": config["api_key"],
},
json=session_config,
timeout=30
)
# Track if we fell back from paid features
proxies_fallback = False
keepalive_fallback = False
# Handle 402 Payment Required - likely paid features not available
# Try to identify which feature caused the issue and retry without it
if response.status_code == 402:
# First try without keepAlive (most likely culprit for paid plan requirement)
if enable_keep_alive:
keepalive_fallback = True
logger.warning("keepAlive may require paid plan (402), retrying without it. "
"Sessions may timeout during long operations.")
session_config.pop("keepAlive", None)
response = requests.post(
"https://api.browserbase.com/v1/sessions",
headers={
"Content-Type": "application/json",
"X-BB-API-Key": config["api_key"],
},
json=session_config,
timeout=30
)
# If still 402, try without proxies too
if response.status_code == 402 and enable_proxies:
proxies_fallback = True
logger.warning("Proxies unavailable (402), retrying without proxies. "
"Bot detection may be less effective.")
session_config.pop("proxies", None)
response = requests.post(
"https://api.browserbase.com/v1/sessions",
headers={
"Content-Type": "application/json",
"X-BB-API-Key": config["api_key"],
},
json=session_config,
timeout=30
)
if not response.ok:
raise RuntimeError(f"Failed to create Browserbase session: {response.status_code} {response.text}")
session_data = response.json()
session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}"
# Update features based on what actually succeeded
if enable_proxies and not proxies_fallback:
features_enabled["proxies"] = True
if enable_advanced_stealth:
features_enabled["advanced_stealth"] = True
if enable_keep_alive and not keepalive_fallback:
features_enabled["keep_alive"] = True
if custom_timeout_ms and "timeout" in session_config:
features_enabled["custom_timeout"] = True
# Log session info for debugging
feature_str = ", ".join(k for k, v in features_enabled.items() if v)
logger.info("Created session %s with features: %s", session_name, feature_str)
return {
"session_name": session_name,
"bb_session_id": session_data["id"],
"cdp_url": session_data["connectUrl"],
"features": features_enabled,
}
def _create_local_session(task_id: str) -> Dict[str, str]:
import uuid
session_name = f"h_{uuid.uuid4().hex[:10]}"
@@ -667,10 +542,12 @@ def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]:
cdp_override = _get_cdp_override()
if cdp_override:
session_info = _create_cdp_session(task_id, cdp_override)
elif _is_local_mode():
session_info = _create_local_session(task_id)
else:
session_info = _create_browserbase_session(task_id)
provider = _get_cloud_provider()
if provider is None:
session_info = _create_local_session(task_id)
else:
session_info = provider.create_session(task_id)
with _cleanup_lock:
_active_sessions[task_id] = session_info
@@ -692,31 +569,6 @@ def _get_session_name(task_id: Optional[str] = None) -> str:
return session_info["session_name"]
def _get_browserbase_config() -> Dict[str, str]:
"""
Get Browserbase configuration from environment.
Returns:
Dict with api_key and project_id
Raises:
ValueError: If required env vars are not set
"""
api_key = os.environ.get("BROWSERBASE_API_KEY")
project_id = os.environ.get("BROWSERBASE_PROJECT_ID")
if not api_key or not project_id:
raise ValueError(
"BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID environment variables are required. "
"Get your credentials at https://browserbase.com"
)
return {
"api_key": api_key,
"project_id": project_id
}
def _find_agent_browser() -> str:
"""
Find the agent-browser CLI executable.
@@ -859,27 +711,62 @@ def _run_browser_command(
browser_env["PATH"] = ":".join(path_parts)
browser_env["AGENT_BROWSER_SOCKET_DIR"] = task_socket_dir
result = subprocess.run(
cmd_parts,
capture_output=True,
text=True,
timeout=timeout,
env=browser_env,
)
# Use temp files for stdout/stderr instead of pipes.
# agent-browser starts a background daemon that inherits file
# descriptors. With capture_output=True (pipes), the daemon keeps
# the pipe fds open after the CLI exits, so communicate() never
# sees EOF and blocks until the timeout fires.
stdout_path = os.path.join(task_socket_dir, f"_stdout_{command}")
stderr_path = os.path.join(task_socket_dir, f"_stderr_{command}")
stdout_fd = os.open(stdout_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
stderr_fd = os.open(stderr_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
try:
proc = subprocess.Popen(
cmd_parts,
stdout=stdout_fd,
stderr=stderr_fd,
stdin=subprocess.DEVNULL,
env=browser_env,
)
finally:
os.close(stdout_fd)
os.close(stderr_fd)
try:
proc.wait(timeout=timeout)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait()
logger.warning("browser '%s' timed out after %ds (task=%s, socket_dir=%s)",
command, timeout, task_id, task_socket_dir)
return {"success": False, "error": f"Command timed out after {timeout} seconds"}
with open(stdout_path, "r") as f:
stdout = f.read()
with open(stderr_path, "r") as f:
stderr = f.read()
returncode = proc.returncode
# Clean up temp files (best-effort)
for p in (stdout_path, stderr_path):
try:
os.unlink(p)
except OSError:
pass
# Log stderr for diagnostics — use warning level on failure so it's visible
if result.stderr and result.stderr.strip():
level = logging.WARNING if result.returncode != 0 else logging.DEBUG
logger.log(level, "browser '%s' stderr: %s", command, result.stderr.strip()[:500])
if stderr and stderr.strip():
level = logging.WARNING if returncode != 0 else logging.DEBUG
logger.log(level, "browser '%s' stderr: %s", command, stderr.strip()[:500])
# Log empty output as warning — common sign of broken agent-browser
if not result.stdout.strip() and result.returncode == 0:
if not stdout.strip() and returncode == 0:
logger.warning("browser '%s' returned empty stdout with rc=0. "
"cmd=%s stderr=%s",
command, " ".join(cmd_parts[:4]) + "...",
(result.stderr or "")[:200])
(stderr or "")[:200])
stdout_text = result.stdout.strip()
stdout_text = stdout.strip()
if stdout_text:
try:
@@ -890,15 +777,15 @@ def _run_browser_command(
if not snap_data.get("snapshot") and not snap_data.get("refs"):
logger.warning("snapshot returned empty content. "
"Possible stale daemon or CDP connection issue. "
"returncode=%s", result.returncode)
"returncode=%s", returncode)
return parsed
except json.JSONDecodeError:
raw = stdout_text[:2000]
logger.warning("browser '%s' returned non-JSON output (rc=%s): %s",
command, result.returncode, raw[:500])
command, returncode, raw[:500])
if command == "screenshot":
stderr_text = (result.stderr or "").strip()
stderr_text = (stderr or "").strip()
combined_text = "\n".join(
part for part in [stdout_text, stderr_text] if part
)
@@ -923,17 +810,13 @@ def _run_browser_command(
}
# Check for errors
if result.returncode != 0:
error_msg = result.stderr.strip() if result.stderr else f"Command failed with code {result.returncode}"
logger.warning("browser '%s' failed (rc=%s): %s", command, result.returncode, error_msg[:300])
if returncode != 0:
error_msg = stderr.strip() if stderr else f"Command failed with code {returncode}"
logger.warning("browser '%s' failed (rc=%s): %s", command, returncode, error_msg[:300])
return {"success": False, "error": error_msg}
return {"success": True, "data": {}}
except subprocess.TimeoutExpired:
logger.warning("browser '%s' timed out after %ds (task=%s, socket_dir=%s)",
command, timeout, task_id, task_socket_dir)
return {"success": False, "error": f"Command timed out after {timeout} seconds"}
except Exception as e:
logger.warning("browser '%s' exception: %s", command, e, exc_info=True)
return {"success": False, "error": str(e)}
@@ -1509,7 +1392,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
if not result.get("success"):
error_detail = result.get("error", "Unknown error")
mode = "local" if _is_local_mode() else "cloud"
_cp = _get_cloud_provider()
mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})"
return json.dumps({
"success": False,
"error": f"Failed to take screenshot ({mode} mode): {error_detail}"
@@ -1521,7 +1405,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
# Check if screenshot file was created
if not screenshot_path.exists():
mode = "local" if _is_local_mode() else "cloud"
_cp = _get_cloud_provider()
mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})"
return json.dumps({
"success": False,
"error": (
@@ -1639,48 +1524,6 @@ def _cleanup_old_recordings(max_age_hours=72):
# Cleanup and Management Functions
# ============================================================================
def _close_browserbase_session(session_id: str, api_key: str, project_id: str) -> bool:
"""
Close a Browserbase session immediately via the API.
Uses POST /v1/sessions/{id} with status=REQUEST_RELEASE to immediately
terminate the session without waiting for keepAlive timeout.
Args:
session_id: The Browserbase session ID
api_key: Browserbase API key
project_id: Browserbase project ID
Returns:
True if session was successfully closed, False otherwise
"""
try:
# POST to update session status to REQUEST_RELEASE
response = requests.post(
f"https://api.browserbase.com/v1/sessions/{session_id}",
headers={
"X-BB-API-Key": api_key,
"Content-Type": "application/json"
},
json={
"projectId": project_id,
"status": "REQUEST_RELEASE"
},
timeout=10
)
if response.status_code in (200, 201, 204):
logger.debug("Successfully closed BrowserBase session %s", session_id)
return True
else:
logger.warning("Failed to close session %s: HTTP %s - %s", session_id, response.status_code, response.text[:200])
return False
except Exception as e:
logger.error("Exception closing session %s: %s", session_id, e)
return False
def cleanup_browser(task_id: Optional[str] = None) -> None:
"""
Clean up browser session for a task.
@@ -1721,15 +1564,14 @@ def cleanup_browser(task_id: Optional[str] = None) -> None:
_active_sessions.pop(task_id, None)
_session_last_activity.pop(task_id, None)
# Cloud mode: close the Browserbase session via API
if bb_session_id and not _is_local_mode():
try:
config = _get_browserbase_config()
success = _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"])
if not success:
logger.warning("Could not close BrowserBase session %s", bb_session_id)
except Exception as e:
logger.error("Exception during BrowserBase session close: %s", e)
# Cloud mode: close the cloud browser session via provider API
if bb_session_id:
provider = _get_cloud_provider()
if provider is not None:
try:
provider.close_session(bb_session_id)
except Exception as e:
logger.warning("Could not close cloud browser session: %s", e)
# Kill the daemon process and clean up socket directory
session_name = session_info.get("session_name", "")
@@ -1798,12 +1640,10 @@ def check_browser_requirements() -> bool:
except FileNotFoundError:
return False
# In cloud mode, also require Browserbase credentials
if not _is_local_mode():
api_key = os.environ.get("BROWSERBASE_API_KEY")
project_id = os.environ.get("BROWSERBASE_PROJECT_ID")
if not api_key or not project_id:
return False
# In cloud mode, also require provider credentials
provider = _get_cloud_provider()
if provider is not None and not provider.is_configured():
return False
return True
@@ -1819,7 +1659,8 @@ if __name__ == "__main__":
print("🌐 Browser Tool Module")
print("=" * 40)
mode = "local" if _is_local_mode() else "cloud (Browserbase)"
_cp = _get_cloud_provider()
mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})"
print(f" Mode: {mode}")
# Check requirements
@@ -1832,12 +1673,9 @@ if __name__ == "__main__":
except FileNotFoundError:
print(" - agent-browser CLI not found")
print(" Install: npm install -g agent-browser && agent-browser install --with-deps")
if not _is_local_mode():
if not os.environ.get("BROWSERBASE_API_KEY"):
print(" - BROWSERBASE_API_KEY not set (required for cloud mode)")
if not os.environ.get("BROWSERBASE_PROJECT_ID"):
print(" - BROWSERBASE_PROJECT_ID not set (required for cloud mode)")
print(" Tip: unset BROWSERBASE_API_KEY to use free local mode instead")
if _cp is not None and not _cp.is_configured():
print(f" - {_cp.provider_name()} credentials not configured")
print(" Tip: remove cloud_provider from config to use free local mode instead")
print("\n📋 Available Browser Tools:")
for schema in BROWSER_TOOL_SCHEMAS: