diff --git a/.env.example b/.env.example index 638610e4..d273a696 100644 --- a/.env.example +++ b/.env.example @@ -65,10 +65,15 @@ OPENCODE_GO_API_KEY= # TOOL API KEYS # ============================================================================= +# Parallel API Key - AI-native web search and extract +# Get at: https://parallel.ai +PARALLEL_API_KEY= + # Firecrawl API Key - Web search, extract, and crawl # Get at: https://firecrawl.dev/ FIRECRAWL_API_KEY= + # FAL.ai API Key - Image generation # Get at: https://fal.ai/ FAL_KEY= diff --git a/AGENTS.md b/AGENTS.md index c1fa098b..13998fe1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -44,7 +44,7 @@ hermes-agent/ │ ├── terminal_tool.py # Terminal orchestration │ ├── process_registry.py # Background process management │ ├── file_tools.py # File read/write/search/patch -│ ├── web_tools.py # Firecrawl search/extract +│ ├── web_tools.py # Web search/extract (Parallel + Firecrawl) │ ├── browser_tool.py # Browserbase browser automation │ ├── code_execution_tool.py # execute_code sandbox │ ├── delegate_tool.py # Subagent delegation diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d866539a..25cddde6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -147,7 +147,7 @@ hermes-agent/ │ ├── approval.py # Dangerous command detection + per-session approval │ ├── terminal_tool.py # Terminal orchestration (sudo, env lifecycle, backends) │ ├── file_operations.py # read_file, write_file, search, patch, etc. -│ ├── web_tools.py # web_search, web_extract (Firecrawl + Gemini summarization) +│ ├── web_tools.py # web_search, web_extract (Parallel/Firecrawl + Gemini summarization) │ ├── vision_tools.py # Image analysis via multimodal models │ ├── delegate_tool.py # Subagent spawning and parallel task execution │ ├── code_execution_tool.py # Sandboxed Python with RPC tool access diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 3f8ac78c..5d74a2a9 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -550,6 +550,14 @@ OPTIONAL_ENV_VARS = { }, # ── Tool API keys ── + "PARALLEL_API_KEY": { + "description": "Parallel API key for AI-native web search and extract", + "prompt": "Parallel API key", + "url": "https://parallel.ai/", + "tools": ["web_search", "web_extract"], + "password": True, + "category": "tool", + }, "FIRECRAWL_API_KEY": { "description": "Firecrawl API key for web search and scraping", "prompt": "Firecrawl API key", @@ -1506,6 +1514,7 @@ def show_config(): keys = [ ("OPENROUTER_API_KEY", "OpenRouter"), ("VOICE_TOOLS_OPENAI_KEY", "OpenAI (STT/TTS)"), + ("PARALLEL_API_KEY", "Parallel"), ("FIRECRAWL_API_KEY", "Firecrawl"), ("BROWSERBASE_API_KEY", "Browserbase"), ("BROWSER_USE_API_KEY", "Browser Use"), @@ -1655,7 +1664,7 @@ def set_config_value(key: str, value: str): # Check if it's an API key (goes to .env) api_keys = [ 'OPENROUTER_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'VOICE_TOOLS_OPENAI_KEY', - 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY', + 'PARALLEL_API_KEY', 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY', 'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN', 'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY', 'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN', diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 18b4485b..b2647d01 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -444,11 +444,11 @@ def _print_setup_summary(config: dict, hermes_home): else: tool_status.append(("Mixture of Agents", False, "OPENROUTER_API_KEY")) - # Firecrawl (web tools) - if get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"): + # Web tools (Parallel or Firecrawl) + if get_env_value("PARALLEL_API_KEY") or get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"): tool_status.append(("Web Search & Extract", True, None)) else: - tool_status.append(("Web Search & Extract", False, "FIRECRAWL_API_KEY")) + tool_status.append(("Web Search & Extract", False, "PARALLEL_API_KEY or FIRECRAWL_API_KEY")) # Browser tools (local Chromium or Browserbase cloud) import shutil diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index aa8bdeaa..2e008cb5 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -151,19 +151,29 @@ TOOL_CATEGORIES = { "web": { "name": "Web Search & Extract", "setup_title": "Select Search Provider", - "setup_note": "A free DuckDuckGo search skill is also included — skip this if you don't need Firecrawl.", + "setup_note": "A free DuckDuckGo search skill is also included — skip this if you don't need a premium provider.", "icon": "🔍", "providers": [ { "name": "Firecrawl Cloud", - "tag": "Recommended - hosted service", + "tag": "Hosted service - search, extract, and crawl", + "web_backend": "firecrawl", "env_vars": [ {"key": "FIRECRAWL_API_KEY", "prompt": "Firecrawl API key", "url": "https://firecrawl.dev"}, ], }, + { + "name": "Parallel", + "tag": "AI-native search and extract", + "web_backend": "parallel", + "env_vars": [ + {"key": "PARALLEL_API_KEY", "prompt": "Parallel API key", "url": "https://parallel.ai"}, + ], + }, { "name": "Firecrawl Self-Hosted", "tag": "Free - run your own instance", + "web_backend": "firecrawl", "env_vars": [ {"key": "FIRECRAWL_API_URL", "prompt": "Your Firecrawl instance URL (e.g., http://localhost:3002)"}, ], @@ -618,6 +628,9 @@ def _is_provider_active(provider: dict, config: dict) -> bool: if "browser_provider" in provider: current = config.get("browser", {}).get("cloud_provider") return provider["browser_provider"] == current + if provider.get("web_backend"): + current = config.get("web", {}).get("backend") + return current == provider["web_backend"] return False @@ -650,6 +663,11 @@ def _configure_provider(provider: dict, config: dict): else: config.get("browser", {}).pop("cloud_provider", None) + # Set web search backend in config if applicable + if provider.get("web_backend"): + config.setdefault("web", {})["backend"] = provider["web_backend"] + _print_success(f" Web backend set to: {provider['web_backend']}") + if not env_vars: _print_success(f" {provider['name']} - no configuration needed!") return diff --git a/pyproject.toml b/pyproject.toml index 861fca13..7e92f907 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "prompt_toolkit", # Tools "firecrawl-py", + "parallel-web>=0.4.2", "fal-client", # Text-to-speech (Edge TTS is free, no API key needed) "edge-tts", diff --git a/requirements.txt b/requirements.txt index 030c8465..67b05659 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ PyJWT[crypto] # Web tools firecrawl-py +parallel-web>=0.4.2 # Image generation fal-client diff --git a/tests/integration/test_web_tools.py b/tests/integration/test_web_tools.py index fb2ea9da..fe96b3ad 100644 --- a/tests/integration/test_web_tools.py +++ b/tests/integration/test_web_tools.py @@ -3,7 +3,7 @@ Comprehensive Test Suite for Web Tools Module This script tests all web tools functionality to ensure they work correctly. -Run this after any updates to the web_tools.py module or Firecrawl library. +Run this after any updates to the web_tools.py module or backend libraries. Usage: python test_web_tools.py # Run all tests @@ -11,7 +11,7 @@ Usage: python test_web_tools.py --verbose # Show detailed output Requirements: - - FIRECRAWL_API_KEY environment variable must be set + - PARALLEL_API_KEY or FIRECRAWL_API_KEY environment variable must be set - An auxiliary LLM provider (OPENROUTER_API_KEY or Nous Portal auth) (optional, for LLM tests) """ @@ -28,12 +28,14 @@ from typing import List # Import the web tools to test (updated path after moving tools/) from tools.web_tools import ( - web_search_tool, - web_extract_tool, + web_search_tool, + web_extract_tool, web_crawl_tool, check_firecrawl_api_key, + check_web_api_key, check_auxiliary_model, - get_debug_session_info + get_debug_session_info, + _get_backend, ) @@ -121,12 +123,13 @@ class WebToolsTester: """Test environment setup and API keys""" print_section("Environment Check") - # Check Firecrawl API key - if not check_firecrawl_api_key(): - self.log_result("Firecrawl API Key", "failed", "FIRECRAWL_API_KEY not set") + # Check web backend API key (Parallel or Firecrawl) + if not check_web_api_key(): + self.log_result("Web Backend API Key", "failed", "PARALLEL_API_KEY or FIRECRAWL_API_KEY not set") return False else: - self.log_result("Firecrawl API Key", "passed", "Found") + backend = _get_backend() + self.log_result("Web Backend API Key", "passed", f"Using {backend} backend") # Check auxiliary LLM provider (optional) if not check_auxiliary_model(): @@ -578,7 +581,9 @@ class WebToolsTester: }, "results": self.test_results, "environment": { + "web_backend": _get_backend() if check_web_api_key() else None, "firecrawl_api_key": check_firecrawl_api_key(), + "parallel_api_key": bool(os.getenv("PARALLEL_API_KEY")), "auxiliary_model": check_auxiliary_model(), "debug_mode": get_debug_session_info()["enabled"] } diff --git a/tests/tools/test_web_tools_config.py b/tests/tools/test_web_tools_config.py index 4bc49166..fdf67074 100644 --- a/tests/tools/test_web_tools_config.py +++ b/tests/tools/test_web_tools_config.py @@ -1,8 +1,11 @@ -"""Tests for Firecrawl client configuration and singleton behavior. +"""Tests for web backend client configuration and singleton behavior. Coverage: _get_firecrawl_client() — configuration matrix, singleton caching, constructor failure recovery, return value verification, edge cases. + _get_backend() — backend selection logic with env var combinations. + _get_parallel_client() — Parallel client configuration, singleton caching. + check_web_api_key() — unified availability check. """ import os @@ -117,3 +120,157 @@ class TestFirecrawlClientConfig: from tools.web_tools import _get_firecrawl_client with pytest.raises(ValueError): _get_firecrawl_client() + + +class TestBackendSelection: + """Test suite for _get_backend() backend selection logic. + + The backend is configured via config.yaml (web.backend), set by + ``hermes tools``. Falls back to key-based detection for legacy/manual + setups. + """ + + _ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL") + + def setup_method(self): + for key in self._ENV_KEYS: + os.environ.pop(key, None) + + def teardown_method(self): + for key in self._ENV_KEYS: + os.environ.pop(key, None) + + # ── Config-based selection (web.backend in config.yaml) ─────────── + + def test_config_parallel(self): + """web.backend=parallel in config → 'parallel' regardless of keys.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={"backend": "parallel"}): + assert _get_backend() == "parallel" + + def test_config_firecrawl(self): + """web.backend=firecrawl in config → 'firecrawl' even if Parallel key set.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={"backend": "firecrawl"}), \ + patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): + assert _get_backend() == "firecrawl" + + def test_config_case_insensitive(self): + """web.backend=Parallel (mixed case) → 'parallel'.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={"backend": "Parallel"}): + assert _get_backend() == "parallel" + + # ── Fallback (no web.backend in config) ─────────────────────────── + + def test_fallback_parallel_only_key(self): + """Only PARALLEL_API_KEY set → 'parallel'.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={}), \ + patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): + assert _get_backend() == "parallel" + + def test_fallback_both_keys_defaults_to_firecrawl(self): + """Both keys set, no config → 'firecrawl' (backward compat).""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={}), \ + patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key", "FIRECRAWL_API_KEY": "fc-test"}): + assert _get_backend() == "firecrawl" + + def test_fallback_firecrawl_only_key(self): + """Only FIRECRAWL_API_KEY set → 'firecrawl'.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={}), \ + patch.dict(os.environ, {"FIRECRAWL_API_KEY": "fc-test"}): + assert _get_backend() == "firecrawl" + + def test_fallback_no_keys_defaults_to_firecrawl(self): + """No keys, no config → 'firecrawl' (will fail at client init).""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={}): + assert _get_backend() == "firecrawl" + + def test_invalid_config_falls_through_to_fallback(self): + """web.backend=invalid → ignored, uses key-based fallback.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={"backend": "tavily"}), \ + patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): + assert _get_backend() == "parallel" + + +class TestParallelClientConfig: + """Test suite for Parallel client initialization.""" + + def setup_method(self): + import tools.web_tools + tools.web_tools._parallel_client = None + os.environ.pop("PARALLEL_API_KEY", None) + + def teardown_method(self): + import tools.web_tools + tools.web_tools._parallel_client = None + os.environ.pop("PARALLEL_API_KEY", None) + + def test_creates_client_with_key(self): + """PARALLEL_API_KEY set → creates Parallel client.""" + with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): + from tools.web_tools import _get_parallel_client + from parallel import Parallel + client = _get_parallel_client() + assert client is not None + assert isinstance(client, Parallel) + + def test_no_key_raises_with_helpful_message(self): + """No PARALLEL_API_KEY → ValueError with guidance.""" + from tools.web_tools import _get_parallel_client + with pytest.raises(ValueError, match="PARALLEL_API_KEY"): + _get_parallel_client() + + def test_singleton_returns_same_instance(self): + """Second call returns cached client.""" + with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): + from tools.web_tools import _get_parallel_client + client1 = _get_parallel_client() + client2 = _get_parallel_client() + assert client1 is client2 + + +class TestCheckWebApiKey: + """Test suite for check_web_api_key() unified availability check.""" + + _ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL") + + def setup_method(self): + for key in self._ENV_KEYS: + os.environ.pop(key, None) + + def teardown_method(self): + for key in self._ENV_KEYS: + os.environ.pop(key, None) + + def test_parallel_key_only(self): + with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is True + + def test_firecrawl_key_only(self): + with patch.dict(os.environ, {"FIRECRAWL_API_KEY": "fc-test"}): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is True + + def test_firecrawl_url_only(self): + with patch.dict(os.environ, {"FIRECRAWL_API_URL": "http://localhost:3002"}): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is True + + def test_no_keys_returns_false(self): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is False + + def test_both_keys_returns_true(self): + with patch.dict(os.environ, { + "PARALLEL_API_KEY": "test-key", + "FIRECRAWL_API_KEY": "fc-test", + }): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is True diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py index 13f7aedd..9d620b59 100644 --- a/tests/tools/test_website_policy.py +++ b/tests/tools/test_website_policy.py @@ -426,6 +426,8 @@ async def test_web_extract_blocks_redirected_final_url(monkeypatch): async def test_web_crawl_short_circuits_blocked_url(monkeypatch): from tools import web_tools + # web_crawl_tool checks for Firecrawl env before website policy + monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") monkeypatch.setattr( web_tools, "check_website_access", @@ -453,6 +455,9 @@ async def test_web_crawl_short_circuits_blocked_url(monkeypatch): async def test_web_crawl_blocks_redirected_final_url(monkeypatch): from tools import web_tools + # web_crawl_tool checks for Firecrawl env before website policy + monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") + def fake_check(url): if url == "https://allowed.test": return None diff --git a/tools/environments/local.py b/tools/environments/local.py index dc753b41..914192f2 100644 --- a/tools/environments/local.py +++ b/tools/environments/local.py @@ -82,6 +82,9 @@ def _build_provider_env_blocklist() -> frozenset: "FIREWORKS_API_KEY", # Fireworks AI "XAI_API_KEY", # xAI (Grok) "HELICONE_API_KEY", # LLM Observability proxy + "PARALLEL_API_KEY", + "FIRECRAWL_API_KEY", + "FIRECRAWL_API_URL", # Gateway/runtime config not represented in OPTIONAL_ENV_VARS. "TELEGRAM_HOME_CHANNEL", "TELEGRAM_HOME_CHANNEL_NAME", diff --git a/tools/web_tools.py b/tools/web_tools.py index 92c0ae60..9102b3e7 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -3,16 +3,16 @@ Standalone Web Tools Module This module provides generic web tools that work with multiple backend providers. -Currently uses Firecrawl as the backend, and the interface makes it easy to swap -providers without changing the function signatures. +Backend is selected during ``hermes tools`` setup (web.backend in config.yaml). Available tools: - web_search_tool: Search the web for information - web_extract_tool: Extract content from specific web pages -- web_crawl_tool: Crawl websites with specific instructions +- web_crawl_tool: Crawl websites with specific instructions (Firecrawl only) Backend compatibility: -- Firecrawl: https://docs.firecrawl.dev/introduction +- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl) +- Parallel: https://docs.parallel.ai (search, extract) LLM Processing: - Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction @@ -53,6 +53,39 @@ from tools.website_policy import check_website_access logger = logging.getLogger(__name__) + +# ─── Backend Selection ──────────────────────────────────────────────────────── + +def _load_web_config() -> dict: + """Load the ``web:`` section from ~/.hermes/config.yaml.""" + try: + from hermes_cli.config import load_config + return load_config().get("web", {}) + except (ImportError, Exception): + return {} + + +def _get_backend() -> str: + """Determine which web backend to use. + + Reads ``web.backend`` from config.yaml (set by ``hermes tools``). + Falls back to whichever API key is present for users who configured + keys manually without running setup. + """ + configured = _load_web_config().get("backend", "").lower().strip() + if configured in ("parallel", "firecrawl"): + return configured + # Fallback for manual / legacy config — use whichever key is present. + has_firecrawl = bool(os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL")) + has_parallel = bool(os.getenv("PARALLEL_API_KEY")) + if has_parallel and not has_firecrawl: + return "parallel" + # Default to firecrawl (backward compat, or when both are set) + return "firecrawl" + + +# ─── Firecrawl Client ──────────────────────────────────────────────────────── + _firecrawl_client = None def _get_firecrawl_client(): @@ -81,6 +114,47 @@ def _get_firecrawl_client(): _firecrawl_client = Firecrawl(**kwargs) return _firecrawl_client + +# ─── Parallel Client ───────────────────────────────────────────────────────── + +_parallel_client = None +_async_parallel_client = None + +def _get_parallel_client(): + """Get or create the Parallel sync client (lazy initialization). + + Requires PARALLEL_API_KEY environment variable. + """ + from parallel import Parallel + global _parallel_client + if _parallel_client is None: + api_key = os.getenv("PARALLEL_API_KEY") + if not api_key: + raise ValueError( + "PARALLEL_API_KEY environment variable not set. " + "Get your API key at https://parallel.ai" + ) + _parallel_client = Parallel(api_key=api_key) + return _parallel_client + + +def _get_async_parallel_client(): + """Get or create the Parallel async client (lazy initialization). + + Requires PARALLEL_API_KEY environment variable. + """ + from parallel import AsyncParallel + global _async_parallel_client + if _async_parallel_client is None: + api_key = os.getenv("PARALLEL_API_KEY") + if not api_key: + raise ValueError( + "PARALLEL_API_KEY environment variable not set. " + "Get your API key at https://parallel.ai" + ) + _async_parallel_client = AsyncParallel(api_key=api_key) + return _async_parallel_client + DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000 # Allow per-task override via env var @@ -428,13 +502,89 @@ def clean_base64_images(text: str) -> str: return cleaned_text +# ─── Parallel Search & Extract Helpers ──────────────────────────────────────── + +def _parallel_search(query: str, limit: int = 5) -> dict: + """Search using the Parallel SDK and return results as a dict.""" + from tools.interrupt import is_interrupted + if is_interrupted(): + return {"error": "Interrupted", "success": False} + + mode = os.getenv("PARALLEL_SEARCH_MODE", "agentic").lower().strip() + if mode not in ("fast", "one-shot", "agentic"): + mode = "agentic" + + logger.info("Parallel search: '%s' (mode=%s, limit=%d)", query, mode, limit) + response = _get_parallel_client().beta.search( + search_queries=[query], + objective=query, + mode=mode, + max_results=min(limit, 20), + ) + + web_results = [] + for i, result in enumerate(response.results or []): + excerpts = result.excerpts or [] + web_results.append({ + "url": result.url or "", + "title": result.title or "", + "description": " ".join(excerpts) if excerpts else "", + "position": i + 1, + }) + + return {"success": True, "data": {"web": web_results}} + + +async def _parallel_extract(urls: List[str]) -> List[Dict[str, Any]]: + """Extract content from URLs using the Parallel async SDK. + + Returns a list of result dicts matching the structure expected by the + LLM post-processing pipeline (url, title, content, metadata). + """ + from tools.interrupt import is_interrupted + if is_interrupted(): + return [{"url": u, "error": "Interrupted", "title": ""} for u in urls] + + logger.info("Parallel extract: %d URL(s)", len(urls)) + response = await _get_async_parallel_client().beta.extract( + urls=urls, + full_content=True, + ) + + results = [] + for result in response.results or []: + content = result.full_content or "" + if not content: + content = "\n\n".join(result.excerpts or []) + url = result.url or "" + title = result.title or "" + results.append({ + "url": url, + "title": title, + "content": content, + "raw_content": content, + "metadata": {"sourceURL": url, "title": title}, + }) + + for error in response.errors or []: + results.append({ + "url": error.url or "", + "title": "", + "content": "", + "error": error.content or error.error_type or "extraction failed", + "metadata": {"sourceURL": error.url or ""}, + }) + + return results + + def web_search_tool(query: str, limit: int = 5) -> str: """ Search the web for information using available search API backend. - + This function provides a generic interface for web search that can work - with multiple backends. Currently uses Firecrawl. - + with multiple backends (Parallel or Firecrawl). + Note: This function returns search result metadata only (URLs, titles, descriptions). Use web_extract_tool to get full content from specific URLs. @@ -478,17 +628,28 @@ def web_search_tool(query: str, limit: int = 5) -> str: if is_interrupted(): return json.dumps({"error": "Interrupted", "success": False}) + # Dispatch to the configured backend + backend = _get_backend() + if backend == "parallel": + response_data = _parallel_search(query, limit) + debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", [])) + result_json = json.dumps(response_data, indent=2, ensure_ascii=False) + debug_call_data["final_response_size"] = len(result_json) + _debug.log_call("web_search_tool", debug_call_data) + _debug.save() + return result_json + logger.info("Searching the web for: '%s' (limit: %d)", query, limit) - + response = _get_firecrawl_client().search( query=query, limit=limit ) - + # The response is a SearchData object with web, news, and images attributes # When not scraping, the results are directly in these attributes web_results = [] - + # Check if response has web attribute (SearchData object) if hasattr(response, 'web'): # Response is a SearchData object with web attribute @@ -596,123 +757,130 @@ async def web_extract_tool( try: logger.info("Extracting content from %d URL(s)", len(urls)) - - # Determine requested formats for Firecrawl v2 - formats: List[str] = [] - if format == "markdown": - formats = ["markdown"] - elif format == "html": - formats = ["html"] + + # Dispatch to the configured backend + backend = _get_backend() + + if backend == "parallel": + results = await _parallel_extract(urls) else: - # Default: request markdown for LLM-readiness and include html as backup - formats = ["markdown", "html"] - - # Always use individual scraping for simplicity and reliability - # Batch scraping adds complexity without much benefit for small numbers of URLs - results: List[Dict[str, Any]] = [] - - from tools.interrupt import is_interrupted as _is_interrupted - for url in urls: - if _is_interrupted(): - results.append({"url": url, "error": "Interrupted", "title": ""}) - continue + # ── Firecrawl extraction ── + # Determine requested formats for Firecrawl v2 + formats: List[str] = [] + if format == "markdown": + formats = ["markdown"] + elif format == "html": + formats = ["html"] + else: + # Default: request markdown for LLM-readiness and include html as backup + formats = ["markdown", "html"] - # Website policy check — block before fetching - blocked = check_website_access(url) - if blocked: - logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"]) - results.append({ - "url": url, "title": "", "content": "", - "error": blocked["message"], - "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}, - }) - continue + # Always use individual scraping for simplicity and reliability + # Batch scraping adds complexity without much benefit for small numbers of URLs + results: List[Dict[str, Any]] = [] - try: - logger.info("Scraping: %s", url) - scrape_result = _get_firecrawl_client().scrape( - url=url, - formats=formats - ) - - # Process the result - properly handle object serialization - metadata = {} - title = "" - content_markdown = None - content_html = None - - # Extract data from the scrape result - if hasattr(scrape_result, 'model_dump'): - # Pydantic model - use model_dump to get dict - result_dict = scrape_result.model_dump() - content_markdown = result_dict.get('markdown') - content_html = result_dict.get('html') - metadata = result_dict.get('metadata', {}) - elif hasattr(scrape_result, '__dict__'): - # Regular object with attributes - content_markdown = getattr(scrape_result, 'markdown', None) - content_html = getattr(scrape_result, 'html', None) - - # Handle metadata - convert to dict if it's an object - metadata_obj = getattr(scrape_result, 'metadata', {}) - if hasattr(metadata_obj, 'model_dump'): - metadata = metadata_obj.model_dump() - elif hasattr(metadata_obj, '__dict__'): - metadata = metadata_obj.__dict__ - elif isinstance(metadata_obj, dict): - metadata = metadata_obj - else: - metadata = {} - elif isinstance(scrape_result, dict): - # Already a dictionary - content_markdown = scrape_result.get('markdown') - content_html = scrape_result.get('html') - metadata = scrape_result.get('metadata', {}) - - # Ensure metadata is a dict (not an object) - if not isinstance(metadata, dict): - if hasattr(metadata, 'model_dump'): - metadata = metadata.model_dump() - elif hasattr(metadata, '__dict__'): - metadata = metadata.__dict__ - else: - metadata = {} - - # Get title from metadata - title = metadata.get("title", "") - - # Re-check final URL after redirect - final_url = metadata.get("sourceURL", url) - final_blocked = check_website_access(final_url) - if final_blocked: - logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"]) + from tools.interrupt import is_interrupted as _is_interrupted + for url in urls: + if _is_interrupted(): + results.append({"url": url, "error": "Interrupted", "title": ""}) + continue + + # Website policy check — block before fetching + blocked = check_website_access(url) + if blocked: + logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"]) results.append({ - "url": final_url, "title": title, "content": "", "raw_content": "", - "error": final_blocked["message"], - "blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]}, + "url": url, "title": "", "content": "", + "error": blocked["message"], + "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}, }) continue - # Choose content based on requested format - chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or "" - - results.append({ - "url": final_url, - "title": title, - "content": chosen_content, - "raw_content": chosen_content, - "metadata": metadata # Now guaranteed to be a dict - }) - - except Exception as scrape_err: - logger.debug("Scrape failed for %s: %s", url, scrape_err) - results.append({ - "url": url, - "title": "", - "content": "", - "raw_content": "", - "error": str(scrape_err) - }) + try: + logger.info("Scraping: %s", url) + scrape_result = _get_firecrawl_client().scrape( + url=url, + formats=formats + ) + + # Process the result - properly handle object serialization + metadata = {} + title = "" + content_markdown = None + content_html = None + + # Extract data from the scrape result + if hasattr(scrape_result, 'model_dump'): + # Pydantic model - use model_dump to get dict + result_dict = scrape_result.model_dump() + content_markdown = result_dict.get('markdown') + content_html = result_dict.get('html') + metadata = result_dict.get('metadata', {}) + elif hasattr(scrape_result, '__dict__'): + # Regular object with attributes + content_markdown = getattr(scrape_result, 'markdown', None) + content_html = getattr(scrape_result, 'html', None) + + # Handle metadata - convert to dict if it's an object + metadata_obj = getattr(scrape_result, 'metadata', {}) + if hasattr(metadata_obj, 'model_dump'): + metadata = metadata_obj.model_dump() + elif hasattr(metadata_obj, '__dict__'): + metadata = metadata_obj.__dict__ + elif isinstance(metadata_obj, dict): + metadata = metadata_obj + else: + metadata = {} + elif isinstance(scrape_result, dict): + # Already a dictionary + content_markdown = scrape_result.get('markdown') + content_html = scrape_result.get('html') + metadata = scrape_result.get('metadata', {}) + + # Ensure metadata is a dict (not an object) + if not isinstance(metadata, dict): + if hasattr(metadata, 'model_dump'): + metadata = metadata.model_dump() + elif hasattr(metadata, '__dict__'): + metadata = metadata.__dict__ + else: + metadata = {} + + # Get title from metadata + title = metadata.get("title", "") + + # Re-check final URL after redirect + final_url = metadata.get("sourceURL", url) + final_blocked = check_website_access(final_url) + if final_blocked: + logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"]) + results.append({ + "url": final_url, "title": title, "content": "", "raw_content": "", + "error": final_blocked["message"], + "blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]}, + }) + continue + + # Choose content based on requested format + chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or "" + + results.append({ + "url": final_url, + "title": title, + "content": chosen_content, + "raw_content": chosen_content, + "metadata": metadata # Now guaranteed to be a dict + }) + + except Exception as scrape_err: + logger.debug("Scrape failed for %s: %s", url, scrape_err) + results.append({ + "url": url, + "title": "", + "content": "", + "raw_content": "", + "error": str(scrape_err) + }) response = {"results": results} @@ -887,6 +1055,14 @@ async def web_crawl_tool( } try: + # web_crawl requires Firecrawl — Parallel has no crawl API + if not (os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL")): + return json.dumps({ + "error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, " + "or use web_search + web_extract instead.", + "success": False, + }, ensure_ascii=False) + # Ensure URL has protocol if not url.startswith(('http://', 'https://')): url = f'https://{url}' @@ -1151,13 +1327,22 @@ async def web_crawl_tool( def check_firecrawl_api_key() -> bool: """ Check if the Firecrawl API key is available in environment variables. - + Returns: bool: True if API key is set, False otherwise """ return bool(os.getenv("FIRECRAWL_API_KEY")) +def check_web_api_key() -> bool: + """Check if any web backend API key is available (Parallel or Firecrawl).""" + return bool( + os.getenv("PARALLEL_API_KEY") + or os.getenv("FIRECRAWL_API_KEY") + or os.getenv("FIRECRAWL_API_URL") + ) + + def check_auxiliary_model() -> bool: """Check if an auxiliary text model is available for LLM content processing.""" try: @@ -1184,26 +1369,30 @@ if __name__ == "__main__": print("=" * 40) # Check if API keys are available - firecrawl_available = check_firecrawl_api_key() + web_available = check_web_api_key() nous_available = check_auxiliary_model() - - if not firecrawl_available: - print("❌ FIRECRAWL_API_KEY environment variable not set") - print("Please set your API key: export FIRECRAWL_API_KEY='your-key-here'") - print("Get API key at: https://firecrawl.dev/") + + if web_available: + backend = _get_backend() + print(f"✅ Web backend: {backend}") + if backend == "parallel": + print(" Using Parallel API (https://parallel.ai)") + else: + print(" Using Firecrawl API (https://firecrawl.dev)") else: - print("✅ Firecrawl API key found") - + print("❌ No web search backend configured") + print("Set PARALLEL_API_KEY (https://parallel.ai) or FIRECRAWL_API_KEY (https://firecrawl.dev)") + if not nous_available: print("❌ No auxiliary model available for LLM content processing") print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY") print("⚠️ Without an auxiliary model, LLM content processing will be disabled") else: print(f"✅ Auxiliary model available: {DEFAULT_SUMMARIZER_MODEL}") - - if not firecrawl_available: + + if not web_available: exit(1) - + print("🛠️ Web tools ready for use!") if nous_available: @@ -1301,8 +1490,8 @@ registry.register( toolset="web", schema=WEB_SEARCH_SCHEMA, handler=lambda args, **kw: web_search_tool(args.get("query", ""), limit=5), - check_fn=check_firecrawl_api_key, - requires_env=["FIRECRAWL_API_KEY"], + check_fn=check_web_api_key, + requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"], emoji="🔍", ) registry.register( @@ -1311,8 +1500,8 @@ registry.register( schema=WEB_EXTRACT_SCHEMA, handler=lambda args, **kw: web_extract_tool( args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"), - check_fn=check_firecrawl_api_key, - requires_env=["FIRECRAWL_API_KEY"], + check_fn=check_web_api_key, + requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"], is_async=True, emoji="📄", ) diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index bd029572..a594b7a6 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -61,6 +61,7 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe | Variable | Description | |----------|-------------| +| `PARALLEL_API_KEY` | AI-native web search ([parallel.ai](https://parallel.ai/)) | | `FIRECRAWL_API_KEY` | Web scraping ([firecrawl.dev](https://firecrawl.dev/)) | | `FIRECRAWL_API_URL` | Custom Firecrawl API endpoint for self-hosted instances (optional) | | `BROWSERBASE_API_KEY` | Browser automation ([browserbase.com](https://browserbase.com/)) |