feat(web): add Parallel as alternative web search/extract backend (#1696)
* feat(web): add Parallel as alternative web search/extract backend Adds Parallel (parallel.ai) as a drop-in alternative to Firecrawl for web_search and web_extract tools using the official parallel-web SDK. - Backend selection via WEB_SEARCH_BACKEND env var (auto/parallel/firecrawl) - Auto mode prefers Firecrawl when both keys present; Parallel when sole backend - web_crawl remains Firecrawl-only with clear error when unavailable - Lazy SDK imports, interrupt support, singleton clients - 16 new unit tests for backend selection and client config Co-authored-by: s-jag <s-jag@users.noreply.github.com> * fix: add PARALLEL_API_KEY to config registry and fix web_crawl policy tests Follow-up for Parallel backend integration: - Add PARALLEL_API_KEY to OPTIONAL_ENV_VARS (hermes doctor, env blocklist) - Add to set_config_value api_keys list (hermes config set) - Add to doctor keys display - Fix 2 web_crawl policy tests that didn't set FIRECRAWL_API_KEY (needed now that web_crawl has a Firecrawl availability guard) * refactor: explicit backend selection via hermes tools, not auto-detect Replace the auto-detect backend selection with explicit user choice: - hermes tools saves WEB_SEARCH_BACKEND to .env when user picks a provider - _get_backend() reads the explicit choice first - Fallback only for manual/legacy config (uses whichever key is present) - _is_provider_active() shows [active] for the selected web backend - Updated tests, docs, and .env.example to remove 'auto' mode language * refactor: use config.yaml for web backend, not env var Match the TTS/browser pattern — web.backend is stored in config.yaml (set by hermes tools), not as a WEB_SEARCH_BACKEND env var. - _load_web_config() reads web: section from config.yaml - _get_backend() reads web.backend from config, falls back to key detection - _configure_provider() saves to config dict (saved to config.yaml) - _is_provider_active() reads from config dict - Removed WEB_SEARCH_BACKEND from .env.example, set_config_value, docs - Updated all tests to mock _load_web_config instead of env vars --------- Co-authored-by: s-jag <s-jag@users.noreply.github.com>
This commit is contained in:
@@ -65,10 +65,15 @@ OPENCODE_GO_API_KEY=
|
||||
# TOOL API KEYS
|
||||
# =============================================================================
|
||||
|
||||
# Parallel API Key - AI-native web search and extract
|
||||
# Get at: https://parallel.ai
|
||||
PARALLEL_API_KEY=
|
||||
|
||||
# Firecrawl API Key - Web search, extract, and crawl
|
||||
# Get at: https://firecrawl.dev/
|
||||
FIRECRAWL_API_KEY=
|
||||
|
||||
|
||||
# FAL.ai API Key - Image generation
|
||||
# Get at: https://fal.ai/
|
||||
FAL_KEY=
|
||||
|
||||
@@ -44,7 +44,7 @@ hermes-agent/
|
||||
│ ├── terminal_tool.py # Terminal orchestration
|
||||
│ ├── process_registry.py # Background process management
|
||||
│ ├── file_tools.py # File read/write/search/patch
|
||||
│ ├── web_tools.py # Firecrawl search/extract
|
||||
│ ├── web_tools.py # Web search/extract (Parallel + Firecrawl)
|
||||
│ ├── browser_tool.py # Browserbase browser automation
|
||||
│ ├── code_execution_tool.py # execute_code sandbox
|
||||
│ ├── delegate_tool.py # Subagent delegation
|
||||
|
||||
@@ -147,7 +147,7 @@ hermes-agent/
|
||||
│ ├── approval.py # Dangerous command detection + per-session approval
|
||||
│ ├── terminal_tool.py # Terminal orchestration (sudo, env lifecycle, backends)
|
||||
│ ├── file_operations.py # read_file, write_file, search, patch, etc.
|
||||
│ ├── web_tools.py # web_search, web_extract (Firecrawl + Gemini summarization)
|
||||
│ ├── web_tools.py # web_search, web_extract (Parallel/Firecrawl + Gemini summarization)
|
||||
│ ├── vision_tools.py # Image analysis via multimodal models
|
||||
│ ├── delegate_tool.py # Subagent spawning and parallel task execution
|
||||
│ ├── code_execution_tool.py # Sandboxed Python with RPC tool access
|
||||
|
||||
@@ -550,6 +550,14 @@ OPTIONAL_ENV_VARS = {
|
||||
},
|
||||
|
||||
# ── Tool API keys ──
|
||||
"PARALLEL_API_KEY": {
|
||||
"description": "Parallel API key for AI-native web search and extract",
|
||||
"prompt": "Parallel API key",
|
||||
"url": "https://parallel.ai/",
|
||||
"tools": ["web_search", "web_extract"],
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"FIRECRAWL_API_KEY": {
|
||||
"description": "Firecrawl API key for web search and scraping",
|
||||
"prompt": "Firecrawl API key",
|
||||
@@ -1506,6 +1514,7 @@ def show_config():
|
||||
keys = [
|
||||
("OPENROUTER_API_KEY", "OpenRouter"),
|
||||
("VOICE_TOOLS_OPENAI_KEY", "OpenAI (STT/TTS)"),
|
||||
("PARALLEL_API_KEY", "Parallel"),
|
||||
("FIRECRAWL_API_KEY", "Firecrawl"),
|
||||
("BROWSERBASE_API_KEY", "Browserbase"),
|
||||
("BROWSER_USE_API_KEY", "Browser Use"),
|
||||
@@ -1655,7 +1664,7 @@ def set_config_value(key: str, value: str):
|
||||
# Check if it's an API key (goes to .env)
|
||||
api_keys = [
|
||||
'OPENROUTER_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'VOICE_TOOLS_OPENAI_KEY',
|
||||
'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY',
|
||||
'PARALLEL_API_KEY', 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY',
|
||||
'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN',
|
||||
'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY',
|
||||
'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN',
|
||||
|
||||
@@ -444,11 +444,11 @@ def _print_setup_summary(config: dict, hermes_home):
|
||||
else:
|
||||
tool_status.append(("Mixture of Agents", False, "OPENROUTER_API_KEY"))
|
||||
|
||||
# Firecrawl (web tools)
|
||||
if get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"):
|
||||
# Web tools (Parallel or Firecrawl)
|
||||
if get_env_value("PARALLEL_API_KEY") or get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"):
|
||||
tool_status.append(("Web Search & Extract", True, None))
|
||||
else:
|
||||
tool_status.append(("Web Search & Extract", False, "FIRECRAWL_API_KEY"))
|
||||
tool_status.append(("Web Search & Extract", False, "PARALLEL_API_KEY or FIRECRAWL_API_KEY"))
|
||||
|
||||
# Browser tools (local Chromium or Browserbase cloud)
|
||||
import shutil
|
||||
|
||||
@@ -151,19 +151,29 @@ TOOL_CATEGORIES = {
|
||||
"web": {
|
||||
"name": "Web Search & Extract",
|
||||
"setup_title": "Select Search Provider",
|
||||
"setup_note": "A free DuckDuckGo search skill is also included — skip this if you don't need Firecrawl.",
|
||||
"setup_note": "A free DuckDuckGo search skill is also included — skip this if you don't need a premium provider.",
|
||||
"icon": "🔍",
|
||||
"providers": [
|
||||
{
|
||||
"name": "Firecrawl Cloud",
|
||||
"tag": "Recommended - hosted service",
|
||||
"tag": "Hosted service - search, extract, and crawl",
|
||||
"web_backend": "firecrawl",
|
||||
"env_vars": [
|
||||
{"key": "FIRECRAWL_API_KEY", "prompt": "Firecrawl API key", "url": "https://firecrawl.dev"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Parallel",
|
||||
"tag": "AI-native search and extract",
|
||||
"web_backend": "parallel",
|
||||
"env_vars": [
|
||||
{"key": "PARALLEL_API_KEY", "prompt": "Parallel API key", "url": "https://parallel.ai"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Firecrawl Self-Hosted",
|
||||
"tag": "Free - run your own instance",
|
||||
"web_backend": "firecrawl",
|
||||
"env_vars": [
|
||||
{"key": "FIRECRAWL_API_URL", "prompt": "Your Firecrawl instance URL (e.g., http://localhost:3002)"},
|
||||
],
|
||||
@@ -618,6 +628,9 @@ def _is_provider_active(provider: dict, config: dict) -> bool:
|
||||
if "browser_provider" in provider:
|
||||
current = config.get("browser", {}).get("cloud_provider")
|
||||
return provider["browser_provider"] == current
|
||||
if provider.get("web_backend"):
|
||||
current = config.get("web", {}).get("backend")
|
||||
return current == provider["web_backend"]
|
||||
return False
|
||||
|
||||
|
||||
@@ -650,6 +663,11 @@ def _configure_provider(provider: dict, config: dict):
|
||||
else:
|
||||
config.get("browser", {}).pop("cloud_provider", None)
|
||||
|
||||
# Set web search backend in config if applicable
|
||||
if provider.get("web_backend"):
|
||||
config.setdefault("web", {})["backend"] = provider["web_backend"]
|
||||
_print_success(f" Web backend set to: {provider['web_backend']}")
|
||||
|
||||
if not env_vars:
|
||||
_print_success(f" {provider['name']} - no configuration needed!")
|
||||
return
|
||||
|
||||
@@ -27,6 +27,7 @@ dependencies = [
|
||||
"prompt_toolkit",
|
||||
# Tools
|
||||
"firecrawl-py",
|
||||
"parallel-web>=0.4.2",
|
||||
"fal-client",
|
||||
# Text-to-speech (Edge TTS is free, no API key needed)
|
||||
"edge-tts",
|
||||
|
||||
@@ -18,6 +18,7 @@ PyJWT[crypto]
|
||||
|
||||
# Web tools
|
||||
firecrawl-py
|
||||
parallel-web>=0.4.2
|
||||
|
||||
# Image generation
|
||||
fal-client
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Comprehensive Test Suite for Web Tools Module
|
||||
|
||||
This script tests all web tools functionality to ensure they work correctly.
|
||||
Run this after any updates to the web_tools.py module or Firecrawl library.
|
||||
Run this after any updates to the web_tools.py module or backend libraries.
|
||||
|
||||
Usage:
|
||||
python test_web_tools.py # Run all tests
|
||||
@@ -11,7 +11,7 @@ Usage:
|
||||
python test_web_tools.py --verbose # Show detailed output
|
||||
|
||||
Requirements:
|
||||
- FIRECRAWL_API_KEY environment variable must be set
|
||||
- PARALLEL_API_KEY or FIRECRAWL_API_KEY environment variable must be set
|
||||
- An auxiliary LLM provider (OPENROUTER_API_KEY or Nous Portal auth) (optional, for LLM tests)
|
||||
"""
|
||||
|
||||
@@ -32,8 +32,10 @@ from tools.web_tools import (
|
||||
web_extract_tool,
|
||||
web_crawl_tool,
|
||||
check_firecrawl_api_key,
|
||||
check_web_api_key,
|
||||
check_auxiliary_model,
|
||||
get_debug_session_info
|
||||
get_debug_session_info,
|
||||
_get_backend,
|
||||
)
|
||||
|
||||
|
||||
@@ -121,12 +123,13 @@ class WebToolsTester:
|
||||
"""Test environment setup and API keys"""
|
||||
print_section("Environment Check")
|
||||
|
||||
# Check Firecrawl API key
|
||||
if not check_firecrawl_api_key():
|
||||
self.log_result("Firecrawl API Key", "failed", "FIRECRAWL_API_KEY not set")
|
||||
# Check web backend API key (Parallel or Firecrawl)
|
||||
if not check_web_api_key():
|
||||
self.log_result("Web Backend API Key", "failed", "PARALLEL_API_KEY or FIRECRAWL_API_KEY not set")
|
||||
return False
|
||||
else:
|
||||
self.log_result("Firecrawl API Key", "passed", "Found")
|
||||
backend = _get_backend()
|
||||
self.log_result("Web Backend API Key", "passed", f"Using {backend} backend")
|
||||
|
||||
# Check auxiliary LLM provider (optional)
|
||||
if not check_auxiliary_model():
|
||||
@@ -578,7 +581,9 @@ class WebToolsTester:
|
||||
},
|
||||
"results": self.test_results,
|
||||
"environment": {
|
||||
"web_backend": _get_backend() if check_web_api_key() else None,
|
||||
"firecrawl_api_key": check_firecrawl_api_key(),
|
||||
"parallel_api_key": bool(os.getenv("PARALLEL_API_KEY")),
|
||||
"auxiliary_model": check_auxiliary_model(),
|
||||
"debug_mode": get_debug_session_info()["enabled"]
|
||||
}
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
"""Tests for Firecrawl client configuration and singleton behavior.
|
||||
"""Tests for web backend client configuration and singleton behavior.
|
||||
|
||||
Coverage:
|
||||
_get_firecrawl_client() — configuration matrix, singleton caching,
|
||||
constructor failure recovery, return value verification, edge cases.
|
||||
_get_backend() — backend selection logic with env var combinations.
|
||||
_get_parallel_client() — Parallel client configuration, singleton caching.
|
||||
check_web_api_key() — unified availability check.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -117,3 +120,157 @@ class TestFirecrawlClientConfig:
|
||||
from tools.web_tools import _get_firecrawl_client
|
||||
with pytest.raises(ValueError):
|
||||
_get_firecrawl_client()
|
||||
|
||||
|
||||
class TestBackendSelection:
|
||||
"""Test suite for _get_backend() backend selection logic.
|
||||
|
||||
The backend is configured via config.yaml (web.backend), set by
|
||||
``hermes tools``. Falls back to key-based detection for legacy/manual
|
||||
setups.
|
||||
"""
|
||||
|
||||
_ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL")
|
||||
|
||||
def setup_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
def teardown_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
# ── Config-based selection (web.backend in config.yaml) ───────────
|
||||
|
||||
def test_config_parallel(self):
|
||||
"""web.backend=parallel in config → 'parallel' regardless of keys."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "parallel"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
def test_config_firecrawl(self):
|
||||
"""web.backend=firecrawl in config → 'firecrawl' even if Parallel key set."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "firecrawl"}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_config_case_insensitive(self):
|
||||
"""web.backend=Parallel (mixed case) → 'parallel'."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "Parallel"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
# ── Fallback (no web.backend in config) ───────────────────────────
|
||||
|
||||
def test_fallback_parallel_only_key(self):
|
||||
"""Only PARALLEL_API_KEY set → 'parallel'."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
def test_fallback_both_keys_defaults_to_firecrawl(self):
|
||||
"""Both keys set, no config → 'firecrawl' (backward compat)."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key", "FIRECRAWL_API_KEY": "fc-test"}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_fallback_firecrawl_only_key(self):
|
||||
"""Only FIRECRAWL_API_KEY set → 'firecrawl'."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}), \
|
||||
patch.dict(os.environ, {"FIRECRAWL_API_KEY": "fc-test"}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_fallback_no_keys_defaults_to_firecrawl(self):
|
||||
"""No keys, no config → 'firecrawl' (will fail at client init)."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={}):
|
||||
assert _get_backend() == "firecrawl"
|
||||
|
||||
def test_invalid_config_falls_through_to_fallback(self):
|
||||
"""web.backend=invalid → ignored, uses key-based fallback."""
|
||||
from tools.web_tools import _get_backend
|
||||
with patch("tools.web_tools._load_web_config", return_value={"backend": "tavily"}), \
|
||||
patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
assert _get_backend() == "parallel"
|
||||
|
||||
|
||||
class TestParallelClientConfig:
|
||||
"""Test suite for Parallel client initialization."""
|
||||
|
||||
def setup_method(self):
|
||||
import tools.web_tools
|
||||
tools.web_tools._parallel_client = None
|
||||
os.environ.pop("PARALLEL_API_KEY", None)
|
||||
|
||||
def teardown_method(self):
|
||||
import tools.web_tools
|
||||
tools.web_tools._parallel_client = None
|
||||
os.environ.pop("PARALLEL_API_KEY", None)
|
||||
|
||||
def test_creates_client_with_key(self):
|
||||
"""PARALLEL_API_KEY set → creates Parallel client."""
|
||||
with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
from tools.web_tools import _get_parallel_client
|
||||
from parallel import Parallel
|
||||
client = _get_parallel_client()
|
||||
assert client is not None
|
||||
assert isinstance(client, Parallel)
|
||||
|
||||
def test_no_key_raises_with_helpful_message(self):
|
||||
"""No PARALLEL_API_KEY → ValueError with guidance."""
|
||||
from tools.web_tools import _get_parallel_client
|
||||
with pytest.raises(ValueError, match="PARALLEL_API_KEY"):
|
||||
_get_parallel_client()
|
||||
|
||||
def test_singleton_returns_same_instance(self):
|
||||
"""Second call returns cached client."""
|
||||
with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
from tools.web_tools import _get_parallel_client
|
||||
client1 = _get_parallel_client()
|
||||
client2 = _get_parallel_client()
|
||||
assert client1 is client2
|
||||
|
||||
|
||||
class TestCheckWebApiKey:
|
||||
"""Test suite for check_web_api_key() unified availability check."""
|
||||
|
||||
_ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL")
|
||||
|
||||
def setup_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
def teardown_method(self):
|
||||
for key in self._ENV_KEYS:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
def test_parallel_key_only(self):
|
||||
with patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
def test_firecrawl_key_only(self):
|
||||
with patch.dict(os.environ, {"FIRECRAWL_API_KEY": "fc-test"}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
def test_firecrawl_url_only(self):
|
||||
with patch.dict(os.environ, {"FIRECRAWL_API_URL": "http://localhost:3002"}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
def test_no_keys_returns_false(self):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is False
|
||||
|
||||
def test_both_keys_returns_true(self):
|
||||
with patch.dict(os.environ, {
|
||||
"PARALLEL_API_KEY": "test-key",
|
||||
"FIRECRAWL_API_KEY": "fc-test",
|
||||
}):
|
||||
from tools.web_tools import check_web_api_key
|
||||
assert check_web_api_key() is True
|
||||
|
||||
@@ -426,6 +426,8 @@ async def test_web_extract_blocks_redirected_final_url(monkeypatch):
|
||||
async def test_web_crawl_short_circuits_blocked_url(monkeypatch):
|
||||
from tools import web_tools
|
||||
|
||||
# web_crawl_tool checks for Firecrawl env before website policy
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
monkeypatch.setattr(
|
||||
web_tools,
|
||||
"check_website_access",
|
||||
@@ -453,6 +455,9 @@ async def test_web_crawl_short_circuits_blocked_url(monkeypatch):
|
||||
async def test_web_crawl_blocks_redirected_final_url(monkeypatch):
|
||||
from tools import web_tools
|
||||
|
||||
# web_crawl_tool checks for Firecrawl env before website policy
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
|
||||
def fake_check(url):
|
||||
if url == "https://allowed.test":
|
||||
return None
|
||||
|
||||
@@ -82,6 +82,9 @@ def _build_provider_env_blocklist() -> frozenset:
|
||||
"FIREWORKS_API_KEY", # Fireworks AI
|
||||
"XAI_API_KEY", # xAI (Grok)
|
||||
"HELICONE_API_KEY", # LLM Observability proxy
|
||||
"PARALLEL_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"FIRECRAWL_API_URL",
|
||||
# Gateway/runtime config not represented in OPTIONAL_ENV_VARS.
|
||||
"TELEGRAM_HOME_CHANNEL",
|
||||
"TELEGRAM_HOME_CHANNEL_NAME",
|
||||
|
||||
@@ -3,16 +3,16 @@
|
||||
Standalone Web Tools Module
|
||||
|
||||
This module provides generic web tools that work with multiple backend providers.
|
||||
Currently uses Firecrawl as the backend, and the interface makes it easy to swap
|
||||
providers without changing the function signatures.
|
||||
Backend is selected during ``hermes tools`` setup (web.backend in config.yaml).
|
||||
|
||||
Available tools:
|
||||
- web_search_tool: Search the web for information
|
||||
- web_extract_tool: Extract content from specific web pages
|
||||
- web_crawl_tool: Crawl websites with specific instructions
|
||||
- web_crawl_tool: Crawl websites with specific instructions (Firecrawl only)
|
||||
|
||||
Backend compatibility:
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl)
|
||||
- Parallel: https://docs.parallel.ai (search, extract)
|
||||
|
||||
LLM Processing:
|
||||
- Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction
|
||||
@@ -53,6 +53,39 @@ from tools.website_policy import check_website_access
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Backend Selection ────────────────────────────────────────────────────────
|
||||
|
||||
def _load_web_config() -> dict:
|
||||
"""Load the ``web:`` section from ~/.hermes/config.yaml."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
return load_config().get("web", {})
|
||||
except (ImportError, Exception):
|
||||
return {}
|
||||
|
||||
|
||||
def _get_backend() -> str:
|
||||
"""Determine which web backend to use.
|
||||
|
||||
Reads ``web.backend`` from config.yaml (set by ``hermes tools``).
|
||||
Falls back to whichever API key is present for users who configured
|
||||
keys manually without running setup.
|
||||
"""
|
||||
configured = _load_web_config().get("backend", "").lower().strip()
|
||||
if configured in ("parallel", "firecrawl"):
|
||||
return configured
|
||||
# Fallback for manual / legacy config — use whichever key is present.
|
||||
has_firecrawl = bool(os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL"))
|
||||
has_parallel = bool(os.getenv("PARALLEL_API_KEY"))
|
||||
if has_parallel and not has_firecrawl:
|
||||
return "parallel"
|
||||
# Default to firecrawl (backward compat, or when both are set)
|
||||
return "firecrawl"
|
||||
|
||||
|
||||
# ─── Firecrawl Client ────────────────────────────────────────────────────────
|
||||
|
||||
_firecrawl_client = None
|
||||
|
||||
def _get_firecrawl_client():
|
||||
@@ -81,6 +114,47 @@ def _get_firecrawl_client():
|
||||
_firecrawl_client = Firecrawl(**kwargs)
|
||||
return _firecrawl_client
|
||||
|
||||
|
||||
# ─── Parallel Client ─────────────────────────────────────────────────────────
|
||||
|
||||
_parallel_client = None
|
||||
_async_parallel_client = None
|
||||
|
||||
def _get_parallel_client():
|
||||
"""Get or create the Parallel sync client (lazy initialization).
|
||||
|
||||
Requires PARALLEL_API_KEY environment variable.
|
||||
"""
|
||||
from parallel import Parallel
|
||||
global _parallel_client
|
||||
if _parallel_client is None:
|
||||
api_key = os.getenv("PARALLEL_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"PARALLEL_API_KEY environment variable not set. "
|
||||
"Get your API key at https://parallel.ai"
|
||||
)
|
||||
_parallel_client = Parallel(api_key=api_key)
|
||||
return _parallel_client
|
||||
|
||||
|
||||
def _get_async_parallel_client():
|
||||
"""Get or create the Parallel async client (lazy initialization).
|
||||
|
||||
Requires PARALLEL_API_KEY environment variable.
|
||||
"""
|
||||
from parallel import AsyncParallel
|
||||
global _async_parallel_client
|
||||
if _async_parallel_client is None:
|
||||
api_key = os.getenv("PARALLEL_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"PARALLEL_API_KEY environment variable not set. "
|
||||
"Get your API key at https://parallel.ai"
|
||||
)
|
||||
_async_parallel_client = AsyncParallel(api_key=api_key)
|
||||
return _async_parallel_client
|
||||
|
||||
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
||||
|
||||
# Allow per-task override via env var
|
||||
@@ -428,12 +502,88 @@ def clean_base64_images(text: str) -> str:
|
||||
return cleaned_text
|
||||
|
||||
|
||||
# ─── Parallel Search & Extract Helpers ────────────────────────────────────────
|
||||
|
||||
def _parallel_search(query: str, limit: int = 5) -> dict:
|
||||
"""Search using the Parallel SDK and return results as a dict."""
|
||||
from tools.interrupt import is_interrupted
|
||||
if is_interrupted():
|
||||
return {"error": "Interrupted", "success": False}
|
||||
|
||||
mode = os.getenv("PARALLEL_SEARCH_MODE", "agentic").lower().strip()
|
||||
if mode not in ("fast", "one-shot", "agentic"):
|
||||
mode = "agentic"
|
||||
|
||||
logger.info("Parallel search: '%s' (mode=%s, limit=%d)", query, mode, limit)
|
||||
response = _get_parallel_client().beta.search(
|
||||
search_queries=[query],
|
||||
objective=query,
|
||||
mode=mode,
|
||||
max_results=min(limit, 20),
|
||||
)
|
||||
|
||||
web_results = []
|
||||
for i, result in enumerate(response.results or []):
|
||||
excerpts = result.excerpts or []
|
||||
web_results.append({
|
||||
"url": result.url or "",
|
||||
"title": result.title or "",
|
||||
"description": " ".join(excerpts) if excerpts else "",
|
||||
"position": i + 1,
|
||||
})
|
||||
|
||||
return {"success": True, "data": {"web": web_results}}
|
||||
|
||||
|
||||
async def _parallel_extract(urls: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Extract content from URLs using the Parallel async SDK.
|
||||
|
||||
Returns a list of result dicts matching the structure expected by the
|
||||
LLM post-processing pipeline (url, title, content, metadata).
|
||||
"""
|
||||
from tools.interrupt import is_interrupted
|
||||
if is_interrupted():
|
||||
return [{"url": u, "error": "Interrupted", "title": ""} for u in urls]
|
||||
|
||||
logger.info("Parallel extract: %d URL(s)", len(urls))
|
||||
response = await _get_async_parallel_client().beta.extract(
|
||||
urls=urls,
|
||||
full_content=True,
|
||||
)
|
||||
|
||||
results = []
|
||||
for result in response.results or []:
|
||||
content = result.full_content or ""
|
||||
if not content:
|
||||
content = "\n\n".join(result.excerpts or [])
|
||||
url = result.url or ""
|
||||
title = result.title or ""
|
||||
results.append({
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"raw_content": content,
|
||||
"metadata": {"sourceURL": url, "title": title},
|
||||
})
|
||||
|
||||
for error in response.errors or []:
|
||||
results.append({
|
||||
"url": error.url or "",
|
||||
"title": "",
|
||||
"content": "",
|
||||
"error": error.content or error.error_type or "extraction failed",
|
||||
"metadata": {"sourceURL": error.url or ""},
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
"""
|
||||
Search the web for information using available search API backend.
|
||||
|
||||
This function provides a generic interface for web search that can work
|
||||
with multiple backends. Currently uses Firecrawl.
|
||||
with multiple backends (Parallel or Firecrawl).
|
||||
|
||||
Note: This function returns search result metadata only (URLs, titles, descriptions).
|
||||
Use web_extract_tool to get full content from specific URLs.
|
||||
@@ -478,6 +628,17 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
if is_interrupted():
|
||||
return json.dumps({"error": "Interrupted", "success": False})
|
||||
|
||||
# Dispatch to the configured backend
|
||||
backend = _get_backend()
|
||||
if backend == "parallel":
|
||||
response_data = _parallel_search(query, limit)
|
||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
||||
debug_call_data["final_response_size"] = len(result_json)
|
||||
_debug.log_call("web_search_tool", debug_call_data)
|
||||
_debug.save()
|
||||
return result_json
|
||||
|
||||
logger.info("Searching the web for: '%s' (limit: %d)", query, limit)
|
||||
|
||||
response = _get_firecrawl_client().search(
|
||||
@@ -597,6 +758,13 @@ async def web_extract_tool(
|
||||
try:
|
||||
logger.info("Extracting content from %d URL(s)", len(urls))
|
||||
|
||||
# Dispatch to the configured backend
|
||||
backend = _get_backend()
|
||||
|
||||
if backend == "parallel":
|
||||
results = await _parallel_extract(urls)
|
||||
else:
|
||||
# ── Firecrawl extraction ──
|
||||
# Determine requested formats for Firecrawl v2
|
||||
formats: List[str] = []
|
||||
if format == "markdown":
|
||||
@@ -887,6 +1055,14 @@ async def web_crawl_tool(
|
||||
}
|
||||
|
||||
try:
|
||||
# web_crawl requires Firecrawl — Parallel has no crawl API
|
||||
if not (os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL")):
|
||||
return json.dumps({
|
||||
"error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, "
|
||||
"or use web_search + web_extract instead.",
|
||||
"success": False,
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Ensure URL has protocol
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = f'https://{url}'
|
||||
@@ -1158,6 +1334,15 @@ def check_firecrawl_api_key() -> bool:
|
||||
return bool(os.getenv("FIRECRAWL_API_KEY"))
|
||||
|
||||
|
||||
def check_web_api_key() -> bool:
|
||||
"""Check if any web backend API key is available (Parallel or Firecrawl)."""
|
||||
return bool(
|
||||
os.getenv("PARALLEL_API_KEY")
|
||||
or os.getenv("FIRECRAWL_API_KEY")
|
||||
or os.getenv("FIRECRAWL_API_URL")
|
||||
)
|
||||
|
||||
|
||||
def check_auxiliary_model() -> bool:
|
||||
"""Check if an auxiliary text model is available for LLM content processing."""
|
||||
try:
|
||||
@@ -1184,15 +1369,19 @@ if __name__ == "__main__":
|
||||
print("=" * 40)
|
||||
|
||||
# Check if API keys are available
|
||||
firecrawl_available = check_firecrawl_api_key()
|
||||
web_available = check_web_api_key()
|
||||
nous_available = check_auxiliary_model()
|
||||
|
||||
if not firecrawl_available:
|
||||
print("❌ FIRECRAWL_API_KEY environment variable not set")
|
||||
print("Please set your API key: export FIRECRAWL_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://firecrawl.dev/")
|
||||
if web_available:
|
||||
backend = _get_backend()
|
||||
print(f"✅ Web backend: {backend}")
|
||||
if backend == "parallel":
|
||||
print(" Using Parallel API (https://parallel.ai)")
|
||||
else:
|
||||
print("✅ Firecrawl API key found")
|
||||
print(" Using Firecrawl API (https://firecrawl.dev)")
|
||||
else:
|
||||
print("❌ No web search backend configured")
|
||||
print("Set PARALLEL_API_KEY (https://parallel.ai) or FIRECRAWL_API_KEY (https://firecrawl.dev)")
|
||||
|
||||
if not nous_available:
|
||||
print("❌ No auxiliary model available for LLM content processing")
|
||||
@@ -1201,7 +1390,7 @@ if __name__ == "__main__":
|
||||
else:
|
||||
print(f"✅ Auxiliary model available: {DEFAULT_SUMMARIZER_MODEL}")
|
||||
|
||||
if not firecrawl_available:
|
||||
if not web_available:
|
||||
exit(1)
|
||||
|
||||
print("🛠️ Web tools ready for use!")
|
||||
@@ -1301,8 +1490,8 @@ registry.register(
|
||||
toolset="web",
|
||||
schema=WEB_SEARCH_SCHEMA,
|
||||
handler=lambda args, **kw: web_search_tool(args.get("query", ""), limit=5),
|
||||
check_fn=check_firecrawl_api_key,
|
||||
requires_env=["FIRECRAWL_API_KEY"],
|
||||
check_fn=check_web_api_key,
|
||||
requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"],
|
||||
emoji="🔍",
|
||||
)
|
||||
registry.register(
|
||||
@@ -1311,8 +1500,8 @@ registry.register(
|
||||
schema=WEB_EXTRACT_SCHEMA,
|
||||
handler=lambda args, **kw: web_extract_tool(
|
||||
args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"),
|
||||
check_fn=check_firecrawl_api_key,
|
||||
requires_env=["FIRECRAWL_API_KEY"],
|
||||
check_fn=check_web_api_key,
|
||||
requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"],
|
||||
is_async=True,
|
||||
emoji="📄",
|
||||
)
|
||||
|
||||
@@ -61,6 +61,7 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `PARALLEL_API_KEY` | AI-native web search ([parallel.ai](https://parallel.ai/)) |
|
||||
| `FIRECRAWL_API_KEY` | Web scraping ([firecrawl.dev](https://firecrawl.dev/)) |
|
||||
| `FIRECRAWL_API_URL` | Custom Firecrawl API endpoint for self-hosted instances (optional) |
|
||||
| `BROWSERBASE_API_KEY` | Browser automation ([browserbase.com](https://browserbase.com/)) |
|
||||
|
||||
Reference in New Issue
Block a user