From d2b10545dbd432bfde93e437880516217d006176 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 17 Mar 2026 04:28:03 -0700 Subject: [PATCH] feat(web): add Tavily as web search/extract/crawl backend (#1731) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Salvage of PR #1707 by @kshitijk4poor (cherry-picked with authorship preserved). Adds Tavily as a third web backend alongside Firecrawl and Parallel, using the Tavily REST API via httpx. - Backend selection via hermes tools → saved as web.backend in config.yaml - All three tools supported: search, extract, crawl - TAVILY_API_KEY in config registry, doctor, status, setup wizard - 15 new Tavily tests + 9 backend selection tests + 5 config tests - Backward compatible Closes #1707 --- hermes_cli/config.py | 13 +- hermes_cli/setup.py | 6 +- hermes_cli/status.py | 1 + hermes_cli/tools_config.py | 13 ++ tests/hermes_cli/test_config.py | 32 ++++ tests/hermes_cli/test_status.py | 14 ++ tests/tools/test_web_tools_config.py | 61 ++++++- tests/tools/test_web_tools_tavily.py | 255 +++++++++++++++++++++++++++ tools/web_tools.py | 199 ++++++++++++++++++++- 9 files changed, 582 insertions(+), 12 deletions(-) create mode 100644 tests/hermes_cli/test_status.py create mode 100644 tests/tools/test_web_tools_tavily.py diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 5d74a2a98..8c914034c 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -379,6 +379,7 @@ ENV_VARS_BY_VERSION: Dict[int, List[str]] = { 4: ["VOICE_TOOLS_OPENAI_KEY", "ELEVENLABS_API_KEY"], 5: ["WHATSAPP_ENABLED", "WHATSAPP_MODE", "WHATSAPP_ALLOWED_USERS", "SLACK_BOT_TOKEN", "SLACK_APP_TOKEN", "SLACK_ALLOWED_USERS"], + 10: ["TAVILY_API_KEY"], } # Required environment variables with metadata for migration prompts. @@ -574,6 +575,14 @@ OPTIONAL_ENV_VARS = { "category": "tool", "advanced": True, }, + "TAVILY_API_KEY": { + "description": "Tavily API key for AI-native web search, extract, and crawl", + "prompt": "Tavily API key", + "url": "https://app.tavily.com/home", + "tools": ["web_search", "web_extract", "web_crawl"], + "password": True, + "category": "tool", + }, "BROWSERBASE_API_KEY": { "description": "Browserbase API key for cloud browser (optional — local browser works without this)", "prompt": "Browserbase API key", @@ -1516,6 +1525,7 @@ def show_config(): ("VOICE_TOOLS_OPENAI_KEY", "OpenAI (STT/TTS)"), ("PARALLEL_API_KEY", "Parallel"), ("FIRECRAWL_API_KEY", "Firecrawl"), + ("TAVILY_API_KEY", "Tavily"), ("BROWSERBASE_API_KEY", "Browserbase"), ("BROWSER_USE_API_KEY", "Browser Use"), ("FAL_KEY", "FAL"), @@ -1664,7 +1674,8 @@ def set_config_value(key: str, value: str): # Check if it's an API key (goes to .env) api_keys = [ 'OPENROUTER_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', 'VOICE_TOOLS_OPENAI_KEY', - 'PARALLEL_API_KEY', 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY', + 'PARALLEL_API_KEY', 'FIRECRAWL_API_KEY', 'FIRECRAWL_API_URL', 'TAVILY_API_KEY', + 'BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID', 'BROWSER_USE_API_KEY', 'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN', 'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY', 'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN', diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index b2647d01b..72f36cc77 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -444,11 +444,11 @@ def _print_setup_summary(config: dict, hermes_home): else: tool_status.append(("Mixture of Agents", False, "OPENROUTER_API_KEY")) - # Web tools (Parallel or Firecrawl) - if get_env_value("PARALLEL_API_KEY") or get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"): + # Web tools (Parallel, Firecrawl, or Tavily) + if get_env_value("PARALLEL_API_KEY") or get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL") or get_env_value("TAVILY_API_KEY"): tool_status.append(("Web Search & Extract", True, None)) else: - tool_status.append(("Web Search & Extract", False, "PARALLEL_API_KEY or FIRECRAWL_API_KEY")) + tool_status.append(("Web Search & Extract", False, "PARALLEL_API_KEY, FIRECRAWL_API_KEY, or TAVILY_API_KEY")) # Browser tools (local Chromium or Browserbase cloud) import shutil diff --git a/hermes_cli/status.py b/hermes_cli/status.py index dce87668c..e8db90cf2 100644 --- a/hermes_cli/status.py +++ b/hermes_cli/status.py @@ -120,6 +120,7 @@ def show_status(args): "MiniMax": "MINIMAX_API_KEY", "MiniMax-CN": "MINIMAX_CN_API_KEY", "Firecrawl": "FIRECRAWL_API_KEY", + "Tavily": "TAVILY_API_KEY", "Browserbase": "BROWSERBASE_API_KEY", # Optional — local browser works without this "FAL": "FAL_KEY", "Tinker": "TINKER_API_KEY", diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index 2e008cb58..1d6783a2d 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -170,6 +170,14 @@ TOOL_CATEGORIES = { {"key": "PARALLEL_API_KEY", "prompt": "Parallel API key", "url": "https://parallel.ai"}, ], }, + { + "name": "Tavily", + "tag": "AI-native search, extract, and crawl", + "web_backend": "tavily", + "env_vars": [ + {"key": "TAVILY_API_KEY", "prompt": "Tavily API key", "url": "https://app.tavily.com/home"}, + ], + }, { "name": "Firecrawl Self-Hosted", "tag": "Free - run your own instance", @@ -851,6 +859,11 @@ def _reconfigure_provider(provider: dict, config: dict): config.get("browser", {}).pop("cloud_provider", None) _print_success(f" Browser set to local mode") + # Set web search backend in config if applicable + if provider.get("web_backend"): + config.setdefault("web", {})["backend"] = provider["web_backend"] + _print_success(f" Web backend set to: {provider['web_backend']}") + if not env_vars: _print_success(f" {provider['name']} - no configuration needed!") return diff --git a/tests/hermes_cli/test_config.py b/tests/hermes_cli/test_config.py index ba4f5c844..82cb99c64 100644 --- a/tests/hermes_cli/test_config.py +++ b/tests/hermes_cli/test_config.py @@ -316,6 +316,38 @@ class TestSanitizeEnvLines: assert fixes == 0 +class TestOptionalEnvVarsRegistry: + """Verify that key env vars are registered in OPTIONAL_ENV_VARS.""" + + def test_tavily_api_key_registered(self): + """TAVILY_API_KEY is listed in OPTIONAL_ENV_VARS.""" + from hermes_cli.config import OPTIONAL_ENV_VARS + assert "TAVILY_API_KEY" in OPTIONAL_ENV_VARS + + def test_tavily_api_key_is_tool_category(self): + """TAVILY_API_KEY is in the 'tool' category.""" + from hermes_cli.config import OPTIONAL_ENV_VARS + assert OPTIONAL_ENV_VARS["TAVILY_API_KEY"]["category"] == "tool" + + def test_tavily_api_key_is_password(self): + """TAVILY_API_KEY is marked as password.""" + from hermes_cli.config import OPTIONAL_ENV_VARS + assert OPTIONAL_ENV_VARS["TAVILY_API_KEY"]["password"] is True + + def test_tavily_api_key_has_url(self): + """TAVILY_API_KEY has a URL.""" + from hermes_cli.config import OPTIONAL_ENV_VARS + assert OPTIONAL_ENV_VARS["TAVILY_API_KEY"]["url"] == "https://app.tavily.com/home" + + def test_tavily_in_env_vars_by_version(self): + """TAVILY_API_KEY is listed in ENV_VARS_BY_VERSION.""" + from hermes_cli.config import ENV_VARS_BY_VERSION + all_vars = [] + for vars_list in ENV_VARS_BY_VERSION.values(): + all_vars.extend(vars_list) + assert "TAVILY_API_KEY" in all_vars + + class TestAnthropicTokenMigration: """Test that config version 8→9 clears ANTHROPIC_TOKEN.""" diff --git a/tests/hermes_cli/test_status.py b/tests/hermes_cli/test_status.py new file mode 100644 index 000000000..374e57b29 --- /dev/null +++ b/tests/hermes_cli/test_status.py @@ -0,0 +1,14 @@ +from types import SimpleNamespace + +from hermes_cli.status import show_status + + +def test_show_status_includes_tavily_key(monkeypatch, capsys, tmp_path): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setenv("TAVILY_API_KEY", "tvly-1234567890abcdef") + + show_status(SimpleNamespace(all=False, deep=False)) + + output = capsys.readouterr().out + assert "Tavily" in output + assert "tvly...cdef" in output diff --git a/tests/tools/test_web_tools_config.py b/tests/tools/test_web_tools_config.py index fdf670749..d291a005b 100644 --- a/tests/tools/test_web_tools_config.py +++ b/tests/tools/test_web_tools_config.py @@ -130,7 +130,7 @@ class TestBackendSelection: setups. """ - _ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL") + _ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL", "TAVILY_API_KEY") def setup_method(self): for key in self._ENV_KEYS: @@ -155,12 +155,31 @@ class TestBackendSelection: patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): assert _get_backend() == "firecrawl" + def test_config_tavily(self): + """web.backend=tavily in config → 'tavily' regardless of other keys.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={"backend": "tavily"}): + assert _get_backend() == "tavily" + + def test_config_tavily_overrides_env_keys(self): + """web.backend=tavily in config → 'tavily' even if Firecrawl key set.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={"backend": "tavily"}), \ + patch.dict(os.environ, {"FIRECRAWL_API_KEY": "fc-test"}): + assert _get_backend() == "tavily" + def test_config_case_insensitive(self): """web.backend=Parallel (mixed case) → 'parallel'.""" from tools.web_tools import _get_backend with patch("tools.web_tools._load_web_config", return_value={"backend": "Parallel"}): assert _get_backend() == "parallel" + def test_config_tavily_case_insensitive(self): + """web.backend=Tavily (mixed case) → 'tavily'.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={"backend": "Tavily"}): + assert _get_backend() == "tavily" + # ── Fallback (no web.backend in config) ─────────────────────────── def test_fallback_parallel_only_key(self): @@ -170,6 +189,28 @@ class TestBackendSelection: patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): assert _get_backend() == "parallel" + def test_fallback_tavily_only_key(self): + """Only TAVILY_API_KEY set → 'tavily'.""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={}), \ + patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}): + assert _get_backend() == "tavily" + + def test_fallback_tavily_with_firecrawl_prefers_firecrawl(self): + """Tavily + Firecrawl keys, no config → 'firecrawl' (backward compat).""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={}), \ + patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test", "FIRECRAWL_API_KEY": "fc-test"}): + assert _get_backend() == "firecrawl" + + def test_fallback_tavily_with_parallel_prefers_parallel(self): + """Tavily + Parallel keys, no config → 'parallel' (Parallel takes priority over Tavily).""" + from tools.web_tools import _get_backend + with patch("tools.web_tools._load_web_config", return_value={}), \ + patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test", "PARALLEL_API_KEY": "par-test"}): + # Parallel + no Firecrawl → parallel + assert _get_backend() == "parallel" + def test_fallback_both_keys_defaults_to_firecrawl(self): """Both keys set, no config → 'firecrawl' (backward compat).""" from tools.web_tools import _get_backend @@ -193,7 +234,7 @@ class TestBackendSelection: def test_invalid_config_falls_through_to_fallback(self): """web.backend=invalid → ignored, uses key-based fallback.""" from tools.web_tools import _get_backend - with patch("tools.web_tools._load_web_config", return_value={"backend": "tavily"}), \ + with patch("tools.web_tools._load_web_config", return_value={"backend": "nonexistent"}), \ patch.dict(os.environ, {"PARALLEL_API_KEY": "test-key"}): assert _get_backend() == "parallel" @@ -238,7 +279,7 @@ class TestParallelClientConfig: class TestCheckWebApiKey: """Test suite for check_web_api_key() unified availability check.""" - _ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL") + _ENV_KEYS = ("PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "FIRECRAWL_API_URL", "TAVILY_API_KEY") def setup_method(self): for key in self._ENV_KEYS: @@ -263,6 +304,11 @@ class TestCheckWebApiKey: from tools.web_tools import check_web_api_key assert check_web_api_key() is True + def test_tavily_key_only(self): + with patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is True + def test_no_keys_returns_false(self): from tools.web_tools import check_web_api_key assert check_web_api_key() is False @@ -274,3 +320,12 @@ class TestCheckWebApiKey: }): from tools.web_tools import check_web_api_key assert check_web_api_key() is True + + def test_all_three_keys_returns_true(self): + with patch.dict(os.environ, { + "PARALLEL_API_KEY": "test-key", + "FIRECRAWL_API_KEY": "fc-test", + "TAVILY_API_KEY": "tvly-test", + }): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is True diff --git a/tests/tools/test_web_tools_tavily.py b/tests/tools/test_web_tools_tavily.py new file mode 100644 index 000000000..2e49b72f1 --- /dev/null +++ b/tests/tools/test_web_tools_tavily.py @@ -0,0 +1,255 @@ +"""Tests for Tavily web backend integration. + +Coverage: + _tavily_request() — API key handling, endpoint construction, error propagation. + _normalize_tavily_search_results() — search response normalization. + _normalize_tavily_documents() — extract/crawl response normalization, failed_results. + web_search_tool / web_extract_tool / web_crawl_tool — Tavily dispatch paths. +""" + +import json +import os +import asyncio +import pytest +from unittest.mock import patch, MagicMock + + +# ─── _tavily_request ───────────────────────────────────────────────────────── + +class TestTavilyRequest: + """Test suite for the _tavily_request helper.""" + + def test_raises_without_api_key(self): + """No TAVILY_API_KEY → ValueError with guidance.""" + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("TAVILY_API_KEY", None) + from tools.web_tools import _tavily_request + with pytest.raises(ValueError, match="TAVILY_API_KEY"): + _tavily_request("search", {"query": "test"}) + + def test_posts_with_api_key_in_body(self): + """api_key is injected into the JSON payload.""" + mock_response = MagicMock() + mock_response.json.return_value = {"results": []} + mock_response.raise_for_status = MagicMock() + + with patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test-key"}): + with patch("tools.web_tools.httpx.post", return_value=mock_response) as mock_post: + from tools.web_tools import _tavily_request + result = _tavily_request("search", {"query": "hello"}) + + mock_post.assert_called_once() + call_kwargs = mock_post.call_args + payload = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") + assert payload["api_key"] == "tvly-test-key" + assert payload["query"] == "hello" + assert "api.tavily.com/search" in call_kwargs.args[0] + + def test_raises_on_http_error(self): + """Non-2xx responses propagate as httpx.HTTPStatusError.""" + import httpx as _httpx + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = _httpx.HTTPStatusError( + "401 Unauthorized", request=MagicMock(), response=mock_response + ) + + with patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-bad-key"}): + with patch("tools.web_tools.httpx.post", return_value=mock_response): + from tools.web_tools import _tavily_request + with pytest.raises(_httpx.HTTPStatusError): + _tavily_request("search", {"query": "test"}) + + +# ─── _normalize_tavily_search_results ───────────────────────────────────────── + +class TestNormalizeTavilySearchResults: + """Test search result normalization.""" + + def test_basic_normalization(self): + from tools.web_tools import _normalize_tavily_search_results + raw = { + "results": [ + {"title": "Python Docs", "url": "https://docs.python.org", "content": "Official docs", "score": 0.9}, + {"title": "Tutorial", "url": "https://example.com", "content": "A tutorial", "score": 0.8}, + ] + } + result = _normalize_tavily_search_results(raw) + assert result["success"] is True + web = result["data"]["web"] + assert len(web) == 2 + assert web[0]["title"] == "Python Docs" + assert web[0]["url"] == "https://docs.python.org" + assert web[0]["description"] == "Official docs" + assert web[0]["position"] == 1 + assert web[1]["position"] == 2 + + def test_empty_results(self): + from tools.web_tools import _normalize_tavily_search_results + result = _normalize_tavily_search_results({"results": []}) + assert result["success"] is True + assert result["data"]["web"] == [] + + def test_missing_fields(self): + from tools.web_tools import _normalize_tavily_search_results + result = _normalize_tavily_search_results({"results": [{}]}) + web = result["data"]["web"] + assert web[0]["title"] == "" + assert web[0]["url"] == "" + assert web[0]["description"] == "" + + +# ─── _normalize_tavily_documents ────────────────────────────────────────────── + +class TestNormalizeTavilyDocuments: + """Test extract/crawl document normalization.""" + + def test_basic_document(self): + from tools.web_tools import _normalize_tavily_documents + raw = { + "results": [{ + "url": "https://example.com", + "title": "Example", + "raw_content": "Full page content here", + }] + } + docs = _normalize_tavily_documents(raw) + assert len(docs) == 1 + assert docs[0]["url"] == "https://example.com" + assert docs[0]["title"] == "Example" + assert docs[0]["content"] == "Full page content here" + assert docs[0]["raw_content"] == "Full page content here" + assert docs[0]["metadata"]["sourceURL"] == "https://example.com" + + def test_falls_back_to_content_when_no_raw_content(self): + from tools.web_tools import _normalize_tavily_documents + raw = {"results": [{"url": "https://example.com", "content": "Snippet"}]} + docs = _normalize_tavily_documents(raw) + assert docs[0]["content"] == "Snippet" + + def test_failed_results_included(self): + from tools.web_tools import _normalize_tavily_documents + raw = { + "results": [], + "failed_results": [ + {"url": "https://fail.com", "error": "timeout"}, + ], + } + docs = _normalize_tavily_documents(raw) + assert len(docs) == 1 + assert docs[0]["url"] == "https://fail.com" + assert docs[0]["error"] == "timeout" + assert docs[0]["content"] == "" + + def test_failed_urls_included(self): + from tools.web_tools import _normalize_tavily_documents + raw = { + "results": [], + "failed_urls": ["https://bad.com"], + } + docs = _normalize_tavily_documents(raw) + assert len(docs) == 1 + assert docs[0]["url"] == "https://bad.com" + assert docs[0]["error"] == "extraction failed" + + def test_fallback_url(self): + from tools.web_tools import _normalize_tavily_documents + raw = {"results": [{"content": "data"}]} + docs = _normalize_tavily_documents(raw, fallback_url="https://fallback.com") + assert docs[0]["url"] == "https://fallback.com" + + +# ─── web_search_tool (Tavily dispatch) ──────────────────────────────────────── + +class TestWebSearchTavily: + """Test web_search_tool dispatch to Tavily.""" + + def test_search_dispatches_to_tavily(self): + mock_response = MagicMock() + mock_response.json.return_value = { + "results": [{"title": "Result", "url": "https://r.com", "content": "desc", "score": 0.9}] + } + mock_response.raise_for_status = MagicMock() + + with patch("tools.web_tools._get_backend", return_value="tavily"), \ + patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \ + patch("tools.web_tools.httpx.post", return_value=mock_response), \ + patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import web_search_tool + result = json.loads(web_search_tool("test query", limit=3)) + assert result["success"] is True + assert len(result["data"]["web"]) == 1 + assert result["data"]["web"][0]["title"] == "Result" + + +# ─── web_extract_tool (Tavily dispatch) ─────────────────────────────────────── + +class TestWebExtractTavily: + """Test web_extract_tool dispatch to Tavily.""" + + def test_extract_dispatches_to_tavily(self): + mock_response = MagicMock() + mock_response.json.return_value = { + "results": [{"url": "https://example.com", "raw_content": "Extracted content", "title": "Page"}] + } + mock_response.raise_for_status = MagicMock() + + with patch("tools.web_tools._get_backend", return_value="tavily"), \ + patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \ + patch("tools.web_tools.httpx.post", return_value=mock_response), \ + patch("tools.web_tools.process_content_with_llm", return_value=None): + from tools.web_tools import web_extract_tool + result = json.loads(asyncio.get_event_loop().run_until_complete( + web_extract_tool(["https://example.com"], use_llm_processing=False) + )) + assert "results" in result + assert len(result["results"]) == 1 + assert result["results"][0]["url"] == "https://example.com" + + +# ─── web_crawl_tool (Tavily dispatch) ───────────────────────────────────────── + +class TestWebCrawlTavily: + """Test web_crawl_tool dispatch to Tavily.""" + + def test_crawl_dispatches_to_tavily(self): + mock_response = MagicMock() + mock_response.json.return_value = { + "results": [ + {"url": "https://example.com/page1", "raw_content": "Page 1 content", "title": "Page 1"}, + {"url": "https://example.com/page2", "raw_content": "Page 2 content", "title": "Page 2"}, + ] + } + mock_response.raise_for_status = MagicMock() + + with patch("tools.web_tools._get_backend", return_value="tavily"), \ + patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \ + patch("tools.web_tools.httpx.post", return_value=mock_response), \ + patch("tools.web_tools.check_website_access", return_value=None), \ + patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import web_crawl_tool + result = json.loads(asyncio.get_event_loop().run_until_complete( + web_crawl_tool("https://example.com", use_llm_processing=False) + )) + assert "results" in result + assert len(result["results"]) == 2 + assert result["results"][0]["title"] == "Page 1" + + def test_crawl_sends_instructions(self): + """Instructions are included in the Tavily crawl payload.""" + mock_response = MagicMock() + mock_response.json.return_value = {"results": []} + mock_response.raise_for_status = MagicMock() + + with patch("tools.web_tools._get_backend", return_value="tavily"), \ + patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \ + patch("tools.web_tools.httpx.post", return_value=mock_response) as mock_post, \ + patch("tools.web_tools.check_website_access", return_value=None), \ + patch("tools.interrupt.is_interrupted", return_value=False): + from tools.web_tools import web_crawl_tool + asyncio.get_event_loop().run_until_complete( + web_crawl_tool("https://example.com", instructions="Find docs", use_llm_processing=False) + ) + call_kwargs = mock_post.call_args + payload = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") + assert payload["instructions"] == "Find docs" + assert payload["url"] == "https://example.com" diff --git a/tools/web_tools.py b/tools/web_tools.py index 9102b3e77..79444d72b 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -46,6 +46,7 @@ import os import re import asyncio from typing import List, Dict, Any, Optional +import httpx from firecrawl import Firecrawl from agent.auxiliary_client import async_call_llm from tools.debug_helpers import DebugSession @@ -73,11 +74,14 @@ def _get_backend() -> str: keys manually without running setup. """ configured = _load_web_config().get("backend", "").lower().strip() - if configured in ("parallel", "firecrawl"): + if configured in ("parallel", "firecrawl", "tavily"): return configured # Fallback for manual / legacy config — use whichever key is present. has_firecrawl = bool(os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL")) has_parallel = bool(os.getenv("PARALLEL_API_KEY")) + has_tavily = bool(os.getenv("TAVILY_API_KEY")) + if has_tavily and not has_firecrawl and not has_parallel: + return "tavily" if has_parallel and not has_firecrawl: return "parallel" # Default to firecrawl (backward compat, or when both are set) @@ -155,6 +159,88 @@ def _get_async_parallel_client(): _async_parallel_client = AsyncParallel(api_key=api_key) return _async_parallel_client +# ─── Tavily Client ─────────────────────────────────────────────────────────── + +_TAVILY_BASE_URL = "https://api.tavily.com" + + +def _tavily_request(endpoint: str, payload: dict) -> dict: + """Send a POST request to the Tavily API. + + Auth is provided via ``api_key`` in the JSON body (no header-based auth). + Raises ``ValueError`` if ``TAVILY_API_KEY`` is not set. + """ + api_key = os.getenv("TAVILY_API_KEY") + if not api_key: + raise ValueError( + "TAVILY_API_KEY environment variable not set. " + "Get your API key at https://app.tavily.com/home" + ) + payload["api_key"] = api_key + url = f"{_TAVILY_BASE_URL}/{endpoint.lstrip('/')}" + logger.info("Tavily %s request to %s", endpoint, url) + response = httpx.post(url, json=payload, timeout=60) + response.raise_for_status() + return response.json() + + +def _normalize_tavily_search_results(response: dict) -> dict: + """Normalize Tavily /search response to the standard web search format. + + Tavily returns ``{results: [{title, url, content, score, ...}]}``. + We map to ``{success, data: {web: [{title, url, description, position}]}}``. + """ + web_results = [] + for i, result in enumerate(response.get("results", [])): + web_results.append({ + "title": result.get("title", ""), + "url": result.get("url", ""), + "description": result.get("content", ""), + "position": i + 1, + }) + return {"success": True, "data": {"web": web_results}} + + +def _normalize_tavily_documents(response: dict, fallback_url: str = "") -> List[Dict[str, Any]]: + """Normalize Tavily /extract or /crawl response to the standard document format. + + Maps results to ``{url, title, content, raw_content, metadata}`` and + includes any ``failed_results`` / ``failed_urls`` as error entries. + """ + documents: List[Dict[str, Any]] = [] + for result in response.get("results", []): + url = result.get("url", fallback_url) + raw = result.get("raw_content", "") or result.get("content", "") + documents.append({ + "url": url, + "title": result.get("title", ""), + "content": raw, + "raw_content": raw, + "metadata": {"sourceURL": url, "title": result.get("title", "")}, + }) + # Handle failed results + for fail in response.get("failed_results", []): + documents.append({ + "url": fail.get("url", fallback_url), + "title": "", + "content": "", + "raw_content": "", + "error": fail.get("error", "extraction failed"), + "metadata": {"sourceURL": fail.get("url", fallback_url)}, + }) + for fail_url in response.get("failed_urls", []): + url_str = fail_url if isinstance(fail_url, str) else str(fail_url) + documents.append({ + "url": url_str, + "title": "", + "content": "", + "raw_content": "", + "error": "extraction failed", + "metadata": {"sourceURL": url_str}, + }) + return documents + + DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000 # Allow per-task override via env var @@ -639,6 +725,22 @@ def web_search_tool(query: str, limit: int = 5) -> str: _debug.save() return result_json + if backend == "tavily": + logger.info("Tavily search: '%s' (limit: %d)", query, limit) + raw = _tavily_request("search", { + "query": query, + "max_results": min(limit, 20), + "include_raw_content": False, + "include_images": False, + }) + response_data = _normalize_tavily_search_results(raw) + debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", [])) + result_json = json.dumps(response_data, indent=2, ensure_ascii=False) + debug_call_data["final_response_size"] = len(result_json) + _debug.log_call("web_search_tool", debug_call_data) + _debug.save() + return result_json + logger.info("Searching the web for: '%s' (limit: %d)", query, limit) response = _get_firecrawl_client().search( @@ -763,6 +865,13 @@ async def web_extract_tool( if backend == "parallel": results = await _parallel_extract(urls) + elif backend == "tavily": + logger.info("Tavily extract: %d URL(s)", len(urls)) + raw = _tavily_request("extract", { + "urls": urls, + "include_images": False, + }) + results = _normalize_tavily_documents(raw, fallback_url=urls[0] if urls else "") else: # ── Firecrawl extraction ── # Determine requested formats for Firecrawl v2 @@ -1055,6 +1164,83 @@ async def web_crawl_tool( } try: + backend = _get_backend() + + # Tavily supports crawl via its /crawl endpoint + if backend == "tavily": + # Ensure URL has protocol + if not url.startswith(('http://', 'https://')): + url = f'https://{url}' + + # Website policy check + blocked = check_website_access(url) + if blocked: + logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"]) + return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"], + "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False) + + from tools.interrupt import is_interrupted as _is_int + if _is_int(): + return json.dumps({"error": "Interrupted", "success": False}) + + logger.info("Tavily crawl: %s", url) + payload: Dict[str, Any] = { + "url": url, + "limit": 20, + "extract_depth": depth, + } + if instructions: + payload["instructions"] = instructions + raw = _tavily_request("crawl", payload) + results = _normalize_tavily_documents(raw, fallback_url=url) + + response = {"results": results} + # Fall through to the shared LLM processing and trimming below + # (skip the Firecrawl-specific crawl logic) + pages_crawled = len(response.get('results', [])) + logger.info("Crawled %d pages", pages_crawled) + debug_call_data["pages_crawled"] = pages_crawled + debug_call_data["original_response_size"] = len(json.dumps(response)) + + # Process each result with LLM if enabled + if use_llm_processing: + logger.info("Processing crawled content with LLM (parallel)...") + debug_call_data["processing_applied"].append("llm_processing") + + async def _process_tavily_crawl(result): + page_url = result.get('url', 'Unknown URL') + title = result.get('title', '') + content = result.get('content', '') + if not content: + return result, None, "no_content" + original_size = len(content) + processed = await process_content_with_llm(content, page_url, title, model, min_length) + if processed: + result['raw_content'] = content + result['content'] = processed + metrics = {"url": page_url, "original_size": original_size, "processed_size": len(processed), + "compression_ratio": len(processed) / original_size if original_size else 1.0, "model_used": model} + return result, metrics, "processed" + metrics = {"url": page_url, "original_size": original_size, "processed_size": original_size, + "compression_ratio": 1.0, "model_used": None, "reason": "content_too_short"} + return result, metrics, "too_short" + + tasks = [_process_tavily_crawl(r) for r in response.get('results', [])] + processed_results = await asyncio.gather(*tasks) + for result, metrics, status in processed_results: + if status == "processed": + debug_call_data["compression_metrics"].append(metrics) + debug_call_data["pages_processed_with_llm"] += 1 + + trimmed_results = [{"url": r.get("url", ""), "title": r.get("title", ""), "content": r.get("content", ""), "error": r.get("error"), + **({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {})} for r in response.get("results", [])] + result_json = json.dumps({"results": trimmed_results}, indent=2, ensure_ascii=False) + cleaned_result = clean_base64_images(result_json) + debug_call_data["final_response_size"] = len(cleaned_result) + _debug.log_call("web_crawl_tool", debug_call_data) + _debug.save() + return cleaned_result + # web_crawl requires Firecrawl — Parallel has no crawl API if not (os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL")): return json.dumps({ @@ -1335,11 +1521,12 @@ def check_firecrawl_api_key() -> bool: def check_web_api_key() -> bool: - """Check if any web backend API key is available (Parallel or Firecrawl).""" + """Check if any web backend API key is available (Parallel, Firecrawl, or Tavily).""" return bool( os.getenv("PARALLEL_API_KEY") or os.getenv("FIRECRAWL_API_KEY") or os.getenv("FIRECRAWL_API_URL") + or os.getenv("TAVILY_API_KEY") ) @@ -1377,11 +1564,13 @@ if __name__ == "__main__": print(f"✅ Web backend: {backend}") if backend == "parallel": print(" Using Parallel API (https://parallel.ai)") + elif backend == "tavily": + print(" Using Tavily API (https://tavily.com)") else: print(" Using Firecrawl API (https://firecrawl.dev)") else: print("❌ No web search backend configured") - print("Set PARALLEL_API_KEY (https://parallel.ai) or FIRECRAWL_API_KEY (https://firecrawl.dev)") + print("Set PARALLEL_API_KEY, TAVILY_API_KEY, or FIRECRAWL_API_KEY") if not nous_available: print("❌ No auxiliary model available for LLM content processing") @@ -1491,7 +1680,7 @@ registry.register( schema=WEB_SEARCH_SCHEMA, handler=lambda args, **kw: web_search_tool(args.get("query", ""), limit=5), check_fn=check_web_api_key, - requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"], + requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "TAVILY_API_KEY"], emoji="🔍", ) registry.register( @@ -1501,7 +1690,7 @@ registry.register( handler=lambda args, **kw: web_extract_tool( args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"), check_fn=check_web_api_key, - requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY"], + requires_env=["PARALLEL_API_KEY", "FIRECRAWL_API_KEY", "TAVILY_API_KEY"], is_async=True, emoji="📄", )