"""Unit tests for web_search and scrape_url tools (SearXNG + Crawl4AI). All tests use mocked HTTP — no live services required. """ from __future__ import annotations from unittest.mock import MagicMock, patch import pytest from timmy.tools.search import _extract_crawl_content, scrape_url, web_search # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _mock_requests(json_response=None, status_code=200, raise_exc=None): """Build a mock requests module whose .get/.post return controlled responses.""" mock_req = MagicMock() # Exception hierarchy class Timeout(Exception): pass class HTTPError(Exception): def __init__(self, *a, response=None, **kw): super().__init__(*a, **kw) self.response = response class RequestException(Exception): pass exc_mod = MagicMock() exc_mod.Timeout = Timeout exc_mod.HTTPError = HTTPError exc_mod.RequestException = RequestException mock_req.exceptions = exc_mod if raise_exc is not None: mock_req.get.side_effect = raise_exc mock_req.post.side_effect = raise_exc else: mock_resp = MagicMock() mock_resp.status_code = status_code mock_resp.json.return_value = json_response or {} if status_code >= 400: mock_resp.raise_for_status.side_effect = HTTPError( response=MagicMock(status_code=status_code) ) mock_req.get.return_value = mock_resp mock_req.post.return_value = mock_resp return mock_req # --------------------------------------------------------------------------- # web_search tests # --------------------------------------------------------------------------- class TestWebSearch: def test_backend_none_short_circuits(self): """TIMMY_SEARCH_BACKEND=none returns disabled message immediately.""" with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "none" result = web_search("anything") assert "disabled" in result def test_missing_requests_package(self): """Graceful error when requests is not installed.""" with patch.dict("sys.modules", {"requests": None}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.search_url = "http://localhost:8888" result = web_search("test query") assert "requests" in result and "not installed" in result def test_successful_search(self): """Happy path: returns formatted result list.""" mock_data = { "results": [ {"title": "Foo Bar", "url": "https://example.com/foo", "content": "Foo is great"}, {"title": "Baz", "url": "https://example.com/baz", "content": "Baz rules"}, ] } mock_req = _mock_requests(json_response=mock_data) with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.search_url = "http://localhost:8888" result = web_search("foo bar") assert "Foo Bar" in result assert "https://example.com/foo" in result assert "Baz" in result assert "foo bar" in result def test_no_results(self): """Empty results list returns a helpful no-results message.""" mock_req = _mock_requests(json_response={"results": []}) with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.search_url = "http://localhost:8888" result = web_search("xyzzy") assert "No results" in result def test_num_results_respected(self): """Only up to num_results entries are returned.""" mock_data = { "results": [ {"title": f"Result {i}", "url": f"https://example.com/{i}", "content": "x"} for i in range(10) ] } mock_req = _mock_requests(json_response=mock_data) with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.search_url = "http://localhost:8888" result = web_search("test", num_results=3) # Only 3 numbered entries should appear assert "1." in result assert "3." in result assert "4." not in result def test_service_unavailable(self): """Connection error degrades gracefully.""" mock_req = MagicMock() mock_req.get.side_effect = OSError("connection refused") mock_req.exceptions = MagicMock() with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.search_url = "http://localhost:8888" result = web_search("test") assert "not reachable" in result or "unavailable" in result def test_catalog_entry_exists(self): """web_search must appear in the tool catalog.""" from timmy.tools import get_all_available_tools catalog = get_all_available_tools() assert "web_search" in catalog assert "orchestrator" in catalog["web_search"]["available_in"] assert "echo" in catalog["web_search"]["available_in"] # --------------------------------------------------------------------------- # scrape_url tests # --------------------------------------------------------------------------- class TestScrapeUrl: def test_invalid_url_no_scheme(self): """URLs without http(s) scheme are rejected before any HTTP call.""" result = scrape_url("example.com/page") assert "Error: invalid URL" in result def test_invalid_url_empty(self): result = scrape_url("") assert "Error: invalid URL" in result def test_backend_none_short_circuits(self): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "none" result = scrape_url("https://example.com") assert "disabled" in result def test_missing_requests_package(self): with patch.dict("sys.modules", {"requests": None}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.crawl_url = "http://localhost:11235" result = scrape_url("https://example.com") assert "requests" in result and "not installed" in result def test_sync_result_returned_immediately(self): """If Crawl4AI returns results in the POST response, use them directly.""" mock_data = { "results": [{"markdown": "# Hello\n\nThis is the page content."}] } mock_req = _mock_requests(json_response=mock_data) with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.crawl_url = "http://localhost:11235" result = scrape_url("https://example.com") assert "Hello" in result assert "page content" in result def test_async_poll_completed(self): """Async task_id flow: polls until completed and returns content.""" submit_response = MagicMock() submit_response.json.return_value = {"task_id": "abc123"} submit_response.raise_for_status.return_value = None poll_response = MagicMock() poll_response.json.return_value = { "status": "completed", "results": [{"markdown": "# Async content"}], } poll_response.raise_for_status.return_value = None mock_req = MagicMock() mock_req.post.return_value = submit_response mock_req.get.return_value = poll_response mock_req.exceptions = MagicMock() with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.crawl_url = "http://localhost:11235" with patch("timmy.tools.search.time") as mock_time: mock_time.sleep = MagicMock() result = scrape_url("https://example.com") assert "Async content" in result def test_async_poll_failed_task(self): """Crawl4AI task failure is reported clearly.""" submit_response = MagicMock() submit_response.json.return_value = {"task_id": "abc123"} submit_response.raise_for_status.return_value = None poll_response = MagicMock() poll_response.json.return_value = {"status": "failed", "error": "site blocked"} poll_response.raise_for_status.return_value = None mock_req = MagicMock() mock_req.post.return_value = submit_response mock_req.get.return_value = poll_response mock_req.exceptions = MagicMock() with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.crawl_url = "http://localhost:11235" with patch("timmy.tools.search.time") as mock_time: mock_time.sleep = MagicMock() result = scrape_url("https://example.com") assert "failed" in result and "site blocked" in result def test_service_unavailable(self): """Connection error degrades gracefully.""" mock_req = MagicMock() mock_req.post.side_effect = OSError("connection refused") mock_req.exceptions = MagicMock() with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.crawl_url = "http://localhost:11235" result = scrape_url("https://example.com") assert "not reachable" in result or "unavailable" in result def test_content_truncation(self): """Content longer than ~4000 tokens is truncated.""" long_content = "x" * 20000 mock_data = {"results": [{"markdown": long_content}]} mock_req = _mock_requests(json_response=mock_data) with patch.dict("sys.modules", {"requests": mock_req}): with patch("timmy.tools.search.settings") as mock_settings: mock_settings.timmy_search_backend = "searxng" mock_settings.crawl_url = "http://localhost:11235" result = scrape_url("https://example.com") assert "[…truncated" in result assert len(result) < 17000 def test_catalog_entry_exists(self): """scrape_url must appear in the tool catalog.""" from timmy.tools import get_all_available_tools catalog = get_all_available_tools() assert "scrape_url" in catalog assert "orchestrator" in catalog["scrape_url"]["available_in"] # --------------------------------------------------------------------------- # _extract_crawl_content helper # --------------------------------------------------------------------------- class TestExtractCrawlContent: def test_empty_results(self): result = _extract_crawl_content([], "https://example.com") assert "No content" in result def test_markdown_field_preferred(self): results = [{"markdown": "# Title", "content": "fallback"}] result = _extract_crawl_content(results, "https://example.com") assert "Title" in result def test_fallback_to_content_field(self): results = [{"content": "plain text content"}] result = _extract_crawl_content(results, "https://example.com") assert "plain text content" in result def test_no_content_fields(self): results = [{"url": "https://example.com"}] result = _extract_crawl_content(results, "https://example.com") assert "No readable content" in result