309 lines
12 KiB
Python
309 lines
12 KiB
Python
"""Unit tests for web_search and scrape_url tools (SearXNG + Crawl4AI).
|
|
|
|
All tests use mocked HTTP — no live services required.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from timmy.tools.search import _extract_crawl_content, scrape_url, web_search
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _mock_requests(json_response=None, status_code=200, raise_exc=None):
|
|
"""Build a mock requests module whose .get/.post return controlled responses."""
|
|
mock_req = MagicMock()
|
|
|
|
# Exception hierarchy
|
|
class Timeout(Exception):
|
|
pass
|
|
|
|
class HTTPError(Exception):
|
|
def __init__(self, *a, response=None, **kw):
|
|
super().__init__(*a, **kw)
|
|
self.response = response
|
|
|
|
class RequestException(Exception):
|
|
pass
|
|
|
|
exc_mod = MagicMock()
|
|
exc_mod.Timeout = Timeout
|
|
exc_mod.HTTPError = HTTPError
|
|
exc_mod.RequestException = RequestException
|
|
mock_req.exceptions = exc_mod
|
|
|
|
if raise_exc is not None:
|
|
mock_req.get.side_effect = raise_exc
|
|
mock_req.post.side_effect = raise_exc
|
|
else:
|
|
mock_resp = MagicMock()
|
|
mock_resp.status_code = status_code
|
|
mock_resp.json.return_value = json_response or {}
|
|
if status_code >= 400:
|
|
mock_resp.raise_for_status.side_effect = HTTPError(
|
|
response=MagicMock(status_code=status_code)
|
|
)
|
|
mock_req.get.return_value = mock_resp
|
|
mock_req.post.return_value = mock_resp
|
|
|
|
return mock_req
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# web_search tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestWebSearch:
|
|
def test_backend_none_short_circuits(self):
|
|
"""TIMMY_SEARCH_BACKEND=none returns disabled message immediately."""
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "none"
|
|
result = web_search("anything")
|
|
assert "disabled" in result
|
|
|
|
def test_missing_requests_package(self):
|
|
"""Graceful error when requests is not installed."""
|
|
with patch.dict("sys.modules", {"requests": None}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.search_url = "http://localhost:8888"
|
|
result = web_search("test query")
|
|
assert "requests" in result and "not installed" in result
|
|
|
|
def test_successful_search(self):
|
|
"""Happy path: returns formatted result list."""
|
|
mock_data = {
|
|
"results": [
|
|
{"title": "Foo Bar", "url": "https://example.com/foo", "content": "Foo is great"},
|
|
{"title": "Baz", "url": "https://example.com/baz", "content": "Baz rules"},
|
|
]
|
|
}
|
|
mock_req = _mock_requests(json_response=mock_data)
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.search_url = "http://localhost:8888"
|
|
result = web_search("foo bar")
|
|
|
|
assert "Foo Bar" in result
|
|
assert "https://example.com/foo" in result
|
|
assert "Baz" in result
|
|
assert "foo bar" in result
|
|
|
|
def test_no_results(self):
|
|
"""Empty results list returns a helpful no-results message."""
|
|
mock_req = _mock_requests(json_response={"results": []})
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.search_url = "http://localhost:8888"
|
|
result = web_search("xyzzy")
|
|
assert "No results" in result
|
|
|
|
def test_num_results_respected(self):
|
|
"""Only up to num_results entries are returned."""
|
|
mock_data = {
|
|
"results": [
|
|
{"title": f"Result {i}", "url": f"https://example.com/{i}", "content": "x"}
|
|
for i in range(10)
|
|
]
|
|
}
|
|
mock_req = _mock_requests(json_response=mock_data)
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.search_url = "http://localhost:8888"
|
|
result = web_search("test", num_results=3)
|
|
|
|
# Only 3 numbered entries should appear
|
|
assert "1." in result
|
|
assert "3." in result
|
|
assert "4." not in result
|
|
|
|
def test_service_unavailable(self):
|
|
"""Connection error degrades gracefully."""
|
|
mock_req = MagicMock()
|
|
mock_req.get.side_effect = OSError("connection refused")
|
|
mock_req.exceptions = MagicMock()
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.search_url = "http://localhost:8888"
|
|
result = web_search("test")
|
|
assert "not reachable" in result or "unavailable" in result
|
|
|
|
def test_catalog_entry_exists(self):
|
|
"""web_search must appear in the tool catalog."""
|
|
from timmy.tools import get_all_available_tools
|
|
|
|
catalog = get_all_available_tools()
|
|
assert "web_search" in catalog
|
|
assert "orchestrator" in catalog["web_search"]["available_in"]
|
|
assert "echo" in catalog["web_search"]["available_in"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# scrape_url tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestScrapeUrl:
|
|
def test_invalid_url_no_scheme(self):
|
|
"""URLs without http(s) scheme are rejected before any HTTP call."""
|
|
result = scrape_url("example.com/page")
|
|
assert "Error: invalid URL" in result
|
|
|
|
def test_invalid_url_empty(self):
|
|
result = scrape_url("")
|
|
assert "Error: invalid URL" in result
|
|
|
|
def test_backend_none_short_circuits(self):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "none"
|
|
result = scrape_url("https://example.com")
|
|
assert "disabled" in result
|
|
|
|
def test_missing_requests_package(self):
|
|
with patch.dict("sys.modules", {"requests": None}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.crawl_url = "http://localhost:11235"
|
|
result = scrape_url("https://example.com")
|
|
assert "requests" in result and "not installed" in result
|
|
|
|
def test_sync_result_returned_immediately(self):
|
|
"""If Crawl4AI returns results in the POST response, use them directly."""
|
|
mock_data = {
|
|
"results": [{"markdown": "# Hello\n\nThis is the page content."}]
|
|
}
|
|
mock_req = _mock_requests(json_response=mock_data)
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.crawl_url = "http://localhost:11235"
|
|
result = scrape_url("https://example.com")
|
|
|
|
assert "Hello" in result
|
|
assert "page content" in result
|
|
|
|
def test_async_poll_completed(self):
|
|
"""Async task_id flow: polls until completed and returns content."""
|
|
submit_response = MagicMock()
|
|
submit_response.json.return_value = {"task_id": "abc123"}
|
|
submit_response.raise_for_status.return_value = None
|
|
|
|
poll_response = MagicMock()
|
|
poll_response.json.return_value = {
|
|
"status": "completed",
|
|
"results": [{"markdown": "# Async content"}],
|
|
}
|
|
poll_response.raise_for_status.return_value = None
|
|
|
|
mock_req = MagicMock()
|
|
mock_req.post.return_value = submit_response
|
|
mock_req.get.return_value = poll_response
|
|
mock_req.exceptions = MagicMock()
|
|
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.crawl_url = "http://localhost:11235"
|
|
with patch("timmy.tools.search.time") as mock_time:
|
|
mock_time.sleep = MagicMock()
|
|
result = scrape_url("https://example.com")
|
|
|
|
assert "Async content" in result
|
|
|
|
def test_async_poll_failed_task(self):
|
|
"""Crawl4AI task failure is reported clearly."""
|
|
submit_response = MagicMock()
|
|
submit_response.json.return_value = {"task_id": "abc123"}
|
|
submit_response.raise_for_status.return_value = None
|
|
|
|
poll_response = MagicMock()
|
|
poll_response.json.return_value = {"status": "failed", "error": "site blocked"}
|
|
poll_response.raise_for_status.return_value = None
|
|
|
|
mock_req = MagicMock()
|
|
mock_req.post.return_value = submit_response
|
|
mock_req.get.return_value = poll_response
|
|
mock_req.exceptions = MagicMock()
|
|
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.crawl_url = "http://localhost:11235"
|
|
with patch("timmy.tools.search.time") as mock_time:
|
|
mock_time.sleep = MagicMock()
|
|
result = scrape_url("https://example.com")
|
|
|
|
assert "failed" in result and "site blocked" in result
|
|
|
|
def test_service_unavailable(self):
|
|
"""Connection error degrades gracefully."""
|
|
mock_req = MagicMock()
|
|
mock_req.post.side_effect = OSError("connection refused")
|
|
mock_req.exceptions = MagicMock()
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.crawl_url = "http://localhost:11235"
|
|
result = scrape_url("https://example.com")
|
|
assert "not reachable" in result or "unavailable" in result
|
|
|
|
def test_content_truncation(self):
|
|
"""Content longer than ~4000 tokens is truncated."""
|
|
long_content = "x" * 20000
|
|
mock_data = {"results": [{"markdown": long_content}]}
|
|
mock_req = _mock_requests(json_response=mock_data)
|
|
with patch.dict("sys.modules", {"requests": mock_req}):
|
|
with patch("timmy.tools.search.settings") as mock_settings:
|
|
mock_settings.timmy_search_backend = "searxng"
|
|
mock_settings.crawl_url = "http://localhost:11235"
|
|
result = scrape_url("https://example.com")
|
|
|
|
assert "[…truncated" in result
|
|
assert len(result) < 17000
|
|
|
|
def test_catalog_entry_exists(self):
|
|
"""scrape_url must appear in the tool catalog."""
|
|
from timmy.tools import get_all_available_tools
|
|
|
|
catalog = get_all_available_tools()
|
|
assert "scrape_url" in catalog
|
|
assert "orchestrator" in catalog["scrape_url"]["available_in"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _extract_crawl_content helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestExtractCrawlContent:
|
|
def test_empty_results(self):
|
|
result = _extract_crawl_content([], "https://example.com")
|
|
assert "No content" in result
|
|
|
|
def test_markdown_field_preferred(self):
|
|
results = [{"markdown": "# Title", "content": "fallback"}]
|
|
result = _extract_crawl_content(results, "https://example.com")
|
|
assert "Title" in result
|
|
|
|
def test_fallback_to_content_field(self):
|
|
results = [{"content": "plain text content"}]
|
|
result = _extract_crawl_content(results, "https://example.com")
|
|
assert "plain text content" in result
|
|
|
|
def test_no_content_fields(self):
|
|
results = [{"url": "https://example.com"}]
|
|
result = _extract_crawl_content(results, "https://example.com")
|
|
assert "No readable content" in result
|