[claude] feat: SearXNG + Crawl4AI self-hosted search backend (#1282) #1299
42
AGENTS.md
42
AGENTS.md
@@ -247,6 +247,48 @@ make docker-agent # add a worker
|
||||
|
||||
---
|
||||
|
||||
## Search Capability (SearXNG + Crawl4AI)
|
||||
|
||||
Timmy has a self-hosted search backend requiring **no paid API key**.
|
||||
|
||||
### Tools
|
||||
|
||||
| Tool | Module | Description |
|
||||
|------|--------|-------------|
|
||||
| `web_search(query)` | `timmy/tools/search.py` | Meta-search via SearXNG — returns ranked results |
|
||||
| `scrape_url(url)` | `timmy/tools/search.py` | Full-page scrape via Crawl4AI → clean markdown |
|
||||
|
||||
Both tools are registered in the **orchestrator** (full) and **echo** (research) toolkits.
|
||||
|
||||
### Configuration
|
||||
|
||||
| Env Var | Default | Description |
|
||||
|---------|---------|-------------|
|
||||
| `TIMMY_SEARCH_BACKEND` | `searxng` | `searxng` or `none` (disable) |
|
||||
| `TIMMY_SEARCH_URL` | `http://localhost:8888` | SearXNG base URL |
|
||||
| `TIMMY_CRAWL_URL` | `http://localhost:11235` | Crawl4AI base URL |
|
||||
|
||||
Inside Docker Compose (when `--profile search` is active), the dashboard
|
||||
uses `http://searxng:8080` and `http://crawl4ai:11235` by default.
|
||||
|
||||
### Starting the services
|
||||
|
||||
```bash
|
||||
# Start SearXNG + Crawl4AI alongside the dashboard:
|
||||
docker compose --profile search up
|
||||
|
||||
# Or start only the search services:
|
||||
docker compose --profile search up searxng crawl4ai
|
||||
```
|
||||
|
||||
### Graceful degradation
|
||||
|
||||
- If `TIMMY_SEARCH_BACKEND=none`: tools return a "disabled" message.
|
||||
- If SearXNG or Crawl4AI is unreachable: tools log a WARNING and return an
|
||||
error string — the app never crashes.
|
||||
|
||||
---
|
||||
|
||||
## Roadmap
|
||||
|
||||
**v2.0 Exodus (in progress):** Voice + Marketplace + Integrations
|
||||
|
||||
@@ -42,6 +42,10 @@ services:
|
||||
GROK_ENABLED: "${GROK_ENABLED:-false}"
|
||||
XAI_API_KEY: "${XAI_API_KEY:-}"
|
||||
GROK_DEFAULT_MODEL: "${GROK_DEFAULT_MODEL:-grok-3-fast}"
|
||||
# Search backend (SearXNG + Crawl4AI) — set TIMMY_SEARCH_BACKEND=none to disable
|
||||
TIMMY_SEARCH_BACKEND: "${TIMMY_SEARCH_BACKEND:-searxng}"
|
||||
TIMMY_SEARCH_URL: "${TIMMY_SEARCH_URL:-http://searxng:8080}"
|
||||
TIMMY_CRAWL_URL: "${TIMMY_CRAWL_URL:-http://crawl4ai:11235}"
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway" # Linux: maps to host IP
|
||||
networks:
|
||||
@@ -74,6 +78,50 @@ services:
|
||||
profiles:
|
||||
- celery
|
||||
|
||||
# ── SearXNG — self-hosted meta-search engine ─────────────────────────
|
||||
searxng:
|
||||
image: searxng/searxng:latest
|
||||
container_name: timmy-searxng
|
||||
profiles:
|
||||
- search
|
||||
ports:
|
||||
- "${SEARXNG_PORT:-8888}:8080"
|
||||
environment:
|
||||
SEARXNG_BASE_URL: "${SEARXNG_BASE_URL:-http://localhost:8888}"
|
||||
volumes:
|
||||
- ./docker/searxng:/etc/searxng:rw
|
||||
networks:
|
||||
- timmy-net
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
|
||||
# ── Crawl4AI — self-hosted web scraper ────────────────────────────────
|
||||
crawl4ai:
|
||||
image: unclecode/crawl4ai:latest
|
||||
container_name: timmy-crawl4ai
|
||||
profiles:
|
||||
- search
|
||||
ports:
|
||||
- "${CRAWL4AI_PORT:-11235}:11235"
|
||||
environment:
|
||||
CRAWL4AI_API_TOKEN: "${CRAWL4AI_API_TOKEN:-}"
|
||||
volumes:
|
||||
- timmy-data:/app/data
|
||||
networks:
|
||||
- timmy-net
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
# ── OpenFang — vendored agent runtime sidecar ────────────────────────────
|
||||
openfang:
|
||||
build:
|
||||
|
||||
67
docker/searxng/settings.yml
Normal file
67
docker/searxng/settings.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
# SearXNG configuration for Timmy Time self-hosted search
|
||||
# https://docs.searxng.org/admin/settings/settings.html
|
||||
|
||||
general:
|
||||
debug: false
|
||||
instance_name: "Timmy Search"
|
||||
privacypolicy_url: false
|
||||
donation_url: false
|
||||
contact_url: false
|
||||
enable_metrics: false
|
||||
|
||||
server:
|
||||
port: 8080
|
||||
bind_address: "0.0.0.0"
|
||||
secret_key: "timmy-searxng-key-change-in-production"
|
||||
base_url: false
|
||||
image_proxy: false
|
||||
|
||||
ui:
|
||||
static_use_hash: false
|
||||
default_locale: ""
|
||||
query_in_title: false
|
||||
infinite_scroll: false
|
||||
default_theme: simple
|
||||
center_alignment: false
|
||||
|
||||
search:
|
||||
safe_search: 0
|
||||
autocomplete: ""
|
||||
default_lang: "en"
|
||||
formats:
|
||||
- html
|
||||
- json
|
||||
|
||||
outgoing:
|
||||
request_timeout: 6.0
|
||||
max_request_timeout: 10.0
|
||||
useragent_suffix: "TimmyResearchBot"
|
||||
pool_connections: 100
|
||||
pool_maxsize: 20
|
||||
|
||||
enabled_plugins:
|
||||
- Hash_plugin
|
||||
- Search_on_category_select
|
||||
- Tracker_url_remover
|
||||
|
||||
engines:
|
||||
- name: google
|
||||
engine: google
|
||||
shortcut: g
|
||||
categories: general
|
||||
|
||||
- name: bing
|
||||
engine: bing
|
||||
shortcut: b
|
||||
categories: general
|
||||
|
||||
- name: duckduckgo
|
||||
engine: duckduckgo
|
||||
shortcut: d
|
||||
categories: general
|
||||
|
||||
- name: wikipedia
|
||||
engine: wikipedia
|
||||
shortcut: wp
|
||||
categories: general
|
||||
timeout: 3.0
|
||||
@@ -108,6 +108,16 @@ class Settings(BaseSettings):
|
||||
grok_sats_hard_cap: int = 100 # Absolute ceiling on sats per Grok query
|
||||
grok_free: bool = False # Skip Lightning invoice when user has own API key
|
||||
|
||||
# ── Search Backend (SearXNG + Crawl4AI) ──────────────────────────────
|
||||
# "searxng" — self-hosted SearXNG meta-search engine (default, no API key)
|
||||
# "none" — disable web search (private/offline deployments)
|
||||
# Override with TIMMY_SEARCH_BACKEND env var.
|
||||
timmy_search_backend: Literal["searxng", "none"] = "searxng"
|
||||
# SearXNG base URL — override with TIMMY_SEARCH_URL env var
|
||||
search_url: str = "http://localhost:8888"
|
||||
# Crawl4AI base URL — override with TIMMY_CRAWL_URL env var
|
||||
crawl_url: str = "http://localhost:11235"
|
||||
|
||||
# ── Database ──────────────────────────────────────────────────────────
|
||||
db_busy_timeout_ms: int = 5000 # SQLite PRAGMA busy_timeout (ms)
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ from timmy.tools.file_tools import (
|
||||
create_research_tools,
|
||||
create_writing_tools,
|
||||
)
|
||||
from timmy.tools.search import scrape_url, web_search
|
||||
from timmy.tools.system_tools import (
|
||||
_safe_eval,
|
||||
calculator,
|
||||
@@ -72,6 +73,9 @@ __all__ = [
|
||||
"create_data_tools",
|
||||
"create_research_tools",
|
||||
"create_writing_tools",
|
||||
# search
|
||||
"scrape_url",
|
||||
"web_search",
|
||||
# system_tools
|
||||
"_safe_eval",
|
||||
"calculator",
|
||||
|
||||
@@ -28,6 +28,7 @@ from timmy.tools.file_tools import (
|
||||
create_research_tools,
|
||||
create_writing_tools,
|
||||
)
|
||||
from timmy.tools.search import scrape_url, web_search
|
||||
from timmy.tools.system_tools import (
|
||||
calculator,
|
||||
consult_grok,
|
||||
@@ -54,6 +55,16 @@ def _register_web_fetch_tool(toolkit: Toolkit) -> None:
|
||||
raise
|
||||
|
||||
|
||||
def _register_search_tools(toolkit: Toolkit) -> None:
|
||||
"""Register SearXNG web_search and Crawl4AI scrape_url tools."""
|
||||
try:
|
||||
toolkit.register(web_search, name="web_search")
|
||||
toolkit.register(scrape_url, name="scrape_url")
|
||||
except Exception as exc:
|
||||
logger.error("Failed to register search tools: %s", exc)
|
||||
raise
|
||||
|
||||
|
||||
def _register_core_tools(toolkit: Toolkit, base_path: Path) -> None:
|
||||
"""Register core execution and file tools."""
|
||||
# Python execution
|
||||
@@ -261,6 +272,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):
|
||||
|
||||
_register_core_tools(toolkit, base_path)
|
||||
_register_web_fetch_tool(toolkit)
|
||||
_register_search_tools(toolkit)
|
||||
_register_grok_tool(toolkit)
|
||||
_register_memory_tools(toolkit)
|
||||
_register_agentic_loop_tool(toolkit)
|
||||
@@ -433,6 +445,16 @@ def _analysis_tool_catalog() -> dict:
|
||||
"description": "Fetch a web page and extract clean readable text (trafilatura)",
|
||||
"available_in": ["orchestrator"],
|
||||
},
|
||||
"web_search": {
|
||||
"name": "Web Search",
|
||||
"description": "Search the web via self-hosted SearXNG (no API key required)",
|
||||
"available_in": ["echo", "orchestrator"],
|
||||
},
|
||||
"scrape_url": {
|
||||
"name": "Scrape URL",
|
||||
"description": "Scrape a URL with Crawl4AI and return clean markdown content",
|
||||
"available_in": ["echo", "orchestrator"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ def _make_smart_read_file(file_tools: FileTools) -> Callable:
|
||||
def create_research_tools(base_dir: str | Path | None = None):
|
||||
"""Create tools for the research agent (Echo).
|
||||
|
||||
Includes: file reading
|
||||
Includes: file reading, web search (SearXNG), URL scraping (Crawl4AI)
|
||||
"""
|
||||
if not _AGNO_TOOLS_AVAILABLE:
|
||||
raise ImportError(f"Agno tools not available: {_ImportError}")
|
||||
@@ -73,6 +73,12 @@ def create_research_tools(base_dir: str | Path | None = None):
|
||||
toolkit.register(_make_smart_read_file(file_tools), name="read_file")
|
||||
toolkit.register(file_tools.list_files, name="list_files")
|
||||
|
||||
# Web search + scraping (gracefully no-ops when backend=none or service down)
|
||||
from timmy.tools.search import scrape_url, web_search
|
||||
|
||||
toolkit.register(web_search, name="web_search")
|
||||
toolkit.register(scrape_url, name="scrape_url")
|
||||
|
||||
return toolkit
|
||||
|
||||
|
||||
|
||||
186
src/timmy/tools/search.py
Normal file
186
src/timmy/tools/search.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI.
|
||||
|
||||
Provides:
|
||||
- web_search(query) — SearXNG meta-search (no API key required)
|
||||
- scrape_url(url) — Crawl4AI full-page scrape to clean markdown
|
||||
|
||||
Both tools degrade gracefully when the backing service is unavailable
|
||||
(logs WARNING, returns descriptive error string — never crashes).
|
||||
|
||||
Services are started via `docker compose --profile search up` or configured
|
||||
with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds
|
||||
_CRAWL_MAX_POLLS = 6
|
||||
_CRAWL_POLL_INTERVAL = 5 # seconds
|
||||
_CRAWL_CHAR_BUDGET = 4000 * 4 # ~4000 tokens
|
||||
|
||||
|
||||
def web_search(query: str, num_results: int = 5) -> str:
|
||||
"""Search the web using the self-hosted SearXNG meta-search engine.
|
||||
|
||||
Returns ranked results (title + URL + snippet) without requiring any
|
||||
paid API key. Requires SearXNG running locally (docker compose
|
||||
--profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance.
|
||||
|
||||
Args:
|
||||
query: The search query.
|
||||
num_results: Maximum number of results to return (default 5).
|
||||
|
||||
Returns:
|
||||
Formatted search results string, or an error/status message on failure.
|
||||
"""
|
||||
if settings.timmy_search_backend == "none":
|
||||
return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)."
|
||||
|
||||
try:
|
||||
import requests as _requests
|
||||
except ImportError:
|
||||
return "Error: 'requests' package is not installed."
|
||||
|
||||
base_url = settings.search_url.rstrip("/")
|
||||
params: dict = {
|
||||
"q": query,
|
||||
"format": "json",
|
||||
"categories": "general",
|
||||
}
|
||||
|
||||
try:
|
||||
resp = _requests.get(
|
||||
f"{base_url}/search",
|
||||
params=params,
|
||||
timeout=10,
|
||||
headers={"User-Agent": "TimmyResearchBot/1.0"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("SearXNG unavailable at %s: %s", base_url, exc)
|
||||
return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}"
|
||||
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception as exc:
|
||||
logger.warning("SearXNG response parse error: %s", exc)
|
||||
return "Search error: could not parse SearXNG response."
|
||||
|
||||
results = data.get("results", [])[:num_results]
|
||||
if not results:
|
||||
return f"No results found for: {query!r}"
|
||||
|
||||
lines = [f"Web search results for: {query!r}\n"]
|
||||
for i, r in enumerate(results, 1):
|
||||
title = r.get("title", "Untitled")
|
||||
url = r.get("url", "")
|
||||
snippet = r.get("content", "").strip()
|
||||
lines.append(f"{i}. {title}\n URL: {url}\n {snippet}\n")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def scrape_url(url: str) -> str:
|
||||
"""Scrape a URL with Crawl4AI and return the main content as clean markdown.
|
||||
|
||||
Crawl4AI extracts well-structured markdown from any public page —
|
||||
articles, docs, product pages — suitable for LLM consumption.
|
||||
Requires Crawl4AI running locally (docker compose --profile search up)
|
||||
or TIMMY_CRAWL_URL pointing to a reachable instance.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape (must start with http:// or https://).
|
||||
|
||||
Returns:
|
||||
Extracted markdown text (up to ~4000 tokens), or an error message.
|
||||
"""
|
||||
if not url or not url.startswith(("http://", "https://")):
|
||||
return f"Error: invalid URL — must start with http:// or https://: {url!r}"
|
||||
|
||||
if settings.timmy_search_backend == "none":
|
||||
return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)."
|
||||
|
||||
try:
|
||||
import requests as _requests
|
||||
except ImportError:
|
||||
return "Error: 'requests' package is not installed."
|
||||
|
||||
base = settings.crawl_url.rstrip("/")
|
||||
|
||||
# Submit crawl task
|
||||
try:
|
||||
resp = _requests.post(
|
||||
f"{base}/crawl",
|
||||
json={"urls": [url], "priority": 10},
|
||||
timeout=15,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("Crawl4AI unavailable at %s: %s", base, exc)
|
||||
return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}"
|
||||
|
||||
try:
|
||||
submit_data = resp.json()
|
||||
except Exception as exc:
|
||||
logger.warning("Crawl4AI submit parse error: %s", exc)
|
||||
return "Scrape error: could not parse Crawl4AI response."
|
||||
|
||||
# Check if result came back synchronously
|
||||
if "results" in submit_data:
|
||||
return _extract_crawl_content(submit_data["results"], url)
|
||||
|
||||
task_id = submit_data.get("task_id")
|
||||
if not task_id:
|
||||
return f"Scrape error: Crawl4AI returned no task_id for {url}"
|
||||
|
||||
# Poll for async result
|
||||
for _ in range(_CRAWL_MAX_POLLS):
|
||||
time.sleep(_CRAWL_POLL_INTERVAL)
|
||||
try:
|
||||
poll = _requests.get(f"{base}/task/{task_id}", timeout=10)
|
||||
poll.raise_for_status()
|
||||
task_data = poll.json()
|
||||
except Exception as exc:
|
||||
logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc)
|
||||
continue
|
||||
|
||||
status = task_data.get("status", "")
|
||||
if status == "completed":
|
||||
results = task_data.get("results") or task_data.get("result")
|
||||
if isinstance(results, dict):
|
||||
results = [results]
|
||||
return _extract_crawl_content(results or [], url)
|
||||
if status == "failed":
|
||||
return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}"
|
||||
|
||||
return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}"
|
||||
|
||||
|
||||
def _extract_crawl_content(results: list, url: str) -> str:
|
||||
"""Extract and truncate markdown content from Crawl4AI results list."""
|
||||
if not results:
|
||||
return f"No content returned by Crawl4AI for: {url}"
|
||||
|
||||
result = results[0]
|
||||
content = (
|
||||
result.get("markdown")
|
||||
or result.get("markdown_v2", {}).get("raw_markdown")
|
||||
or result.get("extracted_content")
|
||||
or result.get("content")
|
||||
or ""
|
||||
)
|
||||
if not content:
|
||||
return f"No readable content extracted from: {url}"
|
||||
|
||||
if len(content) > _CRAWL_CHAR_BUDGET:
|
||||
content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]"
|
||||
|
||||
return content
|
||||
308
tests/timmy/test_tools_search.py
Normal file
308
tests/timmy/test_tools_search.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""Unit tests for web_search and scrape_url tools (SearXNG + Crawl4AI).
|
||||
|
||||
All tests use mocked HTTP — no live services required.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from timmy.tools.search import _extract_crawl_content, scrape_url, web_search
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _mock_requests(json_response=None, status_code=200, raise_exc=None):
|
||||
"""Build a mock requests module whose .get/.post return controlled responses."""
|
||||
mock_req = MagicMock()
|
||||
|
||||
# Exception hierarchy
|
||||
class Timeout(Exception):
|
||||
pass
|
||||
|
||||
class HTTPError(Exception):
|
||||
def __init__(self, *a, response=None, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self.response = response
|
||||
|
||||
class RequestException(Exception):
|
||||
pass
|
||||
|
||||
exc_mod = MagicMock()
|
||||
exc_mod.Timeout = Timeout
|
||||
exc_mod.HTTPError = HTTPError
|
||||
exc_mod.RequestException = RequestException
|
||||
mock_req.exceptions = exc_mod
|
||||
|
||||
if raise_exc is not None:
|
||||
mock_req.get.side_effect = raise_exc
|
||||
mock_req.post.side_effect = raise_exc
|
||||
else:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = status_code
|
||||
mock_resp.json.return_value = json_response or {}
|
||||
if status_code >= 400:
|
||||
mock_resp.raise_for_status.side_effect = HTTPError(
|
||||
response=MagicMock(status_code=status_code)
|
||||
)
|
||||
mock_req.get.return_value = mock_resp
|
||||
mock_req.post.return_value = mock_resp
|
||||
|
||||
return mock_req
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# web_search tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestWebSearch:
|
||||
def test_backend_none_short_circuits(self):
|
||||
"""TIMMY_SEARCH_BACKEND=none returns disabled message immediately."""
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "none"
|
||||
result = web_search("anything")
|
||||
assert "disabled" in result
|
||||
|
||||
def test_missing_requests_package(self):
|
||||
"""Graceful error when requests is not installed."""
|
||||
with patch.dict("sys.modules", {"requests": None}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.search_url = "http://localhost:8888"
|
||||
result = web_search("test query")
|
||||
assert "requests" in result and "not installed" in result
|
||||
|
||||
def test_successful_search(self):
|
||||
"""Happy path: returns formatted result list."""
|
||||
mock_data = {
|
||||
"results": [
|
||||
{"title": "Foo Bar", "url": "https://example.com/foo", "content": "Foo is great"},
|
||||
{"title": "Baz", "url": "https://example.com/baz", "content": "Baz rules"},
|
||||
]
|
||||
}
|
||||
mock_req = _mock_requests(json_response=mock_data)
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.search_url = "http://localhost:8888"
|
||||
result = web_search("foo bar")
|
||||
|
||||
assert "Foo Bar" in result
|
||||
assert "https://example.com/foo" in result
|
||||
assert "Baz" in result
|
||||
assert "foo bar" in result
|
||||
|
||||
def test_no_results(self):
|
||||
"""Empty results list returns a helpful no-results message."""
|
||||
mock_req = _mock_requests(json_response={"results": []})
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.search_url = "http://localhost:8888"
|
||||
result = web_search("xyzzy")
|
||||
assert "No results" in result
|
||||
|
||||
def test_num_results_respected(self):
|
||||
"""Only up to num_results entries are returned."""
|
||||
mock_data = {
|
||||
"results": [
|
||||
{"title": f"Result {i}", "url": f"https://example.com/{i}", "content": "x"}
|
||||
for i in range(10)
|
||||
]
|
||||
}
|
||||
mock_req = _mock_requests(json_response=mock_data)
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.search_url = "http://localhost:8888"
|
||||
result = web_search("test", num_results=3)
|
||||
|
||||
# Only 3 numbered entries should appear
|
||||
assert "1." in result
|
||||
assert "3." in result
|
||||
assert "4." not in result
|
||||
|
||||
def test_service_unavailable(self):
|
||||
"""Connection error degrades gracefully."""
|
||||
mock_req = MagicMock()
|
||||
mock_req.get.side_effect = OSError("connection refused")
|
||||
mock_req.exceptions = MagicMock()
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.search_url = "http://localhost:8888"
|
||||
result = web_search("test")
|
||||
assert "not reachable" in result or "unavailable" in result
|
||||
|
||||
def test_catalog_entry_exists(self):
|
||||
"""web_search must appear in the tool catalog."""
|
||||
from timmy.tools import get_all_available_tools
|
||||
|
||||
catalog = get_all_available_tools()
|
||||
assert "web_search" in catalog
|
||||
assert "orchestrator" in catalog["web_search"]["available_in"]
|
||||
assert "echo" in catalog["web_search"]["available_in"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# scrape_url tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestScrapeUrl:
|
||||
def test_invalid_url_no_scheme(self):
|
||||
"""URLs without http(s) scheme are rejected before any HTTP call."""
|
||||
result = scrape_url("example.com/page")
|
||||
assert "Error: invalid URL" in result
|
||||
|
||||
def test_invalid_url_empty(self):
|
||||
result = scrape_url("")
|
||||
assert "Error: invalid URL" in result
|
||||
|
||||
def test_backend_none_short_circuits(self):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "none"
|
||||
result = scrape_url("https://example.com")
|
||||
assert "disabled" in result
|
||||
|
||||
def test_missing_requests_package(self):
|
||||
with patch.dict("sys.modules", {"requests": None}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.crawl_url = "http://localhost:11235"
|
||||
result = scrape_url("https://example.com")
|
||||
assert "requests" in result and "not installed" in result
|
||||
|
||||
def test_sync_result_returned_immediately(self):
|
||||
"""If Crawl4AI returns results in the POST response, use them directly."""
|
||||
mock_data = {
|
||||
"results": [{"markdown": "# Hello\n\nThis is the page content."}]
|
||||
}
|
||||
mock_req = _mock_requests(json_response=mock_data)
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.crawl_url = "http://localhost:11235"
|
||||
result = scrape_url("https://example.com")
|
||||
|
||||
assert "Hello" in result
|
||||
assert "page content" in result
|
||||
|
||||
def test_async_poll_completed(self):
|
||||
"""Async task_id flow: polls until completed and returns content."""
|
||||
submit_response = MagicMock()
|
||||
submit_response.json.return_value = {"task_id": "abc123"}
|
||||
submit_response.raise_for_status.return_value = None
|
||||
|
||||
poll_response = MagicMock()
|
||||
poll_response.json.return_value = {
|
||||
"status": "completed",
|
||||
"results": [{"markdown": "# Async content"}],
|
||||
}
|
||||
poll_response.raise_for_status.return_value = None
|
||||
|
||||
mock_req = MagicMock()
|
||||
mock_req.post.return_value = submit_response
|
||||
mock_req.get.return_value = poll_response
|
||||
mock_req.exceptions = MagicMock()
|
||||
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.crawl_url = "http://localhost:11235"
|
||||
with patch("timmy.tools.search.time") as mock_time:
|
||||
mock_time.sleep = MagicMock()
|
||||
result = scrape_url("https://example.com")
|
||||
|
||||
assert "Async content" in result
|
||||
|
||||
def test_async_poll_failed_task(self):
|
||||
"""Crawl4AI task failure is reported clearly."""
|
||||
submit_response = MagicMock()
|
||||
submit_response.json.return_value = {"task_id": "abc123"}
|
||||
submit_response.raise_for_status.return_value = None
|
||||
|
||||
poll_response = MagicMock()
|
||||
poll_response.json.return_value = {"status": "failed", "error": "site blocked"}
|
||||
poll_response.raise_for_status.return_value = None
|
||||
|
||||
mock_req = MagicMock()
|
||||
mock_req.post.return_value = submit_response
|
||||
mock_req.get.return_value = poll_response
|
||||
mock_req.exceptions = MagicMock()
|
||||
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.crawl_url = "http://localhost:11235"
|
||||
with patch("timmy.tools.search.time") as mock_time:
|
||||
mock_time.sleep = MagicMock()
|
||||
result = scrape_url("https://example.com")
|
||||
|
||||
assert "failed" in result and "site blocked" in result
|
||||
|
||||
def test_service_unavailable(self):
|
||||
"""Connection error degrades gracefully."""
|
||||
mock_req = MagicMock()
|
||||
mock_req.post.side_effect = OSError("connection refused")
|
||||
mock_req.exceptions = MagicMock()
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.crawl_url = "http://localhost:11235"
|
||||
result = scrape_url("https://example.com")
|
||||
assert "not reachable" in result or "unavailable" in result
|
||||
|
||||
def test_content_truncation(self):
|
||||
"""Content longer than ~4000 tokens is truncated."""
|
||||
long_content = "x" * 20000
|
||||
mock_data = {"results": [{"markdown": long_content}]}
|
||||
mock_req = _mock_requests(json_response=mock_data)
|
||||
with patch.dict("sys.modules", {"requests": mock_req}):
|
||||
with patch("timmy.tools.search.settings") as mock_settings:
|
||||
mock_settings.timmy_search_backend = "searxng"
|
||||
mock_settings.crawl_url = "http://localhost:11235"
|
||||
result = scrape_url("https://example.com")
|
||||
|
||||
assert "[…truncated" in result
|
||||
assert len(result) < 17000
|
||||
|
||||
def test_catalog_entry_exists(self):
|
||||
"""scrape_url must appear in the tool catalog."""
|
||||
from timmy.tools import get_all_available_tools
|
||||
|
||||
catalog = get_all_available_tools()
|
||||
assert "scrape_url" in catalog
|
||||
assert "orchestrator" in catalog["scrape_url"]["available_in"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _extract_crawl_content helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExtractCrawlContent:
|
||||
def test_empty_results(self):
|
||||
result = _extract_crawl_content([], "https://example.com")
|
||||
assert "No content" in result
|
||||
|
||||
def test_markdown_field_preferred(self):
|
||||
results = [{"markdown": "# Title", "content": "fallback"}]
|
||||
result = _extract_crawl_content(results, "https://example.com")
|
||||
assert "Title" in result
|
||||
|
||||
def test_fallback_to_content_field(self):
|
||||
results = [{"content": "plain text content"}]
|
||||
result = _extract_crawl_content(results, "https://example.com")
|
||||
assert "plain text content" in result
|
||||
|
||||
def test_no_content_fields(self):
|
||||
results = [{"url": "https://example.com"}]
|
||||
result = _extract_crawl_content(results, "https://example.com")
|
||||
assert "No readable content" in result
|
||||
Reference in New Issue
Block a user