[claude] feat: SearXNG + Crawl4AI self-hosted search backend (#1282) (#1299)

2026-03-24 01:52:51 +00:00
parent f1f67e62a7
commit a7ccfbddc9
9 changed files with 694 additions and 1 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -247,6 +247,48 @@ make docker-agent       # add a worker

 ---

+## Search Capability (SearXNG + Crawl4AI)
+
+Timmy has a self-hosted search backend requiring **no paid API key**.
+
+### Tools
+
+| Tool | Module | Description |
+|------|--------|-------------|
+| `web_search(query)` | `timmy/tools/search.py` | Meta-search via SearXNG — returns ranked results |
+| `scrape_url(url)` | `timmy/tools/search.py` | Full-page scrape via Crawl4AI → clean markdown |
+
+Both tools are registered in the **orchestrator** (full) and **echo** (research) toolkits.
+
+### Configuration
+
+| Env Var | Default | Description |
+|---------|---------|-------------|
+| `TIMMY_SEARCH_BACKEND` | `searxng` | `searxng` or `none` (disable) |
+| `TIMMY_SEARCH_URL` | `http://localhost:8888` | SearXNG base URL |
+| `TIMMY_CRAWL_URL` | `http://localhost:11235` | Crawl4AI base URL |
+
+Inside Docker Compose (when `--profile search` is active), the dashboard
+uses `http://searxng:8080` and `http://crawl4ai:11235` by default.
+
+### Starting the services
+
+```bash
+# Start SearXNG + Crawl4AI alongside the dashboard:
+docker compose --profile search up
+
+# Or start only the search services:
+docker compose --profile search up searxng crawl4ai
+```
+
+### Graceful degradation
+
+- If `TIMMY_SEARCH_BACKEND=none`: tools return a "disabled" message.
+- If SearXNG or Crawl4AI is unreachable: tools log a WARNING and return an
+  error string — the app never crashes.
+
+---
+
 ## Roadmap

 **v2.0 Exodus (in progress):** Voice + Marketplace + Integrations
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -42,6 +42,10 @@ services:
      GROK_ENABLED: "${GROK_ENABLED:-false}"
      XAI_API_KEY: "${XAI_API_KEY:-}"
      GROK_DEFAULT_MODEL: "${GROK_DEFAULT_MODEL:-grok-3-fast}"
+      # Search backend (SearXNG + Crawl4AI) — set TIMMY_SEARCH_BACKEND=none to disable
+      TIMMY_SEARCH_BACKEND: "${TIMMY_SEARCH_BACKEND:-searxng}"
+      TIMMY_SEARCH_URL: "${TIMMY_SEARCH_URL:-http://searxng:8080}"
+      TIMMY_CRAWL_URL: "${TIMMY_CRAWL_URL:-http://crawl4ai:11235}"
    extra_hosts:
      - "host.docker.internal:host-gateway"  # Linux: maps to host IP
    networks:
@@ -74,6 +78,50 @@ services:
    profiles:
      - celery

+  # ── SearXNG — self-hosted meta-search engine ─────────────────────────
+  searxng:
+    image: searxng/searxng:latest
+    container_name: timmy-searxng
+    profiles:
+      - search
+    ports:
+      - "${SEARXNG_PORT:-8888}:8080"
+    environment:
+      SEARXNG_BASE_URL: "${SEARXNG_BASE_URL:-http://localhost:8888}"
+    volumes:
+      - ./docker/searxng:/etc/searxng:rw
+    networks:
+      - timmy-net
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "-qO-", "http://localhost:8080/healthz"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 20s
+
+  # ── Crawl4AI — self-hosted web scraper ────────────────────────────────
+  crawl4ai:
+    image: unclecode/crawl4ai:latest
+    container_name: timmy-crawl4ai
+    profiles:
+      - search
+    ports:
+      - "${CRAWL4AI_PORT:-11235}:11235"
+    environment:
+      CRAWL4AI_API_TOKEN: "${CRAWL4AI_API_TOKEN:-}"
+    volumes:
+      - timmy-data:/app/data
+    networks:
+      - timmy-net
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+
  # ── OpenFang — vendored agent runtime sidecar ────────────────────────────
  openfang:
    build:
--- a/docker/searxng/settings.yml
+++ b/docker/searxng/settings.yml
@@ -0,0 +1,67 @@
+# SearXNG configuration for Timmy Time self-hosted search
+# https://docs.searxng.org/admin/settings/settings.html
+
+general:
+  debug: false
+  instance_name: "Timmy Search"
+  privacypolicy_url: false
+  donation_url: false
+  contact_url: false
+  enable_metrics: false
+
+server:
+  port: 8080
+  bind_address: "0.0.0.0"
+  secret_key: "timmy-searxng-key-change-in-production"
+  base_url: false
+  image_proxy: false
+
+ui:
+  static_use_hash: false
+  default_locale: ""
+  query_in_title: false
+  infinite_scroll: false
+  default_theme: simple
+  center_alignment: false
+
+search:
+  safe_search: 0
+  autocomplete: ""
+  default_lang: "en"
+  formats:
+    - html
+    - json
+
+outgoing:
+  request_timeout: 6.0
+  max_request_timeout: 10.0
+  useragent_suffix: "TimmyResearchBot"
+  pool_connections: 100
+  pool_maxsize: 20
+
+enabled_plugins:
+  - Hash_plugin
+  - Search_on_category_select
+  - Tracker_url_remover
+
+engines:
+  - name: google
+    engine: google
+    shortcut: g
+    categories: general
+
+  - name: bing
+    engine: bing
+    shortcut: b
+    categories: general
+
+  - name: duckduckgo
+    engine: duckduckgo
+    shortcut: d
+    categories: general
+
+  - name: wikipedia
+    engine: wikipedia
+    shortcut: wp
+    categories: general
+    timeout: 3.0
--- a/src/config.py
+++ b/src/config.py
@@ -109,6 +109,16 @@ class Settings(BaseSettings):
    grok_sats_hard_cap: int = 100  # Absolute ceiling on sats per Grok query
    grok_free: bool = False  # Skip Lightning invoice when user has own API key

+    # ── Search Backend (SearXNG + Crawl4AI) ──────────────────────────────
+    # "searxng" — self-hosted SearXNG meta-search engine (default, no API key)
+    # "none"    — disable web search (private/offline deployments)
+    # Override with TIMMY_SEARCH_BACKEND env var.
+    timmy_search_backend: Literal["searxng", "none"] = "searxng"
+    # SearXNG base URL — override with TIMMY_SEARCH_URL env var
+    search_url: str = "http://localhost:8888"
+    # Crawl4AI base URL — override with TIMMY_CRAWL_URL env var
+    crawl_url: str = "http://localhost:11235"
+
    # ── Database ──────────────────────────────────────────────────────────
    db_busy_timeout_ms: int = 5000  # SQLite PRAGMA busy_timeout (ms)

--- a/src/timmy/tools/init.py
+++ b/src/timmy/tools/init.py
@@ -46,6 +46,7 @@ from timmy.tools.file_tools import (
    create_research_tools,
    create_writing_tools,
 )
+from timmy.tools.search import scrape_url, web_search
 from timmy.tools.system_tools import (
    _safe_eval,
    calculator,
@@ -72,6 +73,9 @@ __all__ = [
    "create_data_tools",
    "create_research_tools",
    "create_writing_tools",
+    # search
+    "scrape_url",
+    "web_search",
    # system_tools
    "_safe_eval",
    "calculator",
--- a/src/timmy/tools/_registry.py
+++ b/src/timmy/tools/_registry.py
@@ -28,6 +28,7 @@ from timmy.tools.file_tools import (
    create_research_tools,
    create_writing_tools,
 )
+from timmy.tools.search import scrape_url, web_search
 from timmy.tools.system_tools import (
    calculator,
    consult_grok,
@@ -54,6 +55,16 @@ def _register_web_fetch_tool(toolkit: Toolkit) -> None:
        raise


+def _register_search_tools(toolkit: Toolkit) -> None:
+    """Register SearXNG web_search and Crawl4AI scrape_url tools."""
+    try:
+        toolkit.register(web_search, name="web_search")
+        toolkit.register(scrape_url, name="scrape_url")
+    except Exception as exc:
+        logger.error("Failed to register search tools: %s", exc)
+        raise
+
+
 def _register_core_tools(toolkit: Toolkit, base_path: Path) -> None:
    """Register core execution and file tools."""
    # Python execution
@@ -261,6 +272,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):

    _register_core_tools(toolkit, base_path)
    _register_web_fetch_tool(toolkit)
+    _register_search_tools(toolkit)
    _register_grok_tool(toolkit)
    _register_memory_tools(toolkit)
    _register_agentic_loop_tool(toolkit)
@@ -433,6 +445,16 @@ def _analysis_tool_catalog() -> dict:
            "description": "Fetch a web page and extract clean readable text (trafilatura)",
            "available_in": ["orchestrator"],
        },
+        "web_search": {
+            "name": "Web Search",
+            "description": "Search the web via self-hosted SearXNG (no API key required)",
+            "available_in": ["echo", "orchestrator"],
+        },
+        "scrape_url": {
+            "name": "Scrape URL",
+            "description": "Scrape a URL with Crawl4AI and return clean markdown content",
+            "available_in": ["echo", "orchestrator"],
+        },
    }


--- a/src/timmy/tools/file_tools.py
+++ b/src/timmy/tools/file_tools.py
@@ -59,7 +59,7 @@ def _make_smart_read_file(file_tools: FileTools) -> Callable:
 def create_research_tools(base_dir: str | Path | None = None):
    """Create tools for the research agent (Echo).

-    Includes: file reading
+    Includes: file reading, web search (SearXNG), URL scraping (Crawl4AI)
    """
    if not _AGNO_TOOLS_AVAILABLE:
        raise ImportError(f"Agno tools not available: {_ImportError}")
@@ -73,6 +73,12 @@ def create_research_tools(base_dir: str | Path | None = None):
    toolkit.register(_make_smart_read_file(file_tools), name="read_file")
    toolkit.register(file_tools.list_files, name="list_files")

+    # Web search + scraping (gracefully no-ops when backend=none or service down)
+    from timmy.tools.search import scrape_url, web_search
+
+    toolkit.register(web_search, name="web_search")
+    toolkit.register(scrape_url, name="scrape_url")
+
    return toolkit


--- a/src/timmy/tools/search.py
+++ b/src/timmy/tools/search.py
@@ -0,0 +1,186 @@
+"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI.
+
+Provides:
+- web_search(query) — SearXNG meta-search (no API key required)
+- scrape_url(url)   — Crawl4AI full-page scrape to clean markdown
+
+Both tools degrade gracefully when the backing service is unavailable
+(logs WARNING, returns descriptive error string — never crashes).
+
+Services are started via `docker compose --profile search up` or configured
+with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+
+from config import settings
+
+logger = logging.getLogger(__name__)
+
+# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds
+_CRAWL_MAX_POLLS = 6
+_CRAWL_POLL_INTERVAL = 5  # seconds
+_CRAWL_CHAR_BUDGET = 4000 * 4  # ~4000 tokens
+
+
+def web_search(query: str, num_results: int = 5) -> str:
+    """Search the web using the self-hosted SearXNG meta-search engine.
+
+    Returns ranked results (title + URL + snippet) without requiring any
+    paid API key.  Requires SearXNG running locally (docker compose
+    --profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance.
+
+    Args:
+        query: The search query.
+        num_results: Maximum number of results to return (default 5).
+
+    Returns:
+        Formatted search results string, or an error/status message on failure.
+    """
+    if settings.timmy_search_backend == "none":
+        return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)."
+
+    try:
+        import requests as _requests
+    except ImportError:
+        return "Error: 'requests' package is not installed."
+
+    base_url = settings.search_url.rstrip("/")
+    params: dict = {
+        "q": query,
+        "format": "json",
+        "categories": "general",
+    }
+
+    try:
+        resp = _requests.get(
+            f"{base_url}/search",
+            params=params,
+            timeout=10,
+            headers={"User-Agent": "TimmyResearchBot/1.0"},
+        )
+        resp.raise_for_status()
+    except Exception as exc:
+        logger.warning("SearXNG unavailable at %s: %s", base_url, exc)
+        return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}"
+
+    try:
+        data = resp.json()
+    except Exception as exc:
+        logger.warning("SearXNG response parse error: %s", exc)
+        return "Search error: could not parse SearXNG response."
+
+    results = data.get("results", [])[:num_results]
+    if not results:
+        return f"No results found for: {query!r}"
+
+    lines = [f"Web search results for: {query!r}\n"]
+    for i, r in enumerate(results, 1):
+        title = r.get("title", "Untitled")
+        url = r.get("url", "")
+        snippet = r.get("content", "").strip()
+        lines.append(f"{i}. {title}\n   URL: {url}\n   {snippet}\n")
+
+    return "\n".join(lines)
+
+
+def scrape_url(url: str) -> str:
+    """Scrape a URL with Crawl4AI and return the main content as clean markdown.
+
+    Crawl4AI extracts well-structured markdown from any public page —
+    articles, docs, product pages — suitable for LLM consumption.
+    Requires Crawl4AI running locally (docker compose --profile search up)
+    or TIMMY_CRAWL_URL pointing to a reachable instance.
+
+    Args:
+        url: The URL to scrape (must start with http:// or https://).
+
+    Returns:
+        Extracted markdown text (up to ~4000 tokens), or an error message.
+    """
+    if not url or not url.startswith(("http://", "https://")):
+        return f"Error: invalid URL — must start with http:// or https://: {url!r}"
+
+    if settings.timmy_search_backend == "none":
+        return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)."
+
+    try:
+        import requests as _requests
+    except ImportError:
+        return "Error: 'requests' package is not installed."
+
+    base = settings.crawl_url.rstrip("/")
+
+    # Submit crawl task
+    try:
+        resp = _requests.post(
+            f"{base}/crawl",
+            json={"urls": [url], "priority": 10},
+            timeout=15,
+            headers={"Content-Type": "application/json"},
+        )
+        resp.raise_for_status()
+    except Exception as exc:
+        logger.warning("Crawl4AI unavailable at %s: %s", base, exc)
+        return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}"
+
+    try:
+        submit_data = resp.json()
+    except Exception as exc:
+        logger.warning("Crawl4AI submit parse error: %s", exc)
+        return "Scrape error: could not parse Crawl4AI response."
+
+    # Check if result came back synchronously
+    if "results" in submit_data:
+        return _extract_crawl_content(submit_data["results"], url)
+
+    task_id = submit_data.get("task_id")
+    if not task_id:
+        return f"Scrape error: Crawl4AI returned no task_id for {url}"
+
+    # Poll for async result
+    for _ in range(_CRAWL_MAX_POLLS):
+        time.sleep(_CRAWL_POLL_INTERVAL)
+        try:
+            poll = _requests.get(f"{base}/task/{task_id}", timeout=10)
+            poll.raise_for_status()
+            task_data = poll.json()
+        except Exception as exc:
+            logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc)
+            continue
+
+        status = task_data.get("status", "")
+        if status == "completed":
+            results = task_data.get("results") or task_data.get("result")
+            if isinstance(results, dict):
+                results = [results]
+            return _extract_crawl_content(results or [], url)
+        if status == "failed":
+            return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}"
+
+    return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}"
+
+
+def _extract_crawl_content(results: list, url: str) -> str:
+    """Extract and truncate markdown content from Crawl4AI results list."""
+    if not results:
+        return f"No content returned by Crawl4AI for: {url}"
+
+    result = results[0]
+    content = (
+        result.get("markdown")
+        or result.get("markdown_v2", {}).get("raw_markdown")
+        or result.get("extracted_content")
+        or result.get("content")
+        or ""
+    )
+    if not content:
+        return f"No readable content extracted from: {url}"
+
+    if len(content) > _CRAWL_CHAR_BUDGET:
+        content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]"
+
+    return content
--- a/tests/timmy/test_tools_search.py
+++ b/tests/timmy/test_tools_search.py
@@ -0,0 +1,308 @@
+"""Unit tests for web_search and scrape_url tools (SearXNG + Crawl4AI).
+
+All tests use mocked HTTP — no live services required.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from timmy.tools.search import _extract_crawl_content, scrape_url, web_search
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _mock_requests(json_response=None, status_code=200, raise_exc=None):
+    """Build a mock requests module whose .get/.post return controlled responses."""
+    mock_req = MagicMock()
+
+    # Exception hierarchy
+    class Timeout(Exception):
+        pass
+
+    class HTTPError(Exception):
+        def __init__(self, *a, response=None, **kw):
+            super().__init__(*a, **kw)
+            self.response = response
+
+    class RequestException(Exception):
+        pass
+
+    exc_mod = MagicMock()
+    exc_mod.Timeout = Timeout
+    exc_mod.HTTPError = HTTPError
+    exc_mod.RequestException = RequestException
+    mock_req.exceptions = exc_mod
+
+    if raise_exc is not None:
+        mock_req.get.side_effect = raise_exc
+        mock_req.post.side_effect = raise_exc
+    else:
+        mock_resp = MagicMock()
+        mock_resp.status_code = status_code
+        mock_resp.json.return_value = json_response or {}
+        if status_code >= 400:
+            mock_resp.raise_for_status.side_effect = HTTPError(
+                response=MagicMock(status_code=status_code)
+            )
+        mock_req.get.return_value = mock_resp
+        mock_req.post.return_value = mock_resp
+
+    return mock_req
+
+
+# ---------------------------------------------------------------------------
+# web_search tests
+# ---------------------------------------------------------------------------
+
+
+class TestWebSearch:
+    def test_backend_none_short_circuits(self):
+        """TIMMY_SEARCH_BACKEND=none returns disabled message immediately."""
+        with patch("timmy.tools.search.settings") as mock_settings:
+            mock_settings.timmy_search_backend = "none"
+            result = web_search("anything")
+        assert "disabled" in result
+
+    def test_missing_requests_package(self):
+        """Graceful error when requests is not installed."""
+        with patch.dict("sys.modules", {"requests": None}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.search_url = "http://localhost:8888"
+                result = web_search("test query")
+        assert "requests" in result and "not installed" in result
+
+    def test_successful_search(self):
+        """Happy path: returns formatted result list."""
+        mock_data = {
+            "results": [
+                {"title": "Foo Bar", "url": "https://example.com/foo", "content": "Foo is great"},
+                {"title": "Baz", "url": "https://example.com/baz", "content": "Baz rules"},
+            ]
+        }
+        mock_req = _mock_requests(json_response=mock_data)
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.search_url = "http://localhost:8888"
+                result = web_search("foo bar")
+
+        assert "Foo Bar" in result
+        assert "https://example.com/foo" in result
+        assert "Baz" in result
+        assert "foo bar" in result
+
+    def test_no_results(self):
+        """Empty results list returns a helpful no-results message."""
+        mock_req = _mock_requests(json_response={"results": []})
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.search_url = "http://localhost:8888"
+                result = web_search("xyzzy")
+        assert "No results" in result
+
+    def test_num_results_respected(self):
+        """Only up to num_results entries are returned."""
+        mock_data = {
+            "results": [
+                {"title": f"Result {i}", "url": f"https://example.com/{i}", "content": "x"}
+                for i in range(10)
+            ]
+        }
+        mock_req = _mock_requests(json_response=mock_data)
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.search_url = "http://localhost:8888"
+                result = web_search("test", num_results=3)
+
+        # Only 3 numbered entries should appear
+        assert "1." in result
+        assert "3." in result
+        assert "4." not in result
+
+    def test_service_unavailable(self):
+        """Connection error degrades gracefully."""
+        mock_req = MagicMock()
+        mock_req.get.side_effect = OSError("connection refused")
+        mock_req.exceptions = MagicMock()
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.search_url = "http://localhost:8888"
+                result = web_search("test")
+        assert "not reachable" in result or "unavailable" in result
+
+    def test_catalog_entry_exists(self):
+        """web_search must appear in the tool catalog."""
+        from timmy.tools import get_all_available_tools
+
+        catalog = get_all_available_tools()
+        assert "web_search" in catalog
+        assert "orchestrator" in catalog["web_search"]["available_in"]
+        assert "echo" in catalog["web_search"]["available_in"]
+
+
+# ---------------------------------------------------------------------------
+# scrape_url tests
+# ---------------------------------------------------------------------------
+
+
+class TestScrapeUrl:
+    def test_invalid_url_no_scheme(self):
+        """URLs without http(s) scheme are rejected before any HTTP call."""
+        result = scrape_url("example.com/page")
+        assert "Error: invalid URL" in result
+
+    def test_invalid_url_empty(self):
+        result = scrape_url("")
+        assert "Error: invalid URL" in result
+
+    def test_backend_none_short_circuits(self):
+        with patch("timmy.tools.search.settings") as mock_settings:
+            mock_settings.timmy_search_backend = "none"
+            result = scrape_url("https://example.com")
+        assert "disabled" in result
+
+    def test_missing_requests_package(self):
+        with patch.dict("sys.modules", {"requests": None}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.crawl_url = "http://localhost:11235"
+                result = scrape_url("https://example.com")
+        assert "requests" in result and "not installed" in result
+
+    def test_sync_result_returned_immediately(self):
+        """If Crawl4AI returns results in the POST response, use them directly."""
+        mock_data = {
+            "results": [{"markdown": "# Hello\n\nThis is the page content."}]
+        }
+        mock_req = _mock_requests(json_response=mock_data)
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.crawl_url = "http://localhost:11235"
+                result = scrape_url("https://example.com")
+
+        assert "Hello" in result
+        assert "page content" in result
+
+    def test_async_poll_completed(self):
+        """Async task_id flow: polls until completed and returns content."""
+        submit_response = MagicMock()
+        submit_response.json.return_value = {"task_id": "abc123"}
+        submit_response.raise_for_status.return_value = None
+
+        poll_response = MagicMock()
+        poll_response.json.return_value = {
+            "status": "completed",
+            "results": [{"markdown": "# Async content"}],
+        }
+        poll_response.raise_for_status.return_value = None
+
+        mock_req = MagicMock()
+        mock_req.post.return_value = submit_response
+        mock_req.get.return_value = poll_response
+        mock_req.exceptions = MagicMock()
+
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.crawl_url = "http://localhost:11235"
+                with patch("timmy.tools.search.time") as mock_time:
+                    mock_time.sleep = MagicMock()
+                    result = scrape_url("https://example.com")
+
+        assert "Async content" in result
+
+    def test_async_poll_failed_task(self):
+        """Crawl4AI task failure is reported clearly."""
+        submit_response = MagicMock()
+        submit_response.json.return_value = {"task_id": "abc123"}
+        submit_response.raise_for_status.return_value = None
+
+        poll_response = MagicMock()
+        poll_response.json.return_value = {"status": "failed", "error": "site blocked"}
+        poll_response.raise_for_status.return_value = None
+
+        mock_req = MagicMock()
+        mock_req.post.return_value = submit_response
+        mock_req.get.return_value = poll_response
+        mock_req.exceptions = MagicMock()
+
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.crawl_url = "http://localhost:11235"
+                with patch("timmy.tools.search.time") as mock_time:
+                    mock_time.sleep = MagicMock()
+                    result = scrape_url("https://example.com")
+
+        assert "failed" in result and "site blocked" in result
+
+    def test_service_unavailable(self):
+        """Connection error degrades gracefully."""
+        mock_req = MagicMock()
+        mock_req.post.side_effect = OSError("connection refused")
+        mock_req.exceptions = MagicMock()
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.crawl_url = "http://localhost:11235"
+                result = scrape_url("https://example.com")
+        assert "not reachable" in result or "unavailable" in result
+
+    def test_content_truncation(self):
+        """Content longer than ~4000 tokens is truncated."""
+        long_content = "x" * 20000
+        mock_data = {"results": [{"markdown": long_content}]}
+        mock_req = _mock_requests(json_response=mock_data)
+        with patch.dict("sys.modules", {"requests": mock_req}):
+            with patch("timmy.tools.search.settings") as mock_settings:
+                mock_settings.timmy_search_backend = "searxng"
+                mock_settings.crawl_url = "http://localhost:11235"
+                result = scrape_url("https://example.com")
+
+        assert "[…truncated" in result
+        assert len(result) < 17000
+
+    def test_catalog_entry_exists(self):
+        """scrape_url must appear in the tool catalog."""
+        from timmy.tools import get_all_available_tools
+
+        catalog = get_all_available_tools()
+        assert "scrape_url" in catalog
+        assert "orchestrator" in catalog["scrape_url"]["available_in"]
+
+
+# ---------------------------------------------------------------------------
+# _extract_crawl_content helper
+# ---------------------------------------------------------------------------
+
+
+class TestExtractCrawlContent:
+    def test_empty_results(self):
+        result = _extract_crawl_content([], "https://example.com")
+        assert "No content" in result
+
+    def test_markdown_field_preferred(self):
+        results = [{"markdown": "# Title", "content": "fallback"}]
+        result = _extract_crawl_content(results, "https://example.com")
+        assert "Title" in result
+
+    def test_fallback_to_content_field(self):
+        results = [{"content": "plain text content"}]
+        result = _extract_crawl_content(results, "https://example.com")
+        assert "plain text content" in result
+
+    def test_no_content_fields(self):
+        results = [{"url": "https://example.com"}]
+        result = _extract_crawl_content(results, "https://example.com")
+        assert "No readable content" in result