diff --git a/AGENTS.md b/AGENTS.md index 7710702a..d3d8f7c4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -247,6 +247,48 @@ make docker-agent # add a worker --- +## Search Capability (SearXNG + Crawl4AI) + +Timmy has a self-hosted search backend requiring **no paid API key**. + +### Tools + +| Tool | Module | Description | +|------|--------|-------------| +| `web_search(query)` | `timmy/tools/search.py` | Meta-search via SearXNG — returns ranked results | +| `scrape_url(url)` | `timmy/tools/search.py` | Full-page scrape via Crawl4AI → clean markdown | + +Both tools are registered in the **orchestrator** (full) and **echo** (research) toolkits. + +### Configuration + +| Env Var | Default | Description | +|---------|---------|-------------| +| `TIMMY_SEARCH_BACKEND` | `searxng` | `searxng` or `none` (disable) | +| `TIMMY_SEARCH_URL` | `http://localhost:8888` | SearXNG base URL | +| `TIMMY_CRAWL_URL` | `http://localhost:11235` | Crawl4AI base URL | + +Inside Docker Compose (when `--profile search` is active), the dashboard +uses `http://searxng:8080` and `http://crawl4ai:11235` by default. + +### Starting the services + +```bash +# Start SearXNG + Crawl4AI alongside the dashboard: +docker compose --profile search up + +# Or start only the search services: +docker compose --profile search up searxng crawl4ai +``` + +### Graceful degradation + +- If `TIMMY_SEARCH_BACKEND=none`: tools return a "disabled" message. +- If SearXNG or Crawl4AI is unreachable: tools log a WARNING and return an + error string — the app never crashes. + +--- + ## Roadmap **v2.0 Exodus (in progress):** Voice + Marketplace + Integrations diff --git a/docker-compose.yml b/docker-compose.yml index 0b62d266..a33efbd7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,6 +42,10 @@ services: GROK_ENABLED: "${GROK_ENABLED:-false}" XAI_API_KEY: "${XAI_API_KEY:-}" GROK_DEFAULT_MODEL: "${GROK_DEFAULT_MODEL:-grok-3-fast}" + # Search backend (SearXNG + Crawl4AI) — set TIMMY_SEARCH_BACKEND=none to disable + TIMMY_SEARCH_BACKEND: "${TIMMY_SEARCH_BACKEND:-searxng}" + TIMMY_SEARCH_URL: "${TIMMY_SEARCH_URL:-http://searxng:8080}" + TIMMY_CRAWL_URL: "${TIMMY_CRAWL_URL:-http://crawl4ai:11235}" extra_hosts: - "host.docker.internal:host-gateway" # Linux: maps to host IP networks: @@ -74,6 +78,50 @@ services: profiles: - celery + # ── SearXNG — self-hosted meta-search engine ───────────────────────── + searxng: + image: searxng/searxng:latest + container_name: timmy-searxng + profiles: + - search + ports: + - "${SEARXNG_PORT:-8888}:8080" + environment: + SEARXNG_BASE_URL: "${SEARXNG_BASE_URL:-http://localhost:8888}" + volumes: + - ./docker/searxng:/etc/searxng:rw + networks: + - timmy-net + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:8080/healthz"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 20s + + # ── Crawl4AI — self-hosted web scraper ──────────────────────────────── + crawl4ai: + image: unclecode/crawl4ai:latest + container_name: timmy-crawl4ai + profiles: + - search + ports: + - "${CRAWL4AI_PORT:-11235}:11235" + environment: + CRAWL4AI_API_TOKEN: "${CRAWL4AI_API_TOKEN:-}" + volumes: + - timmy-data:/app/data + networks: + - timmy-net + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + # ── OpenFang — vendored agent runtime sidecar ──────────────────────────── openfang: build: diff --git a/docker/searxng/settings.yml b/docker/searxng/settings.yml new file mode 100644 index 00000000..543ec04d --- /dev/null +++ b/docker/searxng/settings.yml @@ -0,0 +1,67 @@ +# SearXNG configuration for Timmy Time self-hosted search +# https://docs.searxng.org/admin/settings/settings.html + +general: + debug: false + instance_name: "Timmy Search" + privacypolicy_url: false + donation_url: false + contact_url: false + enable_metrics: false + +server: + port: 8080 + bind_address: "0.0.0.0" + secret_key: "timmy-searxng-key-change-in-production" + base_url: false + image_proxy: false + +ui: + static_use_hash: false + default_locale: "" + query_in_title: false + infinite_scroll: false + default_theme: simple + center_alignment: false + +search: + safe_search: 0 + autocomplete: "" + default_lang: "en" + formats: + - html + - json + +outgoing: + request_timeout: 6.0 + max_request_timeout: 10.0 + useragent_suffix: "TimmyResearchBot" + pool_connections: 100 + pool_maxsize: 20 + +enabled_plugins: + - Hash_plugin + - Search_on_category_select + - Tracker_url_remover + +engines: + - name: google + engine: google + shortcut: g + categories: general + + - name: bing + engine: bing + shortcut: b + categories: general + + - name: duckduckgo + engine: duckduckgo + shortcut: d + categories: general + + - name: wikipedia + engine: wikipedia + shortcut: wp + categories: general + timeout: 3.0 diff --git a/src/config.py b/src/config.py index 507c9517..56cca371 100644 --- a/src/config.py +++ b/src/config.py @@ -109,6 +109,16 @@ class Settings(BaseSettings): grok_sats_hard_cap: int = 100 # Absolute ceiling on sats per Grok query grok_free: bool = False # Skip Lightning invoice when user has own API key + # ── Search Backend (SearXNG + Crawl4AI) ────────────────────────────── + # "searxng" — self-hosted SearXNG meta-search engine (default, no API key) + # "none" — disable web search (private/offline deployments) + # Override with TIMMY_SEARCH_BACKEND env var. + timmy_search_backend: Literal["searxng", "none"] = "searxng" + # SearXNG base URL — override with TIMMY_SEARCH_URL env var + search_url: str = "http://localhost:8888" + # Crawl4AI base URL — override with TIMMY_CRAWL_URL env var + crawl_url: str = "http://localhost:11235" + # ── Database ────────────────────────────────────────────────────────── db_busy_timeout_ms: int = 5000 # SQLite PRAGMA busy_timeout (ms) diff --git a/src/timmy/tools/__init__.py b/src/timmy/tools/__init__.py index d8efc1db..1f2f378b 100644 --- a/src/timmy/tools/__init__.py +++ b/src/timmy/tools/__init__.py @@ -46,6 +46,7 @@ from timmy.tools.file_tools import ( create_research_tools, create_writing_tools, ) +from timmy.tools.search import scrape_url, web_search from timmy.tools.system_tools import ( _safe_eval, calculator, @@ -72,6 +73,9 @@ __all__ = [ "create_data_tools", "create_research_tools", "create_writing_tools", + # search + "scrape_url", + "web_search", # system_tools "_safe_eval", "calculator", diff --git a/src/timmy/tools/_registry.py b/src/timmy/tools/_registry.py index 567b4fa6..1bed37e4 100644 --- a/src/timmy/tools/_registry.py +++ b/src/timmy/tools/_registry.py @@ -28,6 +28,7 @@ from timmy.tools.file_tools import ( create_research_tools, create_writing_tools, ) +from timmy.tools.search import scrape_url, web_search from timmy.tools.system_tools import ( calculator, consult_grok, @@ -54,6 +55,16 @@ def _register_web_fetch_tool(toolkit: Toolkit) -> None: raise +def _register_search_tools(toolkit: Toolkit) -> None: + """Register SearXNG web_search and Crawl4AI scrape_url tools.""" + try: + toolkit.register(web_search, name="web_search") + toolkit.register(scrape_url, name="scrape_url") + except Exception as exc: + logger.error("Failed to register search tools: %s", exc) + raise + + def _register_core_tools(toolkit: Toolkit, base_path: Path) -> None: """Register core execution and file tools.""" # Python execution @@ -261,6 +272,7 @@ def create_full_toolkit(base_dir: str | Path | None = None): _register_core_tools(toolkit, base_path) _register_web_fetch_tool(toolkit) + _register_search_tools(toolkit) _register_grok_tool(toolkit) _register_memory_tools(toolkit) _register_agentic_loop_tool(toolkit) @@ -433,6 +445,16 @@ def _analysis_tool_catalog() -> dict: "description": "Fetch a web page and extract clean readable text (trafilatura)", "available_in": ["orchestrator"], }, + "web_search": { + "name": "Web Search", + "description": "Search the web via self-hosted SearXNG (no API key required)", + "available_in": ["echo", "orchestrator"], + }, + "scrape_url": { + "name": "Scrape URL", + "description": "Scrape a URL with Crawl4AI and return clean markdown content", + "available_in": ["echo", "orchestrator"], + }, } diff --git a/src/timmy/tools/file_tools.py b/src/timmy/tools/file_tools.py index 9d292124..a9fb4ecd 100644 --- a/src/timmy/tools/file_tools.py +++ b/src/timmy/tools/file_tools.py @@ -59,7 +59,7 @@ def _make_smart_read_file(file_tools: FileTools) -> Callable: def create_research_tools(base_dir: str | Path | None = None): """Create tools for the research agent (Echo). - Includes: file reading + Includes: file reading, web search (SearXNG), URL scraping (Crawl4AI) """ if not _AGNO_TOOLS_AVAILABLE: raise ImportError(f"Agno tools not available: {_ImportError}") @@ -73,6 +73,12 @@ def create_research_tools(base_dir: str | Path | None = None): toolkit.register(_make_smart_read_file(file_tools), name="read_file") toolkit.register(file_tools.list_files, name="list_files") + # Web search + scraping (gracefully no-ops when backend=none or service down) + from timmy.tools.search import scrape_url, web_search + + toolkit.register(web_search, name="web_search") + toolkit.register(scrape_url, name="scrape_url") + return toolkit diff --git a/src/timmy/tools/search.py b/src/timmy/tools/search.py new file mode 100644 index 00000000..8d9b3aed --- /dev/null +++ b/src/timmy/tools/search.py @@ -0,0 +1,186 @@ +"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI. + +Provides: +- web_search(query) — SearXNG meta-search (no API key required) +- scrape_url(url) — Crawl4AI full-page scrape to clean markdown + +Both tools degrade gracefully when the backing service is unavailable +(logs WARNING, returns descriptive error string — never crashes). + +Services are started via `docker compose --profile search up` or configured +with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables. +""" + +from __future__ import annotations + +import logging +import time + +from config import settings + +logger = logging.getLogger(__name__) + +# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds +_CRAWL_MAX_POLLS = 6 +_CRAWL_POLL_INTERVAL = 5 # seconds +_CRAWL_CHAR_BUDGET = 4000 * 4 # ~4000 tokens + + +def web_search(query: str, num_results: int = 5) -> str: + """Search the web using the self-hosted SearXNG meta-search engine. + + Returns ranked results (title + URL + snippet) without requiring any + paid API key. Requires SearXNG running locally (docker compose + --profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance. + + Args: + query: The search query. + num_results: Maximum number of results to return (default 5). + + Returns: + Formatted search results string, or an error/status message on failure. + """ + if settings.timmy_search_backend == "none": + return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)." + + try: + import requests as _requests + except ImportError: + return "Error: 'requests' package is not installed." + + base_url = settings.search_url.rstrip("/") + params: dict = { + "q": query, + "format": "json", + "categories": "general", + } + + try: + resp = _requests.get( + f"{base_url}/search", + params=params, + timeout=10, + headers={"User-Agent": "TimmyResearchBot/1.0"}, + ) + resp.raise_for_status() + except Exception as exc: + logger.warning("SearXNG unavailable at %s: %s", base_url, exc) + return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}" + + try: + data = resp.json() + except Exception as exc: + logger.warning("SearXNG response parse error: %s", exc) + return "Search error: could not parse SearXNG response." + + results = data.get("results", [])[:num_results] + if not results: + return f"No results found for: {query!r}" + + lines = [f"Web search results for: {query!r}\n"] + for i, r in enumerate(results, 1): + title = r.get("title", "Untitled") + url = r.get("url", "") + snippet = r.get("content", "").strip() + lines.append(f"{i}. {title}\n URL: {url}\n {snippet}\n") + + return "\n".join(lines) + + +def scrape_url(url: str) -> str: + """Scrape a URL with Crawl4AI and return the main content as clean markdown. + + Crawl4AI extracts well-structured markdown from any public page — + articles, docs, product pages — suitable for LLM consumption. + Requires Crawl4AI running locally (docker compose --profile search up) + or TIMMY_CRAWL_URL pointing to a reachable instance. + + Args: + url: The URL to scrape (must start with http:// or https://). + + Returns: + Extracted markdown text (up to ~4000 tokens), or an error message. + """ + if not url or not url.startswith(("http://", "https://")): + return f"Error: invalid URL — must start with http:// or https://: {url!r}" + + if settings.timmy_search_backend == "none": + return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)." + + try: + import requests as _requests + except ImportError: + return "Error: 'requests' package is not installed." + + base = settings.crawl_url.rstrip("/") + + # Submit crawl task + try: + resp = _requests.post( + f"{base}/crawl", + json={"urls": [url], "priority": 10}, + timeout=15, + headers={"Content-Type": "application/json"}, + ) + resp.raise_for_status() + except Exception as exc: + logger.warning("Crawl4AI unavailable at %s: %s", base, exc) + return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}" + + try: + submit_data = resp.json() + except Exception as exc: + logger.warning("Crawl4AI submit parse error: %s", exc) + return "Scrape error: could not parse Crawl4AI response." + + # Check if result came back synchronously + if "results" in submit_data: + return _extract_crawl_content(submit_data["results"], url) + + task_id = submit_data.get("task_id") + if not task_id: + return f"Scrape error: Crawl4AI returned no task_id for {url}" + + # Poll for async result + for _ in range(_CRAWL_MAX_POLLS): + time.sleep(_CRAWL_POLL_INTERVAL) + try: + poll = _requests.get(f"{base}/task/{task_id}", timeout=10) + poll.raise_for_status() + task_data = poll.json() + except Exception as exc: + logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc) + continue + + status = task_data.get("status", "") + if status == "completed": + results = task_data.get("results") or task_data.get("result") + if isinstance(results, dict): + results = [results] + return _extract_crawl_content(results or [], url) + if status == "failed": + return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}" + + return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}" + + +def _extract_crawl_content(results: list, url: str) -> str: + """Extract and truncate markdown content from Crawl4AI results list.""" + if not results: + return f"No content returned by Crawl4AI for: {url}" + + result = results[0] + content = ( + result.get("markdown") + or result.get("markdown_v2", {}).get("raw_markdown") + or result.get("extracted_content") + or result.get("content") + or "" + ) + if not content: + return f"No readable content extracted from: {url}" + + if len(content) > _CRAWL_CHAR_BUDGET: + content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]" + + return content diff --git a/tests/timmy/test_tools_search.py b/tests/timmy/test_tools_search.py new file mode 100644 index 00000000..dec00f2a --- /dev/null +++ b/tests/timmy/test_tools_search.py @@ -0,0 +1,308 @@ +"""Unit tests for web_search and scrape_url tools (SearXNG + Crawl4AI). + +All tests use mocked HTTP — no live services required. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from timmy.tools.search import _extract_crawl_content, scrape_url, web_search + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _mock_requests(json_response=None, status_code=200, raise_exc=None): + """Build a mock requests module whose .get/.post return controlled responses.""" + mock_req = MagicMock() + + # Exception hierarchy + class Timeout(Exception): + pass + + class HTTPError(Exception): + def __init__(self, *a, response=None, **kw): + super().__init__(*a, **kw) + self.response = response + + class RequestException(Exception): + pass + + exc_mod = MagicMock() + exc_mod.Timeout = Timeout + exc_mod.HTTPError = HTTPError + exc_mod.RequestException = RequestException + mock_req.exceptions = exc_mod + + if raise_exc is not None: + mock_req.get.side_effect = raise_exc + mock_req.post.side_effect = raise_exc + else: + mock_resp = MagicMock() + mock_resp.status_code = status_code + mock_resp.json.return_value = json_response or {} + if status_code >= 400: + mock_resp.raise_for_status.side_effect = HTTPError( + response=MagicMock(status_code=status_code) + ) + mock_req.get.return_value = mock_resp + mock_req.post.return_value = mock_resp + + return mock_req + + +# --------------------------------------------------------------------------- +# web_search tests +# --------------------------------------------------------------------------- + + +class TestWebSearch: + def test_backend_none_short_circuits(self): + """TIMMY_SEARCH_BACKEND=none returns disabled message immediately.""" + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "none" + result = web_search("anything") + assert "disabled" in result + + def test_missing_requests_package(self): + """Graceful error when requests is not installed.""" + with patch.dict("sys.modules", {"requests": None}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.search_url = "http://localhost:8888" + result = web_search("test query") + assert "requests" in result and "not installed" in result + + def test_successful_search(self): + """Happy path: returns formatted result list.""" + mock_data = { + "results": [ + {"title": "Foo Bar", "url": "https://example.com/foo", "content": "Foo is great"}, + {"title": "Baz", "url": "https://example.com/baz", "content": "Baz rules"}, + ] + } + mock_req = _mock_requests(json_response=mock_data) + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.search_url = "http://localhost:8888" + result = web_search("foo bar") + + assert "Foo Bar" in result + assert "https://example.com/foo" in result + assert "Baz" in result + assert "foo bar" in result + + def test_no_results(self): + """Empty results list returns a helpful no-results message.""" + mock_req = _mock_requests(json_response={"results": []}) + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.search_url = "http://localhost:8888" + result = web_search("xyzzy") + assert "No results" in result + + def test_num_results_respected(self): + """Only up to num_results entries are returned.""" + mock_data = { + "results": [ + {"title": f"Result {i}", "url": f"https://example.com/{i}", "content": "x"} + for i in range(10) + ] + } + mock_req = _mock_requests(json_response=mock_data) + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.search_url = "http://localhost:8888" + result = web_search("test", num_results=3) + + # Only 3 numbered entries should appear + assert "1." in result + assert "3." in result + assert "4." not in result + + def test_service_unavailable(self): + """Connection error degrades gracefully.""" + mock_req = MagicMock() + mock_req.get.side_effect = OSError("connection refused") + mock_req.exceptions = MagicMock() + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.search_url = "http://localhost:8888" + result = web_search("test") + assert "not reachable" in result or "unavailable" in result + + def test_catalog_entry_exists(self): + """web_search must appear in the tool catalog.""" + from timmy.tools import get_all_available_tools + + catalog = get_all_available_tools() + assert "web_search" in catalog + assert "orchestrator" in catalog["web_search"]["available_in"] + assert "echo" in catalog["web_search"]["available_in"] + + +# --------------------------------------------------------------------------- +# scrape_url tests +# --------------------------------------------------------------------------- + + +class TestScrapeUrl: + def test_invalid_url_no_scheme(self): + """URLs without http(s) scheme are rejected before any HTTP call.""" + result = scrape_url("example.com/page") + assert "Error: invalid URL" in result + + def test_invalid_url_empty(self): + result = scrape_url("") + assert "Error: invalid URL" in result + + def test_backend_none_short_circuits(self): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "none" + result = scrape_url("https://example.com") + assert "disabled" in result + + def test_missing_requests_package(self): + with patch.dict("sys.modules", {"requests": None}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.crawl_url = "http://localhost:11235" + result = scrape_url("https://example.com") + assert "requests" in result and "not installed" in result + + def test_sync_result_returned_immediately(self): + """If Crawl4AI returns results in the POST response, use them directly.""" + mock_data = { + "results": [{"markdown": "# Hello\n\nThis is the page content."}] + } + mock_req = _mock_requests(json_response=mock_data) + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.crawl_url = "http://localhost:11235" + result = scrape_url("https://example.com") + + assert "Hello" in result + assert "page content" in result + + def test_async_poll_completed(self): + """Async task_id flow: polls until completed and returns content.""" + submit_response = MagicMock() + submit_response.json.return_value = {"task_id": "abc123"} + submit_response.raise_for_status.return_value = None + + poll_response = MagicMock() + poll_response.json.return_value = { + "status": "completed", + "results": [{"markdown": "# Async content"}], + } + poll_response.raise_for_status.return_value = None + + mock_req = MagicMock() + mock_req.post.return_value = submit_response + mock_req.get.return_value = poll_response + mock_req.exceptions = MagicMock() + + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.crawl_url = "http://localhost:11235" + with patch("timmy.tools.search.time") as mock_time: + mock_time.sleep = MagicMock() + result = scrape_url("https://example.com") + + assert "Async content" in result + + def test_async_poll_failed_task(self): + """Crawl4AI task failure is reported clearly.""" + submit_response = MagicMock() + submit_response.json.return_value = {"task_id": "abc123"} + submit_response.raise_for_status.return_value = None + + poll_response = MagicMock() + poll_response.json.return_value = {"status": "failed", "error": "site blocked"} + poll_response.raise_for_status.return_value = None + + mock_req = MagicMock() + mock_req.post.return_value = submit_response + mock_req.get.return_value = poll_response + mock_req.exceptions = MagicMock() + + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.crawl_url = "http://localhost:11235" + with patch("timmy.tools.search.time") as mock_time: + mock_time.sleep = MagicMock() + result = scrape_url("https://example.com") + + assert "failed" in result and "site blocked" in result + + def test_service_unavailable(self): + """Connection error degrades gracefully.""" + mock_req = MagicMock() + mock_req.post.side_effect = OSError("connection refused") + mock_req.exceptions = MagicMock() + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.crawl_url = "http://localhost:11235" + result = scrape_url("https://example.com") + assert "not reachable" in result or "unavailable" in result + + def test_content_truncation(self): + """Content longer than ~4000 tokens is truncated.""" + long_content = "x" * 20000 + mock_data = {"results": [{"markdown": long_content}]} + mock_req = _mock_requests(json_response=mock_data) + with patch.dict("sys.modules", {"requests": mock_req}): + with patch("timmy.tools.search.settings") as mock_settings: + mock_settings.timmy_search_backend = "searxng" + mock_settings.crawl_url = "http://localhost:11235" + result = scrape_url("https://example.com") + + assert "[…truncated" in result + assert len(result) < 17000 + + def test_catalog_entry_exists(self): + """scrape_url must appear in the tool catalog.""" + from timmy.tools import get_all_available_tools + + catalog = get_all_available_tools() + assert "scrape_url" in catalog + assert "orchestrator" in catalog["scrape_url"]["available_in"] + + +# --------------------------------------------------------------------------- +# _extract_crawl_content helper +# --------------------------------------------------------------------------- + + +class TestExtractCrawlContent: + def test_empty_results(self): + result = _extract_crawl_content([], "https://example.com") + assert "No content" in result + + def test_markdown_field_preferred(self): + results = [{"markdown": "# Title", "content": "fallback"}] + result = _extract_crawl_content(results, "https://example.com") + assert "Title" in result + + def test_fallback_to_content_field(self): + results = [{"content": "plain text content"}] + result = _extract_crawl_content(results, "https://example.com") + assert "plain text content" in result + + def test_no_content_fields(self): + results = [{"url": "https://example.com"}] + result = _extract_crawl_content(results, "https://example.com") + assert "No readable content" in result