[claude] feat: SearXNG + Crawl4AI self-hosted search backend (#1282) (#1299)

This commit is contained in:
2026-03-24 01:52:51 +00:00
parent f1f67e62a7
commit a7ccfbddc9
9 changed files with 694 additions and 1 deletions

View File

@@ -109,6 +109,16 @@ class Settings(BaseSettings):
grok_sats_hard_cap: int = 100 # Absolute ceiling on sats per Grok query
grok_free: bool = False # Skip Lightning invoice when user has own API key
# ── Search Backend (SearXNG + Crawl4AI) ──────────────────────────────
# "searxng" — self-hosted SearXNG meta-search engine (default, no API key)
# "none" — disable web search (private/offline deployments)
# Override with TIMMY_SEARCH_BACKEND env var.
timmy_search_backend: Literal["searxng", "none"] = "searxng"
# SearXNG base URL — override with TIMMY_SEARCH_URL env var
search_url: str = "http://localhost:8888"
# Crawl4AI base URL — override with TIMMY_CRAWL_URL env var
crawl_url: str = "http://localhost:11235"
# ── Database ──────────────────────────────────────────────────────────
db_busy_timeout_ms: int = 5000 # SQLite PRAGMA busy_timeout (ms)

View File

@@ -46,6 +46,7 @@ from timmy.tools.file_tools import (
create_research_tools,
create_writing_tools,
)
from timmy.tools.search import scrape_url, web_search
from timmy.tools.system_tools import (
_safe_eval,
calculator,
@@ -72,6 +73,9 @@ __all__ = [
"create_data_tools",
"create_research_tools",
"create_writing_tools",
# search
"scrape_url",
"web_search",
# system_tools
"_safe_eval",
"calculator",

View File

@@ -28,6 +28,7 @@ from timmy.tools.file_tools import (
create_research_tools,
create_writing_tools,
)
from timmy.tools.search import scrape_url, web_search
from timmy.tools.system_tools import (
calculator,
consult_grok,
@@ -54,6 +55,16 @@ def _register_web_fetch_tool(toolkit: Toolkit) -> None:
raise
def _register_search_tools(toolkit: Toolkit) -> None:
"""Register SearXNG web_search and Crawl4AI scrape_url tools."""
try:
toolkit.register(web_search, name="web_search")
toolkit.register(scrape_url, name="scrape_url")
except Exception as exc:
logger.error("Failed to register search tools: %s", exc)
raise
def _register_core_tools(toolkit: Toolkit, base_path: Path) -> None:
"""Register core execution and file tools."""
# Python execution
@@ -261,6 +272,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):
_register_core_tools(toolkit, base_path)
_register_web_fetch_tool(toolkit)
_register_search_tools(toolkit)
_register_grok_tool(toolkit)
_register_memory_tools(toolkit)
_register_agentic_loop_tool(toolkit)
@@ -433,6 +445,16 @@ def _analysis_tool_catalog() -> dict:
"description": "Fetch a web page and extract clean readable text (trafilatura)",
"available_in": ["orchestrator"],
},
"web_search": {
"name": "Web Search",
"description": "Search the web via self-hosted SearXNG (no API key required)",
"available_in": ["echo", "orchestrator"],
},
"scrape_url": {
"name": "Scrape URL",
"description": "Scrape a URL with Crawl4AI and return clean markdown content",
"available_in": ["echo", "orchestrator"],
},
}

View File

@@ -59,7 +59,7 @@ def _make_smart_read_file(file_tools: FileTools) -> Callable:
def create_research_tools(base_dir: str | Path | None = None):
"""Create tools for the research agent (Echo).
Includes: file reading
Includes: file reading, web search (SearXNG), URL scraping (Crawl4AI)
"""
if not _AGNO_TOOLS_AVAILABLE:
raise ImportError(f"Agno tools not available: {_ImportError}")
@@ -73,6 +73,12 @@ def create_research_tools(base_dir: str | Path | None = None):
toolkit.register(_make_smart_read_file(file_tools), name="read_file")
toolkit.register(file_tools.list_files, name="list_files")
# Web search + scraping (gracefully no-ops when backend=none or service down)
from timmy.tools.search import scrape_url, web_search
toolkit.register(web_search, name="web_search")
toolkit.register(scrape_url, name="scrape_url")
return toolkit

186
src/timmy/tools/search.py Normal file
View File

@@ -0,0 +1,186 @@
"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI.
Provides:
- web_search(query) — SearXNG meta-search (no API key required)
- scrape_url(url) — Crawl4AI full-page scrape to clean markdown
Both tools degrade gracefully when the backing service is unavailable
(logs WARNING, returns descriptive error string — never crashes).
Services are started via `docker compose --profile search up` or configured
with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables.
"""
from __future__ import annotations
import logging
import time
from config import settings
logger = logging.getLogger(__name__)
# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds
_CRAWL_MAX_POLLS = 6
_CRAWL_POLL_INTERVAL = 5 # seconds
_CRAWL_CHAR_BUDGET = 4000 * 4 # ~4000 tokens
def web_search(query: str, num_results: int = 5) -> str:
"""Search the web using the self-hosted SearXNG meta-search engine.
Returns ranked results (title + URL + snippet) without requiring any
paid API key. Requires SearXNG running locally (docker compose
--profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance.
Args:
query: The search query.
num_results: Maximum number of results to return (default 5).
Returns:
Formatted search results string, or an error/status message on failure.
"""
if settings.timmy_search_backend == "none":
return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)."
try:
import requests as _requests
except ImportError:
return "Error: 'requests' package is not installed."
base_url = settings.search_url.rstrip("/")
params: dict = {
"q": query,
"format": "json",
"categories": "general",
}
try:
resp = _requests.get(
f"{base_url}/search",
params=params,
timeout=10,
headers={"User-Agent": "TimmyResearchBot/1.0"},
)
resp.raise_for_status()
except Exception as exc:
logger.warning("SearXNG unavailable at %s: %s", base_url, exc)
return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}"
try:
data = resp.json()
except Exception as exc:
logger.warning("SearXNG response parse error: %s", exc)
return "Search error: could not parse SearXNG response."
results = data.get("results", [])[:num_results]
if not results:
return f"No results found for: {query!r}"
lines = [f"Web search results for: {query!r}\n"]
for i, r in enumerate(results, 1):
title = r.get("title", "Untitled")
url = r.get("url", "")
snippet = r.get("content", "").strip()
lines.append(f"{i}. {title}\n URL: {url}\n {snippet}\n")
return "\n".join(lines)
def scrape_url(url: str) -> str:
"""Scrape a URL with Crawl4AI and return the main content as clean markdown.
Crawl4AI extracts well-structured markdown from any public page —
articles, docs, product pages — suitable for LLM consumption.
Requires Crawl4AI running locally (docker compose --profile search up)
or TIMMY_CRAWL_URL pointing to a reachable instance.
Args:
url: The URL to scrape (must start with http:// or https://).
Returns:
Extracted markdown text (up to ~4000 tokens), or an error message.
"""
if not url or not url.startswith(("http://", "https://")):
return f"Error: invalid URL — must start with http:// or https://: {url!r}"
if settings.timmy_search_backend == "none":
return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)."
try:
import requests as _requests
except ImportError:
return "Error: 'requests' package is not installed."
base = settings.crawl_url.rstrip("/")
# Submit crawl task
try:
resp = _requests.post(
f"{base}/crawl",
json={"urls": [url], "priority": 10},
timeout=15,
headers={"Content-Type": "application/json"},
)
resp.raise_for_status()
except Exception as exc:
logger.warning("Crawl4AI unavailable at %s: %s", base, exc)
return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}"
try:
submit_data = resp.json()
except Exception as exc:
logger.warning("Crawl4AI submit parse error: %s", exc)
return "Scrape error: could not parse Crawl4AI response."
# Check if result came back synchronously
if "results" in submit_data:
return _extract_crawl_content(submit_data["results"], url)
task_id = submit_data.get("task_id")
if not task_id:
return f"Scrape error: Crawl4AI returned no task_id for {url}"
# Poll for async result
for _ in range(_CRAWL_MAX_POLLS):
time.sleep(_CRAWL_POLL_INTERVAL)
try:
poll = _requests.get(f"{base}/task/{task_id}", timeout=10)
poll.raise_for_status()
task_data = poll.json()
except Exception as exc:
logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc)
continue
status = task_data.get("status", "")
if status == "completed":
results = task_data.get("results") or task_data.get("result")
if isinstance(results, dict):
results = [results]
return _extract_crawl_content(results or [], url)
if status == "failed":
return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}"
return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}"
def _extract_crawl_content(results: list, url: str) -> str:
"""Extract and truncate markdown content from Crawl4AI results list."""
if not results:
return f"No content returned by Crawl4AI for: {url}"
result = results[0]
content = (
result.get("markdown")
or result.get("markdown_v2", {}).get("raw_markdown")
or result.get("extracted_content")
or result.get("content")
or ""
)
if not content:
return f"No readable content extracted from: {url}"
if len(content) > _CRAWL_CHAR_BUDGET:
content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]"
return content