forked from Rockachopa/Timmy-time-dashboard
This commit is contained in:
@@ -109,6 +109,16 @@ class Settings(BaseSettings):
|
||||
grok_sats_hard_cap: int = 100 # Absolute ceiling on sats per Grok query
|
||||
grok_free: bool = False # Skip Lightning invoice when user has own API key
|
||||
|
||||
# ── Search Backend (SearXNG + Crawl4AI) ──────────────────────────────
|
||||
# "searxng" — self-hosted SearXNG meta-search engine (default, no API key)
|
||||
# "none" — disable web search (private/offline deployments)
|
||||
# Override with TIMMY_SEARCH_BACKEND env var.
|
||||
timmy_search_backend: Literal["searxng", "none"] = "searxng"
|
||||
# SearXNG base URL — override with TIMMY_SEARCH_URL env var
|
||||
search_url: str = "http://localhost:8888"
|
||||
# Crawl4AI base URL — override with TIMMY_CRAWL_URL env var
|
||||
crawl_url: str = "http://localhost:11235"
|
||||
|
||||
# ── Database ──────────────────────────────────────────────────────────
|
||||
db_busy_timeout_ms: int = 5000 # SQLite PRAGMA busy_timeout (ms)
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ from timmy.tools.file_tools import (
|
||||
create_research_tools,
|
||||
create_writing_tools,
|
||||
)
|
||||
from timmy.tools.search import scrape_url, web_search
|
||||
from timmy.tools.system_tools import (
|
||||
_safe_eval,
|
||||
calculator,
|
||||
@@ -72,6 +73,9 @@ __all__ = [
|
||||
"create_data_tools",
|
||||
"create_research_tools",
|
||||
"create_writing_tools",
|
||||
# search
|
||||
"scrape_url",
|
||||
"web_search",
|
||||
# system_tools
|
||||
"_safe_eval",
|
||||
"calculator",
|
||||
|
||||
@@ -28,6 +28,7 @@ from timmy.tools.file_tools import (
|
||||
create_research_tools,
|
||||
create_writing_tools,
|
||||
)
|
||||
from timmy.tools.search import scrape_url, web_search
|
||||
from timmy.tools.system_tools import (
|
||||
calculator,
|
||||
consult_grok,
|
||||
@@ -54,6 +55,16 @@ def _register_web_fetch_tool(toolkit: Toolkit) -> None:
|
||||
raise
|
||||
|
||||
|
||||
def _register_search_tools(toolkit: Toolkit) -> None:
|
||||
"""Register SearXNG web_search and Crawl4AI scrape_url tools."""
|
||||
try:
|
||||
toolkit.register(web_search, name="web_search")
|
||||
toolkit.register(scrape_url, name="scrape_url")
|
||||
except Exception as exc:
|
||||
logger.error("Failed to register search tools: %s", exc)
|
||||
raise
|
||||
|
||||
|
||||
def _register_core_tools(toolkit: Toolkit, base_path: Path) -> None:
|
||||
"""Register core execution and file tools."""
|
||||
# Python execution
|
||||
@@ -261,6 +272,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):
|
||||
|
||||
_register_core_tools(toolkit, base_path)
|
||||
_register_web_fetch_tool(toolkit)
|
||||
_register_search_tools(toolkit)
|
||||
_register_grok_tool(toolkit)
|
||||
_register_memory_tools(toolkit)
|
||||
_register_agentic_loop_tool(toolkit)
|
||||
@@ -433,6 +445,16 @@ def _analysis_tool_catalog() -> dict:
|
||||
"description": "Fetch a web page and extract clean readable text (trafilatura)",
|
||||
"available_in": ["orchestrator"],
|
||||
},
|
||||
"web_search": {
|
||||
"name": "Web Search",
|
||||
"description": "Search the web via self-hosted SearXNG (no API key required)",
|
||||
"available_in": ["echo", "orchestrator"],
|
||||
},
|
||||
"scrape_url": {
|
||||
"name": "Scrape URL",
|
||||
"description": "Scrape a URL with Crawl4AI and return clean markdown content",
|
||||
"available_in": ["echo", "orchestrator"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ def _make_smart_read_file(file_tools: FileTools) -> Callable:
|
||||
def create_research_tools(base_dir: str | Path | None = None):
|
||||
"""Create tools for the research agent (Echo).
|
||||
|
||||
Includes: file reading
|
||||
Includes: file reading, web search (SearXNG), URL scraping (Crawl4AI)
|
||||
"""
|
||||
if not _AGNO_TOOLS_AVAILABLE:
|
||||
raise ImportError(f"Agno tools not available: {_ImportError}")
|
||||
@@ -73,6 +73,12 @@ def create_research_tools(base_dir: str | Path | None = None):
|
||||
toolkit.register(_make_smart_read_file(file_tools), name="read_file")
|
||||
toolkit.register(file_tools.list_files, name="list_files")
|
||||
|
||||
# Web search + scraping (gracefully no-ops when backend=none or service down)
|
||||
from timmy.tools.search import scrape_url, web_search
|
||||
|
||||
toolkit.register(web_search, name="web_search")
|
||||
toolkit.register(scrape_url, name="scrape_url")
|
||||
|
||||
return toolkit
|
||||
|
||||
|
||||
|
||||
186
src/timmy/tools/search.py
Normal file
186
src/timmy/tools/search.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI.
|
||||
|
||||
Provides:
|
||||
- web_search(query) — SearXNG meta-search (no API key required)
|
||||
- scrape_url(url) — Crawl4AI full-page scrape to clean markdown
|
||||
|
||||
Both tools degrade gracefully when the backing service is unavailable
|
||||
(logs WARNING, returns descriptive error string — never crashes).
|
||||
|
||||
Services are started via `docker compose --profile search up` or configured
|
||||
with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds
|
||||
_CRAWL_MAX_POLLS = 6
|
||||
_CRAWL_POLL_INTERVAL = 5 # seconds
|
||||
_CRAWL_CHAR_BUDGET = 4000 * 4 # ~4000 tokens
|
||||
|
||||
|
||||
def web_search(query: str, num_results: int = 5) -> str:
|
||||
"""Search the web using the self-hosted SearXNG meta-search engine.
|
||||
|
||||
Returns ranked results (title + URL + snippet) without requiring any
|
||||
paid API key. Requires SearXNG running locally (docker compose
|
||||
--profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance.
|
||||
|
||||
Args:
|
||||
query: The search query.
|
||||
num_results: Maximum number of results to return (default 5).
|
||||
|
||||
Returns:
|
||||
Formatted search results string, or an error/status message on failure.
|
||||
"""
|
||||
if settings.timmy_search_backend == "none":
|
||||
return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)."
|
||||
|
||||
try:
|
||||
import requests as _requests
|
||||
except ImportError:
|
||||
return "Error: 'requests' package is not installed."
|
||||
|
||||
base_url = settings.search_url.rstrip("/")
|
||||
params: dict = {
|
||||
"q": query,
|
||||
"format": "json",
|
||||
"categories": "general",
|
||||
}
|
||||
|
||||
try:
|
||||
resp = _requests.get(
|
||||
f"{base_url}/search",
|
||||
params=params,
|
||||
timeout=10,
|
||||
headers={"User-Agent": "TimmyResearchBot/1.0"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("SearXNG unavailable at %s: %s", base_url, exc)
|
||||
return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}"
|
||||
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception as exc:
|
||||
logger.warning("SearXNG response parse error: %s", exc)
|
||||
return "Search error: could not parse SearXNG response."
|
||||
|
||||
results = data.get("results", [])[:num_results]
|
||||
if not results:
|
||||
return f"No results found for: {query!r}"
|
||||
|
||||
lines = [f"Web search results for: {query!r}\n"]
|
||||
for i, r in enumerate(results, 1):
|
||||
title = r.get("title", "Untitled")
|
||||
url = r.get("url", "")
|
||||
snippet = r.get("content", "").strip()
|
||||
lines.append(f"{i}. {title}\n URL: {url}\n {snippet}\n")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def scrape_url(url: str) -> str:
|
||||
"""Scrape a URL with Crawl4AI and return the main content as clean markdown.
|
||||
|
||||
Crawl4AI extracts well-structured markdown from any public page —
|
||||
articles, docs, product pages — suitable for LLM consumption.
|
||||
Requires Crawl4AI running locally (docker compose --profile search up)
|
||||
or TIMMY_CRAWL_URL pointing to a reachable instance.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape (must start with http:// or https://).
|
||||
|
||||
Returns:
|
||||
Extracted markdown text (up to ~4000 tokens), or an error message.
|
||||
"""
|
||||
if not url or not url.startswith(("http://", "https://")):
|
||||
return f"Error: invalid URL — must start with http:// or https://: {url!r}"
|
||||
|
||||
if settings.timmy_search_backend == "none":
|
||||
return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)."
|
||||
|
||||
try:
|
||||
import requests as _requests
|
||||
except ImportError:
|
||||
return "Error: 'requests' package is not installed."
|
||||
|
||||
base = settings.crawl_url.rstrip("/")
|
||||
|
||||
# Submit crawl task
|
||||
try:
|
||||
resp = _requests.post(
|
||||
f"{base}/crawl",
|
||||
json={"urls": [url], "priority": 10},
|
||||
timeout=15,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
logger.warning("Crawl4AI unavailable at %s: %s", base, exc)
|
||||
return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}"
|
||||
|
||||
try:
|
||||
submit_data = resp.json()
|
||||
except Exception as exc:
|
||||
logger.warning("Crawl4AI submit parse error: %s", exc)
|
||||
return "Scrape error: could not parse Crawl4AI response."
|
||||
|
||||
# Check if result came back synchronously
|
||||
if "results" in submit_data:
|
||||
return _extract_crawl_content(submit_data["results"], url)
|
||||
|
||||
task_id = submit_data.get("task_id")
|
||||
if not task_id:
|
||||
return f"Scrape error: Crawl4AI returned no task_id for {url}"
|
||||
|
||||
# Poll for async result
|
||||
for _ in range(_CRAWL_MAX_POLLS):
|
||||
time.sleep(_CRAWL_POLL_INTERVAL)
|
||||
try:
|
||||
poll = _requests.get(f"{base}/task/{task_id}", timeout=10)
|
||||
poll.raise_for_status()
|
||||
task_data = poll.json()
|
||||
except Exception as exc:
|
||||
logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc)
|
||||
continue
|
||||
|
||||
status = task_data.get("status", "")
|
||||
if status == "completed":
|
||||
results = task_data.get("results") or task_data.get("result")
|
||||
if isinstance(results, dict):
|
||||
results = [results]
|
||||
return _extract_crawl_content(results or [], url)
|
||||
if status == "failed":
|
||||
return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}"
|
||||
|
||||
return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}"
|
||||
|
||||
|
||||
def _extract_crawl_content(results: list, url: str) -> str:
|
||||
"""Extract and truncate markdown content from Crawl4AI results list."""
|
||||
if not results:
|
||||
return f"No content returned by Crawl4AI for: {url}"
|
||||
|
||||
result = results[0]
|
||||
content = (
|
||||
result.get("markdown")
|
||||
or result.get("markdown_v2", {}).get("raw_markdown")
|
||||
or result.get("extracted_content")
|
||||
or result.get("content")
|
||||
or ""
|
||||
)
|
||||
if not content:
|
||||
return f"No readable content extracted from: {url}"
|
||||
|
||||
if len(content) > _CRAWL_CHAR_BUDGET:
|
||||
content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]"
|
||||
|
||||
return content
|
||||
Reference in New Issue
Block a user