1
0
This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/src/timmy/tools/search.py

187 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI.
Provides:
- web_search(query) — SearXNG meta-search (no API key required)
- scrape_url(url) — Crawl4AI full-page scrape to clean markdown
Both tools degrade gracefully when the backing service is unavailable
(logs WARNING, returns descriptive error string — never crashes).
Services are started via `docker compose --profile search up` or configured
with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables.
"""
from __future__ import annotations
import logging
import time
from config import settings
logger = logging.getLogger(__name__)
# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds
_CRAWL_MAX_POLLS = 6
_CRAWL_POLL_INTERVAL = 5 # seconds
_CRAWL_CHAR_BUDGET = 4000 * 4 # ~4000 tokens
def web_search(query: str, num_results: int = 5) -> str:
"""Search the web using the self-hosted SearXNG meta-search engine.
Returns ranked results (title + URL + snippet) without requiring any
paid API key. Requires SearXNG running locally (docker compose
--profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance.
Args:
query: The search query.
num_results: Maximum number of results to return (default 5).
Returns:
Formatted search results string, or an error/status message on failure.
"""
if settings.timmy_search_backend == "none":
return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)."
try:
import requests as _requests
except ImportError:
return "Error: 'requests' package is not installed."
base_url = settings.search_url.rstrip("/")
params: dict = {
"q": query,
"format": "json",
"categories": "general",
}
try:
resp = _requests.get(
f"{base_url}/search",
params=params,
timeout=10,
headers={"User-Agent": "TimmyResearchBot/1.0"},
)
resp.raise_for_status()
except Exception as exc:
logger.warning("SearXNG unavailable at %s: %s", base_url, exc)
return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}"
try:
data = resp.json()
except Exception as exc:
logger.warning("SearXNG response parse error: %s", exc)
return "Search error: could not parse SearXNG response."
results = data.get("results", [])[:num_results]
if not results:
return f"No results found for: {query!r}"
lines = [f"Web search results for: {query!r}\n"]
for i, r in enumerate(results, 1):
title = r.get("title", "Untitled")
url = r.get("url", "")
snippet = r.get("content", "").strip()
lines.append(f"{i}. {title}\n URL: {url}\n {snippet}\n")
return "\n".join(lines)
def scrape_url(url: str) -> str:
"""Scrape a URL with Crawl4AI and return the main content as clean markdown.
Crawl4AI extracts well-structured markdown from any public page —
articles, docs, product pages — suitable for LLM consumption.
Requires Crawl4AI running locally (docker compose --profile search up)
or TIMMY_CRAWL_URL pointing to a reachable instance.
Args:
url: The URL to scrape (must start with http:// or https://).
Returns:
Extracted markdown text (up to ~4000 tokens), or an error message.
"""
if not url or not url.startswith(("http://", "https://")):
return f"Error: invalid URL — must start with http:// or https://: {url!r}"
if settings.timmy_search_backend == "none":
return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)."
try:
import requests as _requests
except ImportError:
return "Error: 'requests' package is not installed."
base = settings.crawl_url.rstrip("/")
# Submit crawl task
try:
resp = _requests.post(
f"{base}/crawl",
json={"urls": [url], "priority": 10},
timeout=15,
headers={"Content-Type": "application/json"},
)
resp.raise_for_status()
except Exception as exc:
logger.warning("Crawl4AI unavailable at %s: %s", base, exc)
return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}"
try:
submit_data = resp.json()
except Exception as exc:
logger.warning("Crawl4AI submit parse error: %s", exc)
return "Scrape error: could not parse Crawl4AI response."
# Check if result came back synchronously
if "results" in submit_data:
return _extract_crawl_content(submit_data["results"], url)
task_id = submit_data.get("task_id")
if not task_id:
return f"Scrape error: Crawl4AI returned no task_id for {url}"
# Poll for async result
for _ in range(_CRAWL_MAX_POLLS):
time.sleep(_CRAWL_POLL_INTERVAL)
try:
poll = _requests.get(f"{base}/task/{task_id}", timeout=10)
poll.raise_for_status()
task_data = poll.json()
except Exception as exc:
logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc)
continue
status = task_data.get("status", "")
if status == "completed":
results = task_data.get("results") or task_data.get("result")
if isinstance(results, dict):
results = [results]
return _extract_crawl_content(results or [], url)
if status == "failed":
return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}"
return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}"
def _extract_crawl_content(results: list, url: str) -> str:
"""Extract and truncate markdown content from Crawl4AI results list."""
if not results:
return f"No content returned by Crawl4AI for: {url}"
result = results[0]
content = (
result.get("markdown")
or result.get("markdown_v2", {}).get("raw_markdown")
or result.get("extracted_content")
or result.get("content")
or ""
)
if not content:
return f"No readable content extracted from: {url}"
if len(content) > _CRAWL_CHAR_BUDGET:
content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]"
return content