forked from Rockachopa/Timmy-time-dashboard
187 lines
6.2 KiB
Python
187 lines
6.2 KiB
Python
"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI.
|
||
|
||
Provides:
|
||
- web_search(query) — SearXNG meta-search (no API key required)
|
||
- scrape_url(url) — Crawl4AI full-page scrape to clean markdown
|
||
|
||
Both tools degrade gracefully when the backing service is unavailable
|
||
(logs WARNING, returns descriptive error string — never crashes).
|
||
|
||
Services are started via `docker compose --profile search up` or configured
|
||
with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import time
|
||
|
||
from config import settings
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds
|
||
_CRAWL_MAX_POLLS = 6
|
||
_CRAWL_POLL_INTERVAL = 5 # seconds
|
||
_CRAWL_CHAR_BUDGET = 4000 * 4 # ~4000 tokens
|
||
|
||
|
||
def web_search(query: str, num_results: int = 5) -> str:
|
||
"""Search the web using the self-hosted SearXNG meta-search engine.
|
||
|
||
Returns ranked results (title + URL + snippet) without requiring any
|
||
paid API key. Requires SearXNG running locally (docker compose
|
||
--profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance.
|
||
|
||
Args:
|
||
query: The search query.
|
||
num_results: Maximum number of results to return (default 5).
|
||
|
||
Returns:
|
||
Formatted search results string, or an error/status message on failure.
|
||
"""
|
||
if settings.timmy_search_backend == "none":
|
||
return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)."
|
||
|
||
try:
|
||
import requests as _requests
|
||
except ImportError:
|
||
return "Error: 'requests' package is not installed."
|
||
|
||
base_url = settings.search_url.rstrip("/")
|
||
params: dict = {
|
||
"q": query,
|
||
"format": "json",
|
||
"categories": "general",
|
||
}
|
||
|
||
try:
|
||
resp = _requests.get(
|
||
f"{base_url}/search",
|
||
params=params,
|
||
timeout=10,
|
||
headers={"User-Agent": "TimmyResearchBot/1.0"},
|
||
)
|
||
resp.raise_for_status()
|
||
except Exception as exc:
|
||
logger.warning("SearXNG unavailable at %s: %s", base_url, exc)
|
||
return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}"
|
||
|
||
try:
|
||
data = resp.json()
|
||
except Exception as exc:
|
||
logger.warning("SearXNG response parse error: %s", exc)
|
||
return "Search error: could not parse SearXNG response."
|
||
|
||
results = data.get("results", [])[:num_results]
|
||
if not results:
|
||
return f"No results found for: {query!r}"
|
||
|
||
lines = [f"Web search results for: {query!r}\n"]
|
||
for i, r in enumerate(results, 1):
|
||
title = r.get("title", "Untitled")
|
||
url = r.get("url", "")
|
||
snippet = r.get("content", "").strip()
|
||
lines.append(f"{i}. {title}\n URL: {url}\n {snippet}\n")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def scrape_url(url: str) -> str:
|
||
"""Scrape a URL with Crawl4AI and return the main content as clean markdown.
|
||
|
||
Crawl4AI extracts well-structured markdown from any public page —
|
||
articles, docs, product pages — suitable for LLM consumption.
|
||
Requires Crawl4AI running locally (docker compose --profile search up)
|
||
or TIMMY_CRAWL_URL pointing to a reachable instance.
|
||
|
||
Args:
|
||
url: The URL to scrape (must start with http:// or https://).
|
||
|
||
Returns:
|
||
Extracted markdown text (up to ~4000 tokens), or an error message.
|
||
"""
|
||
if not url or not url.startswith(("http://", "https://")):
|
||
return f"Error: invalid URL — must start with http:// or https://: {url!r}"
|
||
|
||
if settings.timmy_search_backend == "none":
|
||
return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)."
|
||
|
||
try:
|
||
import requests as _requests
|
||
except ImportError:
|
||
return "Error: 'requests' package is not installed."
|
||
|
||
base = settings.crawl_url.rstrip("/")
|
||
|
||
# Submit crawl task
|
||
try:
|
||
resp = _requests.post(
|
||
f"{base}/crawl",
|
||
json={"urls": [url], "priority": 10},
|
||
timeout=15,
|
||
headers={"Content-Type": "application/json"},
|
||
)
|
||
resp.raise_for_status()
|
||
except Exception as exc:
|
||
logger.warning("Crawl4AI unavailable at %s: %s", base, exc)
|
||
return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}"
|
||
|
||
try:
|
||
submit_data = resp.json()
|
||
except Exception as exc:
|
||
logger.warning("Crawl4AI submit parse error: %s", exc)
|
||
return "Scrape error: could not parse Crawl4AI response."
|
||
|
||
# Check if result came back synchronously
|
||
if "results" in submit_data:
|
||
return _extract_crawl_content(submit_data["results"], url)
|
||
|
||
task_id = submit_data.get("task_id")
|
||
if not task_id:
|
||
return f"Scrape error: Crawl4AI returned no task_id for {url}"
|
||
|
||
# Poll for async result
|
||
for _ in range(_CRAWL_MAX_POLLS):
|
||
time.sleep(_CRAWL_POLL_INTERVAL)
|
||
try:
|
||
poll = _requests.get(f"{base}/task/{task_id}", timeout=10)
|
||
poll.raise_for_status()
|
||
task_data = poll.json()
|
||
except Exception as exc:
|
||
logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc)
|
||
continue
|
||
|
||
status = task_data.get("status", "")
|
||
if status == "completed":
|
||
results = task_data.get("results") or task_data.get("result")
|
||
if isinstance(results, dict):
|
||
results = [results]
|
||
return _extract_crawl_content(results or [], url)
|
||
if status == "failed":
|
||
return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}"
|
||
|
||
return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}"
|
||
|
||
|
||
def _extract_crawl_content(results: list, url: str) -> str:
|
||
"""Extract and truncate markdown content from Crawl4AI results list."""
|
||
if not results:
|
||
return f"No content returned by Crawl4AI for: {url}"
|
||
|
||
result = results[0]
|
||
content = (
|
||
result.get("markdown")
|
||
or result.get("markdown_v2", {}).get("raw_markdown")
|
||
or result.get("extracted_content")
|
||
or result.get("content")
|
||
or ""
|
||
)
|
||
if not content:
|
||
return f"No readable content extracted from: {url}"
|
||
|
||
if len(content) > _CRAWL_CHAR_BUDGET:
|
||
content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]"
|
||
|
||
return content
|