Timmy-time-dashboard/src/timmy/tools/search.py

"""Self-hosted web search and scraping tools using SearXNG + Crawl4AI.

Provides:
- web_search(query) — SearXNG meta-search (no API key required)
- scrape_url(url)   — Crawl4AI full-page scrape to clean markdown

Both tools degrade gracefully when the backing service is unavailable
(logs WARNING, returns descriptive error string — never crashes).

Services are started via `docker compose --profile search up` or configured
with TIMMY_SEARCH_URL / TIMMY_CRAWL_URL environment variables.
"""

from __future__ import annotations

import logging
import time

from config import settings

logger = logging.getLogger(__name__)

# Crawl4AI polling: up to _CRAWL_MAX_POLLS × _CRAWL_POLL_INTERVAL seconds
_CRAWL_MAX_POLLS = 6
_CRAWL_POLL_INTERVAL = 5  # seconds
_CRAWL_CHAR_BUDGET = 4000 * 4  # ~4000 tokens


def web_search(query: str, num_results: int = 5) -> str:
    """Search the web using the self-hosted SearXNG meta-search engine.

    Returns ranked results (title + URL + snippet) without requiring any
    paid API key.  Requires SearXNG running locally (docker compose
    --profile search up) or TIMMY_SEARCH_URL pointing to a reachable instance.

    Args:
        query: The search query.
        num_results: Maximum number of results to return (default 5).

    Returns:
        Formatted search results string, or an error/status message on failure.
    """
    if settings.timmy_search_backend == "none":
        return "Web search is disabled (TIMMY_SEARCH_BACKEND=none)."

    try:
        import requests as _requests
    except ImportError:
        return "Error: 'requests' package is not installed."

    base_url = settings.search_url.rstrip("/")
    params: dict = {
        "q": query,
        "format": "json",
        "categories": "general",
    }

    try:
        resp = _requests.get(
            f"{base_url}/search",
            params=params,
            timeout=10,
            headers={"User-Agent": "TimmyResearchBot/1.0"},
        )
        resp.raise_for_status()
    except Exception as exc:
        logger.warning("SearXNG unavailable at %s: %s", base_url, exc)
        return f"Search unavailable — SearXNG not reachable ({base_url}): {exc}"

    try:
        data = resp.json()
    except Exception as exc:
        logger.warning("SearXNG response parse error: %s", exc)
        return "Search error: could not parse SearXNG response."

    results = data.get("results", [])[:num_results]
    if not results:
        return f"No results found for: {query!r}"

    lines = [f"Web search results for: {query!r}\n"]
    for i, r in enumerate(results, 1):
        title = r.get("title", "Untitled")
        url = r.get("url", "")
        snippet = r.get("content", "").strip()
        lines.append(f"{i}. {title}\n   URL: {url}\n   {snippet}\n")

    return "\n".join(lines)


def scrape_url(url: str) -> str:
    """Scrape a URL with Crawl4AI and return the main content as clean markdown.

    Crawl4AI extracts well-structured markdown from any public page —
    articles, docs, product pages — suitable for LLM consumption.
    Requires Crawl4AI running locally (docker compose --profile search up)
    or TIMMY_CRAWL_URL pointing to a reachable instance.

    Args:
        url: The URL to scrape (must start with http:// or https://).

    Returns:
        Extracted markdown text (up to ~4000 tokens), or an error message.
    """
    if not url or not url.startswith(("http://", "https://")):
        return f"Error: invalid URL — must start with http:// or https://: {url!r}"

    if settings.timmy_search_backend == "none":
        return "Web scraping is disabled (TIMMY_SEARCH_BACKEND=none)."

    try:
        import requests as _requests
    except ImportError:
        return "Error: 'requests' package is not installed."

    base = settings.crawl_url.rstrip("/")

    # Submit crawl task
    try:
        resp = _requests.post(
            f"{base}/crawl",
            json={"urls": [url], "priority": 10},
            timeout=15,
            headers={"Content-Type": "application/json"},
        )
        resp.raise_for_status()
    except Exception as exc:
        logger.warning("Crawl4AI unavailable at %s: %s", base, exc)
        return f"Scrape unavailable — Crawl4AI not reachable ({base}): {exc}"

    try:
        submit_data = resp.json()
    except Exception as exc:
        logger.warning("Crawl4AI submit parse error: %s", exc)
        return "Scrape error: could not parse Crawl4AI response."

    # Check if result came back synchronously
    if "results" in submit_data:
        return _extract_crawl_content(submit_data["results"], url)

    task_id = submit_data.get("task_id")
    if not task_id:
        return f"Scrape error: Crawl4AI returned no task_id for {url}"

    # Poll for async result
    for _ in range(_CRAWL_MAX_POLLS):
        time.sleep(_CRAWL_POLL_INTERVAL)
        try:
            poll = _requests.get(f"{base}/task/{task_id}", timeout=10)
            poll.raise_for_status()
            task_data = poll.json()
        except Exception as exc:
            logger.warning("Crawl4AI poll error (task=%s): %s", task_id, exc)
            continue

        status = task_data.get("status", "")
        if status == "completed":
            results = task_data.get("results") or task_data.get("result")
            if isinstance(results, dict):
                results = [results]
            return _extract_crawl_content(results or [], url)
        if status == "failed":
            return f"Scrape failed for {url}: {task_data.get('error', 'unknown error')}"

    return f"Scrape timed out after {_CRAWL_MAX_POLLS * _CRAWL_POLL_INTERVAL}s for {url}"


def _extract_crawl_content(results: list, url: str) -> str:
    """Extract and truncate markdown content from Crawl4AI results list."""
    if not results:
        return f"No content returned by Crawl4AI for: {url}"

    result = results[0]
    content = (
        result.get("markdown")
        or result.get("markdown_v2", {}).get("raw_markdown")
        or result.get("extracted_content")
        or result.get("content")
        or ""
    )
    if not content:
        return f"No readable content extracted from: {url}"

    if len(content) > _CRAWL_CHAR_BUDGET:
        content = content[:_CRAWL_CHAR_BUDGET] + "\n\n[…truncated to ~4000 tokens]"

    return content