From 0c5bbb1b4b4a2045e5f269edbae5296265a9b922 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Sun, 22 Mar 2026 19:03:08 -0400 Subject: [PATCH] feat: add web_fetch tool for full-page content extraction (trafilatura) Implements web_fetch(url, max_tokens) tool that downloads a URL, extracts clean readable text via trafilatura, and truncates to a token budget. Registered as an Agno tool in the full toolkit. - Validates URL scheme before attempting fetch - Uses requests with 15s timeout and TimmyResearchBot/1.0 user-agent - Graceful degradation: missing packages, timeouts, HTTP errors, empty pages - Added trafilatura as optional dependency with 'research' extra - 11 unit tests covering all acceptance criteria Fixes #973 Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 2 + src/timmy/tools.py | 69 ++++++++++++ tests/timmy/test_tools_web_fetch.py | 158 ++++++++++++++++++++++++++++ 3 files changed, 229 insertions(+) create mode 100644 tests/timmy/test_tools_web_fetch.py diff --git a/pyproject.toml b/pyproject.toml index 6e65674c..b4c0b42a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ sounddevice = { version = ">=0.4.6", optional = true } sentence-transformers = { version = ">=2.0.0", optional = true } numpy = { version = ">=1.24.0", optional = true } requests = { version = ">=2.31.0", optional = true } +trafilatura = { version = ">=1.6.0", optional = true } GitPython = { version = ">=3.1.40", optional = true } pytest = { version = ">=8.0.0", optional = true } pytest-asyncio = { version = ">=0.24.0", optional = true } @@ -67,6 +68,7 @@ voice = ["pyttsx3", "openai-whisper", "piper-tts", "sounddevice"] celery = ["celery"] embeddings = ["sentence-transformers", "numpy"] git = ["GitPython"] +research = ["requests", "trafilatura"] dev = ["pytest", "pytest-asyncio", "pytest-cov", "pytest-timeout", "pytest-randomly", "pytest-xdist", "selenium"] [tool.poetry.group.dev.dependencies] diff --git a/src/timmy/tools.py b/src/timmy/tools.py index 37009092..bf3e409c 100644 --- a/src/timmy/tools.py +++ b/src/timmy/tools.py @@ -473,6 +473,69 @@ def consult_grok(query: str) -> str: return response +def web_fetch(url: str, max_tokens: int = 4000) -> str: + """Fetch a web page and return its main text content. + + Downloads the URL, extracts readable text using trafilatura, and + truncates to a token budget. Use this to read full articles, docs, + or blog posts that web_search only returns snippets for. + + Args: + url: The URL to fetch (must start with http:// or https://). + max_tokens: Maximum approximate token budget (default 4000). + Text is truncated to max_tokens * 4 characters. + + Returns: + Extracted text content, or an error message on failure. + """ + if not url or not url.startswith(("http://", "https://")): + return f"Error: invalid URL — must start with http:// or https://: {url!r}" + + try: + import requests as _requests + except ImportError: + return "Error: 'requests' package is not installed. Install with: pip install requests" + + try: + import trafilatura + except ImportError: + return ( + "Error: 'trafilatura' package is not installed. Install with: pip install trafilatura" + ) + + try: + resp = _requests.get( + url, + timeout=15, + headers={"User-Agent": "TimmyResearchBot/1.0"}, + ) + resp.raise_for_status() + except _requests.exceptions.Timeout: + return f"Error: request timed out after 15 seconds for {url}" + except _requests.exceptions.HTTPError as exc: + return f"Error: HTTP {exc.response.status_code} for {url}" + except _requests.exceptions.RequestException as exc: + return f"Error: failed to fetch {url} — {exc}" + + text = trafilatura.extract(resp.text, include_tables=True, include_links=True) + if not text: + return f"Error: could not extract readable content from {url}" + + char_budget = max_tokens * 4 + if len(text) > char_budget: + text = text[:char_budget] + f"\n\n[…truncated to ~{max_tokens} tokens]" + + return text + + +def _register_web_fetch_tool(toolkit: Toolkit) -> None: + """Register the web_fetch tool for full-page content extraction.""" + try: + toolkit.register(web_fetch, name="web_fetch") + except Exception as exc: + logger.warning("Tool execution failed (web_fetch registration): %s", exc) + + def _register_core_tools(toolkit: Toolkit, base_path: Path) -> None: """Register core execution and file tools.""" # Python execution @@ -672,6 +735,7 @@ def create_full_toolkit(base_dir: str | Path | None = None): base_path = Path(base_dir) if base_dir else Path(settings.repo_root) _register_core_tools(toolkit, base_path) + _register_web_fetch_tool(toolkit) _register_grok_tool(toolkit) _register_memory_tools(toolkit) _register_agentic_loop_tool(toolkit) @@ -829,6 +893,11 @@ def _analysis_tool_catalog() -> dict: "description": "Evaluate mathematical expressions with exact results", "available_in": ["orchestrator"], }, + "web_fetch": { + "name": "Web Fetch", + "description": "Fetch a web page and extract clean readable text (trafilatura)", + "available_in": ["orchestrator"], + }, } diff --git a/tests/timmy/test_tools_web_fetch.py b/tests/timmy/test_tools_web_fetch.py new file mode 100644 index 00000000..ba0ec27f --- /dev/null +++ b/tests/timmy/test_tools_web_fetch.py @@ -0,0 +1,158 @@ +"""Unit tests for the web_fetch tool in timmy.tools.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from timmy.tools import web_fetch + + +class TestWebFetch: + """Tests for web_fetch function.""" + + def test_invalid_url_no_scheme(self): + """URLs without http(s) scheme are rejected.""" + result = web_fetch("example.com") + assert "Error: invalid URL" in result + + def test_invalid_url_empty(self): + """Empty URL is rejected.""" + result = web_fetch("") + assert "Error: invalid URL" in result + + def test_invalid_url_ftp(self): + """Non-HTTP schemes are rejected.""" + result = web_fetch("ftp://example.com") + assert "Error: invalid URL" in result + + @patch("timmy.tools.trafilatura", create=True) + @patch("timmy.tools._requests", create=True) + def test_successful_fetch(self, mock_requests, mock_trafilatura): + """Happy path: fetch + extract returns text.""" + # We need to patch at import level inside the function + mock_resp = MagicMock() + mock_resp.text = "

Hello world

" + + with patch.dict( + "sys.modules", {"requests": mock_requests, "trafilatura": mock_trafilatura} + ): + mock_requests.get.return_value = mock_resp + mock_requests.exceptions = _make_exceptions() + mock_trafilatura.extract.return_value = "Hello world" + + result = web_fetch("https://example.com") + + assert result == "Hello world" + + @patch.dict("sys.modules", {"requests": MagicMock(), "trafilatura": MagicMock()}) + def test_truncation(self): + """Long text is truncated to max_tokens * 4 chars.""" + import sys + + mock_trafilatura = sys.modules["trafilatura"] + mock_requests = sys.modules["requests"] + + long_text = "a" * 20000 + mock_resp = MagicMock() + mock_resp.text = "" + long_text + "" + mock_requests.get.return_value = mock_resp + mock_requests.exceptions = _make_exceptions() + mock_trafilatura.extract.return_value = long_text + + result = web_fetch("https://example.com", max_tokens=100) + + # 100 tokens * 4 chars = 400 chars max + assert len(result) < 500 + assert "[…truncated" in result + + @patch.dict("sys.modules", {"requests": MagicMock(), "trafilatura": MagicMock()}) + def test_extraction_failure(self): + """Returns error when trafilatura can't extract text.""" + import sys + + mock_trafilatura = sys.modules["trafilatura"] + mock_requests = sys.modules["requests"] + + mock_resp = MagicMock() + mock_resp.text = "" + mock_requests.get.return_value = mock_resp + mock_requests.exceptions = _make_exceptions() + mock_trafilatura.extract.return_value = None + + result = web_fetch("https://example.com") + assert "Error: could not extract" in result + + @patch.dict("sys.modules", {"trafilatura": MagicMock()}) + def test_timeout(self): + """Timeout errors are handled gracefully.""" + + mock_requests = MagicMock() + exc_mod = _make_exceptions() + mock_requests.exceptions = exc_mod + mock_requests.get.side_effect = exc_mod.Timeout("timed out") + + with patch.dict("sys.modules", {"requests": mock_requests}): + result = web_fetch("https://example.com") + + assert "timed out" in result + + @patch.dict("sys.modules", {"trafilatura": MagicMock()}) + def test_http_error(self): + """HTTP errors (404, 500, etc.) are handled gracefully.""" + + mock_requests = MagicMock() + exc_mod = _make_exceptions() + mock_requests.exceptions = exc_mod + + mock_response = MagicMock() + mock_response.status_code = 404 + mock_requests.get.return_value.raise_for_status.side_effect = exc_mod.HTTPError( + response=mock_response + ) + + with patch.dict("sys.modules", {"requests": mock_requests}): + result = web_fetch("https://example.com/nope") + + assert "404" in result + + def test_missing_requests(self): + """Graceful error when requests not installed.""" + with patch.dict("sys.modules", {"requests": None}): + result = web_fetch("https://example.com") + assert "requests" in result and "not installed" in result + + def test_missing_trafilatura(self): + """Graceful error when trafilatura not installed.""" + mock_requests = MagicMock() + with patch.dict("sys.modules", {"requests": mock_requests, "trafilatura": None}): + result = web_fetch("https://example.com") + assert "trafilatura" in result and "not installed" in result + + def test_catalog_entry_exists(self): + """web_fetch should appear in the tool catalog.""" + from timmy.tools import get_all_available_tools + + catalog = get_all_available_tools() + assert "web_fetch" in catalog + assert "orchestrator" in catalog["web_fetch"]["available_in"] + + +def _make_exceptions(): + """Create a mock exceptions module with real exception classes.""" + + class Timeout(Exception): + pass + + class HTTPError(Exception): + def __init__(self, *args, response=None, **kwargs): + super().__init__(*args, **kwargs) + self.response = response + + class RequestException(Exception): + pass + + mod = MagicMock() + mod.Timeout = Timeout + mod.HTTPError = HTTPError + mod.RequestException = RequestException + return mod -- 2.43.0