1
0

[claude] Add web_fetch tool (trafilatura) for full-page content extraction (#973) (#1004)

This commit is contained in:
2026-03-22 23:03:38 +00:00
parent 9656a5e0d0
commit c0f6ca9fc2
3 changed files with 229 additions and 0 deletions

View File

@@ -473,6 +473,69 @@ def consult_grok(query: str) -> str:
return response
def web_fetch(url: str, max_tokens: int = 4000) -> str:
"""Fetch a web page and return its main text content.
Downloads the URL, extracts readable text using trafilatura, and
truncates to a token budget. Use this to read full articles, docs,
or blog posts that web_search only returns snippets for.
Args:
url: The URL to fetch (must start with http:// or https://).
max_tokens: Maximum approximate token budget (default 4000).
Text is truncated to max_tokens * 4 characters.
Returns:
Extracted text content, or an error message on failure.
"""
if not url or not url.startswith(("http://", "https://")):
return f"Error: invalid URL — must start with http:// or https://: {url!r}"
try:
import requests as _requests
except ImportError:
return "Error: 'requests' package is not installed. Install with: pip install requests"
try:
import trafilatura
except ImportError:
return (
"Error: 'trafilatura' package is not installed. Install with: pip install trafilatura"
)
try:
resp = _requests.get(
url,
timeout=15,
headers={"User-Agent": "TimmyResearchBot/1.0"},
)
resp.raise_for_status()
except _requests.exceptions.Timeout:
return f"Error: request timed out after 15 seconds for {url}"
except _requests.exceptions.HTTPError as exc:
return f"Error: HTTP {exc.response.status_code} for {url}"
except _requests.exceptions.RequestException as exc:
return f"Error: failed to fetch {url}{exc}"
text = trafilatura.extract(resp.text, include_tables=True, include_links=True)
if not text:
return f"Error: could not extract readable content from {url}"
char_budget = max_tokens * 4
if len(text) > char_budget:
text = text[:char_budget] + f"\n\n[…truncated to ~{max_tokens} tokens]"
return text
def _register_web_fetch_tool(toolkit: Toolkit) -> None:
"""Register the web_fetch tool for full-page content extraction."""
try:
toolkit.register(web_fetch, name="web_fetch")
except Exception as exc:
logger.warning("Tool execution failed (web_fetch registration): %s", exc)
def _register_core_tools(toolkit: Toolkit, base_path: Path) -> None:
"""Register core execution and file tools."""
# Python execution
@@ -672,6 +735,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):
base_path = Path(base_dir) if base_dir else Path(settings.repo_root)
_register_core_tools(toolkit, base_path)
_register_web_fetch_tool(toolkit)
_register_grok_tool(toolkit)
_register_memory_tools(toolkit)
_register_agentic_loop_tool(toolkit)
@@ -829,6 +893,11 @@ def _analysis_tool_catalog() -> dict:
"description": "Evaluate mathematical expressions with exact results",
"available_in": ["orchestrator"],
},
"web_fetch": {
"name": "Web Fetch",
"description": "Fetch a web page and extract clean readable text (trafilatura)",
"available_in": ["orchestrator"],
},
}