266 lines
9.5 KiB
Python
266 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Standalone Web Tools Module
|
|
|
|
This module provides generic web tools that work with multiple backend providers.
|
|
Currently uses Tavily as the backend, but the interface makes it easy to swap
|
|
to other providers like Firecrawl without changing the function signatures.
|
|
|
|
Available tools:
|
|
- web_search_tool: Search the web for information
|
|
- web_extract_tool: Extract content from specific web pages
|
|
- web_crawl_tool: Crawl websites with specific instructions
|
|
|
|
Backend compatibility:
|
|
- Tavily: https://docs.tavily.com/
|
|
- Firecrawl: https://docs.firecrawl.dev/features/search
|
|
|
|
Usage:
|
|
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
|
|
|
|
# Search the web
|
|
results = web_search_tool("Python machine learning libraries", limit=3)
|
|
|
|
# Extract content from URLs
|
|
content = web_extract_tool(["https://example.com"], format="markdown")
|
|
|
|
# Crawl a website
|
|
crawl_data = web_crawl_tool("example.com", "Find contact information")
|
|
"""
|
|
|
|
#TODO: Search Capabilities over the scraped pages
|
|
#TODO: Store the pages in something
|
|
#TODO: Tool to see what pages are available/saved to search over
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from typing import List
|
|
from tavily import TavilyClient
|
|
|
|
# Initialize Tavily client once at module level
|
|
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
|
|
|
|
|
def clean_base64_images(text: str) -> str:
|
|
"""
|
|
Remove base64 encoded images from text to reduce token count and clutter.
|
|
|
|
This function finds and removes base64 encoded images in various formats:
|
|
- (data:image/png;base64,...)
|
|
- (data:image/jpeg;base64,...)
|
|
- (data:image/svg+xml;base64,...)
|
|
- data:image/[type];base64,... (without parentheses)
|
|
|
|
Args:
|
|
text: The text content to clean
|
|
|
|
Returns:
|
|
Cleaned text with base64 images replaced with placeholders
|
|
"""
|
|
# Pattern to match base64 encoded images wrapped in parentheses
|
|
# Matches: (data:image/[type];base64,[base64-string])
|
|
base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
|
|
|
|
# Pattern to match base64 encoded images without parentheses
|
|
# Matches: data:image/[type];base64,[base64-string]
|
|
base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
|
|
|
|
# Replace parentheses-wrapped images first
|
|
cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
|
|
|
|
# Then replace any remaining non-parentheses images
|
|
cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
|
|
|
|
return cleaned_text
|
|
|
|
|
|
def web_search_tool(query: str, limit: int = 5) -> str:
|
|
"""
|
|
Search the web for information using available search API backend.
|
|
|
|
This function provides a generic interface for web search that can work
|
|
with multiple backends. Currently uses Tavily but can be easily swapped.
|
|
|
|
Args:
|
|
query (str): The search query to look up
|
|
limit (int): Maximum number of results to return (default: 5)
|
|
|
|
Returns:
|
|
str: JSON string containing search results with the following structure:
|
|
{
|
|
"query": str,
|
|
"results": [
|
|
{
|
|
"title": str,
|
|
"url": str,
|
|
"content": str,
|
|
"score": float
|
|
},
|
|
...
|
|
]
|
|
}
|
|
|
|
Raises:
|
|
Exception: If search fails or API key is not set
|
|
"""
|
|
try:
|
|
print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
|
|
|
|
# Use Tavily's search functionality
|
|
response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
|
|
|
|
print(f"✅ Found {len(response.get('results', []))} results")
|
|
result_json = json.dumps(response, indent=2)
|
|
# Clean base64 images from search results
|
|
return clean_base64_images(result_json)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error searching web: {str(e)}"
|
|
print(f"❌ {error_msg}")
|
|
return json.dumps({"error": error_msg})
|
|
|
|
|
|
def web_extract_tool(urls: List[str], format: str = None) -> str:
|
|
"""
|
|
Extract content from specific web pages using available extraction API backend.
|
|
|
|
This function provides a generic interface for web content extraction that
|
|
can work with multiple backends. Currently uses Tavily but can be easily swapped.
|
|
|
|
Args:
|
|
urls (List[str]): List of URLs to extract content from
|
|
format (str): Desired output format ("markdown" or "html", optional)
|
|
|
|
Returns:
|
|
str: JSON string containing extracted content with the following structure:
|
|
{
|
|
"results": [
|
|
{
|
|
"url": str,
|
|
"title": str,
|
|
"raw_content": str,
|
|
"content": str
|
|
},
|
|
...
|
|
]
|
|
}
|
|
|
|
Raises:
|
|
Exception: If extraction fails or API key is not set
|
|
"""
|
|
try:
|
|
print(f"📄 Extracting content from {len(urls)} URL(s)")
|
|
|
|
# Use Tavily's extract functionality
|
|
response = tavily_client.extract(urls=urls, format=format)
|
|
|
|
print(f"✅ Extracted content from {len(response.get('results', []))} pages")
|
|
|
|
# Print summary of extracted pages for debugging
|
|
for result in response.get('results', []):
|
|
url = result.get('url', 'Unknown URL')
|
|
content_length = len(result.get('raw_content', ''))
|
|
print(f" 📝 {url} ({content_length} characters)")
|
|
|
|
result_json = json.dumps(response, indent=2)
|
|
# Clean base64 images from extracted content
|
|
return clean_base64_images(result_json)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error extracting content: {str(e)}"
|
|
print(f"❌ {error_msg}")
|
|
return json.dumps({"error": error_msg})
|
|
|
|
|
|
def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") -> str:
|
|
"""
|
|
Crawl a website with specific instructions using available crawling API backend.
|
|
|
|
This function provides a generic interface for web crawling that can work
|
|
with multiple backends. Currently uses Tavily but can be easily swapped.
|
|
|
|
Args:
|
|
url (str): The base URL to crawl (can include or exclude https://)
|
|
instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
|
|
depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
|
|
|
|
Returns:
|
|
str: JSON string containing crawled content with the following structure:
|
|
{
|
|
"results": [
|
|
{
|
|
"url": str,
|
|
"title": str,
|
|
"content": str
|
|
},
|
|
...
|
|
]
|
|
}
|
|
|
|
Raises:
|
|
Exception: If crawling fails or API key is not set
|
|
"""
|
|
try:
|
|
instructions_text = f" with instructions: '{instructions}'" if instructions else ""
|
|
print(f"🕷️ Crawling {url}{instructions_text}")
|
|
|
|
# Use Tavily's crawl functionality
|
|
response = tavily_client.crawl(
|
|
url=url,
|
|
limit=20, # Reasonable limit for most use cases
|
|
instructions=instructions or "Get all available content",
|
|
extract_depth=depth
|
|
)
|
|
|
|
print(f"✅ Crawled {len(response.get('results', []))} pages")
|
|
|
|
# Print summary of crawled pages for debugging
|
|
for result in response.get('results', []):
|
|
page_url = result.get('url', 'Unknown URL')
|
|
content_length = len(result.get('content', ''))
|
|
print(f" 🌐 {page_url} ({content_length} characters)")
|
|
|
|
result_json = json.dumps(response, indent=2)
|
|
# Clean base64 images from crawled content
|
|
return clean_base64_images(result_json)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error crawling website: {str(e)}"
|
|
print(f"❌ {error_msg}")
|
|
return json.dumps({"error": error_msg})
|
|
|
|
|
|
# Convenience function to check if API key is available
|
|
def check_tavily_api_key() -> bool:
|
|
"""
|
|
Check if the Tavily API key is available in environment variables.
|
|
|
|
Returns:
|
|
bool: True if API key is set, False otherwise
|
|
"""
|
|
return bool(os.getenv("TAVILY_API_KEY"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
"""
|
|
Simple test/demo when run directly
|
|
"""
|
|
print("🌐 Standalone Web Tools Module")
|
|
print("=" * 40)
|
|
|
|
# Check if API key is available
|
|
if not check_tavily_api_key():
|
|
print("❌ TAVILY_API_KEY environment variable not set")
|
|
print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
|
|
print("Get API key at: https://tavily.com/")
|
|
exit(1)
|
|
|
|
print("✅ Tavily API key found")
|
|
print("🛠️ Web tools ready for use!")
|
|
print("\nExample usage:")
|
|
print(" from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
|
|
print(" results = web_search_tool('Python tutorials')")
|
|
print(" content = web_extract_tool(['https://example.com'])")
|
|
print(" crawl_data = web_crawl_tool('example.com', 'Find documentation')")
|