2025-07-22 18:32:44 -07:00
#!/usr/bin/env python3
"""
Standalone Web Tools Module
This module provides generic web tools that work with multiple backend providers .
Currently uses Tavily as the backend , but the interface makes it easy to swap
to other providers like Firecrawl without changing the function signatures .
Available tools :
- web_search_tool : Search the web for information
- web_extract_tool : Extract content from specific web pages
- web_crawl_tool : Crawl websites with specific instructions
Backend compatibility :
- Tavily : https : / / docs . tavily . com /
- Firecrawl : https : / / docs . firecrawl . dev / features / search
2025-07-31 10:11:27 -07:00
LLM Processing :
- Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction
- Extracts key excerpts and creates markdown summaries to reduce token usage
2025-07-22 18:32:44 -07:00
Usage :
from web_tools import web_search_tool , web_extract_tool , web_crawl_tool
# Search the web
results = web_search_tool ( " Python machine learning libraries " , limit = 3 )
# Extract content from URLs
content = web_extract_tool ( [ " https://example.com " ] , format = " markdown " )
# Crawl a website
crawl_data = web_crawl_tool ( " example.com " , " Find contact information " )
"""
#TODO: Search Capabilities over the scraped pages
#TODO: Store the pages in something
#TODO: Tool to see what pages are available/saved to search over
import json
import os
import re
2025-07-31 10:11:27 -07:00
import asyncio
from typing import List , Dict , Any , Optional
2025-07-22 18:32:44 -07:00
from tavily import TavilyClient
2025-07-31 10:11:27 -07:00
from openai import AsyncOpenAI
2025-07-22 18:32:44 -07:00
# Initialize Tavily client once at module level
tavily_client = TavilyClient ( api_key = os . getenv ( " TAVILY_API_KEY " ) )
2025-07-31 10:11:27 -07:00
# Initialize Nous Research API client for LLM processing (async)
nous_client = AsyncOpenAI (
api_key = os . getenv ( " NOUS_API_KEY " ) ,
base_url = " https://inference-api.nousresearch.com/v1 "
)
# Configuration for LLM processing
DEFAULT_SUMMARIZER_MODEL = " gemini-2.5-flash "
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
async def process_content_with_llm (
content : str ,
url : str = " " ,
title : str = " " ,
model : str = DEFAULT_SUMMARIZER_MODEL ,
min_length : int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
) - > Optional [ str ] :
"""
Process web content using LLM to create intelligent summaries with key excerpts .
This function uses Gemini 2.5 Flash ( or specified model ) via Nous Research API
to intelligently extract key information and create markdown summaries ,
significantly reducing token usage while preserving all important information .
Args :
content ( str ) : The raw content to process
url ( str ) : The source URL ( for context , optional )
title ( str ) : The page title ( for context , optional )
model ( str ) : The model to use for processing ( default : gemini - 2.5 - flash )
min_length ( int ) : Minimum content length to trigger processing ( default : 5000 )
Returns :
Optional [ str ] : Processed markdown content , or None if content too short or processing fails
"""
try :
# Skip processing if content is too short
if len ( content ) < min_length :
print ( f " 📏 Content too short ( { len ( content ) } < { min_length } chars), skipping LLM processing " )
return None
print ( f " 🧠 Processing content with LLM ( { len ( content ) } characters) " )
# Create context information
context_info = [ ]
if title :
context_info . append ( f " Title: { title } " )
if url :
context_info . append ( f " Source: { url } " )
context_str = " \n " . join ( context_info ) + " \n \n " if context_info else " "
# Simplified prompt for better quality markdown output
system_prompt = """ You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
Create a well - structured markdown summary that includes :
1. Key excerpts ( quotes , code snippets , important facts ) in their original format
2. Comprehensive summary of all other important information
3. Proper markdown formatting with headers , bullets , and emphasis
Your goal is to preserve ALL important information while reducing length . Never lose key facts , figures , insights , or actionable information . Make it scannable and well - organized . """
user_prompt = f """ Please process this web content and create a comprehensive markdown summary:
{ context_str } CONTENT TO PROCESS :
{ content }
Create a markdown summary that captures all key information in a well - organized , scannable format . Include important quotes and code snippets in their original formatting . Focus on actionable information , specific details , and unique insights . """
# Call the LLM asynchronously
response = await nous_client . chat . completions . create (
model = model ,
messages = [
{ " role " : " system " , " content " : system_prompt } ,
{ " role " : " user " , " content " : user_prompt }
] ,
temperature = 0.1 , # Low temperature for consistent extraction
max_tokens = 4000 # Generous limit for comprehensive processing
)
# Get the markdown response directly
processed_content = response . choices [ 0 ] . message . content . strip ( )
# Calculate compression metrics for logging
original_length = len ( content )
processed_length = len ( processed_content )
compression_ratio = processed_length / original_length if original_length > 0 else 1.0
print ( f " ✅ Content processed: { original_length } → { processed_length } chars ( { compression_ratio : .1% } ) " )
return processed_content
except Exception as e :
print ( f " ❌ Error processing content with LLM: { str ( e ) } " )
return None
2025-07-22 18:32:44 -07:00
def clean_base64_images ( text : str ) - > str :
"""
Remove base64 encoded images from text to reduce token count and clutter .
This function finds and removes base64 encoded images in various formats :
- ( data : image / png ; base64 , . . . )
- ( data : image / jpeg ; base64 , . . . )
- ( data : image / svg + xml ; base64 , . . . )
- data : image / [ type ] ; base64 , . . . ( without parentheses )
Args :
text : The text content to clean
Returns :
Cleaned text with base64 images replaced with placeholders
"""
# Pattern to match base64 encoded images wrapped in parentheses
# Matches: (data:image/[type];base64,[base64-string])
base64_with_parens_pattern = r ' \ (data:image/[^;]+;base64,[A-Za-z0-9+/=]+ \ ) '
# Pattern to match base64 encoded images without parentheses
# Matches: data:image/[type];base64,[base64-string]
base64_pattern = r ' data:image/[^;]+;base64,[A-Za-z0-9+/=]+ '
# Replace parentheses-wrapped images first
cleaned_text = re . sub ( base64_with_parens_pattern , ' [BASE64_IMAGE_REMOVED] ' , text )
# Then replace any remaining non-parentheses images
cleaned_text = re . sub ( base64_pattern , ' [BASE64_IMAGE_REMOVED] ' , cleaned_text )
return cleaned_text
def web_search_tool ( query : str , limit : int = 5 ) - > str :
"""
Search the web for information using available search API backend .
This function provides a generic interface for web search that can work
with multiple backends . Currently uses Tavily but can be easily swapped .
2025-07-31 10:11:27 -07:00
Note : Search results are already concise snippets , so no LLM processing is applied .
2025-07-22 18:32:44 -07:00
Args :
query ( str ) : The search query to look up
limit ( int ) : Maximum number of results to return ( default : 5 )
Returns :
str : JSON string containing search results with the following structure :
{
" query " : str ,
" results " : [
{
" title " : str ,
" url " : str ,
" content " : str ,
" score " : float
} ,
. . .
]
}
Raises :
Exception : If search fails or API key is not set
"""
try :
print ( f " 🔍 Searching the web for: ' { query } ' (limit: { limit } ) " )
# Use Tavily's search functionality
response = tavily_client . search ( query = query , max_results = limit , search_depth = " advanced " )
print ( f " ✅ Found { len ( response . get ( ' results ' , [ ] ) ) } results " )
2025-07-31 10:11:27 -07:00
2025-07-22 18:32:44 -07:00
result_json = json . dumps ( response , indent = 2 )
# Clean base64 images from search results
return clean_base64_images ( result_json )
except Exception as e :
error_msg = f " Error searching web: { str ( e ) } "
print ( f " ❌ { error_msg } " )
return json . dumps ( { " error " : error_msg } )
2025-07-31 10:11:27 -07:00
async def web_extract_tool (
urls : List [ str ] ,
format : str = None ,
use_llm_processing : bool = True ,
model : str = DEFAULT_SUMMARIZER_MODEL ,
min_length : int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
) - > str :
2025-07-22 18:32:44 -07:00
"""
Extract content from specific web pages using available extraction API backend .
This function provides a generic interface for web content extraction that
can work with multiple backends . Currently uses Tavily but can be easily swapped .
Args :
urls ( List [ str ] ) : List of URLs to extract content from
format ( str ) : Desired output format ( " markdown " or " html " , optional )
2025-07-31 10:11:27 -07:00
use_llm_processing ( bool ) : Whether to process content with LLM for summarization ( default : True )
model ( str ) : The model to use for LLM processing ( default : gemini - 2.5 - flash )
min_length ( int ) : Minimum content length to trigger LLM processing ( default : 5000 )
2025-07-22 18:32:44 -07:00
Returns :
2025-07-31 10:11:27 -07:00
str : JSON string containing extracted content . If LLM processing is enabled and successful ,
the ' content ' field will contain the processed markdown summary instead of raw content .
2025-07-22 18:32:44 -07:00
Raises :
Exception : If extraction fails or API key is not set
"""
try :
print ( f " 📄 Extracting content from { len ( urls ) } URL(s) " )
# Use Tavily's extract functionality
response = tavily_client . extract ( urls = urls , format = format )
print ( f " ✅ Extracted content from { len ( response . get ( ' results ' , [ ] ) ) } pages " )
2025-07-31 10:11:27 -07:00
# Process each result with LLM if enabled
if use_llm_processing and os . getenv ( " NOUS_API_KEY " ) :
print ( " 🧠 Processing extracted content with LLM... " )
for result in response . get ( ' results ' , [ ] ) :
url = result . get ( ' url ' , ' Unknown URL ' )
title = result . get ( ' title ' , ' ' )
raw_content = result . get ( ' raw_content ' , ' ' ) or result . get ( ' content ' , ' ' )
if raw_content :
# Process content with LLM
processed = await process_content_with_llm (
raw_content , url , title , model , min_length
)
if processed :
# Replace content with processed version
result [ ' content ' ] = processed
# Keep raw content in separate field for reference
result [ ' raw_content ' ] = raw_content
print ( f " 📝 { url } (processed) " )
else :
print ( f " 📝 { url } (no processing - content too short) " )
else :
print ( f " ⚠️ { url } (no content to process) " )
else :
if use_llm_processing and not os . getenv ( " NOUS_API_KEY " ) :
print ( " ⚠️ LLM processing requested but NOUS_API_KEY not set, returning raw content " )
# Print summary of extracted pages for debugging (original behavior)
for result in response . get ( ' results ' , [ ] ) :
url = result . get ( ' url ' , ' Unknown URL ' )
content_length = len ( result . get ( ' raw_content ' , ' ' ) )
print ( f " 📝 { url } ( { content_length } characters) " )
2025-07-22 18:32:44 -07:00
result_json = json . dumps ( response , indent = 2 )
# Clean base64 images from extracted content
return clean_base64_images ( result_json )
except Exception as e :
error_msg = f " Error extracting content: { str ( e ) } "
print ( f " ❌ { error_msg } " )
return json . dumps ( { " error " : error_msg } )
2025-07-31 10:11:27 -07:00
async def web_crawl_tool (
url : str ,
instructions : str = None ,
depth : str = " basic " ,
use_llm_processing : bool = True ,
model : str = DEFAULT_SUMMARIZER_MODEL ,
min_length : int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
) - > str :
2025-07-22 18:32:44 -07:00
"""
Crawl a website with specific instructions using available crawling API backend .
This function provides a generic interface for web crawling that can work
with multiple backends . Currently uses Tavily but can be easily swapped .
Args :
url ( str ) : The base URL to crawl ( can include or exclude https : / / )
instructions ( str ) : Instructions for what to crawl / extract using LLM intelligence ( optional )
depth ( str ) : Depth of extraction ( " basic " or " advanced " , default : " basic " )
2025-07-31 10:11:27 -07:00
use_llm_processing ( bool ) : Whether to process content with LLM for summarization ( default : True )
model ( str ) : The model to use for LLM processing ( default : gemini - 2.5 - flash )
min_length ( int ) : Minimum content length to trigger LLM processing ( default : 5000 )
2025-07-22 18:32:44 -07:00
Returns :
2025-07-31 10:11:27 -07:00
str : JSON string containing crawled content . If LLM processing is enabled and successful ,
the ' content ' field will contain the processed markdown summary instead of raw content .
Each page is processed individually .
2025-07-22 18:32:44 -07:00
Raises :
Exception : If crawling fails or API key is not set
"""
try :
instructions_text = f " with instructions: ' { instructions } ' " if instructions else " "
print ( f " 🕷️ Crawling { url } { instructions_text } " )
# Use Tavily's crawl functionality
response = tavily_client . crawl (
url = url ,
limit = 20 , # Reasonable limit for most use cases
instructions = instructions or " Get all available content " ,
extract_depth = depth
)
print ( f " ✅ Crawled { len ( response . get ( ' results ' , [ ] ) ) } pages " )
2025-07-31 10:11:27 -07:00
# Process each result with LLM if enabled
if use_llm_processing and os . getenv ( " NOUS_API_KEY " ) :
print ( " 🧠 Processing crawled content with LLM... " )
for result in response . get ( ' results ' , [ ] ) :
page_url = result . get ( ' url ' , ' Unknown URL ' )
title = result . get ( ' title ' , ' ' )
content = result . get ( ' content ' , ' ' )
if content :
# Process content with LLM
processed = await process_content_with_llm (
content , page_url , title , model , min_length
)
if processed :
# Keep original content in raw_content field
result [ ' raw_content ' ] = content
# Replace content with processed version
result [ ' content ' ] = processed
print ( f " 🌐 { page_url } (processed) " )
else :
print ( f " 🌐 { page_url } (no processing - content too short) " )
else :
print ( f " ⚠️ { page_url } (no content to process) " )
else :
if use_llm_processing and not os . getenv ( " NOUS_API_KEY " ) :
print ( " ⚠️ LLM processing requested but NOUS_API_KEY not set, returning raw content " )
# Print summary of crawled pages for debugging (original behavior)
for result in response . get ( ' results ' , [ ] ) :
page_url = result . get ( ' url ' , ' Unknown URL ' )
content_length = len ( result . get ( ' content ' , ' ' ) )
print ( f " 🌐 { page_url } ( { content_length } characters) " )
2025-07-22 18:32:44 -07:00
result_json = json . dumps ( response , indent = 2 )
# Clean base64 images from crawled content
return clean_base64_images ( result_json )
except Exception as e :
error_msg = f " Error crawling website: { str ( e ) } "
print ( f " ❌ { error_msg } " )
return json . dumps ( { " error " : error_msg } )
# Convenience function to check if API key is available
def check_tavily_api_key ( ) - > bool :
"""
Check if the Tavily API key is available in environment variables .
Returns :
bool : True if API key is set , False otherwise
"""
return bool ( os . getenv ( " TAVILY_API_KEY " ) )
2025-07-31 10:11:27 -07:00
def check_nous_api_key ( ) - > bool :
"""
Check if the Nous Research API key is available in environment variables .
Returns :
bool : True if API key is set , False otherwise
"""
return bool ( os . getenv ( " NOUS_API_KEY " ) )
2025-07-22 18:32:44 -07:00
if __name__ == " __main__ " :
"""
Simple test / demo when run directly
"""
print ( " 🌐 Standalone Web Tools Module " )
print ( " = " * 40 )
2025-07-31 10:11:27 -07:00
# Check if API keys are available
tavily_available = check_tavily_api_key ( )
nous_available = check_nous_api_key ( )
if not tavily_available :
2025-07-22 18:32:44 -07:00
print ( " ❌ TAVILY_API_KEY environment variable not set " )
print ( " Please set your API key: export TAVILY_API_KEY= ' your-key-here ' " )
print ( " Get API key at: https://tavily.com/ " )
2025-07-31 10:11:27 -07:00
else :
print ( " ✅ Tavily API key found " )
if not nous_available :
print ( " ❌ NOUS_API_KEY environment variable not set " )
print ( " Please set your API key: export NOUS_API_KEY= ' your-key-here ' " )
print ( " Get API key at: https://inference-api.nousresearch.com/ " )
print ( " ⚠️ Without Nous API key, LLM content processing will be disabled " )
else :
print ( " ✅ Nous Research API key found " )
if not tavily_available :
2025-07-22 18:32:44 -07:00
exit ( 1 )
print ( " 🛠️ Web tools ready for use! " )
2025-07-31 10:11:27 -07:00
if nous_available :
print ( " 🧠 LLM content processing available with Gemini 2.5 Flash " )
print ( f " Default min length for processing: { DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION } chars " )
print ( " \n Basic usage: " )
2025-07-22 18:32:44 -07:00
print ( " from web_tools import web_search_tool, web_extract_tool, web_crawl_tool " )
2025-07-31 10:11:27 -07:00
print ( " import asyncio " )
print ( " " )
print ( " # Search (synchronous) " )
2025-07-22 18:32:44 -07:00
print ( " results = web_search_tool( ' Python tutorials ' ) " )
2025-07-31 10:11:27 -07:00
print ( " " )
print ( " # Extract and crawl (asynchronous) " )
print ( " async def main(): " )
print ( " content = await web_extract_tool([ ' https://example.com ' ]) " )
print ( " crawl_data = await web_crawl_tool( ' example.com ' , ' Find docs ' ) " )
print ( " asyncio.run(main()) " )
if nous_available :
print ( " \n LLM-enhanced usage: " )
print ( " # Content automatically processed for pages >5000 chars (default) " )
print ( " content = await web_extract_tool([ ' https://python.org/about/ ' ]) " )
print ( " " )
print ( " # Customize processing parameters " )
print ( " crawl_data = await web_crawl_tool( " )
print ( " ' docs.python.org ' , " )
print ( " ' Find key concepts ' , " )
print ( " model= ' gemini-2.5-flash ' , " )
print ( " min_length=3000 " )
print ( " ) " )
print ( " " )
print ( " # Disable LLM processing " )
print ( " raw_content = await web_extract_tool([ ' https://example.com ' ], use_llm_processing=False) " )
print ( f " \n 📝 Run ' python test_web_tools_llm.py ' to test LLM processing capabilities " )