2025-10-01 23:29:25 +00:00
#!/usr/bin/env python3
"""
Standalone Web Tools Module
This module provides generic web tools that work with multiple backend providers .
Currently uses Firecrawl as the backend , and the interface makes it easy to swap
providers without changing the function signatures .
Available tools :
- web_search_tool : Search the web for information
- web_extract_tool : Extract content from specific web pages
- web_crawl_tool : Crawl websites with specific instructions
Backend compatibility :
- Firecrawl : https : / / docs . firecrawl . dev / introduction
LLM Processing :
2026-01-08 08:57:51 +00:00
- Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction
2025-10-01 23:29:25 +00:00
- Extracts key excerpts and creates markdown summaries to reduce token usage
Debug Mode :
- Set WEB_TOOLS_DEBUG = true to enable detailed logging
- Creates web_tools_debug_UUID . json in . / logs directory
- Captures all tool calls , results , and compression metrics
Usage :
from web_tools import web_search_tool , web_extract_tool , web_crawl_tool
# Search the web
results = web_search_tool ( " Python machine learning libraries " , limit = 3 )
# Extract content from URLs
content = web_extract_tool ( [ " https://example.com " ] , format = " markdown " )
# Crawl a website
crawl_data = web_crawl_tool ( " example.com " , " Find contact information " )
"""
#TODO: Search Capabilities over the scraped pages
#TODO: Store the pages in something
#TODO: Tool to see what pages are available/saved to search over
import json
2026-02-21 03:11:11 -08:00
import logging
2025-10-01 23:29:25 +00:00
import os
import re
import asyncio
from typing import List , Dict , Any , Optional
from firecrawl import Firecrawl
2026-02-22 02:16:11 -08:00
from openai import AsyncOpenAI
2026-02-28 21:47:51 -08:00
from agent . auxiliary_client import get_async_text_auxiliary_client
2026-02-21 03:53:24 -08:00
from tools . debug_helpers import DebugSession
2025-10-01 23:29:25 +00:00
2026-02-21 03:11:11 -08:00
logger = logging . getLogger ( __name__ )
2026-01-29 19:59:59 +00:00
_firecrawl_client = None
def _get_firecrawl_client ( ) :
2026-03-05 16:44:21 -08:00
""" Get or create the Firecrawl client (lazy initialization).
Uses the cloud API by default ( requires FIRECRAWL_API_KEY ) .
Set FIRECRAWL_API_URL to point at a self - hosted instance instead —
in that case the API key is optional ( set USE_DB_AUTHENTICATION = false
on your Firecrawl server to disable auth entirely ) .
"""
2026-01-29 19:59:59 +00:00
global _firecrawl_client
if _firecrawl_client is None :
api_key = os . getenv ( " FIRECRAWL_API_KEY " )
2026-03-05 16:16:18 -06:00
api_url = os . getenv ( " FIRECRAWL_API_URL " )
2026-03-05 16:44:21 -08:00
if not api_key and not api_url :
raise ValueError (
" FIRECRAWL_API_KEY environment variable not set. "
" Set it for cloud Firecrawl, or set FIRECRAWL_API_URL "
" to use a self-hosted instance. "
)
kwargs = { }
if api_key :
kwargs [ " api_key " ] = api_key
2026-03-05 16:16:18 -06:00
if api_url :
2026-03-05 16:44:21 -08:00
kwargs [ " api_url " ] = api_url
_firecrawl_client = Firecrawl ( * * kwargs )
2026-01-29 19:59:59 +00:00
return _firecrawl_client
2025-10-01 23:29:25 +00:00
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
2026-02-28 21:47:51 -08:00
# Resolve async auxiliary client at module level.
# Handles Codex Responses API adapter transparently.
_aux_async_client , DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client ( )
2026-02-22 02:16:11 -08:00
2026-02-21 03:53:24 -08:00
_debug = DebugSession ( " web_tools " , env_var = " WEB_TOOLS_DEBUG " )
2025-10-01 23:29:25 +00:00
async def process_content_with_llm (
content : str ,
url : str = " " ,
title : str = " " ,
model : str = DEFAULT_SUMMARIZER_MODEL ,
min_length : int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
) - > Optional [ str ] :
"""
Process web content using LLM to create intelligent summaries with key excerpts .
2026-01-08 08:57:51 +00:00
This function uses Gemini 3 Flash Preview ( or specified model ) via OpenRouter API
2025-10-01 23:29:25 +00:00
to intelligently extract key information and create markdown summaries ,
significantly reducing token usage while preserving all important information .
2026-01-10 05:56:26 +00:00
For very large content ( > 500 k chars ) , uses chunked processing with synthesis .
For extremely large content ( > 2 M chars ) , refuses to process entirely .
2025-10-01 23:29:25 +00:00
Args :
content ( str ) : The raw content to process
url ( str ) : The source URL ( for context , optional )
title ( str ) : The page title ( for context , optional )
2026-01-08 08:57:51 +00:00
model ( str ) : The model to use for processing ( default : google / gemini - 3 - flash - preview )
2025-10-01 23:29:25 +00:00
min_length ( int ) : Minimum content length to trigger processing ( default : 5000 )
Returns :
Optional [ str ] : Processed markdown content , or None if content too short or processing fails
"""
2026-01-10 05:56:26 +00:00
# Size thresholds
MAX_CONTENT_SIZE = 2_000_000 # 2M chars - refuse entirely above this
CHUNK_THRESHOLD = 500_000 # 500k chars - use chunked processing above this
CHUNK_SIZE = 100_000 # 100k chars per chunk
MAX_OUTPUT_SIZE = 5000 # Hard cap on final output size
2025-10-01 23:29:25 +00:00
try :
2026-01-10 05:56:26 +00:00
content_len = len ( content )
# Refuse if content is absurdly large
if content_len > MAX_CONTENT_SIZE :
size_mb = content_len / 1_000_000
2026-02-21 03:11:11 -08:00
logger . warning ( " Content too large ( %.1f MB > 2MB limit). Refusing to process. " , size_mb )
2026-01-10 05:56:26 +00:00
return f " [Content too large to process: { size_mb : .1f } MB. Try using web_crawl with specific extraction instructions, or search for a more focused source.] "
2025-10-01 23:29:25 +00:00
# Skip processing if content is too short
2026-01-10 05:56:26 +00:00
if content_len < min_length :
2026-02-21 03:11:11 -08:00
logger . debug ( " Content too short ( %d < %d chars), skipping LLM processing " , content_len , min_length )
2025-10-01 23:29:25 +00:00
return None
# Create context information
context_info = [ ]
if title :
context_info . append ( f " Title: { title } " )
if url :
context_info . append ( f " Source: { url } " )
context_str = " \n " . join ( context_info ) + " \n \n " if context_info else " "
2026-01-10 05:56:26 +00:00
# Check if we need chunked processing
if content_len > CHUNK_THRESHOLD :
2026-02-21 03:11:11 -08:00
logger . info ( " Content large ( %d chars). Using chunked processing... " , content_len )
2026-01-10 05:56:26 +00:00
return await _process_large_content_chunked (
content , context_str , model , CHUNK_SIZE , MAX_OUTPUT_SIZE
)
# Standard single-pass processing for normal content
2026-02-21 03:11:11 -08:00
logger . info ( " Processing content with LLM ( %d characters) " , content_len )
2026-01-10 05:56:26 +00:00
processed_content = await _call_summarizer_llm ( content , context_str , model )
if processed_content :
# Enforce output cap
if len ( processed_content ) > MAX_OUTPUT_SIZE :
processed_content = processed_content [ : MAX_OUTPUT_SIZE ] + " \n \n [... summary truncated for context management ...] "
# Log compression metrics
processed_length = len ( processed_content )
compression_ratio = processed_length / content_len if content_len > 0 else 1.0
2026-02-21 03:11:11 -08:00
logger . info ( " Content processed: %d -> %d chars ( %.1f %% ) " , content_len , processed_length , compression_ratio * 100 )
2026-01-10 05:56:26 +00:00
return processed_content
except Exception as e :
2026-02-23 23:55:42 -08:00
logger . debug ( " Error processing content with LLM: %s " , e )
2026-01-10 05:56:26 +00:00
return f " [Failed to process content: { str ( e ) [ : 100 ] } . Content size: { len ( content ) : , } chars] "
async def _call_summarizer_llm (
content : str ,
context_str : str ,
model : str ,
2026-02-28 21:47:51 -08:00
max_tokens : int = 20000 ,
2026-01-10 05:56:26 +00:00
is_chunk : bool = False ,
chunk_info : str = " "
) - > Optional [ str ] :
"""
Make a single LLM call to summarize content .
Args :
content : The content to summarize
context_str : Context information ( title , URL )
model : Model to use
max_tokens : Maximum output tokens
is_chunk : Whether this is a chunk of a larger document
chunk_info : Information about chunk position ( e . g . , " Chunk 2/5 " )
Returns :
Summarized content or None on failure
"""
if is_chunk :
# Chunk-specific prompt - aware that this is partial content
system_prompt = """ You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY.
Important guidelines for chunk processing :
1. Do NOT write introductions or conclusions - this is a partial document
2. Focus on extracting ALL key facts , figures , data points , and insights from this section
3. Preserve important quotes , code snippets , and specific details verbatim
4. Use bullet points and structured formatting for easy synthesis later
5. Note any references to other sections ( e . g . , " as mentioned earlier " , " see below " ) without trying to resolve them
Your output will be combined with summaries of other sections , so focus on thorough extraction rather than narrative flow . """
user_prompt = f """ Extract key information from this SECTION of a larger document:
{ context_str } { chunk_info }
SECTION CONTENT :
{ content }
Extract all important information from this section in a structured format . Focus on facts , data , insights , and key details . Do not add introductions or conclusions . """
else :
# Standard full-document prompt
2025-10-01 23:29:25 +00:00
system_prompt = """ You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
Create a well - structured markdown summary that includes :
1. Key excerpts ( quotes , code snippets , important facts ) in their original format
2. Comprehensive summary of all other important information
3. Proper markdown formatting with headers , bullets , and emphasis
Your goal is to preserve ALL important information while reducing length . Never lose key facts , figures , insights , or actionable information . Make it scannable and well - organized . """
user_prompt = f """ Please process this web content and create a comprehensive markdown summary:
{ context_str } CONTENT TO PROCESS :
{ content }
Create a markdown summary that captures all key information in a well - organized , scannable format . Include important quotes and code snippets in their original formatting . Focus on actionable information , specific details , and unique insights . """
2026-01-10 05:56:26 +00:00
# Call the LLM with retry logic
max_retries = 6
retry_delay = 2
last_error = None
for attempt in range ( max_retries ) :
try :
2026-02-22 02:16:11 -08:00
if _aux_async_client is None :
logger . warning ( " No auxiliary model available for web content processing " )
return None
2026-02-26 20:23:56 -08:00
from agent . auxiliary_client import get_auxiliary_extra_body , auxiliary_max_tokens_param
2026-02-25 18:39:36 -08:00
_extra = get_auxiliary_extra_body ( )
2026-02-22 02:16:11 -08:00
response = await _aux_async_client . chat . completions . create (
2026-01-10 05:56:26 +00:00
model = model ,
messages = [
{ " role " : " system " , " content " : system_prompt } ,
{ " role " : " user " , " content " : user_prompt }
] ,
temperature = 0.1 ,
2026-02-26 20:23:56 -08:00
* * auxiliary_max_tokens_param ( max_tokens ) ,
2026-02-25 18:39:36 -08:00
* * ( { } if not _extra else { " extra_body " : _extra } ) ,
2026-01-10 05:56:26 +00:00
)
return response . choices [ 0 ] . message . content . strip ( )
except Exception as api_error :
last_error = api_error
if attempt < max_retries - 1 :
2026-02-21 03:11:11 -08:00
logger . warning ( " LLM API call failed (attempt %d / %d ): %s " , attempt + 1 , max_retries , str ( api_error ) [ : 100 ] )
logger . warning ( " Retrying in %d s... " , retry_delay )
2026-01-10 05:56:26 +00:00
await asyncio . sleep ( retry_delay )
retry_delay = min ( retry_delay * 2 , 60 )
else :
raise last_error
return None
async def _process_large_content_chunked (
content : str ,
context_str : str ,
model : str ,
chunk_size : int ,
max_output_size : int
) - > Optional [ str ] :
"""
Process large content by chunking , summarizing each chunk in parallel ,
then synthesizing the summaries .
Args :
content : The large content to process
context_str : Context information
model : Model to use
chunk_size : Size of each chunk in characters
max_output_size : Maximum final output size
2025-10-01 23:29:25 +00:00
2026-01-10 05:56:26 +00:00
Returns :
Synthesized summary or None on failure
"""
# Split content into chunks
chunks = [ ]
for i in range ( 0 , len ( content ) , chunk_size ) :
chunk = content [ i : i + chunk_size ]
chunks . append ( chunk )
2026-02-21 03:11:11 -08:00
logger . info ( " Split into %d chunks of ~ %d chars each " , len ( chunks ) , chunk_size )
2026-01-10 05:56:26 +00:00
# Summarize each chunk in parallel
async def summarize_chunk ( chunk_idx : int , chunk_content : str ) - > tuple [ int , Optional [ str ] ] :
""" Summarize a single chunk. """
try :
chunk_info = f " [Processing chunk { chunk_idx + 1 } of { len ( chunks ) } ] "
summary = await _call_summarizer_llm (
chunk_content ,
context_str ,
model ,
2026-02-28 21:47:51 -08:00
max_tokens = 10000 ,
2026-01-10 05:56:26 +00:00
is_chunk = True ,
chunk_info = chunk_info
)
if summary :
2026-02-21 03:11:11 -08:00
logger . info ( " Chunk %d / %d summarized: %d -> %d chars " , chunk_idx + 1 , len ( chunks ) , len ( chunk_content ) , len ( summary ) )
2026-01-10 05:56:26 +00:00
return chunk_idx , summary
except Exception as e :
2026-02-21 03:11:11 -08:00
logger . warning ( " Chunk %d / %d failed: %s " , chunk_idx + 1 , len ( chunks ) , str ( e ) [ : 50 ] )
2026-01-10 05:56:26 +00:00
return chunk_idx , None
# Run all chunk summarizations in parallel
tasks = [ summarize_chunk ( i , chunk ) for i , chunk in enumerate ( chunks ) ]
results = await asyncio . gather ( * tasks )
# Collect successful summaries in order
summaries = [ ]
for chunk_idx , summary in sorted ( results , key = lambda x : x [ 0 ] ) :
if summary :
summaries . append ( f " ## Section { chunk_idx + 1 } \n { summary } " )
if not summaries :
2026-02-23 23:55:42 -08:00
logger . debug ( " All chunk summarizations failed " )
2026-01-10 05:56:26 +00:00
return " [Failed to process large content: all chunk summarizations failed] "
2026-02-21 03:11:11 -08:00
logger . info ( " Got %d / %d chunk summaries " , len ( summaries ) , len ( chunks ) )
2026-01-10 05:56:26 +00:00
# If only one chunk succeeded, just return it (with cap)
if len ( summaries ) == 1 :
result = summaries [ 0 ]
if len ( result ) > max_output_size :
result = result [ : max_output_size ] + " \n \n [... truncated ...] "
return result
# Synthesize the summaries into a final summary
2026-02-21 03:11:11 -08:00
logger . info ( " Synthesizing %d summaries... " , len ( summaries ) )
2026-01-10 05:56:26 +00:00
combined_summaries = " \n \n --- \n \n " . join ( summaries )
synthesis_prompt = f """ You have been given summaries of different sections of a large document.
Synthesize these into ONE cohesive , comprehensive summary that :
1. Removes redundancy between sections
2. Preserves all key facts , figures , and actionable information
3. Is well - organized with clear structure
4. Is under { max_output_size } characters
{ context_str } SECTION SUMMARIES :
{ combined_summaries }
Create a single , unified markdown summary . """
try :
2026-02-22 02:16:11 -08:00
if _aux_async_client is None :
logger . warning ( " No auxiliary model for synthesis, concatenating summaries " )
fallback = " \n \n " . join ( summaries )
if len ( fallback ) > max_output_size :
fallback = fallback [ : max_output_size ] + " \n \n [... truncated ...] "
return fallback
2026-02-26 20:23:56 -08:00
from agent . auxiliary_client import get_auxiliary_extra_body , auxiliary_max_tokens_param
2026-02-25 18:39:36 -08:00
_extra = get_auxiliary_extra_body ( )
2026-02-22 02:16:11 -08:00
response = await _aux_async_client . chat . completions . create (
2026-01-10 05:56:26 +00:00
model = model ,
messages = [
{ " role " : " system " , " content " : " You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise. " } ,
{ " role " : " user " , " content " : synthesis_prompt }
] ,
temperature = 0.1 ,
2026-02-28 21:47:51 -08:00
* * auxiliary_max_tokens_param ( 20000 ) ,
2026-02-25 18:39:36 -08:00
* * ( { } if not _extra else { " extra_body " : _extra } ) ,
2026-01-10 05:56:26 +00:00
)
final_summary = response . choices [ 0 ] . message . content . strip ( )
# Enforce hard cap
if len ( final_summary ) > max_output_size :
final_summary = final_summary [ : max_output_size ] + " \n \n [... summary truncated for context management ...] "
original_len = len ( content )
final_len = len ( final_summary )
compression = final_len / original_len if original_len > 0 else 1.0
2026-02-21 03:11:11 -08:00
logger . info ( " Synthesis complete: %d -> %d chars ( %.2f %% ) " , original_len , final_len , compression * 100 )
2026-01-10 05:56:26 +00:00
return final_summary
2025-10-01 23:29:25 +00:00
except Exception as e :
2026-02-21 03:11:11 -08:00
logger . warning ( " Synthesis failed: %s " , str ( e ) [ : 100 ] )
2026-01-10 05:56:26 +00:00
# Fall back to concatenated summaries with truncation
fallback = " \n \n " . join ( summaries )
if len ( fallback ) > max_output_size :
fallback = fallback [ : max_output_size ] + " \n \n [... truncated due to synthesis failure ...] "
return fallback
2025-10-01 23:29:25 +00:00
def clean_base64_images ( text : str ) - > str :
"""
Remove base64 encoded images from text to reduce token count and clutter .
This function finds and removes base64 encoded images in various formats :
- ( data : image / png ; base64 , . . . )
- ( data : image / jpeg ; base64 , . . . )
- ( data : image / svg + xml ; base64 , . . . )
- data : image / [ type ] ; base64 , . . . ( without parentheses )
Args :
text : The text content to clean
Returns :
Cleaned text with base64 images replaced with placeholders
"""
# Pattern to match base64 encoded images wrapped in parentheses
# Matches: (data:image/[type];base64,[base64-string])
base64_with_parens_pattern = r ' \ (data:image/[^;]+;base64,[A-Za-z0-9+/=]+ \ ) '
# Pattern to match base64 encoded images without parentheses
# Matches: data:image/[type];base64,[base64-string]
base64_pattern = r ' data:image/[^;]+;base64,[A-Za-z0-9+/=]+ '
# Replace parentheses-wrapped images first
cleaned_text = re . sub ( base64_with_parens_pattern , ' [BASE64_IMAGE_REMOVED] ' , text )
# Then replace any remaining non-parentheses images
cleaned_text = re . sub ( base64_pattern , ' [BASE64_IMAGE_REMOVED] ' , cleaned_text )
return cleaned_text
def web_search_tool ( query : str , limit : int = 5 ) - > str :
"""
Search the web for information using available search API backend .
This function provides a generic interface for web search that can work
with multiple backends . Currently uses Firecrawl .
Note : This function returns search result metadata only ( URLs , titles , descriptions ) .
Use web_extract_tool to get full content from specific URLs .
Args :
query ( str ) : The search query to look up
limit ( int ) : Maximum number of results to return ( default : 5 )
Returns :
str : JSON string containing search results with the following structure :
{
" success " : bool ,
" data " : {
" web " : [
{
" title " : str ,
" url " : str ,
" description " : str ,
" position " : int
} ,
. . .
]
}
}
Raises :
Exception : If search fails or API key is not set
"""
debug_call_data = {
" parameters " : {
" query " : query ,
" limit " : limit
} ,
" error " : None ,
" results_count " : 0 ,
" original_response_size " : 0 ,
" final_response_size " : 0
}
try :
2026-02-23 02:11:33 -08:00
from tools . interrupt import is_interrupted
if is_interrupted ( ) :
return json . dumps ( { " error " : " Interrupted " , " success " : False } )
2026-02-21 03:11:11 -08:00
logger . info ( " Searching the web for: ' %s ' (limit: %d ) " , query , limit )
2025-10-01 23:29:25 +00:00
2026-01-29 19:59:59 +00:00
response = _get_firecrawl_client ( ) . search (
2025-10-01 23:29:25 +00:00
query = query ,
limit = limit
)
# The response is a SearchData object with web, news, and images attributes
# When not scraping, the results are directly in these attributes
web_results = [ ]
# Check if response has web attribute (SearchData object)
if hasattr ( response , ' web ' ) :
# Response is a SearchData object with web attribute
if response . web :
# Convert each SearchResultWeb object to dict
for result in response . web :
if hasattr ( result , ' model_dump ' ) :
# Pydantic model - use model_dump
web_results . append ( result . model_dump ( ) )
elif hasattr ( result , ' __dict__ ' ) :
# Regular object - use __dict__
web_results . append ( result . __dict__ )
elif isinstance ( result , dict ) :
# Already a dict
web_results . append ( result )
elif hasattr ( response , ' model_dump ' ) :
# Response has model_dump method - use it to get dict
response_dict = response . model_dump ( )
if ' web ' in response_dict and response_dict [ ' web ' ] :
web_results = response_dict [ ' web ' ]
elif isinstance ( response , dict ) :
# Response is already a dictionary
if ' web ' in response and response [ ' web ' ] :
web_results = response [ ' web ' ]
results_count = len ( web_results )
2026-02-21 03:11:11 -08:00
logger . info ( " Found %d search results " , results_count )
2025-10-01 23:29:25 +00:00
# Build response with just search metadata (URLs, titles, descriptions)
response_data = {
" success " : True ,
" data " : {
" web " : web_results
}
}
# Capture debug information
debug_call_data [ " results_count " ] = results_count
# Convert to JSON
2025-11-05 03:47:17 +00:00
result_json = json . dumps ( response_data , indent = 2 , ensure_ascii = False )
2025-10-01 23:29:25 +00:00
debug_call_data [ " final_response_size " ] = len ( result_json )
# Log debug information
2026-02-21 03:53:24 -08:00
_debug . log_call ( " web_search_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
return result_json
except Exception as e :
error_msg = f " Error searching web: { str ( e ) } "
2026-02-23 23:55:42 -08:00
logger . debug ( " %s " , error_msg )
2025-10-01 23:29:25 +00:00
debug_call_data [ " error " ] = error_msg
2026-02-21 03:53:24 -08:00
_debug . log_call ( " web_search_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
2025-11-05 03:47:17 +00:00
return json . dumps ( { " error " : error_msg } , ensure_ascii = False )
2025-10-01 23:29:25 +00:00
async def web_extract_tool (
urls : List [ str ] ,
format : str = None ,
use_llm_processing : bool = True ,
model : str = DEFAULT_SUMMARIZER_MODEL ,
min_length : int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
) - > str :
"""
Extract content from specific web pages using available extraction API backend .
This function provides a generic interface for web content extraction that
can work with multiple backends . Currently uses Firecrawl .
Args :
urls ( List [ str ] ) : List of URLs to extract content from
format ( str ) : Desired output format ( " markdown " or " html " , optional )
use_llm_processing ( bool ) : Whether to process content with LLM for summarization ( default : True )
2026-01-08 08:57:51 +00:00
model ( str ) : The model to use for LLM processing ( default : google / gemini - 3 - flash - preview )
2025-10-01 23:29:25 +00:00
min_length ( int ) : Minimum content length to trigger LLM processing ( default : 5000 )
Returns :
str : JSON string containing extracted content . If LLM processing is enabled and successful ,
the ' content ' field will contain the processed markdown summary instead of raw content .
Raises :
Exception : If extraction fails or API key is not set
"""
debug_call_data = {
" parameters " : {
" urls " : urls ,
" format " : format ,
" use_llm_processing " : use_llm_processing ,
" model " : model ,
" min_length " : min_length
} ,
" error " : None ,
" pages_extracted " : 0 ,
" pages_processed_with_llm " : 0 ,
" original_response_size " : 0 ,
" final_response_size " : 0 ,
" compression_metrics " : [ ] ,
" processing_applied " : [ ]
}
try :
2026-02-21 03:11:11 -08:00
logger . info ( " Extracting content from %d URL(s) " , len ( urls ) )
2025-10-01 23:29:25 +00:00
# Determine requested formats for Firecrawl v2
formats : List [ str ] = [ ]
if format == " markdown " :
formats = [ " markdown " ]
elif format == " html " :
formats = [ " html " ]
else :
# Default: request markdown for LLM-readiness and include html as backup
formats = [ " markdown " , " html " ]
# Always use individual scraping for simplicity and reliability
# Batch scraping adds complexity without much benefit for small numbers of URLs
results : List [ Dict [ str , Any ] ] = [ ]
2026-02-23 02:11:33 -08:00
from tools . interrupt import is_interrupted as _is_interrupted
2025-10-01 23:29:25 +00:00
for url in urls :
2026-02-23 02:11:33 -08:00
if _is_interrupted ( ) :
results . append ( { " url " : url , " error " : " Interrupted " , " title " : " " } )
continue
2025-10-01 23:29:25 +00:00
try :
2026-02-21 03:11:11 -08:00
logger . info ( " Scraping: %s " , url )
2026-01-29 19:59:59 +00:00
scrape_result = _get_firecrawl_client ( ) . scrape (
2025-10-01 23:29:25 +00:00
url = url ,
formats = formats
)
# Process the result - properly handle object serialization
metadata = { }
title = " "
content_markdown = None
content_html = None
# Extract data from the scrape result
if hasattr ( scrape_result , ' model_dump ' ) :
# Pydantic model - use model_dump to get dict
result_dict = scrape_result . model_dump ( )
content_markdown = result_dict . get ( ' markdown ' )
content_html = result_dict . get ( ' html ' )
metadata = result_dict . get ( ' metadata ' , { } )
elif hasattr ( scrape_result , ' __dict__ ' ) :
# Regular object with attributes
content_markdown = getattr ( scrape_result , ' markdown ' , None )
content_html = getattr ( scrape_result , ' html ' , None )
# Handle metadata - convert to dict if it's an object
metadata_obj = getattr ( scrape_result , ' metadata ' , { } )
if hasattr ( metadata_obj , ' model_dump ' ) :
metadata = metadata_obj . model_dump ( )
elif hasattr ( metadata_obj , ' __dict__ ' ) :
metadata = metadata_obj . __dict__
elif isinstance ( metadata_obj , dict ) :
metadata = metadata_obj
else :
metadata = { }
elif isinstance ( scrape_result , dict ) :
# Already a dictionary
content_markdown = scrape_result . get ( ' markdown ' )
content_html = scrape_result . get ( ' html ' )
metadata = scrape_result . get ( ' metadata ' , { } )
# Ensure metadata is a dict (not an object)
if not isinstance ( metadata , dict ) :
if hasattr ( metadata , ' model_dump ' ) :
metadata = metadata . model_dump ( )
elif hasattr ( metadata , ' __dict__ ' ) :
metadata = metadata . __dict__
else :
metadata = { }
# Get title from metadata
title = metadata . get ( " title " , " " )
# Choose content based on requested format
chosen_content = content_markdown if ( format == " markdown " or ( format is None and content_markdown ) ) else content_html or content_markdown or " "
results . append ( {
" url " : metadata . get ( " sourceURL " , url ) ,
" title " : title ,
" content " : chosen_content ,
" raw_content " : chosen_content ,
" metadata " : metadata # Now guaranteed to be a dict
} )
except Exception as scrape_err :
2026-02-23 23:55:42 -08:00
logger . debug ( " Scrape failed for %s : %s " , url , scrape_err )
2025-10-01 23:29:25 +00:00
results . append ( {
" url " : url ,
" title " : " " ,
" content " : " " ,
" raw_content " : " " ,
" error " : str ( scrape_err )
} )
response = { " results " : results }
pages_extracted = len ( response . get ( ' results ' , [ ] ) )
2026-02-21 03:11:11 -08:00
logger . info ( " Extracted content from %d pages " , pages_extracted )
2025-10-01 23:29:25 +00:00
debug_call_data [ " pages_extracted " ] = pages_extracted
debug_call_data [ " original_response_size " ] = len ( json . dumps ( response ) )
2026-02-22 02:16:11 -08:00
# Process each result with LLM if enabled and auxiliary client is available
if use_llm_processing and _aux_async_client is not None :
2026-02-21 03:11:11 -08:00
logger . info ( " Processing extracted content with LLM (parallel)... " )
2025-10-01 23:29:25 +00:00
debug_call_data [ " processing_applied " ] . append ( " llm_processing " )
2026-01-08 08:57:51 +00:00
# Prepare tasks for parallel processing
async def process_single_result ( result ) :
""" Process a single result with LLM and return updated result with metrics. """
2025-10-01 23:29:25 +00:00
url = result . get ( ' url ' , ' Unknown URL ' )
title = result . get ( ' title ' , ' ' )
raw_content = result . get ( ' raw_content ' , ' ' ) or result . get ( ' content ' , ' ' )
2026-01-08 08:57:51 +00:00
if not raw_content :
return result , None , " no_content "
original_size = len ( raw_content )
# Process content with LLM
processed = await process_content_with_llm (
raw_content , url , title , model , min_length
)
if processed :
processed_size = len ( processed )
compression_ratio = processed_size / original_size if original_size > 0 else 1.0
2025-10-01 23:29:25 +00:00
2026-01-08 08:57:51 +00:00
# Update result with processed content
result [ ' content ' ] = processed
result [ ' raw_content ' ] = raw_content
2025-10-01 23:29:25 +00:00
2026-01-08 08:57:51 +00:00
metrics = {
" url " : url ,
" original_size " : original_size ,
" processed_size " : processed_size ,
" compression_ratio " : compression_ratio ,
" model_used " : model
}
return result , metrics , " processed "
else :
metrics = {
" url " : url ,
" original_size " : original_size ,
" processed_size " : original_size ,
" compression_ratio " : 1.0 ,
" model_used " : None ,
" reason " : " content_too_short "
}
return result , metrics , " too_short "
# Run all LLM processing in parallel
results_list = response . get ( ' results ' , [ ] )
tasks = [ process_single_result ( result ) for result in results_list ]
processed_results = await asyncio . gather ( * tasks )
# Collect metrics and print results
for result , metrics , status in processed_results :
url = result . get ( ' url ' , ' Unknown URL ' )
if status == " processed " :
debug_call_data [ " compression_metrics " ] . append ( metrics )
debug_call_data [ " pages_processed_with_llm " ] + = 1
2026-02-21 03:11:11 -08:00
logger . info ( " %s (processed) " , url )
2026-01-08 08:57:51 +00:00
elif status == " too_short " :
debug_call_data [ " compression_metrics " ] . append ( metrics )
2026-02-21 03:11:11 -08:00
logger . info ( " %s (no processing - content too short) " , url )
2025-10-01 23:29:25 +00:00
else :
2026-02-21 03:11:11 -08:00
logger . warning ( " %s (no content to process) " , url )
2025-10-01 23:29:25 +00:00
else :
2026-02-22 02:16:11 -08:00
if use_llm_processing and _aux_async_client is None :
logger . warning ( " LLM processing requested but no auxiliary model available, returning raw content " )
2025-10-01 23:29:25 +00:00
debug_call_data [ " processing_applied " ] . append ( " llm_processing_unavailable " )
# Print summary of extracted pages for debugging (original behavior)
for result in response . get ( ' results ' , [ ] ) :
url = result . get ( ' url ' , ' Unknown URL ' )
content_length = len ( result . get ( ' raw_content ' , ' ' ) )
2026-02-21 03:11:11 -08:00
logger . info ( " %s ( %d characters) " , url , content_length )
2025-10-01 23:29:25 +00:00
# Trim output to minimal fields per entry: title, content, error
trimmed_results = [
{
" title " : r . get ( " title " , " " ) ,
" content " : r . get ( " content " , " " ) ,
" error " : r . get ( " error " ) ,
}
for r in response . get ( " results " , [ ] )
]
trimmed_response = { " results " : trimmed_results }
2025-11-05 03:47:17 +00:00
if trimmed_response . get ( " results " ) == [ ] :
result_json = json . dumps ( { " error " : " Content was inaccessible or not found " } , ensure_ascii = False )
cleaned_result = clean_base64_images ( result_json )
2025-10-01 23:29:25 +00:00
2025-11-05 03:47:17 +00:00
else :
result_json = json . dumps ( trimmed_response , indent = 2 , ensure_ascii = False )
cleaned_result = clean_base64_images ( result_json )
2025-10-01 23:29:25 +00:00
debug_call_data [ " final_response_size " ] = len ( cleaned_result )
debug_call_data [ " processing_applied " ] . append ( " base64_image_removal " )
# Log debug information
2026-02-21 03:53:24 -08:00
_debug . log_call ( " web_extract_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
return cleaned_result
except Exception as e :
error_msg = f " Error extracting content: { str ( e ) } "
2026-02-23 23:55:42 -08:00
logger . debug ( " %s " , error_msg )
2025-10-01 23:29:25 +00:00
debug_call_data [ " error " ] = error_msg
2026-02-21 03:53:24 -08:00
_debug . log_call ( " web_extract_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
2025-11-05 03:47:17 +00:00
return json . dumps ( { " error " : error_msg } , ensure_ascii = False )
2025-10-01 23:29:25 +00:00
async def web_crawl_tool (
url : str ,
instructions : str = None ,
depth : str = " basic " ,
use_llm_processing : bool = True ,
model : str = DEFAULT_SUMMARIZER_MODEL ,
min_length : int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
) - > str :
"""
Crawl a website with specific instructions using available crawling API backend .
This function provides a generic interface for web crawling that can work
with multiple backends . Currently uses Firecrawl .
Args :
url ( str ) : The base URL to crawl ( can include or exclude https : / / )
instructions ( str ) : Instructions for what to crawl / extract using LLM intelligence ( optional )
depth ( str ) : Depth of extraction ( " basic " or " advanced " , default : " basic " )
use_llm_processing ( bool ) : Whether to process content with LLM for summarization ( default : True )
2026-01-08 08:57:51 +00:00
model ( str ) : The model to use for LLM processing ( default : google / gemini - 3 - flash - preview )
2025-10-01 23:29:25 +00:00
min_length ( int ) : Minimum content length to trigger LLM processing ( default : 5000 )
Returns :
str : JSON string containing crawled content . If LLM processing is enabled and successful ,
the ' content ' field will contain the processed markdown summary instead of raw content .
Each page is processed individually .
Raises :
Exception : If crawling fails or API key is not set
"""
debug_call_data = {
" parameters " : {
" url " : url ,
" instructions " : instructions ,
" depth " : depth ,
" use_llm_processing " : use_llm_processing ,
" model " : model ,
" min_length " : min_length
} ,
" error " : None ,
" pages_crawled " : 0 ,
" pages_processed_with_llm " : 0 ,
" original_response_size " : 0 ,
" final_response_size " : 0 ,
" compression_metrics " : [ ] ,
" processing_applied " : [ ]
}
try :
# Ensure URL has protocol
if not url . startswith ( ( ' http:// ' , ' https:// ' ) ) :
url = f ' https:// { url } '
2026-02-21 03:11:11 -08:00
logger . info ( " Added https:// prefix to URL: %s " , url )
2025-10-01 23:29:25 +00:00
instructions_text = f " with instructions: ' { instructions } ' " if instructions else " "
2026-02-21 03:11:11 -08:00
logger . info ( " Crawling %s %s " , url , instructions_text )
2025-10-01 23:29:25 +00:00
# Use Firecrawl's v2 crawl functionality
# Docs: https://docs.firecrawl.dev/features/crawl
# The crawl() method automatically waits for completion and returns all data
# Build crawl parameters - keep it simple
crawl_params = {
" limit " : 20 , # Limit number of pages to crawl
" scrape_options " : {
" formats " : [ " markdown " ] # Just markdown for simplicity
}
}
# Note: The 'prompt' parameter is not documented for crawl
# Instructions are typically used with the Extract endpoint, not Crawl
if instructions :
2026-02-21 03:11:11 -08:00
logger . info ( " Instructions parameter ignored (not supported in crawl API) " )
2025-10-01 23:29:25 +00:00
2026-02-23 02:11:33 -08:00
from tools . interrupt import is_interrupted as _is_int
if _is_int ( ) :
return json . dumps ( { " error " : " Interrupted " , " success " : False } )
2025-10-01 23:29:25 +00:00
try :
2026-01-29 19:59:59 +00:00
crawl_result = _get_firecrawl_client ( ) . crawl (
2025-10-01 23:29:25 +00:00
url = url ,
* * crawl_params
)
except Exception as e :
2026-02-23 23:55:42 -08:00
logger . debug ( " Crawl API call failed: %s " , e )
2025-10-01 23:29:25 +00:00
raise
pages : List [ Dict [ str , Any ] ] = [ ]
# Process crawl results - the crawl method returns a CrawlJob object with data attribute
data_list = [ ]
# The crawl_result is a CrawlJob object with a 'data' attribute containing list of Document objects
if hasattr ( crawl_result , ' data ' ) :
data_list = crawl_result . data if crawl_result . data else [ ]
2026-02-21 03:11:11 -08:00
logger . info ( " Status: %s " , getattr ( crawl_result , ' status ' , ' unknown ' ) )
logger . info ( " Retrieved %d pages " , len ( data_list ) )
2025-10-01 23:29:25 +00:00
# Debug: Check other attributes if no data
if not data_list :
2026-02-21 03:11:11 -08:00
logger . debug ( " CrawlJob attributes: %s " , [ attr for attr in dir ( crawl_result ) if not attr . startswith ( ' _ ' ) ] )
logger . debug ( " Status: %s " , getattr ( crawl_result , ' status ' , ' N/A ' ) )
logger . debug ( " Total: %s " , getattr ( crawl_result , ' total ' , ' N/A ' ) )
logger . debug ( " Completed: %s " , getattr ( crawl_result , ' completed ' , ' N/A ' ) )
2025-10-01 23:29:25 +00:00
elif isinstance ( crawl_result , dict ) and ' data ' in crawl_result :
data_list = crawl_result . get ( " data " , [ ] )
else :
2026-02-21 03:11:11 -08:00
logger . warning ( " Unexpected crawl result type " )
logger . debug ( " Result type: %s " , type ( crawl_result ) )
2025-10-01 23:29:25 +00:00
if hasattr ( crawl_result , ' __dict__ ' ) :
2026-02-21 03:11:11 -08:00
logger . debug ( " Result attributes: %s " , list ( crawl_result . __dict__ . keys ( ) ) )
2025-10-01 23:29:25 +00:00
for item in data_list :
# Process each crawled page - properly handle object serialization
page_url = " Unknown URL "
title = " "
content_markdown = None
content_html = None
metadata = { }
# Extract data from the item
if hasattr ( item , ' model_dump ' ) :
# Pydantic model - use model_dump to get dict
item_dict = item . model_dump ( )
content_markdown = item_dict . get ( ' markdown ' )
content_html = item_dict . get ( ' html ' )
metadata = item_dict . get ( ' metadata ' , { } )
elif hasattr ( item , ' __dict__ ' ) :
# Regular object with attributes
content_markdown = getattr ( item , ' markdown ' , None )
content_html = getattr ( item , ' html ' , None )
# Handle metadata - convert to dict if it's an object
metadata_obj = getattr ( item , ' metadata ' , { } )
if hasattr ( metadata_obj , ' model_dump ' ) :
metadata = metadata_obj . model_dump ( )
elif hasattr ( metadata_obj , ' __dict__ ' ) :
metadata = metadata_obj . __dict__
elif isinstance ( metadata_obj , dict ) :
metadata = metadata_obj
else :
metadata = { }
elif isinstance ( item , dict ) :
# Already a dictionary
content_markdown = item . get ( ' markdown ' )
content_html = item . get ( ' html ' )
metadata = item . get ( ' metadata ' , { } )
# Ensure metadata is a dict (not an object)
if not isinstance ( metadata , dict ) :
if hasattr ( metadata , ' model_dump ' ) :
metadata = metadata . model_dump ( )
elif hasattr ( metadata , ' __dict__ ' ) :
metadata = metadata . __dict__
else :
metadata = { }
# Extract URL and title from metadata
page_url = metadata . get ( " sourceURL " , metadata . get ( " url " , " Unknown URL " ) )
title = metadata . get ( " title " , " " )
# Choose content (prefer markdown)
content = content_markdown or content_html or " "
pages . append ( {
" url " : page_url ,
" title " : title ,
" content " : content ,
" raw_content " : content ,
" metadata " : metadata # Now guaranteed to be a dict
} )
response = { " results " : pages }
pages_crawled = len ( response . get ( ' results ' , [ ] ) )
2026-02-21 03:11:11 -08:00
logger . info ( " Crawled %d pages " , pages_crawled )
2025-10-01 23:29:25 +00:00
debug_call_data [ " pages_crawled " ] = pages_crawled
debug_call_data [ " original_response_size " ] = len ( json . dumps ( response ) )
2026-02-22 02:16:11 -08:00
# Process each result with LLM if enabled and auxiliary client is available
if use_llm_processing and _aux_async_client is not None :
2026-02-21 03:11:11 -08:00
logger . info ( " Processing crawled content with LLM (parallel)... " )
2025-10-01 23:29:25 +00:00
debug_call_data [ " processing_applied " ] . append ( " llm_processing " )
2026-01-08 08:57:51 +00:00
# Prepare tasks for parallel processing
async def process_single_crawl_result ( result ) :
""" Process a single crawl result with LLM and return updated result with metrics. """
2025-10-01 23:29:25 +00:00
page_url = result . get ( ' url ' , ' Unknown URL ' )
title = result . get ( ' title ' , ' ' )
content = result . get ( ' content ' , ' ' )
2026-01-08 08:57:51 +00:00
if not content :
return result , None , " no_content "
original_size = len ( content )
# Process content with LLM
processed = await process_content_with_llm (
content , page_url , title , model , min_length
)
if processed :
processed_size = len ( processed )
compression_ratio = processed_size / original_size if original_size > 0 else 1.0
2025-10-01 23:29:25 +00:00
2026-01-08 08:57:51 +00:00
# Update result with processed content
result [ ' raw_content ' ] = content
result [ ' content ' ] = processed
2025-10-01 23:29:25 +00:00
2026-01-08 08:57:51 +00:00
metrics = {
" url " : page_url ,
" original_size " : original_size ,
" processed_size " : processed_size ,
" compression_ratio " : compression_ratio ,
" model_used " : model
}
return result , metrics , " processed "
else :
metrics = {
" url " : page_url ,
" original_size " : original_size ,
" processed_size " : original_size ,
" compression_ratio " : 1.0 ,
" model_used " : None ,
" reason " : " content_too_short "
}
return result , metrics , " too_short "
# Run all LLM processing in parallel
results_list = response . get ( ' results ' , [ ] )
tasks = [ process_single_crawl_result ( result ) for result in results_list ]
processed_results = await asyncio . gather ( * tasks )
# Collect metrics and print results
for result , metrics , status in processed_results :
page_url = result . get ( ' url ' , ' Unknown URL ' )
if status == " processed " :
debug_call_data [ " compression_metrics " ] . append ( metrics )
debug_call_data [ " pages_processed_with_llm " ] + = 1
2026-02-21 03:11:11 -08:00
logger . info ( " %s (processed) " , page_url )
2026-01-08 08:57:51 +00:00
elif status == " too_short " :
debug_call_data [ " compression_metrics " ] . append ( metrics )
2026-02-21 03:11:11 -08:00
logger . info ( " %s (no processing - content too short) " , page_url )
2025-10-01 23:29:25 +00:00
else :
2026-02-21 03:11:11 -08:00
logger . warning ( " %s (no content to process) " , page_url )
2025-10-01 23:29:25 +00:00
else :
2026-02-22 02:16:11 -08:00
if use_llm_processing and _aux_async_client is None :
logger . warning ( " LLM processing requested but no auxiliary model available, returning raw content " )
2025-10-01 23:29:25 +00:00
debug_call_data [ " processing_applied " ] . append ( " llm_processing_unavailable " )
# Print summary of crawled pages for debugging (original behavior)
for result in response . get ( ' results ' , [ ] ) :
page_url = result . get ( ' url ' , ' Unknown URL ' )
content_length = len ( result . get ( ' content ' , ' ' ) )
2026-02-21 03:11:11 -08:00
logger . info ( " %s ( %d characters) " , page_url , content_length )
2025-10-01 23:29:25 +00:00
# Trim output to minimal fields per entry: title, content, error
trimmed_results = [
{
" title " : r . get ( " title " , " " ) ,
" content " : r . get ( " content " , " " ) ,
2025-11-05 03:47:17 +00:00
" error " : r . get ( " error " )
2025-10-01 23:29:25 +00:00
}
for r in response . get ( " results " , [ ] )
]
trimmed_response = { " results " : trimmed_results }
2025-11-05 03:47:17 +00:00
result_json = json . dumps ( trimmed_response , indent = 2 , ensure_ascii = False )
2025-10-01 23:29:25 +00:00
# Clean base64 images from crawled content
cleaned_result = clean_base64_images ( result_json )
debug_call_data [ " final_response_size " ] = len ( cleaned_result )
debug_call_data [ " processing_applied " ] . append ( " base64_image_removal " )
# Log debug information
2026-02-21 03:53:24 -08:00
_debug . log_call ( " web_crawl_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
return cleaned_result
except Exception as e :
error_msg = f " Error crawling website: { str ( e ) } "
2026-02-23 23:55:42 -08:00
logger . debug ( " %s " , error_msg )
2025-10-01 23:29:25 +00:00
debug_call_data [ " error " ] = error_msg
2026-02-21 03:53:24 -08:00
_debug . log_call ( " web_crawl_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
2025-11-05 03:47:17 +00:00
return json . dumps ( { " error " : error_msg } , ensure_ascii = False )
2025-10-01 23:29:25 +00:00
# Convenience function to check if API key is available
def check_firecrawl_api_key ( ) - > bool :
"""
Check if the Firecrawl API key is available in environment variables .
Returns :
bool : True if API key is set , False otherwise
"""
return bool ( os . getenv ( " FIRECRAWL_API_KEY " ) )
2026-02-22 02:16:11 -08:00
def check_auxiliary_model ( ) - > bool :
""" Check if an auxiliary text model is available for LLM content processing. """
return _aux_async_client is not None
2025-10-01 23:29:25 +00:00
def get_debug_session_info ( ) - > Dict [ str , Any ] :
2026-02-21 03:53:24 -08:00
""" Get information about the current debug session. """
return _debug . get_session_info ( )
2025-10-01 23:29:25 +00:00
if __name__ == " __main__ " :
"""
Simple test / demo when run directly
"""
print ( " 🌐 Standalone Web Tools Module " )
print ( " = " * 40 )
# Check if API keys are available
firecrawl_available = check_firecrawl_api_key ( )
2026-02-22 02:16:11 -08:00
nous_available = check_auxiliary_model ( )
2025-10-01 23:29:25 +00:00
if not firecrawl_available :
print ( " ❌ FIRECRAWL_API_KEY environment variable not set " )
print ( " Please set your API key: export FIRECRAWL_API_KEY= ' your-key-here ' " )
print ( " Get API key at: https://firecrawl.dev/ " )
else :
print ( " ✅ Firecrawl API key found " )
if not nous_available :
2026-02-22 02:16:11 -08:00
print ( " ❌ No auxiliary model available for LLM content processing " )
print ( " Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY " )
print ( " ⚠️ Without an auxiliary model, LLM content processing will be disabled " )
2025-10-01 23:29:25 +00:00
else :
2026-02-22 02:16:11 -08:00
print ( f " ✅ Auxiliary model available: { DEFAULT_SUMMARIZER_MODEL } " )
2025-10-01 23:29:25 +00:00
if not firecrawl_available :
exit ( 1 )
print ( " 🛠️ Web tools ready for use! " )
if nous_available :
2026-02-22 02:16:11 -08:00
print ( f " 🧠 LLM content processing available with { DEFAULT_SUMMARIZER_MODEL } " )
2025-10-01 23:29:25 +00:00
print ( f " Default min length for processing: { DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION } chars " )
# Show debug mode status
2026-02-21 03:53:24 -08:00
if _debug . active :
print ( f " 🐛 Debug mode ENABLED - Session ID: { _debug . session_id } " )
print ( f " Debug logs will be saved to: { _debug . log_dir } /web_tools_debug_ { _debug . session_id } .json " )
2025-10-01 23:29:25 +00:00
else :
print ( " 🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable) " )
print ( " \n Basic usage: " )
print ( " from web_tools import web_search_tool, web_extract_tool, web_crawl_tool " )
print ( " import asyncio " )
print ( " " )
print ( " # Search (synchronous) " )
print ( " results = web_search_tool( ' Python tutorials ' ) " )
print ( " " )
print ( " # Extract and crawl (asynchronous) " )
print ( " async def main(): " )
print ( " content = await web_extract_tool([ ' https://example.com ' ]) " )
print ( " crawl_data = await web_crawl_tool( ' example.com ' , ' Find docs ' ) " )
print ( " asyncio.run(main()) " )
if nous_available :
print ( " \n LLM-enhanced usage: " )
print ( " # Content automatically processed for pages >5000 chars (default) " )
print ( " content = await web_extract_tool([ ' https://python.org/about/ ' ]) " )
print ( " " )
print ( " # Customize processing parameters " )
print ( " crawl_data = await web_crawl_tool( " )
print ( " ' docs.python.org ' , " )
print ( " ' Find key concepts ' , " )
2026-01-08 08:57:51 +00:00
print ( " model= ' google/gemini-3-flash-preview ' , " )
2025-10-01 23:29:25 +00:00
print ( " min_length=3000 " )
print ( " ) " )
print ( " " )
print ( " # Disable LLM processing " )
print ( " raw_content = await web_extract_tool([ ' https://example.com ' ], use_llm_processing=False) " )
print ( " \n Debug mode: " )
print ( " # Enable debug logging " )
print ( " export WEB_TOOLS_DEBUG=true " )
print ( " # Debug logs capture: " )
print ( " # - All tool calls with parameters " )
print ( " # - Original API responses " )
print ( " # - LLM compression metrics " )
print ( " # - Final processed results " )
print ( " # Logs saved to: ./logs/web_tools_debug_UUID.json " )
print ( f " \n 📝 Run ' python test_web_tools_llm.py ' to test LLM processing capabilities " )
2026-02-21 20:22:33 -08:00
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
from tools . registry import registry
WEB_SEARCH_SCHEMA = {
" name " : " web_search " ,
" description " : " Search the web for information on any topic. Returns up to 5 relevant results with titles, URLs, and descriptions. " ,
" parameters " : {
" type " : " object " ,
" properties " : {
" query " : {
" type " : " string " ,
" description " : " The search query to look up on the web "
}
} ,
" required " : [ " query " ]
}
}
WEB_EXTRACT_SCHEMA = {
" name " : " web_extract " ,
2026-02-26 23:06:08 -08:00
" description " : " Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead. " ,
2026-02-21 20:22:33 -08:00
" parameters " : {
" type " : " object " ,
" properties " : {
" urls " : {
" type " : " array " ,
" items " : { " type " : " string " } ,
" description " : " List of URLs to extract content from (max 5 URLs per call) " ,
" maxItems " : 5
}
} ,
" required " : [ " urls " ]
}
}
registry . register (
name = " web_search " ,
toolset = " web " ,
schema = WEB_SEARCH_SCHEMA ,
handler = lambda args , * * kw : web_search_tool ( args . get ( " query " , " " ) , limit = 5 ) ,
check_fn = check_firecrawl_api_key ,
requires_env = [ " FIRECRAWL_API_KEY " ] ,
)
registry . register (
name = " web_extract " ,
toolset = " web " ,
schema = WEB_EXTRACT_SCHEMA ,
handler = lambda args , * * kw : web_extract_tool (
args . get ( " urls " , [ ] ) [ : 5 ] if isinstance ( args . get ( " urls " ) , list ) else [ ] , " markdown " ) ,
check_fn = check_firecrawl_api_key ,
requires_env = [ " FIRECRAWL_API_KEY " ] ,
is_async = True ,
)