update to firecrawl
This commit is contained in:
@@ -24,7 +24,7 @@ import asyncio
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# Import toolsets
|
||||
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_tavily_api_key
|
||||
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_firecrawl_api_key
|
||||
from terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION
|
||||
from vision_tools import vision_analyze_tool, check_vision_requirements
|
||||
from mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements
|
||||
@@ -272,7 +272,7 @@ def get_all_tool_names() -> List[str]:
|
||||
tool_names = []
|
||||
|
||||
# Web tools
|
||||
if check_tavily_api_key():
|
||||
if check_firecrawl_api_key():
|
||||
tool_names.extend(["web_search", "web_extract", "web_crawl"])
|
||||
|
||||
# Terminal tools
|
||||
@@ -395,7 +395,7 @@ def get_tool_definitions(
|
||||
|
||||
# Collect all available tools from each toolset
|
||||
toolset_tools = {
|
||||
"web_tools": get_web_tool_definitions() if check_tavily_api_key() else [],
|
||||
"web_tools": get_web_tool_definitions() if check_firecrawl_api_key() else [],
|
||||
"terminal_tools": get_terminal_tool_definitions() if check_hecate_requirements() else [],
|
||||
"vision_tools": get_vision_tool_definitions() if check_vision_requirements() else [],
|
||||
"moa_tools": get_moa_tool_definitions() if check_moa_requirements() else [],
|
||||
@@ -687,10 +687,10 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
toolsets = {
|
||||
"web_tools": {
|
||||
"available": check_tavily_api_key(),
|
||||
"available": check_firecrawl_api_key(),
|
||||
"tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"],
|
||||
"description": "Web search, content extraction, and website crawling tools",
|
||||
"requirements": ["TAVILY_API_KEY environment variable"]
|
||||
"requirements": ["FIRECRAWL_API_KEY environment variable"]
|
||||
},
|
||||
"terminal_tools": {
|
||||
"available": check_hecate_requirements(),
|
||||
@@ -714,7 +714,7 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
|
||||
"available": check_image_generation_requirements(),
|
||||
"tools": ["image_generate_tool"],
|
||||
"description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality",
|
||||
"requirements": ["FAL_API_KEY environment variable", "fal-client package"]
|
||||
"requirements": ["FAL_KEY environment variable", "fal-client package"]
|
||||
}
|
||||
# Future toolsets can be added here
|
||||
}
|
||||
@@ -729,7 +729,7 @@ def check_toolset_requirements() -> Dict[str, bool]:
|
||||
Dict: Status of each toolset's requirements
|
||||
"""
|
||||
return {
|
||||
"web_tools": check_tavily_api_key(),
|
||||
"web_tools": check_firecrawl_api_key(),
|
||||
"terminal_tools": check_hecate_requirements(),
|
||||
"vision_tools": check_vision_requirements(),
|
||||
"moa_tools": check_moa_requirements(),
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
tavily-python
|
||||
firecrawl-py
|
||||
openai
|
||||
fal-client
|
||||
172
web_tools.py
172
web_tools.py
@@ -3,8 +3,8 @@
|
||||
Standalone Web Tools Module
|
||||
|
||||
This module provides generic web tools that work with multiple backend providers.
|
||||
Currently uses Tavily as the backend, but the interface makes it easy to swap
|
||||
to other providers like Firecrawl without changing the function signatures.
|
||||
Currently uses Firecrawl as the backend, and the interface makes it easy to swap
|
||||
providers without changing the function signatures.
|
||||
|
||||
Available tools:
|
||||
- web_search_tool: Search the web for information
|
||||
@@ -12,8 +12,7 @@ Available tools:
|
||||
- web_crawl_tool: Crawl websites with specific instructions
|
||||
|
||||
Backend compatibility:
|
||||
- Tavily: https://docs.tavily.com/
|
||||
- Firecrawl: https://docs.firecrawl.dev/features/search
|
||||
- Firecrawl: https://docs.firecrawl.dev/introduction
|
||||
|
||||
LLM Processing:
|
||||
- Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction
|
||||
@@ -49,11 +48,11 @@ import uuid
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from tavily import TavilyClient
|
||||
from firecrawl import FirecrawlApp, ScrapeOptions
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
# Initialize Tavily client once at module level
|
||||
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
||||
# Initialize Firecrawl client once at module level
|
||||
firecrawl_app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
||||
|
||||
# Initialize Nous Research API client for LLM processing (async)
|
||||
nous_client = AsyncOpenAI(
|
||||
@@ -250,7 +249,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
Search the web for information using available search API backend.
|
||||
|
||||
This function provides a generic interface for web search that can work
|
||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
with multiple backends. Currently uses Firecrawl.
|
||||
|
||||
Note: Search results are already concise snippets, so no LLM processing is applied.
|
||||
|
||||
@@ -290,18 +289,36 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
try:
|
||||
print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
|
||||
|
||||
# Use Tavily's search functionality
|
||||
response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
|
||||
# Use Firecrawl's search functionality
|
||||
# Firecrawl Search: search the web and get full content from results
|
||||
# Docs: https://docs.firecrawl.dev/introduction
|
||||
# Note: Firecrawl SDK supports search via app.search(query, limit=...)
|
||||
response = firecrawl_app.search(query=query, limit=limit)
|
||||
|
||||
results_count = len(response.get('results', []))
|
||||
# Determine results count and trim to minimal structure: { success, data: [{markdown}] }
|
||||
results_list = []
|
||||
success_flag = True
|
||||
if isinstance(response, dict):
|
||||
success_flag = bool(response.get("success", True))
|
||||
if "data" in response and isinstance(response["data"], list):
|
||||
results_list = response["data"]
|
||||
elif "results" in response and isinstance(response["results"], list):
|
||||
results_list = response["results"]
|
||||
results_count = len(results_list)
|
||||
print(f"✅ Found {results_count} results")
|
||||
|
||||
# Capture debug information
|
||||
debug_call_data["results_count"] = results_count
|
||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from search results
|
||||
# Build minimal response
|
||||
minimal_data = []
|
||||
for item in results_list:
|
||||
if isinstance(item, dict) and ("markdown" in item):
|
||||
minimal_data.append({"markdown": item.get("markdown", "")})
|
||||
minimal_response = {"success": success_flag, "data": minimal_data}
|
||||
|
||||
result_json = json.dumps(minimal_response, indent=2)
|
||||
cleaned_result = clean_base64_images(result_json)
|
||||
|
||||
debug_call_data["final_response_size"] = len(cleaned_result)
|
||||
@@ -335,7 +352,7 @@ async def web_extract_tool(
|
||||
Extract content from specific web pages using available extraction API backend.
|
||||
|
||||
This function provides a generic interface for web content extraction that
|
||||
can work with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
can work with multiple backends. Currently uses Firecrawl.
|
||||
|
||||
Args:
|
||||
urls (List[str]): List of URLs to extract content from
|
||||
@@ -371,8 +388,49 @@ async def web_extract_tool(
|
||||
try:
|
||||
print(f"📄 Extracting content from {len(urls)} URL(s)")
|
||||
|
||||
# Use Tavily's extract functionality
|
||||
response = tavily_client.extract(urls=urls, format=format)
|
||||
# Use Firecrawl's scrape functionality per URL and normalize to a common shape
|
||||
results: List[Dict[str, Any]] = []
|
||||
for url in urls:
|
||||
try:
|
||||
# Determine requested formats for Firecrawl
|
||||
formats: List[str] = []
|
||||
if format == "markdown":
|
||||
formats = ["markdown"]
|
||||
elif format == "html":
|
||||
formats = ["html"]
|
||||
else:
|
||||
# Default: request markdown for LLM-readiness and include html as backup
|
||||
formats = ["markdown", "html"]
|
||||
|
||||
scrape_result = firecrawl_app.scrape_url(url, formats=formats)
|
||||
|
||||
# Firecrawl returns {success, data: {markdown?, html?, metadata}}
|
||||
data = scrape_result.get("data", {}) if isinstance(scrape_result, dict) else {}
|
||||
metadata = data.get("metadata", {})
|
||||
title = metadata.get("title", "")
|
||||
content_markdown = data.get("markdown")
|
||||
content_html = data.get("html")
|
||||
|
||||
# Choose content based on requested format
|
||||
chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
|
||||
|
||||
results.append({
|
||||
"url": metadata.get("sourceURL", url),
|
||||
"title": title,
|
||||
"content": chosen_content,
|
||||
"raw_content": chosen_content,
|
||||
"metadata": metadata
|
||||
})
|
||||
except Exception as scrape_err:
|
||||
results.append({
|
||||
"url": url,
|
||||
"title": "",
|
||||
"content": "",
|
||||
"raw_content": "",
|
||||
"error": str(scrape_err)
|
||||
})
|
||||
|
||||
response = {"results": results}
|
||||
|
||||
pages_extracted = len(response.get('results', []))
|
||||
print(f"✅ Extracted content from {pages_extracted} pages")
|
||||
@@ -440,7 +498,18 @@ async def web_extract_tool(
|
||||
content_length = len(result.get('raw_content', ''))
|
||||
print(f" 📝 {url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Trim output to minimal fields per entry: title, content, error
|
||||
trimmed_results = [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"content": r.get("content", ""),
|
||||
"error": r.get("error")
|
||||
}
|
||||
for r in response.get("results", [])
|
||||
]
|
||||
trimmed_response = {"results": trimmed_results}
|
||||
|
||||
result_json = json.dumps(trimmed_response, indent=2)
|
||||
# Clean base64 images from extracted content
|
||||
cleaned_result = clean_base64_images(result_json)
|
||||
|
||||
@@ -476,7 +545,7 @@ async def web_crawl_tool(
|
||||
Crawl a website with specific instructions using available crawling API backend.
|
||||
|
||||
This function provides a generic interface for web crawling that can work
|
||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
with multiple backends. Currently uses Firecrawl.
|
||||
|
||||
Args:
|
||||
url (str): The base URL to crawl (can include or exclude https://)
|
||||
@@ -516,13 +585,35 @@ async def web_crawl_tool(
|
||||
instructions_text = f" with instructions: '{instructions}'" if instructions else ""
|
||||
print(f"🕷️ Crawling {url}{instructions_text}")
|
||||
|
||||
# Use Tavily's crawl functionality
|
||||
response = tavily_client.crawl(
|
||||
url=url,
|
||||
limit=20, # Reasonable limit for most use cases
|
||||
instructions=instructions or "Get all available content",
|
||||
extract_depth=depth
|
||||
# Use Firecrawl's crawl functionality and normalize to a common shape
|
||||
# Firecrawl SDK returns the crawl results directly for synchronous crawl
|
||||
scrape_options = ScrapeOptions(formats=["markdown", "html"])
|
||||
crawl_result = firecrawl_app.crawl_url(
|
||||
url,
|
||||
limit=20,
|
||||
scrape_options=scrape_options,
|
||||
)
|
||||
|
||||
pages: List[Dict[str, Any]] = []
|
||||
if isinstance(crawl_result, dict):
|
||||
# Firecrawl returns {success, data: [ {markdown?, html?, metadata} ]}
|
||||
data_list = crawl_result.get("data", [])
|
||||
for item in data_list:
|
||||
metadata = item.get("metadata", {}) if isinstance(item, dict) else {}
|
||||
page_url = metadata.get("sourceURL", "Unknown URL")
|
||||
title = metadata.get("title", "")
|
||||
content_markdown = item.get("markdown") if isinstance(item, dict) else None
|
||||
content_html = item.get("html") if isinstance(item, dict) else None
|
||||
content = content_markdown or content_html or ""
|
||||
pages.append({
|
||||
"url": page_url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"raw_content": content,
|
||||
"metadata": metadata
|
||||
})
|
||||
|
||||
response = {"results": pages}
|
||||
|
||||
pages_crawled = len(response.get('results', []))
|
||||
print(f"✅ Crawled {pages_crawled} pages")
|
||||
@@ -590,7 +681,18 @@ async def web_crawl_tool(
|
||||
content_length = len(result.get('content', ''))
|
||||
print(f" 🌐 {page_url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Trim output to minimal fields per entry: title, content, error
|
||||
trimmed_results = [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"content": r.get("content", ""),
|
||||
"error": r.get("error")
|
||||
}
|
||||
for r in response.get("results", [])
|
||||
]
|
||||
trimmed_response = {"results": trimmed_results}
|
||||
|
||||
result_json = json.dumps(trimmed_response, indent=2)
|
||||
# Clean base64 images from crawled content
|
||||
cleaned_result = clean_base64_images(result_json)
|
||||
|
||||
@@ -615,14 +717,14 @@ async def web_crawl_tool(
|
||||
|
||||
|
||||
# Convenience function to check if API key is available
|
||||
def check_tavily_api_key() -> bool:
|
||||
def check_firecrawl_api_key() -> bool:
|
||||
"""
|
||||
Check if the Tavily API key is available in environment variables.
|
||||
Check if the Firecrawl API key is available in environment variables.
|
||||
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
"""
|
||||
return bool(os.getenv("TAVILY_API_KEY"))
|
||||
return bool(os.getenv("FIRECRAWL_API_KEY"))
|
||||
|
||||
|
||||
def check_nous_api_key() -> bool:
|
||||
@@ -670,15 +772,15 @@ if __name__ == "__main__":
|
||||
print("=" * 40)
|
||||
|
||||
# Check if API keys are available
|
||||
tavily_available = check_tavily_api_key()
|
||||
firecrawl_available = check_firecrawl_api_key()
|
||||
nous_available = check_nous_api_key()
|
||||
|
||||
if not tavily_available:
|
||||
print("❌ TAVILY_API_KEY environment variable not set")
|
||||
print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://tavily.com/")
|
||||
if not firecrawl_available:
|
||||
print("❌ FIRECRAWL_API_KEY environment variable not set")
|
||||
print("Please set your API key: export FIRECRAWL_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://firecrawl.dev/")
|
||||
else:
|
||||
print("✅ Tavily API key found")
|
||||
print("✅ Firecrawl API key found")
|
||||
|
||||
if not nous_available:
|
||||
print("❌ NOUS_API_KEY environment variable not set")
|
||||
@@ -688,7 +790,7 @@ if __name__ == "__main__":
|
||||
else:
|
||||
print("✅ Nous Research API key found")
|
||||
|
||||
if not tavily_available:
|
||||
if not firecrawl_available:
|
||||
exit(1)
|
||||
|
||||
print("🛠️ Web tools ready for use!")
|
||||
|
||||
Reference in New Issue
Block a user