update to firecrawl

2025-08-21 08:12:24 -07:00
parent 96cff78335
commit 4ece87efb0
3 changed files with 145 additions and 43 deletions
--- a/model_tools.py
+++ b/model_tools.py
@@ -24,7 +24,7 @@ import asyncio
 from typing import Dict, Any, List

 # Import toolsets
-from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_tavily_api_key
+from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_firecrawl_api_key
 from terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION
 from vision_tools import vision_analyze_tool, check_vision_requirements
 from mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements
@@ -272,7 +272,7 @@ def get_all_tool_names() -> List[str]:
    tool_names = []
    
    # Web tools
-    if check_tavily_api_key():
+    if check_firecrawl_api_key():
        tool_names.extend(["web_search", "web_extract", "web_crawl"])
    
    # Terminal tools  
@@ -395,7 +395,7 @@ def get_tool_definitions(
    
    # Collect all available tools from each toolset
    toolset_tools = {
-        "web_tools": get_web_tool_definitions() if check_tavily_api_key() else [],
+        "web_tools": get_web_tool_definitions() if check_firecrawl_api_key() else [],
        "terminal_tools": get_terminal_tool_definitions() if check_hecate_requirements() else [],
        "vision_tools": get_vision_tool_definitions() if check_vision_requirements() else [],
        "moa_tools": get_moa_tool_definitions() if check_moa_requirements() else [],
@@ -687,10 +687,10 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
    """
    toolsets = {
        "web_tools": {
-            "available": check_tavily_api_key(),
+            "available": check_firecrawl_api_key(),
            "tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"],
            "description": "Web search, content extraction, and website crawling tools",
-            "requirements": ["TAVILY_API_KEY environment variable"]
+            "requirements": ["FIRECRAWL_API_KEY environment variable"]
        },
        "terminal_tools": {
            "available": check_hecate_requirements(),
@@ -714,7 +714,7 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
            "available": check_image_generation_requirements(),
            "tools": ["image_generate_tool"],
            "description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality",
-            "requirements": ["FAL_API_KEY environment variable", "fal-client package"]
+            "requirements": ["FAL_KEY environment variable", "fal-client package"]
        }
        # Future toolsets can be added here
    }
@@ -729,7 +729,7 @@ def check_toolset_requirements() -> Dict[str, bool]:
        Dict: Status of each toolset's requirements
    """
    return {
-        "web_tools": check_tavily_api_key(),
+        "web_tools": check_firecrawl_api_key(),
        "terminal_tools": check_hecate_requirements(),
        "vision_tools": check_vision_requirements(),
        "moa_tools": check_moa_requirements(),
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-tavily-python
+firecrawl-py
 openai
 fal-client
--- a/web_tools.py
+++ b/web_tools.py
@@ -3,8 +3,8 @@
 Standalone Web Tools Module

 This module provides generic web tools that work with multiple backend providers.
-Currently uses Tavily as the backend, but the interface makes it easy to swap
-to other providers like Firecrawl without changing the function signatures.
+Currently uses Firecrawl as the backend, and the interface makes it easy to swap
+providers without changing the function signatures.

 Available tools:
 - web_search_tool: Search the web for information
@@ -12,8 +12,7 @@ Available tools:
 - web_crawl_tool: Crawl websites with specific instructions

 Backend compatibility:
- Tavily: https://docs.tavily.com/
- Firecrawl: https://docs.firecrawl.dev/features/search
+- Firecrawl: https://docs.firecrawl.dev/introduction

 LLM Processing:
 - Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction
@@ -49,11 +48,11 @@ import uuid
 import datetime
 from pathlib import Path
 from typing import List, Dict, Any, Optional
-from tavily import TavilyClient
+from firecrawl import FirecrawlApp, ScrapeOptions
 from openai import AsyncOpenAI

-# Initialize Tavily client once at module level
-tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+# Initialize Firecrawl client once at module level
+firecrawl_app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))

 # Initialize Nous Research API client for LLM processing (async)
 nous_client = AsyncOpenAI(
@@ -250,7 +249,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
    Search the web for information using available search API backend.
    
    This function provides a generic interface for web search that can work
-    with multiple backends. Currently uses Tavily but can be easily swapped.
+    with multiple backends. Currently uses Firecrawl.
    
    Note: Search results are already concise snippets, so no LLM processing is applied.
    
@@ -290,18 +289,36 @@ def web_search_tool(query: str, limit: int = 5) -> str:
    try:
        print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
        
-        # Use Tavily's search functionality
-        response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
+        # Use Firecrawl's search functionality
+        # Firecrawl Search: search the web and get full content from results
+        # Docs: https://docs.firecrawl.dev/introduction
+        # Note: Firecrawl SDK supports search via app.search(query, limit=...)
+        response = firecrawl_app.search(query=query, limit=limit)
        
-        results_count = len(response.get('results', []))
+        # Determine results count and trim to minimal structure: { success, data: [{markdown}] }
+        results_list = []
+        success_flag = True
+        if isinstance(response, dict):
+            success_flag = bool(response.get("success", True))
+            if "data" in response and isinstance(response["data"], list):
+                results_list = response["data"]
+            elif "results" in response and isinstance(response["results"], list):
+                results_list = response["results"]
+        results_count = len(results_list)
        print(f"✅ Found {results_count} results")
        
        # Capture debug information
        debug_call_data["results_count"] = results_count
        debug_call_data["original_response_size"] = len(json.dumps(response))
        
-        result_json = json.dumps(response, indent=2)
-        # Clean base64 images from search results
+        # Build minimal response
+        minimal_data = []
+        for item in results_list:
+            if isinstance(item, dict) and ("markdown" in item):
+                minimal_data.append({"markdown": item.get("markdown", "")})
+        minimal_response = {"success": success_flag, "data": minimal_data}
+        
+        result_json = json.dumps(minimal_response, indent=2)
        cleaned_result = clean_base64_images(result_json)
        
        debug_call_data["final_response_size"] = len(cleaned_result)
@@ -335,7 +352,7 @@ async def web_extract_tool(
    Extract content from specific web pages using available extraction API backend.
    
    This function provides a generic interface for web content extraction that
-    can work with multiple backends. Currently uses Tavily but can be easily swapped.
+    can work with multiple backends. Currently uses Firecrawl.
    
    Args:
        urls (List[str]): List of URLs to extract content from
@@ -371,8 +388,49 @@ async def web_extract_tool(
    try:
        print(f"📄 Extracting content from {len(urls)} URL(s)")
        
-        # Use Tavily's extract functionality
-        response = tavily_client.extract(urls=urls, format=format)
+        # Use Firecrawl's scrape functionality per URL and normalize to a common shape
+        results: List[Dict[str, Any]] = []
+        for url in urls:
+            try:
+                # Determine requested formats for Firecrawl
+                formats: List[str] = []
+                if format == "markdown":
+                    formats = ["markdown"]
+                elif format == "html":
+                    formats = ["html"]
+                else:
+                    # Default: request markdown for LLM-readiness and include html as backup
+                    formats = ["markdown", "html"]
+
+                scrape_result = firecrawl_app.scrape_url(url, formats=formats)
+
+                # Firecrawl returns {success, data: {markdown?, html?, metadata}}
+                data = scrape_result.get("data", {}) if isinstance(scrape_result, dict) else {}
+                metadata = data.get("metadata", {})
+                title = metadata.get("title", "")
+                content_markdown = data.get("markdown")
+                content_html = data.get("html")
+
+                # Choose content based on requested format
+                chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
+
+                results.append({
+                    "url": metadata.get("sourceURL", url),
+                    "title": title,
+                    "content": chosen_content,
+                    "raw_content": chosen_content,
+                    "metadata": metadata
+                })
+            except Exception as scrape_err:
+                results.append({
+                    "url": url,
+                    "title": "",
+                    "content": "",
+                    "raw_content": "",
+                    "error": str(scrape_err)
+                })
+
+        response = {"results": results}
        
        pages_extracted = len(response.get('results', []))
        print(f"✅ Extracted content from {pages_extracted} pages")
@@ -440,7 +498,18 @@ async def web_extract_tool(
                content_length = len(result.get('raw_content', ''))
                print(f"  📝 {url} ({content_length} characters)")
        
-        result_json = json.dumps(response, indent=2)
+        # Trim output to minimal fields per entry: title, content, error
+        trimmed_results = [
+            {
+                "title": r.get("title", ""),
+                "content": r.get("content", ""),
+                "error": r.get("error")
+            }
+            for r in response.get("results", [])
+        ]
+        trimmed_response = {"results": trimmed_results}
+        
+        result_json = json.dumps(trimmed_response, indent=2)
        # Clean base64 images from extracted content
        cleaned_result = clean_base64_images(result_json)
        
@@ -476,7 +545,7 @@ async def web_crawl_tool(
    Crawl a website with specific instructions using available crawling API backend.
    
    This function provides a generic interface for web crawling that can work
-    with multiple backends. Currently uses Tavily but can be easily swapped.
+    with multiple backends. Currently uses Firecrawl.
    
    Args:
        url (str): The base URL to crawl (can include or exclude https://)
@@ -516,13 +585,35 @@ async def web_crawl_tool(
        instructions_text = f" with instructions: '{instructions}'" if instructions else ""
        print(f"🕷️ Crawling {url}{instructions_text}")
        
-        # Use Tavily's crawl functionality
-        response = tavily_client.crawl(
-            url=url,
-            limit=20,  # Reasonable limit for most use cases
-            instructions=instructions or "Get all available content",
-            extract_depth=depth
+        # Use Firecrawl's crawl functionality and normalize to a common shape
+        # Firecrawl SDK returns the crawl results directly for synchronous crawl
+        scrape_options = ScrapeOptions(formats=["markdown", "html"])
+        crawl_result = firecrawl_app.crawl_url(
+            url,
+            limit=20,
+            scrape_options=scrape_options,
        )
+
+        pages: List[Dict[str, Any]] = []
+        if isinstance(crawl_result, dict):
+            # Firecrawl returns {success, data: [ {markdown?, html?, metadata} ]}
+            data_list = crawl_result.get("data", [])
+            for item in data_list:
+                metadata = item.get("metadata", {}) if isinstance(item, dict) else {}
+                page_url = metadata.get("sourceURL", "Unknown URL")
+                title = metadata.get("title", "")
+                content_markdown = item.get("markdown") if isinstance(item, dict) else None
+                content_html = item.get("html") if isinstance(item, dict) else None
+                content = content_markdown or content_html or ""
+                pages.append({
+                    "url": page_url,
+                    "title": title,
+                    "content": content,
+                    "raw_content": content,
+                    "metadata": metadata
+                })
+
+        response = {"results": pages}
        
        pages_crawled = len(response.get('results', []))
        print(f"✅ Crawled {pages_crawled} pages")
@@ -590,7 +681,18 @@ async def web_crawl_tool(
                content_length = len(result.get('content', ''))
                print(f"  🌐 {page_url} ({content_length} characters)")
        
-        result_json = json.dumps(response, indent=2)
+        # Trim output to minimal fields per entry: title, content, error
+        trimmed_results = [
+            {
+                "title": r.get("title", ""),
+                "content": r.get("content", ""),
+                "error": r.get("error")
+            }
+            for r in response.get("results", [])
+        ]
+        trimmed_response = {"results": trimmed_results}
+        
+        result_json = json.dumps(trimmed_response, indent=2)
        # Clean base64 images from crawled content
        cleaned_result = clean_base64_images(result_json)
        
@@ -615,14 +717,14 @@ async def web_crawl_tool(


 # Convenience function to check if API key is available
-def check_tavily_api_key() -> bool:
+def check_firecrawl_api_key() -> bool:
    """
-    Check if the Tavily API key is available in environment variables.
+    Check if the Firecrawl API key is available in environment variables.
    
    Returns:
        bool: True if API key is set, False otherwise
    """
-    return bool(os.getenv("TAVILY_API_KEY"))
+    return bool(os.getenv("FIRECRAWL_API_KEY"))


 def check_nous_api_key() -> bool:
@@ -670,15 +772,15 @@ if __name__ == "__main__":
    print("=" * 40)
    
    # Check if API keys are available
-    tavily_available = check_tavily_api_key()
+    firecrawl_available = check_firecrawl_api_key()
    nous_available = check_nous_api_key()
    
-    if not tavily_available:
-        print("❌ TAVILY_API_KEY environment variable not set")
-        print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
-        print("Get API key at: https://tavily.com/")
+    if not firecrawl_available:
+        print("❌ FIRECRAWL_API_KEY environment variable not set")
+        print("Please set your API key: export FIRECRAWL_API_KEY='your-key-here'")
+        print("Get API key at: https://firecrawl.dev/")
    else:
-        print("✅ Tavily API key found")
+        print("✅ Firecrawl API key found")
    
    if not nous_available:
        print("❌ NOUS_API_KEY environment variable not set")
@@ -688,7 +790,7 @@ if __name__ == "__main__":
    else:
        print("✅ Nous Research API key found")
    
-    if not tavily_available:
+    if not firecrawl_available:
        exit(1)
    
    print("🛠️  Web tools ready for use!")