diff --git a/model_tools.py b/model_tools.py index 6bde5dc7..c5aa0827 100644 --- a/model_tools.py +++ b/model_tools.py @@ -24,7 +24,7 @@ import asyncio from typing import Dict, Any, List # Import toolsets -from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_tavily_api_key +from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_firecrawl_api_key from terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION from vision_tools import vision_analyze_tool, check_vision_requirements from mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements @@ -272,7 +272,7 @@ def get_all_tool_names() -> List[str]: tool_names = [] # Web tools - if check_tavily_api_key(): + if check_firecrawl_api_key(): tool_names.extend(["web_search", "web_extract", "web_crawl"]) # Terminal tools @@ -395,7 +395,7 @@ def get_tool_definitions( # Collect all available tools from each toolset toolset_tools = { - "web_tools": get_web_tool_definitions() if check_tavily_api_key() else [], + "web_tools": get_web_tool_definitions() if check_firecrawl_api_key() else [], "terminal_tools": get_terminal_tool_definitions() if check_hecate_requirements() else [], "vision_tools": get_vision_tool_definitions() if check_vision_requirements() else [], "moa_tools": get_moa_tool_definitions() if check_moa_requirements() else [], @@ -687,10 +687,10 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]: """ toolsets = { "web_tools": { - "available": check_tavily_api_key(), + "available": check_firecrawl_api_key(), "tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"], "description": "Web search, content extraction, and website crawling tools", - "requirements": ["TAVILY_API_KEY environment variable"] + "requirements": ["FIRECRAWL_API_KEY environment variable"] }, "terminal_tools": { "available": check_hecate_requirements(), @@ -714,7 +714,7 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]: "available": check_image_generation_requirements(), "tools": ["image_generate_tool"], "description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality", - "requirements": ["FAL_API_KEY environment variable", "fal-client package"] + "requirements": ["FAL_KEY environment variable", "fal-client package"] } # Future toolsets can be added here } @@ -729,7 +729,7 @@ def check_toolset_requirements() -> Dict[str, bool]: Dict: Status of each toolset's requirements """ return { - "web_tools": check_tavily_api_key(), + "web_tools": check_firecrawl_api_key(), "terminal_tools": check_hecate_requirements(), "vision_tools": check_vision_requirements(), "moa_tools": check_moa_requirements(), diff --git a/requirements.txt b/requirements.txt index 060677d9..1a12b584 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -tavily-python +firecrawl-py openai fal-client \ No newline at end of file diff --git a/web_tools.py b/web_tools.py index 38eb0b93..25c44f2d 100644 --- a/web_tools.py +++ b/web_tools.py @@ -3,8 +3,8 @@ Standalone Web Tools Module This module provides generic web tools that work with multiple backend providers. -Currently uses Tavily as the backend, but the interface makes it easy to swap -to other providers like Firecrawl without changing the function signatures. +Currently uses Firecrawl as the backend, and the interface makes it easy to swap +providers without changing the function signatures. Available tools: - web_search_tool: Search the web for information @@ -12,8 +12,7 @@ Available tools: - web_crawl_tool: Crawl websites with specific instructions Backend compatibility: -- Tavily: https://docs.tavily.com/ -- Firecrawl: https://docs.firecrawl.dev/features/search +- Firecrawl: https://docs.firecrawl.dev/introduction LLM Processing: - Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction @@ -49,11 +48,11 @@ import uuid import datetime from pathlib import Path from typing import List, Dict, Any, Optional -from tavily import TavilyClient +from firecrawl import FirecrawlApp, ScrapeOptions from openai import AsyncOpenAI -# Initialize Tavily client once at module level -tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) +# Initialize Firecrawl client once at module level +firecrawl_app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) # Initialize Nous Research API client for LLM processing (async) nous_client = AsyncOpenAI( @@ -250,7 +249,7 @@ def web_search_tool(query: str, limit: int = 5) -> str: Search the web for information using available search API backend. This function provides a generic interface for web search that can work - with multiple backends. Currently uses Tavily but can be easily swapped. + with multiple backends. Currently uses Firecrawl. Note: Search results are already concise snippets, so no LLM processing is applied. @@ -290,18 +289,36 @@ def web_search_tool(query: str, limit: int = 5) -> str: try: print(f"🔍 Searching the web for: '{query}' (limit: {limit})") - # Use Tavily's search functionality - response = tavily_client.search(query=query, max_results=limit, search_depth="advanced") + # Use Firecrawl's search functionality + # Firecrawl Search: search the web and get full content from results + # Docs: https://docs.firecrawl.dev/introduction + # Note: Firecrawl SDK supports search via app.search(query, limit=...) + response = firecrawl_app.search(query=query, limit=limit) - results_count = len(response.get('results', [])) + # Determine results count and trim to minimal structure: { success, data: [{markdown}] } + results_list = [] + success_flag = True + if isinstance(response, dict): + success_flag = bool(response.get("success", True)) + if "data" in response and isinstance(response["data"], list): + results_list = response["data"] + elif "results" in response and isinstance(response["results"], list): + results_list = response["results"] + results_count = len(results_list) print(f"✅ Found {results_count} results") # Capture debug information debug_call_data["results_count"] = results_count debug_call_data["original_response_size"] = len(json.dumps(response)) - result_json = json.dumps(response, indent=2) - # Clean base64 images from search results + # Build minimal response + minimal_data = [] + for item in results_list: + if isinstance(item, dict) and ("markdown" in item): + minimal_data.append({"markdown": item.get("markdown", "")}) + minimal_response = {"success": success_flag, "data": minimal_data} + + result_json = json.dumps(minimal_response, indent=2) cleaned_result = clean_base64_images(result_json) debug_call_data["final_response_size"] = len(cleaned_result) @@ -335,7 +352,7 @@ async def web_extract_tool( Extract content from specific web pages using available extraction API backend. This function provides a generic interface for web content extraction that - can work with multiple backends. Currently uses Tavily but can be easily swapped. + can work with multiple backends. Currently uses Firecrawl. Args: urls (List[str]): List of URLs to extract content from @@ -371,8 +388,49 @@ async def web_extract_tool( try: print(f"📄 Extracting content from {len(urls)} URL(s)") - # Use Tavily's extract functionality - response = tavily_client.extract(urls=urls, format=format) + # Use Firecrawl's scrape functionality per URL and normalize to a common shape + results: List[Dict[str, Any]] = [] + for url in urls: + try: + # Determine requested formats for Firecrawl + formats: List[str] = [] + if format == "markdown": + formats = ["markdown"] + elif format == "html": + formats = ["html"] + else: + # Default: request markdown for LLM-readiness and include html as backup + formats = ["markdown", "html"] + + scrape_result = firecrawl_app.scrape_url(url, formats=formats) + + # Firecrawl returns {success, data: {markdown?, html?, metadata}} + data = scrape_result.get("data", {}) if isinstance(scrape_result, dict) else {} + metadata = data.get("metadata", {}) + title = metadata.get("title", "") + content_markdown = data.get("markdown") + content_html = data.get("html") + + # Choose content based on requested format + chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or "" + + results.append({ + "url": metadata.get("sourceURL", url), + "title": title, + "content": chosen_content, + "raw_content": chosen_content, + "metadata": metadata + }) + except Exception as scrape_err: + results.append({ + "url": url, + "title": "", + "content": "", + "raw_content": "", + "error": str(scrape_err) + }) + + response = {"results": results} pages_extracted = len(response.get('results', [])) print(f"✅ Extracted content from {pages_extracted} pages") @@ -440,7 +498,18 @@ async def web_extract_tool( content_length = len(result.get('raw_content', '')) print(f" 📝 {url} ({content_length} characters)") - result_json = json.dumps(response, indent=2) + # Trim output to minimal fields per entry: title, content, error + trimmed_results = [ + { + "title": r.get("title", ""), + "content": r.get("content", ""), + "error": r.get("error") + } + for r in response.get("results", []) + ] + trimmed_response = {"results": trimmed_results} + + result_json = json.dumps(trimmed_response, indent=2) # Clean base64 images from extracted content cleaned_result = clean_base64_images(result_json) @@ -476,7 +545,7 @@ async def web_crawl_tool( Crawl a website with specific instructions using available crawling API backend. This function provides a generic interface for web crawling that can work - with multiple backends. Currently uses Tavily but can be easily swapped. + with multiple backends. Currently uses Firecrawl. Args: url (str): The base URL to crawl (can include or exclude https://) @@ -516,13 +585,35 @@ async def web_crawl_tool( instructions_text = f" with instructions: '{instructions}'" if instructions else "" print(f"🕷️ Crawling {url}{instructions_text}") - # Use Tavily's crawl functionality - response = tavily_client.crawl( - url=url, - limit=20, # Reasonable limit for most use cases - instructions=instructions or "Get all available content", - extract_depth=depth + # Use Firecrawl's crawl functionality and normalize to a common shape + # Firecrawl SDK returns the crawl results directly for synchronous crawl + scrape_options = ScrapeOptions(formats=["markdown", "html"]) + crawl_result = firecrawl_app.crawl_url( + url, + limit=20, + scrape_options=scrape_options, ) + + pages: List[Dict[str, Any]] = [] + if isinstance(crawl_result, dict): + # Firecrawl returns {success, data: [ {markdown?, html?, metadata} ]} + data_list = crawl_result.get("data", []) + for item in data_list: + metadata = item.get("metadata", {}) if isinstance(item, dict) else {} + page_url = metadata.get("sourceURL", "Unknown URL") + title = metadata.get("title", "") + content_markdown = item.get("markdown") if isinstance(item, dict) else None + content_html = item.get("html") if isinstance(item, dict) else None + content = content_markdown or content_html or "" + pages.append({ + "url": page_url, + "title": title, + "content": content, + "raw_content": content, + "metadata": metadata + }) + + response = {"results": pages} pages_crawled = len(response.get('results', [])) print(f"✅ Crawled {pages_crawled} pages") @@ -590,7 +681,18 @@ async def web_crawl_tool( content_length = len(result.get('content', '')) print(f" 🌐 {page_url} ({content_length} characters)") - result_json = json.dumps(response, indent=2) + # Trim output to minimal fields per entry: title, content, error + trimmed_results = [ + { + "title": r.get("title", ""), + "content": r.get("content", ""), + "error": r.get("error") + } + for r in response.get("results", []) + ] + trimmed_response = {"results": trimmed_results} + + result_json = json.dumps(trimmed_response, indent=2) # Clean base64 images from crawled content cleaned_result = clean_base64_images(result_json) @@ -615,14 +717,14 @@ async def web_crawl_tool( # Convenience function to check if API key is available -def check_tavily_api_key() -> bool: +def check_firecrawl_api_key() -> bool: """ - Check if the Tavily API key is available in environment variables. + Check if the Firecrawl API key is available in environment variables. Returns: bool: True if API key is set, False otherwise """ - return bool(os.getenv("TAVILY_API_KEY")) + return bool(os.getenv("FIRECRAWL_API_KEY")) def check_nous_api_key() -> bool: @@ -670,15 +772,15 @@ if __name__ == "__main__": print("=" * 40) # Check if API keys are available - tavily_available = check_tavily_api_key() + firecrawl_available = check_firecrawl_api_key() nous_available = check_nous_api_key() - if not tavily_available: - print("❌ TAVILY_API_KEY environment variable not set") - print("Please set your API key: export TAVILY_API_KEY='your-key-here'") - print("Get API key at: https://tavily.com/") + if not firecrawl_available: + print("❌ FIRECRAWL_API_KEY environment variable not set") + print("Please set your API key: export FIRECRAWL_API_KEY='your-key-here'") + print("Get API key at: https://firecrawl.dev/") else: - print("✅ Tavily API key found") + print("✅ Firecrawl API key found") if not nous_available: print("❌ NOUS_API_KEY environment variable not set") @@ -688,7 +790,7 @@ if __name__ == "__main__": else: print("✅ Nous Research API key found") - if not tavily_available: + if not firecrawl_available: exit(1) print("🛠️ Web tools ready for use!")