initital commit

2025-07-22 18:32:44 -07:00
commit 21d80ca683
8 changed files with 865 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/venv/
+/_pycache/
--- a/pycache/model_tools.cpython-310.pyc
+++ b/pycache/model_tools.cpython-310.pyc
--- a/pycache/web_tools.cpython-310.pyc
+++ b/pycache/web_tools.cpython-310.pyc
--- a/model_tools.py
+++ b/model_tools.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Model Tools Module
+
+This module constructs tool schemas and handlers for AI model API calls.
+It imports tools from various toolset modules and provides a unified interface
+for defining tools and executing function calls.
+
+Currently supports:
+- Web tools (search, extract, crawl) from web_tools.py
+
+Usage:
+    from model_tools import get_tool_definitions, handle_function_call
+    
+    # Get tool definitions for model API
+    tools = get_tool_definitions()
+    
+    # Handle function calls from model
+    result = handle_function_call("web_search_tool", {"query": "Python", "limit": 3})
+"""
+
+import json
+from typing import Dict, Any, List
+
+# Import toolsets
+from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_tavily_api_key
+
+def get_web_tool_definitions() -> List[Dict[str, Any]]:
+    """
+    Get tool definitions for web tools in OpenAI's expected format.
+    
+    Returns:
+        List[Dict]: List of web tool definitions compatible with OpenAI API
+    """
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": "web_search_tool",
+                "description": "Search the web for information on any topic. Returns relevant results with titles, URLs, content snippets, and answers. Uses advanced search depth for comprehensive results.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "The search query to look up on the web"
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum number of results to return (default: 5, max: 10)",
+                            "default": 5,
+                            "minimum": 1,
+                            "maximum": 10
+                        }
+                    },
+                    "required": ["query"]
+                }
+            }
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "web_extract_tool",
+                "description": "Extract and read the full content from specific web page URLs. Useful for getting detailed information from webpages found through search.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "urls": {
+                            "type": "array",
+                            "items": {"type": "string"},
+                            "description": "List of URLs to extract content from (max 5 URLs per call)",
+                            "maxItems": 5
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["markdown", "html"],
+                            "description": "Desired output format for extracted content (optional)"
+                        }
+                    },
+                    "required": ["urls"]
+                }
+            }
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "web_crawl_tool",
+                "description": "Crawl a website with specific instructions to find and extract targeted content. Uses AI to intelligently navigate and extract relevant information from across the site.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "url": {
+                            "type": "string",
+                            "description": "The base URL to crawl (can include or exclude https://)"
+                        },
+                        "instructions": {
+                            "type": "string",
+                            "description": "Specific instructions for what to crawl/extract using AI intelligence (e.g., 'Find pricing information', 'Get documentation pages', 'Extract contact details')"
+                        },
+                        "depth": {
+                            "type": "string",
+                            "enum": ["basic", "advanced"],
+                            "description": "Depth of extraction - 'basic' for surface content, 'advanced' for deeper analysis (default: basic)",
+                            "default": "basic"
+                        }
+                    },
+                    "required": ["url"]
+                }
+            }
+        }
+    ]
+
+def get_tool_definitions() -> List[Dict[str, Any]]:
+    """
+    Get all available tool definitions for model API calls.
+    
+    This function aggregates tool definitions from all available toolsets.
+    Currently includes web tools, but can be extended to include other toolsets.
+    
+    Returns:
+        List[Dict]: Complete list of all available tool definitions
+    """
+    tools = []
+    
+    # Add web tools
+    tools.extend(get_web_tool_definitions())
+    
+    # Future toolsets can be added here:
+    # tools.extend(get_file_tool_definitions())
+    # tools.extend(get_code_tool_definitions())
+    # tools.extend(get_database_tool_definitions())
+    
+    return tools
+
+def handle_web_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+    """
+    Handle function calls for web tools.
+    
+    Args:
+        function_name (str): Name of the web function to call
+        function_args (Dict): Arguments for the function
+    
+    Returns:
+        str: Function result as JSON string
+    """
+    if function_name == "web_search_tool":
+        query = function_args.get("query", "")
+        limit = function_args.get("limit", 5)
+        # Ensure limit is within bounds
+        limit = max(1, min(10, limit))
+        return web_search_tool(query, limit)
+    
+    elif function_name == "web_extract_tool":
+        urls = function_args.get("urls", [])
+        # Limit URLs to prevent abuse
+        urls = urls[:5] if isinstance(urls, list) else []
+        format = function_args.get("format")
+        return web_extract_tool(urls, format)
+    
+    elif function_name == "web_crawl_tool":
+        url = function_args.get("url", "")
+        instructions = function_args.get("instructions")
+        depth = function_args.get("depth", "basic")
+        return web_crawl_tool(url, instructions, depth)
+    
+    else:
+        return json.dumps({"error": f"Unknown web function: {function_name}"})
+
+def handle_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+    """
+    Main function call dispatcher that routes calls to appropriate toolsets.
+    
+    This function determines which toolset a function belongs to and dispatches
+    the call to the appropriate handler. This makes it easy to add new toolsets
+    without changing the main calling interface.
+    
+    Args:
+        function_name (str): Name of the function to call
+        function_args (Dict): Arguments for the function
+    
+    Returns:
+        str: Function result as JSON string
+    
+    Raises:
+        None: Returns error as JSON string instead of raising exceptions
+    """
+    try:
+        # Route web tools
+        if function_name in ["web_search_tool", "web_extract_tool", "web_crawl_tool"]:
+            return handle_web_function_call(function_name, function_args)
+        
+        # Future toolsets can be routed here:
+        # elif function_name in ["file_read_tool", "file_write_tool"]:
+        #     return handle_file_function_call(function_name, function_args)
+        # elif function_name in ["code_execute_tool", "code_analyze_tool"]:
+        #     return handle_code_function_call(function_name, function_args)
+        
+        else:
+            error_msg = f"Unknown function: {function_name}"
+            print(f"❌ {error_msg}")
+            return json.dumps({"error": error_msg})
+    
+    except Exception as e:
+        error_msg = f"Error executing {function_name}: {str(e)}"
+        print(f"❌ {error_msg}")
+        return json.dumps({"error": error_msg})
+
+def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
+    """
+    Get information about all available toolsets and their status.
+    
+    Returns:
+        Dict: Information about each toolset including availability and tools
+    """
+    toolsets = {
+        "web_tools": {
+            "available": check_tavily_api_key(),
+            "tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"],
+            "description": "Web search, content extraction, and website crawling tools",
+            "requirements": ["TAVILY_API_KEY environment variable"]
+        }
+        # Future toolsets can be added here
+    }
+    
+    return toolsets
+
+def check_toolset_requirements() -> Dict[str, bool]:
+    """
+    Check if all requirements for available toolsets are met.
+    
+    Returns:
+        Dict: Status of each toolset's requirements
+    """
+    return {
+        "web_tools": check_tavily_api_key()
+    }
+
+if __name__ == "__main__":
+    """
+    Simple test/demo when run directly
+    """
+    print("🛠️  Model Tools Module")
+    print("=" * 40)
+    
+    # Check toolset requirements
+    requirements = check_toolset_requirements()
+    print("📋 Toolset Requirements:")
+    for toolset, available in requirements.items():
+        status = "✅" if available else "❌"
+        print(f"  {status} {toolset}: {'Available' if available else 'Missing requirements'}")
+    
+    # Show available tools
+    tools = get_tool_definitions()
+    print(f"\n🔧 Available Tools ({len(tools)} total):")
+    for tool in tools:
+        func_name = tool["function"]["name"]
+        desc = tool["function"]["description"]
+        print(f"  📌 {func_name}: {desc[:80]}{'...' if len(desc) > 80 else ''}")
+    
+    # Show toolset info
+    toolsets = get_available_toolsets()
+    print(f"\n📦 Toolset Information:")
+    for name, info in toolsets.items():
+        status = "✅" if info["available"] else "❌"
+        print(f"  {status} {name}: {info['description']}")
+        if not info["available"]:
+            print(f"    Requirements: {', '.join(info['requirements'])}")
+    
+    print("\n💡 Usage Example:")
+    print("  from model_tools import get_tool_definitions, handle_function_call")
+    print("  tools = get_tool_definitions()")
+    print("  result = handle_function_call('web_search_tool', {'query': 'Python'})")
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+tavily-python
+openai
--- a/run_agent.py
+++ b/run_agent.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+"""
+AI Agent Runner with Tool Calling
+
+This module provides a clean, standalone agent that can execute AI models
+with tool calling capabilities. It handles the conversation loop, tool execution,
+and response management.
+
+Features:
+- Automatic tool calling loop until completion
+- Configurable model parameters
+- Error handling and recovery
+- Message history management
+- Support for multiple model providers
+
+Usage:
+    from run_agent import AIAgent
+    
+    agent = AIAgent(base_url="http://localhost:30000/v1", model="claude-opus-4-20250514")
+    response = agent.run_conversation("Tell me about the latest Python updates")
+"""
+
+import json
+import os
+import time
+from typing import List, Dict, Any, Optional
+from openai import OpenAI
+
+# Import our tool system
+from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
+
+
+class AIAgent:
+    """
+    AI Agent with tool calling capabilities.
+    
+    This class manages the conversation flow, tool execution, and response handling
+    for AI models that support function calling.
+    """
+    
+    def __init__(
+        self, 
+        base_url: str = None, 
+        api_key: str = None, 
+        model: str = "gpt-4",
+        max_iterations: int = 10,
+        tool_delay: float = 1.0
+    ):
+        """
+        Initialize the AI Agent.
+        
+        Args:
+            base_url (str): Base URL for the model API (optional)
+            api_key (str): API key for authentication (optional, uses env var if not provided)
+            model (str): Model name to use (default: "gpt-4")
+            max_iterations (int): Maximum number of tool calling iterations (default: 10)
+            tool_delay (float): Delay between tool calls in seconds (default: 1.0)
+        """
+        self.model = model
+        self.max_iterations = max_iterations
+        self.tool_delay = tool_delay
+        
+        # Initialize OpenAI client
+        client_kwargs = {}
+        if base_url:
+            client_kwargs["base_url"] = base_url
+        if api_key:
+            client_kwargs["api_key"] = api_key
+        else:
+            client_kwargs["api_key"] = os.getenv("OPENAI_API_KEY", "dummy-key")
+        
+        try:
+            self.client = OpenAI(**client_kwargs)
+            print(f"🤖 AI Agent initialized with model: {self.model}")
+            if base_url:
+                print(f"🔗 Using custom base URL: {base_url}")
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize OpenAI client: {e}")
+        
+        # Get available tools
+        self.tools = get_tool_definitions()
+        print(f"🛠️  Loaded {len(self.tools)} tools")
+        
+        # Check tool requirements
+        requirements = check_toolset_requirements()
+        missing_reqs = [name for name, available in requirements.items() if not available]
+        if missing_reqs:
+            print(f"⚠️  Some tools may not work due to missing requirements: {missing_reqs}")
+    
+    def create_system_message(self, custom_system: str = None) -> str:
+        """
+        Create the system message for the agent.
+        
+        Args:
+            custom_system (str): Custom system message (optional)
+            
+        Returns:
+            str: System message content
+        """
+        if custom_system:
+            return custom_system
+        
+        return (
+            "You are an AI assistant that provides helpful responses. You may use extremely long chains of thought "
+            "to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help "
+            "come to a correct solution prior to answering. You should enclose your thoughts and internal monologue "
+            "inside <thinking> tags.\n\n"
+            "You are equipped with web research tools that allow you to search the web, extract content from web pages, "
+            "and crawl websites. Use these tools to gather current information and provide accurate, well-researched responses. "
+            "You can call multiple tools in parallel if they are not reliant on each other's results. You can also use "
+            "sequential tool calls to build on data you've collected from previous tool calls. Continue using tools until "
+            "you feel confident you have enough information to provide a comprehensive answer."
+        )
+    
+    def run_conversation(
+        self, 
+        user_message: str, 
+        system_message: str = None, 
+        conversation_history: List[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Run a complete conversation with tool calling until completion.
+        
+        Args:
+            user_message (str): The user's message/question
+            system_message (str): Custom system message (optional)
+            conversation_history (List[Dict]): Previous conversation messages (optional)
+            
+        Returns:
+            Dict: Complete conversation result with final response and message history
+        """
+        # Initialize conversation
+        messages = conversation_history or []
+        
+        # Add system message if not already present
+        if not messages or messages[0]["role"] != "system":
+            messages.insert(0, {
+                "role": "system",
+                "content": self.create_system_message(system_message)
+            })
+        
+        # Add user message
+        messages.append({
+            "role": "user",
+            "content": user_message
+        })
+        
+        print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
+        
+        # Main conversation loop
+        api_call_count = 0
+        final_response = None
+        
+        while api_call_count < self.max_iterations:
+            api_call_count += 1
+            print(f"\n🔄 Making API call #{api_call_count}...")
+            
+            try:
+                # Make API call with tools
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=messages,
+                    tools=self.tools if self.tools else None
+                )
+                
+                assistant_message = response.choices[0].message
+                
+                # Handle assistant response
+                if assistant_message.content:
+                    print(f"🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
+                
+                # Check for tool calls
+                if assistant_message.tool_calls:
+                    print(f"🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...")
+                    
+                    # Add assistant message with tool calls to conversation
+                    messages.append({
+                        "role": "assistant",
+                        "content": assistant_message.content,
+                        "tool_calls": [
+                            {
+                                "id": tool_call.id,
+                                "type": tool_call.type,
+                                "function": {
+                                    "name": tool_call.function.name,
+                                    "arguments": tool_call.function.arguments
+                                }
+                            }
+                            for tool_call in assistant_message.tool_calls
+                        ]
+                    })
+                    
+                    # Execute each tool call
+                    for i, tool_call in enumerate(assistant_message.tool_calls, 1):
+                        function_name = tool_call.function.name
+                        
+                        try:
+                            function_args = json.loads(tool_call.function.arguments)
+                        except json.JSONDecodeError as e:
+                            print(f"❌ Invalid JSON in tool call arguments: {e}")
+                            function_args = {}
+                        
+                        print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
+                        
+                        # Execute the tool
+                        function_result = handle_function_call(function_name, function_args)
+                        
+                        # Add tool result to conversation
+                        messages.append({
+                            "role": "tool",
+                            "content": function_result,
+                            "tool_call_id": tool_call.id
+                        })
+                        
+                        print(f"  ✅ Tool {i} completed")
+                        
+                        # Delay between tool calls
+                        if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
+                            time.sleep(self.tool_delay)
+                    
+                    # Continue loop for next response
+                    continue
+                
+                else:
+                    # No tool calls - this is the final response
+                    final_response = assistant_message.content or ""
+                    
+                    # Add final assistant message
+                    messages.append({
+                        "role": "assistant", 
+                        "content": final_response
+                    })
+                    
+                    print(f"🎉 Conversation completed after {api_call_count} API call(s)")
+                    break
+                
+            except Exception as e:
+                error_msg = f"Error during API call #{api_call_count}: {str(e)}"
+                print(f"❌ {error_msg}")
+                
+                # Add error to conversation and try to continue
+                messages.append({
+                    "role": "assistant",
+                    "content": f"I encountered an error: {error_msg}. Let me try a different approach."
+                })
+                
+                # If we're near the limit, break to avoid infinite loops
+                if api_call_count >= self.max_iterations - 1:
+                    final_response = f"I apologize, but I encountered repeated errors: {error_msg}"
+                    break
+        
+        # Handle max iterations reached
+        if api_call_count >= self.max_iterations:
+            print(f"⚠️  Reached maximum iterations ({self.max_iterations}). Stopping to prevent infinite loop.")
+            if final_response is None:
+                final_response = "I've reached the maximum number of iterations. Here's what I found so far."
+        
+        return {
+            "final_response": final_response,
+            "messages": messages,
+            "api_calls": api_call_count,
+            "completed": final_response is not None
+        }
+    
+    def chat(self, message: str) -> str:
+        """
+        Simple chat interface that returns just the final response.
+        
+        Args:
+            message (str): User message
+            
+        Returns:
+            str: Final assistant response
+        """
+        result = self.run_conversation(message)
+        return result["final_response"]
+
+
+def main():
+    """
+    Main function for running the agent directly.
+    """
+    print("🤖 AI Agent with Tool Calling")
+    print("=" * 50)
+    
+    # Initialize agent with local SGLang server (modify as needed)
+    try:
+        agent = AIAgent(
+            base_url="https://api.anthropic.com/v1/",
+            model="claude-opus-4-20250514"
+        )
+    except RuntimeError as e:
+        print(f"❌ Failed to initialize agent: {e}")
+        return
+    
+    # Example conversation
+    user_query = (
+        "Tell me about the latest developments in Python 3.12 and what new features "
+        "developers should know about. Please search for current information."
+    )
+    
+    print(f"\n📝 User Query: {user_query}")
+    print("\n" + "=" * 50)
+    
+    # Run conversation
+    result = agent.run_conversation(user_query)
+    
+    print("\n" + "=" * 50)
+    print("📋 CONVERSATION SUMMARY")
+    print("=" * 50)
+    print(f"✅ Completed: {result['completed']}")
+    print(f"📞 API Calls: {result['api_calls']}")
+    print(f"💬 Messages: {len(result['messages'])}")
+    
+    if result['final_response']:
+        print(f"\n🎯 FINAL RESPONSE:")
+        print("-" * 30)
+        print(result['final_response'])
+    
+    print("\n👋 Agent execution completed!")
+
+
+if __name__ == "__main__":
+    main()
--- a/terminal_tool.py
+++ b/terminal_tool.py
--- a/web_tools.py
+++ b/web_tools.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+"""
+Standalone Web Tools Module
+
+This module provides generic web tools that work with multiple backend providers.
+Currently uses Tavily as the backend, but the interface makes it easy to swap
+to other providers like Firecrawl without changing the function signatures.
+
+Available tools:
+- web_search_tool: Search the web for information
+- web_extract_tool: Extract content from specific web pages
+- web_crawl_tool: Crawl websites with specific instructions
+
+Backend compatibility:
+- Tavily: https://docs.tavily.com/
+- Firecrawl: https://docs.firecrawl.dev/features/search
+
+Usage:
+    from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
+    
+    # Search the web
+    results = web_search_tool("Python machine learning libraries", limit=3)
+    
+    # Extract content from URLs  
+    content = web_extract_tool(["https://example.com"], format="markdown")
+    
+    # Crawl a website
+    crawl_data = web_crawl_tool("example.com", "Find contact information")
+"""
+
+#TODO: Search Capabilities over the scraped pages
+#TODO: Store the pages in something
+#TODO: Tool to see what pages are available/saved to search over
+
+import json
+import os
+import re
+from typing import List
+from tavily import TavilyClient
+
+# Initialize Tavily client once at module level
+tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+
+
+def clean_base64_images(text: str) -> str:
+    """
+    Remove base64 encoded images from text to reduce token count and clutter.
+    
+    This function finds and removes base64 encoded images in various formats:
+    - (data:image/png;base64,...)
+    - (data:image/jpeg;base64,...)
+    - (data:image/svg+xml;base64,...)
+    - data:image/[type];base64,... (without parentheses)
+    
+    Args:
+        text: The text content to clean
+        
+    Returns:
+        Cleaned text with base64 images replaced with placeholders
+    """
+    # Pattern to match base64 encoded images wrapped in parentheses
+    # Matches: (data:image/[type];base64,[base64-string])
+    base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
+    
+    # Pattern to match base64 encoded images without parentheses
+    # Matches: data:image/[type];base64,[base64-string]
+    base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
+    
+    # Replace parentheses-wrapped images first
+    cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
+    
+    # Then replace any remaining non-parentheses images
+    cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
+    
+    return cleaned_text
+
+
+def web_search_tool(query: str, limit: int = 5) -> str:
+    """
+    Search the web for information using available search API backend.
+    
+    This function provides a generic interface for web search that can work
+    with multiple backends. Currently uses Tavily but can be easily swapped.
+    
+    Args:
+        query (str): The search query to look up
+        limit (int): Maximum number of results to return (default: 5)
+    
+    Returns:
+        str: JSON string containing search results with the following structure:
+             {
+                 "query": str,
+                 "results": [
+                     {
+                         "title": str,
+                         "url": str,
+                         "content": str,
+                         "score": float
+                     },
+                     ...
+                 ]
+             }
+    
+    Raises:
+        Exception: If search fails or API key is not set
+    """
+    try:
+        print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
+        
+        # Use Tavily's search functionality
+        response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
+        
+        print(f"✅ Found {len(response.get('results', []))} results")
+        result_json = json.dumps(response, indent=2)
+        # Clean base64 images from search results
+        return clean_base64_images(result_json)
+        
+    except Exception as e:
+        error_msg = f"Error searching web: {str(e)}"
+        print(f"❌ {error_msg}")
+        return json.dumps({"error": error_msg})
+
+
+def web_extract_tool(urls: List[str], format: str = None) -> str:
+    """
+    Extract content from specific web pages using available extraction API backend.
+    
+    This function provides a generic interface for web content extraction that
+    can work with multiple backends. Currently uses Tavily but can be easily swapped.
+    
+    Args:
+        urls (List[str]): List of URLs to extract content from
+        format (str): Desired output format ("markdown" or "html", optional)
+    
+    Returns:
+        str: JSON string containing extracted content with the following structure:
+             {
+                 "results": [
+                     {
+                         "url": str,
+                         "title": str,
+                         "raw_content": str,
+                         "content": str
+                     },
+                     ...
+                 ]
+             }
+    
+    Raises:
+        Exception: If extraction fails or API key is not set
+    """
+    try:
+        print(f"📄 Extracting content from {len(urls)} URL(s)")
+        
+        # Use Tavily's extract functionality
+        response = tavily_client.extract(urls=urls, format=format)
+        
+        print(f"✅ Extracted content from {len(response.get('results', []))} pages")
+        
+        # Print summary of extracted pages for debugging
+        for result in response.get('results', []):
+            url = result.get('url', 'Unknown URL')
+            content_length = len(result.get('raw_content', ''))
+            print(f"  📝 {url} ({content_length} characters)")
+        
+        result_json = json.dumps(response, indent=2)
+        # Clean base64 images from extracted content
+        return clean_base64_images(result_json)
+            
+    except Exception as e:
+        error_msg = f"Error extracting content: {str(e)}"
+        print(f"❌ {error_msg}")
+        return json.dumps({"error": error_msg})
+
+
+def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") -> str:
+    """
+    Crawl a website with specific instructions using available crawling API backend.
+    
+    This function provides a generic interface for web crawling that can work
+    with multiple backends. Currently uses Tavily but can be easily swapped.
+    
+    Args:
+        url (str): The base URL to crawl (can include or exclude https://)
+        instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
+        depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
+    
+    Returns:
+        str: JSON string containing crawled content with the following structure:
+             {
+                 "results": [
+                     {
+                         "url": str,
+                         "title": str,
+                         "content": str
+                     },
+                     ...
+                 ]
+             }
+    
+    Raises:
+        Exception: If crawling fails or API key is not set
+    """
+    try:
+        instructions_text = f" with instructions: '{instructions}'" if instructions else ""
+        print(f"🕷️ Crawling {url}{instructions_text}")
+        
+        # Use Tavily's crawl functionality
+        response = tavily_client.crawl(
+            url=url,
+            limit=20,  # Reasonable limit for most use cases
+            instructions=instructions or "Get all available content",
+            extract_depth=depth
+        )
+        
+        print(f"✅ Crawled {len(response.get('results', []))} pages")
+        
+        # Print summary of crawled pages for debugging
+        for result in response.get('results', []):
+            page_url = result.get('url', 'Unknown URL')
+            content_length = len(result.get('content', ''))
+            print(f"  🌐 {page_url} ({content_length} characters)")
+        
+        result_json = json.dumps(response, indent=2)
+        # Clean base64 images from crawled content
+        return clean_base64_images(result_json)
+        
+    except Exception as e:
+        error_msg = f"Error crawling website: {str(e)}"
+        print(f"❌ {error_msg}")
+        return json.dumps({"error": error_msg})
+
+
+# Convenience function to check if API key is available
+def check_tavily_api_key() -> bool:
+    """
+    Check if the Tavily API key is available in environment variables.
+    
+    Returns:
+        bool: True if API key is set, False otherwise
+    """
+    return bool(os.getenv("TAVILY_API_KEY"))
+
+
+if __name__ == "__main__":
+    """
+    Simple test/demo when run directly
+    """
+    print("🌐 Standalone Web Tools Module")
+    print("=" * 40)
+    
+    # Check if API key is available
+    if not check_tavily_api_key():
+        print("❌ TAVILY_API_KEY environment variable not set")
+        print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
+        print("Get API key at: https://tavily.com/")
+        exit(1)
+    
+    print("✅ Tavily API key found")
+    print("🛠️  Web tools ready for use!")
+    print("\nExample usage:")
+    print("  from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
+    print("  results = web_search_tool('Python tutorials')")
+    print("  content = web_extract_tool(['https://example.com'])")
+    print("  crawl_data = web_crawl_tool('example.com', 'Find documentation')")