initital commit
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
/venv/
|
||||
/_pycache/
|
||||
BIN
__pycache__/model_tools.cpython-310.pyc
Normal file
BIN
__pycache__/model_tools.cpython-310.pyc
Normal file
Binary file not shown.
BIN
__pycache__/web_tools.cpython-310.pyc
Normal file
BIN
__pycache__/web_tools.cpython-310.pyc
Normal file
Binary file not shown.
272
model_tools.py
Normal file
272
model_tools.py
Normal file
@@ -0,0 +1,272 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Model Tools Module
|
||||
|
||||
This module constructs tool schemas and handlers for AI model API calls.
|
||||
It imports tools from various toolset modules and provides a unified interface
|
||||
for defining tools and executing function calls.
|
||||
|
||||
Currently supports:
|
||||
- Web tools (search, extract, crawl) from web_tools.py
|
||||
|
||||
Usage:
|
||||
from model_tools import get_tool_definitions, handle_function_call
|
||||
|
||||
# Get tool definitions for model API
|
||||
tools = get_tool_definitions()
|
||||
|
||||
# Handle function calls from model
|
||||
result = handle_function_call("web_search_tool", {"query": "Python", "limit": 3})
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# Import toolsets
|
||||
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_tavily_api_key
|
||||
|
||||
def get_web_tool_definitions() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get tool definitions for web tools in OpenAI's expected format.
|
||||
|
||||
Returns:
|
||||
List[Dict]: List of web tool definitions compatible with OpenAI API
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search_tool",
|
||||
"description": "Search the web for information on any topic. Returns relevant results with titles, URLs, content snippets, and answers. Uses advanced search depth for comprehensive results.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The search query to look up on the web"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results to return (default: 5, max: 10)",
|
||||
"default": 5,
|
||||
"minimum": 1,
|
||||
"maximum": 10
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_extract_tool",
|
||||
"description": "Extract and read the full content from specific web page URLs. Useful for getting detailed information from webpages found through search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"urls": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of URLs to extract content from (max 5 URLs per call)",
|
||||
"maxItems": 5
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "html"],
|
||||
"description": "Desired output format for extracted content (optional)"
|
||||
}
|
||||
},
|
||||
"required": ["urls"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_crawl_tool",
|
||||
"description": "Crawl a website with specific instructions to find and extract targeted content. Uses AI to intelligently navigate and extract relevant information from across the site.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The base URL to crawl (can include or exclude https://)"
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string",
|
||||
"description": "Specific instructions for what to crawl/extract using AI intelligence (e.g., 'Find pricing information', 'Get documentation pages', 'Extract contact details')"
|
||||
},
|
||||
"depth": {
|
||||
"type": "string",
|
||||
"enum": ["basic", "advanced"],
|
||||
"description": "Depth of extraction - 'basic' for surface content, 'advanced' for deeper analysis (default: basic)",
|
||||
"default": "basic"
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
def get_tool_definitions() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all available tool definitions for model API calls.
|
||||
|
||||
This function aggregates tool definitions from all available toolsets.
|
||||
Currently includes web tools, but can be extended to include other toolsets.
|
||||
|
||||
Returns:
|
||||
List[Dict]: Complete list of all available tool definitions
|
||||
"""
|
||||
tools = []
|
||||
|
||||
# Add web tools
|
||||
tools.extend(get_web_tool_definitions())
|
||||
|
||||
# Future toolsets can be added here:
|
||||
# tools.extend(get_file_tool_definitions())
|
||||
# tools.extend(get_code_tool_definitions())
|
||||
# tools.extend(get_database_tool_definitions())
|
||||
|
||||
return tools
|
||||
|
||||
def handle_web_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Handle function calls for web tools.
|
||||
|
||||
Args:
|
||||
function_name (str): Name of the web function to call
|
||||
function_args (Dict): Arguments for the function
|
||||
|
||||
Returns:
|
||||
str: Function result as JSON string
|
||||
"""
|
||||
if function_name == "web_search_tool":
|
||||
query = function_args.get("query", "")
|
||||
limit = function_args.get("limit", 5)
|
||||
# Ensure limit is within bounds
|
||||
limit = max(1, min(10, limit))
|
||||
return web_search_tool(query, limit)
|
||||
|
||||
elif function_name == "web_extract_tool":
|
||||
urls = function_args.get("urls", [])
|
||||
# Limit URLs to prevent abuse
|
||||
urls = urls[:5] if isinstance(urls, list) else []
|
||||
format = function_args.get("format")
|
||||
return web_extract_tool(urls, format)
|
||||
|
||||
elif function_name == "web_crawl_tool":
|
||||
url = function_args.get("url", "")
|
||||
instructions = function_args.get("instructions")
|
||||
depth = function_args.get("depth", "basic")
|
||||
return web_crawl_tool(url, instructions, depth)
|
||||
|
||||
else:
|
||||
return json.dumps({"error": f"Unknown web function: {function_name}"})
|
||||
|
||||
def handle_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Main function call dispatcher that routes calls to appropriate toolsets.
|
||||
|
||||
This function determines which toolset a function belongs to and dispatches
|
||||
the call to the appropriate handler. This makes it easy to add new toolsets
|
||||
without changing the main calling interface.
|
||||
|
||||
Args:
|
||||
function_name (str): Name of the function to call
|
||||
function_args (Dict): Arguments for the function
|
||||
|
||||
Returns:
|
||||
str: Function result as JSON string
|
||||
|
||||
Raises:
|
||||
None: Returns error as JSON string instead of raising exceptions
|
||||
"""
|
||||
try:
|
||||
# Route web tools
|
||||
if function_name in ["web_search_tool", "web_extract_tool", "web_crawl_tool"]:
|
||||
return handle_web_function_call(function_name, function_args)
|
||||
|
||||
# Future toolsets can be routed here:
|
||||
# elif function_name in ["file_read_tool", "file_write_tool"]:
|
||||
# return handle_file_function_call(function_name, function_args)
|
||||
# elif function_name in ["code_execute_tool", "code_analyze_tool"]:
|
||||
# return handle_code_function_call(function_name, function_args)
|
||||
|
||||
else:
|
||||
error_msg = f"Unknown function: {function_name}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error executing {function_name}: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Get information about all available toolsets and their status.
|
||||
|
||||
Returns:
|
||||
Dict: Information about each toolset including availability and tools
|
||||
"""
|
||||
toolsets = {
|
||||
"web_tools": {
|
||||
"available": check_tavily_api_key(),
|
||||
"tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"],
|
||||
"description": "Web search, content extraction, and website crawling tools",
|
||||
"requirements": ["TAVILY_API_KEY environment variable"]
|
||||
}
|
||||
# Future toolsets can be added here
|
||||
}
|
||||
|
||||
return toolsets
|
||||
|
||||
def check_toolset_requirements() -> Dict[str, bool]:
|
||||
"""
|
||||
Check if all requirements for available toolsets are met.
|
||||
|
||||
Returns:
|
||||
Dict: Status of each toolset's requirements
|
||||
"""
|
||||
return {
|
||||
"web_tools": check_tavily_api_key()
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Simple test/demo when run directly
|
||||
"""
|
||||
print("🛠️ Model Tools Module")
|
||||
print("=" * 40)
|
||||
|
||||
# Check toolset requirements
|
||||
requirements = check_toolset_requirements()
|
||||
print("📋 Toolset Requirements:")
|
||||
for toolset, available in requirements.items():
|
||||
status = "✅" if available else "❌"
|
||||
print(f" {status} {toolset}: {'Available' if available else 'Missing requirements'}")
|
||||
|
||||
# Show available tools
|
||||
tools = get_tool_definitions()
|
||||
print(f"\n🔧 Available Tools ({len(tools)} total):")
|
||||
for tool in tools:
|
||||
func_name = tool["function"]["name"]
|
||||
desc = tool["function"]["description"]
|
||||
print(f" 📌 {func_name}: {desc[:80]}{'...' if len(desc) > 80 else ''}")
|
||||
|
||||
# Show toolset info
|
||||
toolsets = get_available_toolsets()
|
||||
print(f"\n📦 Toolset Information:")
|
||||
for name, info in toolsets.items():
|
||||
status = "✅" if info["available"] else "❌"
|
||||
print(f" {status} {name}: {info['description']}")
|
||||
if not info["available"]:
|
||||
print(f" Requirements: {', '.join(info['requirements'])}")
|
||||
|
||||
print("\n💡 Usage Example:")
|
||||
print(" from model_tools import get_tool_definitions, handle_function_call")
|
||||
print(" tools = get_tool_definitions()")
|
||||
print(" result = handle_function_call('web_search_tool', {'query': 'Python'})")
|
||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
tavily-python
|
||||
openai
|
||||
324
run_agent.py
Normal file
324
run_agent.py
Normal file
@@ -0,0 +1,324 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI Agent Runner with Tool Calling
|
||||
|
||||
This module provides a clean, standalone agent that can execute AI models
|
||||
with tool calling capabilities. It handles the conversation loop, tool execution,
|
||||
and response management.
|
||||
|
||||
Features:
|
||||
- Automatic tool calling loop until completion
|
||||
- Configurable model parameters
|
||||
- Error handling and recovery
|
||||
- Message history management
|
||||
- Support for multiple model providers
|
||||
|
||||
Usage:
|
||||
from run_agent import AIAgent
|
||||
|
||||
agent = AIAgent(base_url="http://localhost:30000/v1", model="claude-opus-4-20250514")
|
||||
response = agent.run_conversation("Tell me about the latest Python updates")
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional
|
||||
from openai import OpenAI
|
||||
|
||||
# Import our tool system
|
||||
from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
|
||||
|
||||
|
||||
class AIAgent:
|
||||
"""
|
||||
AI Agent with tool calling capabilities.
|
||||
|
||||
This class manages the conversation flow, tool execution, and response handling
|
||||
for AI models that support function calling.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str = None,
|
||||
api_key: str = None,
|
||||
model: str = "gpt-4",
|
||||
max_iterations: int = 10,
|
||||
tool_delay: float = 1.0
|
||||
):
|
||||
"""
|
||||
Initialize the AI Agent.
|
||||
|
||||
Args:
|
||||
base_url (str): Base URL for the model API (optional)
|
||||
api_key (str): API key for authentication (optional, uses env var if not provided)
|
||||
model (str): Model name to use (default: "gpt-4")
|
||||
max_iterations (int): Maximum number of tool calling iterations (default: 10)
|
||||
tool_delay (float): Delay between tool calls in seconds (default: 1.0)
|
||||
"""
|
||||
self.model = model
|
||||
self.max_iterations = max_iterations
|
||||
self.tool_delay = tool_delay
|
||||
|
||||
# Initialize OpenAI client
|
||||
client_kwargs = {}
|
||||
if base_url:
|
||||
client_kwargs["base_url"] = base_url
|
||||
if api_key:
|
||||
client_kwargs["api_key"] = api_key
|
||||
else:
|
||||
client_kwargs["api_key"] = os.getenv("OPENAI_API_KEY", "dummy-key")
|
||||
|
||||
try:
|
||||
self.client = OpenAI(**client_kwargs)
|
||||
print(f"🤖 AI Agent initialized with model: {self.model}")
|
||||
if base_url:
|
||||
print(f"🔗 Using custom base URL: {base_url}")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to initialize OpenAI client: {e}")
|
||||
|
||||
# Get available tools
|
||||
self.tools = get_tool_definitions()
|
||||
print(f"🛠️ Loaded {len(self.tools)} tools")
|
||||
|
||||
# Check tool requirements
|
||||
requirements = check_toolset_requirements()
|
||||
missing_reqs = [name for name, available in requirements.items() if not available]
|
||||
if missing_reqs:
|
||||
print(f"⚠️ Some tools may not work due to missing requirements: {missing_reqs}")
|
||||
|
||||
def create_system_message(self, custom_system: str = None) -> str:
|
||||
"""
|
||||
Create the system message for the agent.
|
||||
|
||||
Args:
|
||||
custom_system (str): Custom system message (optional)
|
||||
|
||||
Returns:
|
||||
str: System message content
|
||||
"""
|
||||
if custom_system:
|
||||
return custom_system
|
||||
|
||||
return (
|
||||
"You are an AI assistant that provides helpful responses. You may use extremely long chains of thought "
|
||||
"to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help "
|
||||
"come to a correct solution prior to answering. You should enclose your thoughts and internal monologue "
|
||||
"inside <thinking> tags.\n\n"
|
||||
"You are equipped with web research tools that allow you to search the web, extract content from web pages, "
|
||||
"and crawl websites. Use these tools to gather current information and provide accurate, well-researched responses. "
|
||||
"You can call multiple tools in parallel if they are not reliant on each other's results. You can also use "
|
||||
"sequential tool calls to build on data you've collected from previous tool calls. Continue using tools until "
|
||||
"you feel confident you have enough information to provide a comprehensive answer."
|
||||
)
|
||||
|
||||
def run_conversation(
|
||||
self,
|
||||
user_message: str,
|
||||
system_message: str = None,
|
||||
conversation_history: List[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run a complete conversation with tool calling until completion.
|
||||
|
||||
Args:
|
||||
user_message (str): The user's message/question
|
||||
system_message (str): Custom system message (optional)
|
||||
conversation_history (List[Dict]): Previous conversation messages (optional)
|
||||
|
||||
Returns:
|
||||
Dict: Complete conversation result with final response and message history
|
||||
"""
|
||||
# Initialize conversation
|
||||
messages = conversation_history or []
|
||||
|
||||
# Add system message if not already present
|
||||
if not messages or messages[0]["role"] != "system":
|
||||
messages.insert(0, {
|
||||
"role": "system",
|
||||
"content": self.create_system_message(system_message)
|
||||
})
|
||||
|
||||
# Add user message
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": user_message
|
||||
})
|
||||
|
||||
print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
|
||||
|
||||
# Main conversation loop
|
||||
api_call_count = 0
|
||||
final_response = None
|
||||
|
||||
while api_call_count < self.max_iterations:
|
||||
api_call_count += 1
|
||||
print(f"\n🔄 Making API call #{api_call_count}...")
|
||||
|
||||
try:
|
||||
# Make API call with tools
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=messages,
|
||||
tools=self.tools if self.tools else None
|
||||
)
|
||||
|
||||
assistant_message = response.choices[0].message
|
||||
|
||||
# Handle assistant response
|
||||
if assistant_message.content:
|
||||
print(f"🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
|
||||
|
||||
# Check for tool calls
|
||||
if assistant_message.tool_calls:
|
||||
print(f"🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...")
|
||||
|
||||
# Add assistant message with tool calls to conversation
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": assistant_message.content,
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": tool_call.id,
|
||||
"type": tool_call.type,
|
||||
"function": {
|
||||
"name": tool_call.function.name,
|
||||
"arguments": tool_call.function.arguments
|
||||
}
|
||||
}
|
||||
for tool_call in assistant_message.tool_calls
|
||||
]
|
||||
})
|
||||
|
||||
# Execute each tool call
|
||||
for i, tool_call in enumerate(assistant_message.tool_calls, 1):
|
||||
function_name = tool_call.function.name
|
||||
|
||||
try:
|
||||
function_args = json.loads(tool_call.function.arguments)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"❌ Invalid JSON in tool call arguments: {e}")
|
||||
function_args = {}
|
||||
|
||||
print(f" 📞 Tool {i}: {function_name}({list(function_args.keys())})")
|
||||
|
||||
# Execute the tool
|
||||
function_result = handle_function_call(function_name, function_args)
|
||||
|
||||
# Add tool result to conversation
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"content": function_result,
|
||||
"tool_call_id": tool_call.id
|
||||
})
|
||||
|
||||
print(f" ✅ Tool {i} completed")
|
||||
|
||||
# Delay between tool calls
|
||||
if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
|
||||
time.sleep(self.tool_delay)
|
||||
|
||||
# Continue loop for next response
|
||||
continue
|
||||
|
||||
else:
|
||||
# No tool calls - this is the final response
|
||||
final_response = assistant_message.content or ""
|
||||
|
||||
# Add final assistant message
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": final_response
|
||||
})
|
||||
|
||||
print(f"🎉 Conversation completed after {api_call_count} API call(s)")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error during API call #{api_call_count}: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
|
||||
# Add error to conversation and try to continue
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": f"I encountered an error: {error_msg}. Let me try a different approach."
|
||||
})
|
||||
|
||||
# If we're near the limit, break to avoid infinite loops
|
||||
if api_call_count >= self.max_iterations - 1:
|
||||
final_response = f"I apologize, but I encountered repeated errors: {error_msg}"
|
||||
break
|
||||
|
||||
# Handle max iterations reached
|
||||
if api_call_count >= self.max_iterations:
|
||||
print(f"⚠️ Reached maximum iterations ({self.max_iterations}). Stopping to prevent infinite loop.")
|
||||
if final_response is None:
|
||||
final_response = "I've reached the maximum number of iterations. Here's what I found so far."
|
||||
|
||||
return {
|
||||
"final_response": final_response,
|
||||
"messages": messages,
|
||||
"api_calls": api_call_count,
|
||||
"completed": final_response is not None
|
||||
}
|
||||
|
||||
def chat(self, message: str) -> str:
|
||||
"""
|
||||
Simple chat interface that returns just the final response.
|
||||
|
||||
Args:
|
||||
message (str): User message
|
||||
|
||||
Returns:
|
||||
str: Final assistant response
|
||||
"""
|
||||
result = self.run_conversation(message)
|
||||
return result["final_response"]
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function for running the agent directly.
|
||||
"""
|
||||
print("🤖 AI Agent with Tool Calling")
|
||||
print("=" * 50)
|
||||
|
||||
# Initialize agent with local SGLang server (modify as needed)
|
||||
try:
|
||||
agent = AIAgent(
|
||||
base_url="https://api.anthropic.com/v1/",
|
||||
model="claude-opus-4-20250514"
|
||||
)
|
||||
except RuntimeError as e:
|
||||
print(f"❌ Failed to initialize agent: {e}")
|
||||
return
|
||||
|
||||
# Example conversation
|
||||
user_query = (
|
||||
"Tell me about the latest developments in Python 3.12 and what new features "
|
||||
"developers should know about. Please search for current information."
|
||||
)
|
||||
|
||||
print(f"\n📝 User Query: {user_query}")
|
||||
print("\n" + "=" * 50)
|
||||
|
||||
# Run conversation
|
||||
result = agent.run_conversation(user_query)
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("📋 CONVERSATION SUMMARY")
|
||||
print("=" * 50)
|
||||
print(f"✅ Completed: {result['completed']}")
|
||||
print(f"📞 API Calls: {result['api_calls']}")
|
||||
print(f"💬 Messages: {len(result['messages'])}")
|
||||
|
||||
if result['final_response']:
|
||||
print(f"\n🎯 FINAL RESPONSE:")
|
||||
print("-" * 30)
|
||||
print(result['final_response'])
|
||||
|
||||
print("\n👋 Agent execution completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
terminal_tool.py
Normal file
0
terminal_tool.py
Normal file
265
web_tools.py
Normal file
265
web_tools.py
Normal file
@@ -0,0 +1,265 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone Web Tools Module
|
||||
|
||||
This module provides generic web tools that work with multiple backend providers.
|
||||
Currently uses Tavily as the backend, but the interface makes it easy to swap
|
||||
to other providers like Firecrawl without changing the function signatures.
|
||||
|
||||
Available tools:
|
||||
- web_search_tool: Search the web for information
|
||||
- web_extract_tool: Extract content from specific web pages
|
||||
- web_crawl_tool: Crawl websites with specific instructions
|
||||
|
||||
Backend compatibility:
|
||||
- Tavily: https://docs.tavily.com/
|
||||
- Firecrawl: https://docs.firecrawl.dev/features/search
|
||||
|
||||
Usage:
|
||||
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
|
||||
|
||||
# Search the web
|
||||
results = web_search_tool("Python machine learning libraries", limit=3)
|
||||
|
||||
# Extract content from URLs
|
||||
content = web_extract_tool(["https://example.com"], format="markdown")
|
||||
|
||||
# Crawl a website
|
||||
crawl_data = web_crawl_tool("example.com", "Find contact information")
|
||||
"""
|
||||
|
||||
#TODO: Search Capabilities over the scraped pages
|
||||
#TODO: Store the pages in something
|
||||
#TODO: Tool to see what pages are available/saved to search over
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import List
|
||||
from tavily import TavilyClient
|
||||
|
||||
# Initialize Tavily client once at module level
|
||||
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
|
||||
def clean_base64_images(text: str) -> str:
|
||||
"""
|
||||
Remove base64 encoded images from text to reduce token count and clutter.
|
||||
|
||||
This function finds and removes base64 encoded images in various formats:
|
||||
- (data:image/png;base64,...)
|
||||
- (data:image/jpeg;base64,...)
|
||||
- (data:image/svg+xml;base64,...)
|
||||
- data:image/[type];base64,... (without parentheses)
|
||||
|
||||
Args:
|
||||
text: The text content to clean
|
||||
|
||||
Returns:
|
||||
Cleaned text with base64 images replaced with placeholders
|
||||
"""
|
||||
# Pattern to match base64 encoded images wrapped in parentheses
|
||||
# Matches: (data:image/[type];base64,[base64-string])
|
||||
base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
|
||||
|
||||
# Pattern to match base64 encoded images without parentheses
|
||||
# Matches: data:image/[type];base64,[base64-string]
|
||||
base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
|
||||
|
||||
# Replace parentheses-wrapped images first
|
||||
cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
|
||||
|
||||
# Then replace any remaining non-parentheses images
|
||||
cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
|
||||
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
"""
|
||||
Search the web for information using available search API backend.
|
||||
|
||||
This function provides a generic interface for web search that can work
|
||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
|
||||
Args:
|
||||
query (str): The search query to look up
|
||||
limit (int): Maximum number of results to return (default: 5)
|
||||
|
||||
Returns:
|
||||
str: JSON string containing search results with the following structure:
|
||||
{
|
||||
"query": str,
|
||||
"results": [
|
||||
{
|
||||
"title": str,
|
||||
"url": str,
|
||||
"content": str,
|
||||
"score": float
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Raises:
|
||||
Exception: If search fails or API key is not set
|
||||
"""
|
||||
try:
|
||||
print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
|
||||
|
||||
# Use Tavily's search functionality
|
||||
response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
|
||||
|
||||
print(f"✅ Found {len(response.get('results', []))} results")
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from search results
|
||||
return clean_base64_images(result_json)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error searching web: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
def web_extract_tool(urls: List[str], format: str = None) -> str:
|
||||
"""
|
||||
Extract content from specific web pages using available extraction API backend.
|
||||
|
||||
This function provides a generic interface for web content extraction that
|
||||
can work with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
|
||||
Args:
|
||||
urls (List[str]): List of URLs to extract content from
|
||||
format (str): Desired output format ("markdown" or "html", optional)
|
||||
|
||||
Returns:
|
||||
str: JSON string containing extracted content with the following structure:
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"url": str,
|
||||
"title": str,
|
||||
"raw_content": str,
|
||||
"content": str
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Raises:
|
||||
Exception: If extraction fails or API key is not set
|
||||
"""
|
||||
try:
|
||||
print(f"📄 Extracting content from {len(urls)} URL(s)")
|
||||
|
||||
# Use Tavily's extract functionality
|
||||
response = tavily_client.extract(urls=urls, format=format)
|
||||
|
||||
print(f"✅ Extracted content from {len(response.get('results', []))} pages")
|
||||
|
||||
# Print summary of extracted pages for debugging
|
||||
for result in response.get('results', []):
|
||||
url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('raw_content', ''))
|
||||
print(f" 📝 {url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from extracted content
|
||||
return clean_base64_images(result_json)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error extracting content: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") -> str:
|
||||
"""
|
||||
Crawl a website with specific instructions using available crawling API backend.
|
||||
|
||||
This function provides a generic interface for web crawling that can work
|
||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
|
||||
Args:
|
||||
url (str): The base URL to crawl (can include or exclude https://)
|
||||
instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
|
||||
depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
|
||||
|
||||
Returns:
|
||||
str: JSON string containing crawled content with the following structure:
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"url": str,
|
||||
"title": str,
|
||||
"content": str
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Raises:
|
||||
Exception: If crawling fails or API key is not set
|
||||
"""
|
||||
try:
|
||||
instructions_text = f" with instructions: '{instructions}'" if instructions else ""
|
||||
print(f"🕷️ Crawling {url}{instructions_text}")
|
||||
|
||||
# Use Tavily's crawl functionality
|
||||
response = tavily_client.crawl(
|
||||
url=url,
|
||||
limit=20, # Reasonable limit for most use cases
|
||||
instructions=instructions or "Get all available content",
|
||||
extract_depth=depth
|
||||
)
|
||||
|
||||
print(f"✅ Crawled {len(response.get('results', []))} pages")
|
||||
|
||||
# Print summary of crawled pages for debugging
|
||||
for result in response.get('results', []):
|
||||
page_url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('content', ''))
|
||||
print(f" 🌐 {page_url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from crawled content
|
||||
return clean_base64_images(result_json)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error crawling website: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
# Convenience function to check if API key is available
|
||||
def check_tavily_api_key() -> bool:
|
||||
"""
|
||||
Check if the Tavily API key is available in environment variables.
|
||||
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
"""
|
||||
return bool(os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Simple test/demo when run directly
|
||||
"""
|
||||
print("🌐 Standalone Web Tools Module")
|
||||
print("=" * 40)
|
||||
|
||||
# Check if API key is available
|
||||
if not check_tavily_api_key():
|
||||
print("❌ TAVILY_API_KEY environment variable not set")
|
||||
print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://tavily.com/")
|
||||
exit(1)
|
||||
|
||||
print("✅ Tavily API key found")
|
||||
print("🛠️ Web tools ready for use!")
|
||||
print("\nExample usage:")
|
||||
print(" from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
|
||||
print(" results = web_search_tool('Python tutorials')")
|
||||
print(" content = web_extract_tool(['https://example.com'])")
|
||||
print(" crawl_data = web_crawl_tool('example.com', 'Find documentation')")
|
||||
Reference in New Issue
Block a user