2025-10-01 23:29:25 +00:00
#!/usr/bin/env python3
"""
Vision Tools Module
This module provides vision analysis tools that work with image URLs .
2026-01-14 13:40:10 +00:00
Uses Gemini 3 Flash Preview via OpenRouter API for intelligent image understanding .
2025-10-01 23:29:25 +00:00
Available tools :
- vision_analyze_tool : Analyze images from URLs with custom prompts
Features :
2025-10-08 02:38:04 +00:00
- Downloads images from URLs and converts to base64 for API compatibility
2025-10-01 23:29:25 +00:00
- Comprehensive image description
- Context - aware analysis based on user queries
2025-10-08 02:38:04 +00:00
- Automatic temporary file cleanup
2025-10-01 23:29:25 +00:00
- Proper error handling and validation
- Debug logging support
Usage :
from vision_tools import vision_analyze_tool
import asyncio
# Analyze an image
result = await vision_analyze_tool (
image_url = " https://example.com/image.jpg " ,
user_prompt = " What architectural style is this building? "
)
"""
import json
2026-02-21 03:11:11 -08:00
import logging
2025-10-01 23:29:25 +00:00
import os
import asyncio
import uuid
2025-10-08 02:38:04 +00:00
import base64
2025-10-01 23:29:25 +00:00
from pathlib import Path
from typing import Dict , Any , Optional
2026-02-20 23:23:32 -08:00
import httpx
2026-02-21 03:53:24 -08:00
from tools . openrouter_client import get_async_client as _get_openrouter_client , check_api_key as check_openrouter_api_key
from tools . debug_helpers import DebugSession
2025-10-01 23:29:25 +00:00
2026-02-21 03:11:11 -08:00
logger = logging . getLogger ( __name__ )
2025-10-01 23:29:25 +00:00
# Configuration for vision processing
2026-01-14 13:40:10 +00:00
DEFAULT_VISION_MODEL = " google/gemini-3-flash-preview "
2025-10-01 23:29:25 +00:00
2026-02-21 03:53:24 -08:00
_debug = DebugSession ( " vision_tools " , env_var = " VISION_TOOLS_DEBUG " )
2025-10-01 23:29:25 +00:00
def _validate_image_url ( url : str ) - > bool :
"""
Basic validation of image URL format .
Args :
url ( str ) : The URL to validate
Returns :
bool : True if URL appears to be valid , False otherwise
"""
if not url or not isinstance ( url , str ) :
return False
# Check if it's a valid URL format
if not ( url . startswith ( ' http:// ' ) or url . startswith ( ' https:// ' ) ) :
return False
# Check for common image extensions (optional, as URLs may not have extensions)
image_extensions = [ ' .jpg ' , ' .jpeg ' , ' .png ' , ' .gif ' , ' .bmp ' , ' .webp ' , ' .svg ' ]
return True # Allow all HTTP/HTTPS URLs for flexibility
2026-01-18 10:11:59 +00:00
async def _download_image ( image_url : str , destination : Path , max_retries : int = 3 ) - > Path :
2025-10-08 02:38:04 +00:00
"""
2026-01-18 10:11:59 +00:00
Download an image from a URL to a local destination ( async ) with retry logic .
2025-10-08 02:38:04 +00:00
Args :
image_url ( str ) : The URL of the image to download
destination ( Path ) : The path where the image should be saved
2026-01-18 10:11:59 +00:00
max_retries ( int ) : Maximum number of retry attempts ( default : 3 )
2025-10-08 02:38:04 +00:00
Returns :
Path : The path to the downloaded image
Raises :
2026-01-18 10:11:59 +00:00
Exception : If download fails after all retries
2025-10-08 02:38:04 +00:00
"""
2026-01-18 10:11:59 +00:00
import asyncio
2025-10-08 02:38:04 +00:00
# Create parent directories if they don't exist
destination . parent . mkdir ( parents = True , exist_ok = True )
2026-01-18 10:11:59 +00:00
last_error = None
for attempt in range ( max_retries ) :
try :
# Download the image with appropriate headers using async httpx
2026-01-29 06:10:24 +00:00
# Enable follow_redirects to handle image CDNs that redirect (e.g., Imgur, Picsum)
async with httpx . AsyncClient ( timeout = 30.0 , follow_redirects = True ) as client :
2026-01-18 10:11:59 +00:00
response = await client . get (
image_url ,
2026-01-29 06:10:24 +00:00
headers = {
" User-Agent " : " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 " ,
" Accept " : " image/*,*/*;q=0.8 " ,
} ,
2026-01-18 10:11:59 +00:00
)
response . raise_for_status ( )
# Save the image content
destination . write_bytes ( response . content )
return destination
except Exception as e :
last_error = e
if attempt < max_retries - 1 :
wait_time = 2 * * ( attempt + 1 ) # 2s, 4s, 8s
2026-02-21 03:11:11 -08:00
logger . warning ( " Image download failed (attempt %s / %s ): %s " , attempt + 1 , max_retries , str ( e ) [ : 50 ] )
logger . warning ( " Retrying in %s s... " , wait_time )
2026-01-18 10:11:59 +00:00
await asyncio . sleep ( wait_time )
else :
2026-02-21 03:11:11 -08:00
logger . error ( " Image download failed after %s attempts: %s " , max_retries , str ( e ) [ : 100 ] )
2025-10-08 02:38:04 +00:00
2026-01-18 10:11:59 +00:00
raise last_error
2025-10-08 02:38:04 +00:00
def _determine_mime_type ( image_path : Path ) - > str :
"""
Determine the MIME type of an image based on its file extension .
Args :
image_path ( Path ) : Path to the image file
Returns :
str : The MIME type ( defaults to image / jpeg if unknown )
"""
extension = image_path . suffix . lower ( )
mime_types = {
' .jpg ' : ' image/jpeg ' ,
' .jpeg ' : ' image/jpeg ' ,
' .png ' : ' image/png ' ,
' .gif ' : ' image/gif ' ,
' .bmp ' : ' image/bmp ' ,
' .webp ' : ' image/webp ' ,
' .svg ' : ' image/svg+xml '
}
return mime_types . get ( extension , ' image/jpeg ' )
def _image_to_base64_data_url ( image_path : Path , mime_type : Optional [ str ] = None ) - > str :
"""
Convert an image file to a base64 - encoded data URL .
Args :
image_path ( Path ) : Path to the image file
mime_type ( Optional [ str ] ) : MIME type of the image ( auto - detected if None )
Returns :
str : Base64 - encoded data URL ( e . g . , " data:image/jpeg;base64,... " )
"""
# Read the image as bytes
data = image_path . read_bytes ( )
# Encode to base64
encoded = base64 . b64encode ( data ) . decode ( " ascii " )
# Determine MIME type
mime = mime_type or _determine_mime_type ( image_path )
# Create data URL
data_url = f " data: { mime } ;base64, { encoded } "
return data_url
2025-10-01 23:29:25 +00:00
async def vision_analyze_tool (
image_url : str ,
user_prompt : str ,
model : str = DEFAULT_VISION_MODEL
) - > str :
"""
2026-02-15 16:10:50 -08:00
Analyze an image from a URL or local file path using vision AI .
2025-10-01 23:29:25 +00:00
2026-02-15 16:10:50 -08:00
This tool accepts either an HTTP / HTTPS URL or a local file path . For URLs ,
it downloads the image first . In both cases , the image is converted to base64
and processed using Gemini 3 Flash Preview via OpenRouter API .
2025-10-08 02:38:04 +00:00
2025-10-01 23:29:25 +00:00
The user_prompt parameter is expected to be pre - formatted by the calling
function ( typically model_tools . py ) to include both full description
requests and specific questions .
Args :
2026-02-15 16:10:50 -08:00
image_url ( str ) : The URL or local file path of the image to analyze .
Accepts http : / / , https : / / URLs or absolute / relative file paths .
2025-10-01 23:29:25 +00:00
user_prompt ( str ) : The pre - formatted prompt for the vision model
2026-01-14 13:40:10 +00:00
model ( str ) : The vision model to use ( default : google / gemini - 3 - flash - preview )
2025-10-01 23:29:25 +00:00
Returns :
str : JSON string containing the analysis results with the following structure :
{
" success " : bool ,
" analysis " : str ( defaults to error message if None )
}
Raises :
2025-10-08 02:38:04 +00:00
Exception : If download fails , analysis fails , or API key is not set
Note :
2026-02-15 16:10:50 -08:00
- For URLs , temporary images are stored in . / temp_vision_images / and cleaned up
- For local file paths , the file is used directly and NOT deleted
2025-10-08 02:38:04 +00:00
- Supports common image formats ( JPEG , PNG , GIF , WebP , etc . )
2025-10-01 23:29:25 +00:00
"""
debug_call_data = {
" parameters " : {
" image_url " : image_url ,
2025-10-15 18:07:06 +00:00
" user_prompt " : user_prompt [ : 200 ] + " ... " if len ( user_prompt ) > 200 else user_prompt ,
2025-10-01 23:29:25 +00:00
" model " : model
} ,
" error " : None ,
" success " : False ,
" analysis_length " : 0 ,
2025-10-15 18:07:06 +00:00
" model_used " : model ,
" image_size_bytes " : 0
2025-10-01 23:29:25 +00:00
}
2025-10-08 02:38:04 +00:00
temp_image_path = None
2026-02-15 16:10:50 -08:00
# Track whether we should clean up the file after processing.
# Local files (e.g. from the image cache) should NOT be deleted.
should_cleanup = True
2025-10-08 02:38:04 +00:00
2025-10-01 23:29:25 +00:00
try :
2026-02-21 03:11:11 -08:00
logger . info ( " Analyzing image: %s " , image_url [ : 60 ] )
logger . info ( " User prompt: %s " , user_prompt [ : 100 ] )
2025-10-01 23:29:25 +00:00
# Check API key availability
2026-01-14 13:40:10 +00:00
if not os . getenv ( " OPENROUTER_API_KEY " ) :
raise ValueError ( " OPENROUTER_API_KEY environment variable not set " )
2025-10-01 23:29:25 +00:00
2026-02-15 16:10:50 -08:00
# Determine if this is a local file path or a remote URL
local_path = Path ( image_url )
if local_path . is_file ( ) :
# Local file path (e.g. from platform image cache) -- skip download
2026-02-21 03:11:11 -08:00
logger . info ( " Using local image file: %s " , image_url )
2026-02-15 16:10:50 -08:00
temp_image_path = local_path
should_cleanup = False # Don't delete cached/local files
elif _validate_image_url ( image_url ) :
# Remote URL -- download to a temporary location
2026-02-21 03:11:11 -08:00
logger . info ( " Downloading image from URL... " )
2026-02-15 16:10:50 -08:00
temp_dir = Path ( " ./temp_vision_images " )
temp_image_path = temp_dir / f " temp_image_ { uuid . uuid4 ( ) } .jpg "
await _download_image ( image_url , temp_image_path )
should_cleanup = True
else :
raise ValueError (
" Invalid image source. Provide an HTTP/HTTPS URL or a valid local file path. "
)
2025-10-15 18:07:06 +00:00
# Get image file size for logging
image_size_bytes = temp_image_path . stat ( ) . st_size
image_size_kb = image_size_bytes / 1024
2026-02-21 03:11:11 -08:00
logger . info ( " Image ready ( %.1f KB) " , image_size_kb )
2025-10-08 02:38:04 +00:00
# Convert image to base64 data URL
2026-02-21 03:11:11 -08:00
logger . info ( " Converting image to base64... " )
2025-10-08 02:38:04 +00:00
image_data_url = _image_to_base64_data_url ( temp_image_path )
2025-10-15 18:07:06 +00:00
# Calculate size in KB for better readability
data_size_kb = len ( image_data_url ) / 1024
2026-02-21 03:11:11 -08:00
logger . info ( " Image converted to base64 ( %.1f KB) " , data_size_kb )
2025-10-15 18:07:06 +00:00
debug_call_data [ " image_size_bytes " ] = image_size_bytes
2025-10-08 02:38:04 +00:00
2025-10-01 23:29:25 +00:00
# Use the prompt as provided (model_tools.py now handles full description formatting)
comprehensive_prompt = user_prompt
2025-10-08 02:38:04 +00:00
# Prepare the message with base64-encoded image
2025-10-01 23:29:25 +00:00
messages = [
{
" role " : " user " ,
" content " : [
{
" type " : " text " ,
" text " : comprehensive_prompt
} ,
{
" type " : " image_url " ,
" image_url " : {
2025-10-08 02:38:04 +00:00
" url " : image_data_url
2025-10-01 23:29:25 +00:00
}
}
]
}
]
2026-02-21 03:11:11 -08:00
logger . info ( " Processing image with %s ... " , model )
2025-10-01 23:29:25 +00:00
2026-01-14 13:40:10 +00:00
# Call the vision API with reasoning enabled
2026-01-29 19:59:59 +00:00
response = await _get_openrouter_client ( ) . chat . completions . create (
2025-10-01 23:29:25 +00:00
model = model ,
messages = messages ,
temperature = 0.1 , # Low temperature for consistent analysis
2026-01-14 13:40:10 +00:00
max_tokens = 2000 , # Generous limit for detailed analysis
extra_body = {
" reasoning " : {
" enabled " : True ,
" effort " : " xhigh "
}
}
2025-10-01 23:29:25 +00:00
)
# Extract the analysis
analysis = response . choices [ 0 ] . message . content . strip ( )
analysis_length = len ( analysis )
2026-02-21 03:11:11 -08:00
logger . info ( " Image analysis completed ( %s characters) " , analysis_length )
2025-10-01 23:29:25 +00:00
# Prepare successful response
result = {
" success " : True ,
" analysis " : analysis or " There was a problem with the request and the image could not be analyzed. "
}
debug_call_data [ " success " ] = True
debug_call_data [ " analysis_length " ] = analysis_length
# Log debug information
2026-02-21 03:53:24 -08:00
_debug . log_call ( " vision_analyze_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
2025-11-05 03:47:17 +00:00
return json . dumps ( result , indent = 2 , ensure_ascii = False )
2025-10-01 23:29:25 +00:00
except Exception as e :
error_msg = f " Error analyzing image: { str ( e ) } "
2026-02-21 03:11:11 -08:00
logger . error ( " %s " , error_msg )
2025-10-01 23:29:25 +00:00
# Prepare error response
result = {
" success " : False ,
" analysis " : " There was a problem with the request and the image could not be analyzed. "
}
debug_call_data [ " error " ] = error_msg
2026-02-21 03:53:24 -08:00
_debug . log_call ( " vision_analyze_tool " , debug_call_data )
_debug . save ( )
2025-10-01 23:29:25 +00:00
2025-11-05 03:47:17 +00:00
return json . dumps ( result , indent = 2 , ensure_ascii = False )
2025-10-08 02:38:04 +00:00
finally :
2026-02-15 16:10:50 -08:00
# Clean up temporary image file (but NOT local/cached files)
if should_cleanup and temp_image_path and temp_image_path . exists ( ) :
2025-10-08 02:38:04 +00:00
try :
temp_image_path . unlink ( )
2026-02-21 03:11:11 -08:00
logger . debug ( " Cleaned up temporary image file " )
2025-10-08 02:38:04 +00:00
except Exception as cleanup_error :
2026-02-21 03:11:11 -08:00
logger . warning ( " Could not delete temporary file: %s " , cleanup_error )
2025-10-01 23:29:25 +00:00
def check_vision_requirements ( ) - > bool :
"""
Check if all requirements for vision tools are met .
Returns :
bool : True if requirements are met , False otherwise
"""
2026-01-14 13:40:10 +00:00
return check_openrouter_api_key ( )
2025-10-01 23:29:25 +00:00
def get_debug_session_info ( ) - > Dict [ str , Any ] :
"""
Get information about the current debug session .
Returns :
Dict [ str , Any ] : Dictionary containing debug session information
"""
2026-02-21 03:53:24 -08:00
return _debug . get_session_info ( )
2025-10-01 23:29:25 +00:00
if __name__ == " __main__ " :
"""
Simple test / demo when run directly
"""
print ( " 👁️ Vision Tools Module " )
print ( " = " * 40 )
# Check if API key is available
2026-01-14 13:40:10 +00:00
api_available = check_openrouter_api_key ( )
2025-10-01 23:29:25 +00:00
if not api_available :
2026-01-14 13:40:10 +00:00
print ( " ❌ OPENROUTER_API_KEY environment variable not set " )
print ( " Please set your API key: export OPENROUTER_API_KEY= ' your-key-here ' " )
print ( " Get API key at: https://openrouter.ai/ " )
2025-10-01 23:29:25 +00:00
exit ( 1 )
else :
2026-01-14 13:40:10 +00:00
print ( " ✅ OpenRouter API key found " )
2025-10-01 23:29:25 +00:00
print ( " 🛠️ Vision tools ready for use! " )
print ( f " 🧠 Using model: { DEFAULT_VISION_MODEL } " )
# Show debug mode status
2026-02-21 03:53:24 -08:00
if _debug . active :
print ( f " 🐛 Debug mode ENABLED - Session ID: { _debug . session_id } " )
print ( f " Debug logs will be saved to: ./logs/vision_tools_debug_ { _debug . session_id } .json " )
2025-10-01 23:29:25 +00:00
else :
print ( " 🐛 Debug mode disabled (set VISION_TOOLS_DEBUG=true to enable) " )
print ( " \n Basic usage: " )
print ( " from vision_tools import vision_analyze_tool " )
print ( " import asyncio " )
print ( " " )
print ( " async def main(): " )
print ( " result = await vision_analyze_tool( " )
print ( " image_url= ' https://example.com/image.jpg ' , " )
print ( " user_prompt= ' What do you see in this image? ' " )
print ( " ) " )
print ( " print(result) " )
print ( " asyncio.run(main()) " )
print ( " \n Example prompts: " )
print ( " - ' What architectural style is this building? ' " )
print ( " - ' Describe the emotions and mood in this image ' " )
print ( " - ' What text can you read in this image? ' " )
print ( " - ' Identify any safety hazards visible ' " )
print ( " - ' What products or brands are shown? ' " )
print ( " \n Debug mode: " )
print ( " # Enable debug logging " )
print ( " export VISION_TOOLS_DEBUG=true " )
print ( " # Debug logs capture all vision analysis calls and results " )
print ( " # Logs saved to: ./logs/vision_tools_debug_UUID.json " )
2026-02-21 20:22:33 -08:00
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
from tools . registry import registry
VISION_ANALYZE_SCHEMA = {
" name " : " vision_analyze " ,
" description " : " Analyze images using AI vision. Provides a comprehensive description and answers a specific question about the image content. " ,
" parameters " : {
" type " : " object " ,
" properties " : {
" image_url " : {
" type " : " string " ,
" description " : " Image URL (http/https) or local file path to analyze. "
} ,
" question " : {
" type " : " string " ,
" description " : " Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question. "
}
} ,
" required " : [ " image_url " , " question " ]
}
}
def _handle_vision_analyze ( args , * * kw ) :
image_url = args . get ( " image_url " , " " )
question = args . get ( " question " , " " )
full_prompt = f " Fully describe and explain everything about this image, then answer the following question: \n \n { question } "
return vision_analyze_tool ( image_url , full_prompt , " google/gemini-3-flash-preview " )
registry . register (
name = " vision_analyze " ,
toolset = " vision " ,
schema = VISION_ANALYZE_SCHEMA ,
handler = _handle_vision_analyze ,
check_fn = check_vision_requirements ,
requires_env = [ " OPENROUTER_API_KEY " ] ,
is_async = True ,
)