Update vision_tools.py to include image downloading and base64 conversion features.

add excluding tmp image dl's in .gitignore
This commit is contained in:
teknium
2025-10-08 02:38:04 +00:00
parent d36790de91
commit 8d256779d8
3 changed files with 124 additions and 7 deletions

View File

@@ -9,8 +9,10 @@ Available tools:
- vision_analyze_tool: Analyze images from URLs with custom prompts
Features:
- Downloads images from URLs and converts to base64 for API compatibility
- Comprehensive image description
- Context-aware analysis based on user queries
- Automatic temporary file cleanup
- Proper error handling and validation
- Debug logging support
@@ -30,6 +32,8 @@ import os
import asyncio
import uuid
import datetime
import base64
import requests
from pathlib import Path
from typing import Dict, Any, Optional
from openai import AsyncOpenAI
@@ -127,6 +131,85 @@ def _validate_image_url(url: str) -> bool:
return True # Allow all HTTP/HTTPS URLs for flexibility
def _download_image(image_url: str, destination: Path) -> Path:
"""
Download an image from a URL to a local destination.
Args:
image_url (str): The URL of the image to download
destination (Path): The path where the image should be saved
Returns:
Path: The path to the downloaded image
Raises:
Exception: If download fails or response is invalid
"""
# Create parent directories if they don't exist
destination.parent.mkdir(parents=True, exist_ok=True)
# Download the image with appropriate headers
response = requests.get(
image_url,
timeout=30,
headers={"User-Agent": "hermes-agent-vision/1.0"},
)
response.raise_for_status()
# Save the image content
destination.write_bytes(response.content)
return destination
def _determine_mime_type(image_path: Path) -> str:
"""
Determine the MIME type of an image based on its file extension.
Args:
image_path (Path): Path to the image file
Returns:
str: The MIME type (defaults to image/jpeg if unknown)
"""
extension = image_path.suffix.lower()
mime_types = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.bmp': 'image/bmp',
'.webp': 'image/webp',
'.svg': 'image/svg+xml'
}
return mime_types.get(extension, 'image/jpeg')
def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) -> str:
"""
Convert an image file to a base64-encoded data URL.
Args:
image_path (Path): Path to the image file
mime_type (Optional[str]): MIME type of the image (auto-detected if None)
Returns:
str: Base64-encoded data URL (e.g., "data:image/jpeg;base64,...")
"""
# Read the image as bytes
data = image_path.read_bytes()
# Encode to base64
encoded = base64.b64encode(data).decode("ascii")
# Determine MIME type
mime = mime_type or _determine_mime_type(image_path)
# Create data URL
data_url = f"data:{mime};base64,{encoded}"
return data_url
async def vision_analyze_tool(
image_url: str,
user_prompt: str,
@@ -135,13 +218,16 @@ async def vision_analyze_tool(
"""
Analyze an image from a URL using vision AI.
This tool processes images using Gemini Flash via Nous Research API.
This tool downloads images from URLs, converts them to base64, and processes
them using Gemini Flash via Nous Research API. The image is downloaded to a
temporary location and automatically cleaned up after processing.
The user_prompt parameter is expected to be pre-formatted by the calling
function (typically model_tools.py) to include both full description
requests and specific questions.
Args:
image_url (str): The URL of the image to analyze
image_url (str): The URL of the image to analyze (must be http:// or https://)
user_prompt (str): The pre-formatted prompt for the vision model
model (str): The vision model to use (default: gemini-2.5-flash)
@@ -153,7 +239,12 @@ async def vision_analyze_tool(
}
Raises:
Exception: If analysis fails or API key is not set
Exception: If download fails, analysis fails, or API key is not set
Note:
- Temporary images are stored in ./temp_vision_images/
- Images are automatically deleted after processing
- Supports common image formats (JPEG, PNG, GIF, WebP, etc.)
"""
debug_call_data = {
"parameters": {
@@ -167,6 +258,8 @@ async def vision_analyze_tool(
"model_used": model
}
temp_image_path = None
try:
print(f"🔍 Analyzing image from URL: {image_url[:60]}{'...' if len(image_url) > 60 else ''}")
print(f"📝 User prompt: {user_prompt[:100]}{'...' if len(user_prompt) > 100 else ''}")
@@ -179,10 +272,23 @@ async def vision_analyze_tool(
if not os.getenv("NOUS_API_KEY"):
raise ValueError("NOUS_API_KEY environment variable not set")
# Download the image to a temporary location
print(f"⬇️ Downloading image from URL...")
temp_dir = Path("./temp_vision_images")
temp_image_path = temp_dir / f"temp_image_{uuid.uuid4()}.jpg"
_download_image(image_url, temp_image_path)
print(f"✅ Image downloaded successfully")
# Convert image to base64 data URL
print(f"🔄 Converting image to base64...")
image_data_url = _image_to_base64_data_url(temp_image_path)
print(f"✅ Image converted to base64 ({len(image_data_url)} characters)")
# Use the prompt as provided (model_tools.py now handles full description formatting)
comprehensive_prompt = user_prompt
# Prepare the message with image URL format
# Prepare the message with base64-encoded image
messages = [
{
"role": "user",
@@ -194,7 +300,7 @@ async def vision_analyze_tool(
{
"type": "image_url",
"image_url": {
"url": image_url
"url": image_data_url
}
}
]
@@ -247,6 +353,15 @@ async def vision_analyze_tool(
_save_debug_log()
return json.dumps(result, indent=2)
finally:
# Clean up temporary image file
if temp_image_path and temp_image_path.exists():
try:
temp_image_path.unlink()
print(f"🧹 Cleaned up temporary image file")
except Exception as cleanup_error:
print(f"⚠️ Warning: Could not delete temporary file: {cleanup_error}")
def check_nous_api_key() -> bool: