106 lines
3.7 KiB
Python
106 lines
3.7 KiB
Python
|
|
#!/usr/bin/env python3
|
|
"""
|
|
Local Inference Bridge — Fast-path for low-entropy LLM tasks.
|
|
|
|
Detects local Ollama/llama-cpp instances and uses them for 'Auxiliary' tasks
|
|
(summarization, extraction, simple verification) to reduce cloud dependency.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import requests
|
|
from typing import Dict, List, Optional, Any
|
|
from tools.registry import registry, tool_error, tool_result
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
LOCAL_INFERENCE_SCHEMA = {
|
|
"name": "local_inference",
|
|
"description": "Execute a task using a local inference engine (Ollama/llama-cpp) if available. Ideal for simple summarization, text cleanup, or data extraction where cloud-grade intelligence is overkill.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"prompt": {"type": "string", "description": "The task prompt."},
|
|
"system": {"type": "string", "description": "Optional system instruction."},
|
|
"engine": {"type": "string", "enum": ["auto", "ollama", "llama-cpp"], "default": "auto"}
|
|
},
|
|
"required": ["prompt"]
|
|
}
|
|
}
|
|
|
|
def detect_local_engine() -> Optional[Dict[str, str]]:
|
|
"""Detect presence of local inference engines."""
|
|
# 1. Check Ollama (default port 11434)
|
|
try:
|
|
res = requests.get("http://localhost:11434/api/tags", timeout=1)
|
|
if res.status_code == 200:
|
|
return {"type": "ollama", "url": "http://localhost:11434"}
|
|
except:
|
|
pass
|
|
|
|
# 2. Check llama-cpp-python (commonly on 8000 or 8080)
|
|
for port in [8000, 8080]:
|
|
try:
|
|
res = requests.get(f"http://localhost:{port}/v1/models", timeout=1)
|
|
if res.status_code == 200:
|
|
return {"type": "llama-cpp", "url": f"http://localhost:{port}"}
|
|
except:
|
|
pass
|
|
|
|
return None
|
|
|
|
def run_local_task(prompt: str, system: str = None, engine: str = "auto"):
|
|
"""Execute inference on a detected local engine."""
|
|
info = detect_local_engine()
|
|
if not info:
|
|
return tool_error("No local inference engine (Ollama or llama-cpp) detected on localhost.")
|
|
|
|
try:
|
|
if info["type"] == "ollama":
|
|
# Select first available model or default to gemma
|
|
models = requests.get(f"{info['url']}/api/tags").json().get("models", [])
|
|
model_name = models[0]["name"] if models else "gemma"
|
|
|
|
payload = {
|
|
"model": model_name,
|
|
"prompt": prompt,
|
|
"stream": False
|
|
}
|
|
if system: payload["system"] = system
|
|
|
|
res = requests.post(f"{info['url']}/api/generate", json=payload, timeout=60)
|
|
result = res.json().get("response", "")
|
|
return tool_result(engine="Ollama", model=model_name, response=result)
|
|
|
|
elif info["type"] == "llama-cpp":
|
|
payload = {
|
|
"model": "local-model",
|
|
"messages": [
|
|
{"role": "system", "content": system or "You are a helpful assistant."},
|
|
{"role": "user", "content": prompt}
|
|
]
|
|
}
|
|
res = requests.post(f"{info['url']}/v1/chat/completions", json=payload, timeout=60)
|
|
result = res.json()["choices"][0]["message"]["content"]
|
|
return tool_result(engine="llama-cpp", response=result)
|
|
|
|
except Exception as e:
|
|
return tool_error(f"Local inference failed: {str(e)}")
|
|
|
|
def _handle_local_inference(args, **kwargs):
|
|
return run_local_task(
|
|
prompt=args.get("prompt"),
|
|
system=args.get("system"),
|
|
engine=args.get("engine", "auto")
|
|
)
|
|
|
|
registry.register(
|
|
name="local_inference",
|
|
toolset="inference",
|
|
schema=LOCAL_INFERENCE_SCHEMA,
|
|
handler=_handle_local_inference,
|
|
emoji="🏠"
|
|
)
|
|
|