feat: Local Inference Bridge — Bypassing cloud for local tasks
All checks were successful
Lint / lint (pull_request) Successful in 17s

This commit is contained in:
2026-04-22 03:01:37 +00:00
parent a2a40429bd
commit 5a0bdb556e

View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
Local Inference Bridge — Fast-path for low-entropy LLM tasks.
Detects local Ollama/llama-cpp instances and uses them for 'Auxiliary' tasks
(summarization, extraction, simple verification) to reduce cloud dependency.
"""
import json
import logging
import os
import requests
from typing import Dict, List, Optional, Any
from tools.registry import registry, tool_error, tool_result
logger = logging.getLogger(__name__)
LOCAL_INFERENCE_SCHEMA = {
"name": "local_inference",
"description": "Execute a task using a local inference engine (Ollama/llama-cpp) if available. Ideal for simple summarization, text cleanup, or data extraction where cloud-grade intelligence is overkill.",
"parameters": {
"type": "object",
"properties": {
"prompt": {"type": "string", "description": "The task prompt."},
"system": {"type": "string", "description": "Optional system instruction."},
"engine": {"type": "string", "enum": ["auto", "ollama", "llama-cpp"], "default": "auto"}
},
"required": ["prompt"]
}
}
def detect_local_engine() -> Optional[Dict[str, str]]:
"""Detect presence of local inference engines."""
# 1. Check Ollama (default port 11434)
try:
res = requests.get("http://localhost:11434/api/tags", timeout=1)
if res.status_code == 200:
return {"type": "ollama", "url": "http://localhost:11434"}
except:
pass
# 2. Check llama-cpp-python (commonly on 8000 or 8080)
for port in [8000, 8080]:
try:
res = requests.get(f"http://localhost:{port}/v1/models", timeout=1)
if res.status_code == 200:
return {"type": "llama-cpp", "url": f"http://localhost:{port}"}
except:
pass
return None
def run_local_task(prompt: str, system: str = None, engine: str = "auto"):
"""Execute inference on a detected local engine."""
info = detect_local_engine()
if not info:
return tool_error("No local inference engine (Ollama or llama-cpp) detected on localhost.")
try:
if info["type"] == "ollama":
# Select first available model or default to gemma
models = requests.get(f"{info['url']}/api/tags").json().get("models", [])
model_name = models[0]["name"] if models else "gemma"
payload = {
"model": model_name,
"prompt": prompt,
"stream": False
}
if system: payload["system"] = system
res = requests.post(f"{info['url']}/api/generate", json=payload, timeout=60)
result = res.json().get("response", "")
return tool_result(engine="Ollama", model=model_name, response=result)
elif info["type"] == "llama-cpp":
payload = {
"model": "local-model",
"messages": [
{"role": "system", "content": system or "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
}
res = requests.post(f"{info['url']}/v1/chat/completions", json=payload, timeout=60)
result = res.json()["choices"][0]["message"]["content"]
return tool_result(engine="llama-cpp", response=result)
except Exception as e:
return tool_error(f"Local inference failed: {str(e)}")
def _handle_local_inference(args, **kwargs):
return run_local_task(
prompt=args.get("prompt"),
system=args.get("system"),
engine=args.get("engine", "auto")
)
registry.register(
name="local_inference",
toolset="inference",
schema=LOCAL_INFERENCE_SCHEMA,
handler=_handle_local_inference,
emoji="🏠"
)