feat: Local Inference Bridge — Bypassing cloud for local tasks

2026-04-22 03:01:37 +00:00
parent a2a40429bd
commit 5a0bdb556e
1 changed files with 106 additions and 0 deletions
--- a/tools/local_inference_tool.py
+++ b/tools/local_inference_tool.py
@@ -0,0 +1,106 @@
+
+#!/usr/bin/env python3
+"""
+Local Inference Bridge — Fast-path for low-entropy LLM tasks.
+
+Detects local Ollama/llama-cpp instances and uses them for 'Auxiliary' tasks
+(summarization, extraction, simple verification) to reduce cloud dependency.
+"""
+
+import json
+import logging
+import os
+import requests
+from typing import Dict, List, Optional, Any
+from tools.registry import registry, tool_error, tool_result
+
+logger = logging.getLogger(__name__)
+
+LOCAL_INFERENCE_SCHEMA = {
+    "name": "local_inference",
+    "description": "Execute a task using a local inference engine (Ollama/llama-cpp) if available. Ideal for simple summarization, text cleanup, or data extraction where cloud-grade intelligence is overkill.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "prompt": {"type": "string", "description": "The task prompt."},
+            "system": {"type": "string", "description": "Optional system instruction."},
+            "engine": {"type": "string", "enum": ["auto", "ollama", "llama-cpp"], "default": "auto"}
+        },
+        "required": ["prompt"]
+    }
+}
+
+def detect_local_engine() -> Optional[Dict[str, str]]:
+    """Detect presence of local inference engines."""
+    # 1. Check Ollama (default port 11434)
+    try:
+        res = requests.get("http://localhost:11434/api/tags", timeout=1)
+        if res.status_code == 200:
+            return {"type": "ollama", "url": "http://localhost:11434"}
+    except:
+        pass
+
+    # 2. Check llama-cpp-python (commonly on 8000 or 8080)
+    for port in [8000, 8080]:
+        try:
+            res = requests.get(f"http://localhost:{port}/v1/models", timeout=1)
+            if res.status_code == 200:
+                return {"type": "llama-cpp", "url": f"http://localhost:{port}"}
+        except:
+            pass
+            
+    return None
+
+def run_local_task(prompt: str, system: str = None, engine: str = "auto"):
+    """Execute inference on a detected local engine."""
+    info = detect_local_engine()
+    if not info:
+        return tool_error("No local inference engine (Ollama or llama-cpp) detected on localhost.")
+
+    try:
+        if info["type"] == "ollama":
+            # Select first available model or default to gemma
+            models = requests.get(f"{info['url']}/api/tags").json().get("models", [])
+            model_name = models[0]["name"] if models else "gemma"
+            
+            payload = {
+                "model": model_name,
+                "prompt": prompt,
+                "stream": False
+            }
+            if system: payload["system"] = system
+            
+            res = requests.post(f"{info['url']}/api/generate", json=payload, timeout=60)
+            result = res.json().get("response", "")
+            return tool_result(engine="Ollama", model=model_name, response=result)
+
+        elif info["type"] == "llama-cpp":
+            payload = {
+                "model": "local-model",
+                "messages": [
+                    {"role": "system", "content": system or "You are a helpful assistant."},
+                    {"role": "user", "content": prompt}
+                ]
+            }
+            res = requests.post(f"{info['url']}/v1/chat/completions", json=payload, timeout=60)
+            result = res.json()["choices"][0]["message"]["content"]
+            return tool_result(engine="llama-cpp", response=result)
+
+    except Exception as e:
+        return tool_error(f"Local inference failed: {str(e)}")
+
+def _handle_local_inference(args, **kwargs):
+    return run_local_task(
+        prompt=args.get("prompt"),
+        system=args.get("system"),
+        engine=args.get("engine", "auto")
+    )
+
+registry.register(
+    name="local_inference",
+    toolset="inference",
+    schema=LOCAL_INFERENCE_SCHEMA,
+    handler=_handle_local_inference,
+    emoji="🏠"
+)
+