#!/usr/bin/env python3 """ Local Inference Bridge — Fast-path for low-entropy LLM tasks. Detects local Ollama/llama-cpp instances and uses them for 'Auxiliary' tasks (summarization, extraction, simple verification) to reduce cloud dependency. """ import json import logging import os import requests from typing import Dict, List, Optional, Any from tools.registry import registry, tool_error, tool_result logger = logging.getLogger(__name__) LOCAL_INFERENCE_SCHEMA = { "name": "local_inference", "description": "Execute a task using a local inference engine (Ollama/llama-cpp) if available. Ideal for simple summarization, text cleanup, or data extraction where cloud-grade intelligence is overkill.", "parameters": { "type": "object", "properties": { "prompt": {"type": "string", "description": "The task prompt."}, "system": {"type": "string", "description": "Optional system instruction."}, "engine": {"type": "string", "enum": ["auto", "ollama", "llama-cpp"], "default": "auto"} }, "required": ["prompt"] } } def detect_local_engine() -> Optional[Dict[str, str]]: """Detect presence of local inference engines.""" # 1. Check Ollama (default port 11434) try: res = requests.get("http://localhost:11434/api/tags", timeout=1) if res.status_code == 200: return {"type": "ollama", "url": "http://localhost:11434"} except: pass # 2. Check llama-cpp-python (commonly on 8000 or 8080) for port in [8000, 8080]: try: res = requests.get(f"http://localhost:{port}/v1/models", timeout=1) if res.status_code == 200: return {"type": "llama-cpp", "url": f"http://localhost:{port}"} except: pass return None def run_local_task(prompt: str, system: str = None, engine: str = "auto"): """Execute inference on a detected local engine.""" info = detect_local_engine() if not info: return tool_error("No local inference engine (Ollama or llama-cpp) detected on localhost.") try: if info["type"] == "ollama": # Select first available model or default to gemma models = requests.get(f"{info['url']}/api/tags").json().get("models", []) model_name = models[0]["name"] if models else "gemma" payload = { "model": model_name, "prompt": prompt, "stream": False } if system: payload["system"] = system res = requests.post(f"{info['url']}/api/generate", json=payload, timeout=60) result = res.json().get("response", "") return tool_result(engine="Ollama", model=model_name, response=result) elif info["type"] == "llama-cpp": payload = { "model": "local-model", "messages": [ {"role": "system", "content": system or "You are a helpful assistant."}, {"role": "user", "content": prompt} ] } res = requests.post(f"{info['url']}/v1/chat/completions", json=payload, timeout=60) result = res.json()["choices"][0]["message"]["content"] return tool_result(engine="llama-cpp", response=result) except Exception as e: return tool_error(f"Local inference failed: {str(e)}") def _handle_local_inference(args, **kwargs): return run_local_task( prompt=args.get("prompt"), system=args.get("system"), engine=args.get("engine", "auto") ) registry.register( name="local_inference", toolset="inference", schema=LOCAL_INFERENCE_SCHEMA, handler=_handle_local_inference, emoji="🏠" )