diff --git a/tools/local_inference_tool.py b/tools/local_inference_tool.py new file mode 100644 index 000000000..237a24ea1 --- /dev/null +++ b/tools/local_inference_tool.py @@ -0,0 +1,106 @@ + +#!/usr/bin/env python3 +""" +Local Inference Bridge — Fast-path for low-entropy LLM tasks. + +Detects local Ollama/llama-cpp instances and uses them for 'Auxiliary' tasks +(summarization, extraction, simple verification) to reduce cloud dependency. +""" + +import json +import logging +import os +import requests +from typing import Dict, List, Optional, Any +from tools.registry import registry, tool_error, tool_result + +logger = logging.getLogger(__name__) + +LOCAL_INFERENCE_SCHEMA = { + "name": "local_inference", + "description": "Execute a task using a local inference engine (Ollama/llama-cpp) if available. Ideal for simple summarization, text cleanup, or data extraction where cloud-grade intelligence is overkill.", + "parameters": { + "type": "object", + "properties": { + "prompt": {"type": "string", "description": "The task prompt."}, + "system": {"type": "string", "description": "Optional system instruction."}, + "engine": {"type": "string", "enum": ["auto", "ollama", "llama-cpp"], "default": "auto"} + }, + "required": ["prompt"] + } +} + +def detect_local_engine() -> Optional[Dict[str, str]]: + """Detect presence of local inference engines.""" + # 1. Check Ollama (default port 11434) + try: + res = requests.get("http://localhost:11434/api/tags", timeout=1) + if res.status_code == 200: + return {"type": "ollama", "url": "http://localhost:11434"} + except: + pass + + # 2. Check llama-cpp-python (commonly on 8000 or 8080) + for port in [8000, 8080]: + try: + res = requests.get(f"http://localhost:{port}/v1/models", timeout=1) + if res.status_code == 200: + return {"type": "llama-cpp", "url": f"http://localhost:{port}"} + except: + pass + + return None + +def run_local_task(prompt: str, system: str = None, engine: str = "auto"): + """Execute inference on a detected local engine.""" + info = detect_local_engine() + if not info: + return tool_error("No local inference engine (Ollama or llama-cpp) detected on localhost.") + + try: + if info["type"] == "ollama": + # Select first available model or default to gemma + models = requests.get(f"{info['url']}/api/tags").json().get("models", []) + model_name = models[0]["name"] if models else "gemma" + + payload = { + "model": model_name, + "prompt": prompt, + "stream": False + } + if system: payload["system"] = system + + res = requests.post(f"{info['url']}/api/generate", json=payload, timeout=60) + result = res.json().get("response", "") + return tool_result(engine="Ollama", model=model_name, response=result) + + elif info["type"] == "llama-cpp": + payload = { + "model": "local-model", + "messages": [ + {"role": "system", "content": system or "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ] + } + res = requests.post(f"{info['url']}/v1/chat/completions", json=payload, timeout=60) + result = res.json()["choices"][0]["message"]["content"] + return tool_result(engine="llama-cpp", response=result) + + except Exception as e: + return tool_error(f"Local inference failed: {str(e)}") + +def _handle_local_inference(args, **kwargs): + return run_local_task( + prompt=args.get("prompt"), + system=args.get("system"), + engine=args.get("engine", "auto") + ) + +registry.register( + name="local_inference", + toolset="inference", + schema=LOCAL_INFERENCE_SCHEMA, + handler=_handle_local_inference, + emoji="🏠" +) + \ No newline at end of file