feat: Atlas Inference Engine provider integration (#674 )

Atlas is a Rust+CUDA inference engine 3x faster than vLLM with a 2.5GB image vs 20+GB. OpenAI-compatible API at localhost:8888/v1. New agent/atlas_provider.py: - AtlasProvider class with health_check(), list_models(), benchmark_inference(), get_provider_config() - ATLAS_SUPPORTED_MODELS list (8 models as of alpha-2.8) - get_atlas_config_hint() for config.yaml setup - get_atlas_docker_command() for quick deployment Integration: - 'atlas' added as provider alias in hermes_cli/auth.py (routes to 'custom' like ollama/vllm/lmstudio) - Atlas documented in cli-config.yaml.example with provider config and docker quick-start Config: provider: atlas base_url: http://localhost:8888/v1 Docker: docker run -d --gpus all --ipc=host -p 8888:8888 avarok/atlas-gb10:alpha-2.8 serve <model> --speculative Closes #674
2026-04-14 19:08:28 -04:00
3 changed files with 227 additions and 0 deletions
--- a/agent/atlas_provider.py
+++ b/agent/atlas_provider.py
@@ -0,0 +1,219 @@
+"""Atlas Inference Engine provider integration.
+
+Atlas is a Rust+CUDA LLM inference engine that is 3x faster than vLLM.
+It exposes an OpenAI-compatible API at http://localhost:8888/v1.
+
+This module provides:
+- Atlas provider configuration and validation
+- Health check for Atlas server
+- Model discovery via Atlas API
+- Benchmark comparison utilities
+
+Usage:
+    from agent.atlas_provider import AtlasProvider
+    atlas = AtlasProvider()
+    if atlas.is_available():
+        models = atlas.list_models()
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import time
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# Default Atlas configuration
+ATLAS_DEFAULT_BASE_URL = os.getenv("ATLAS_BASE_URL", "http://localhost:8888/v1")
+ATLAS_DEFAULT_PORT = int(os.getenv("ATLAS_PORT", "8888"))
+
+# Known Atlas-compatible models (as of alpha-2.8)
+ATLAS_SUPPORTED_MODELS = [
+    "Sehyo/Qwen3.5-35B-A3B-NVFP4",
+    "Sehyo/Qwen3.5-122B-A10B-NVFP4",
+    "Sehyo/Qwen3-Next-80B-A3B-NVFP4",
+    "Sehyo/Qwen3-Coder-Next-FP8",
+    "Sehyo/Qwen3-VL-30B-NVFP4",
+    "Sehyo/Gemma-4-26B-NVFP4",
+    "Sehyo/Nemotron-3-Nano-30B-NVFP4",
+    "Sehyo/Mistral-Small-4-119B-NVFP4",
+]
+
+
+class AtlasProvider:
+    """Atlas Inference Engine provider.
+
+    Wraps the Atlas OpenAI-compatible API with health checks,
+    model discovery, and configuration validation.
+    """
+
+    def __init__(self, base_url: str = ""):
+        self.base_url = (base_url or ATLAS_DEFAULT_BASE_URL).rstrip("/")
+        self._api_url = self.base_url
+        if not self._api_url.endswith("/v1"):
+            self._api_url += "/v1"
+
+    def is_available(self) -> bool:
+        """Check if Atlas server is running and responding."""
+        try:
+            import urllib.request
+            req = urllib.request.Request(f"{self._api_url}/models", method="GET")
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                return resp.status == 200
+        except Exception:
+            return False
+
+    def list_models(self) -> List[Dict[str, Any]]:
+        """List models available on the Atlas server."""
+        try:
+            import urllib.request
+            req = urllib.request.Request(f"{self._api_url}/models", method="GET")
+            with urllib.request.urlopen(req, timeout=10) as resp:
+                data = json.loads(resp.read())
+                return data.get("data", [])
+        except Exception as exc:
+            logger.warning("Atlas model list failed: %s", exc)
+            return []
+
+    def health_check(self) -> Dict[str, Any]:
+        """Comprehensive health check of the Atlas server."""
+        result = {
+            "available": False,
+            "base_url": self.base_url,
+            "models": [],
+            "model_count": 0,
+            "latency_ms": 0,
+            "error": None,
+        }
+
+        t0 = time.monotonic()
+        try:
+            import urllib.request
+            req = urllib.request.Request(f"{self._api_url}/models", method="GET")
+            with urllib.request.urlopen(req, timeout=5) as resp:
+                result["latency_ms"] = int((time.monotonic() - t0) * 1000)
+                if resp.status == 200:
+                    data = json.loads(resp.read())
+                    models = data.get("data", [])
+                    result["available"] = True
+                    result["models"] = [m.get("id", "") for m in models]
+                    result["model_count"] = len(models)
+        except Exception as exc:
+            result["latency_ms"] = int((time.monotonic() - t0) * 1000)
+            result["error"] = str(exc)
+
+        return result
+
+    def get_provider_config(self) -> Dict[str, Any]:
+        """Return a provider config dict suitable for hermes config.yaml."""
+        return {
+            "name": "atlas",
+            "base_url": self._api_url,
+            "api_mode": "openai",
+            "description": "Atlas Inference Engine (Rust+CUDA, 3x faster than vLLM)",
+        }
+
+    def benchmark_inference(
+        self,
+        prompt: str = "Explain the theory of relativity in three sentences.",
+        model: str = "",
+        num_tokens: int = 100,
+    ) -> Dict[str, Any]:
+        """Run a quick inference benchmark against Atlas.
+
+        Returns timing metrics for comparison with vLLM or other backends.
+        """
+        result = {
+            "provider": "atlas",
+            "model": model or "unknown",
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "total_time_ms": 0,
+            "tokens_per_second": 0.0,
+            "time_to_first_token_ms": 0,
+            "error": None,
+        }
+
+        try:
+            import urllib.request
+
+            messages = [{"role": "user", "content": prompt}]
+            body = {
+                "model": model or "",
+                "messages": messages,
+                "max_tokens": num_tokens,
+                "stream": False,
+            }
+
+            t0 = time.monotonic()
+            req = urllib.request.Request(
+                f"{self._api_url}/chat/completions",
+                data=json.dumps(body).encode(),
+                headers={"Content-Type": "application/json"},
+                method="POST",
+            )
+            with urllib.request.urlopen(req, timeout=60) as resp:
+                elapsed = time.monotonic() - t0
+                data = json.loads(resp.read())
+
+            usage = data.get("usage", {})
+            result["prompt_tokens"] = usage.get("prompt_tokens", 0)
+            result["completion_tokens"] = usage.get("completion_tokens", 0)
+            result["total_time_ms"] = int(elapsed * 1000)
+            if elapsed > 0 and result["completion_tokens"] > 0:
+                result["tokens_per_second"] = round(
+                    result["completion_tokens"] / elapsed, 1
+                )
+
+        except Exception as exc:
+            result["error"] = str(exc)
+
+        return result
+
+
+def get_atlas_config_hint() -> str:
+    """Return a config.yaml snippet for adding Atlas as a provider."""
+    return """# Atlas Inference Engine configuration
+# Add to config.yaml under providers:
+
+providers:
+  atlas:
+    base_url: http://localhost:8888/v1
+    api_mode: openai
+    # No API key needed for local Atlas
+
+# Then set model:
+model:
+  default: atlas/<model-name>
+  provider: atlas
+
+# Or use as fallback:
+fallback_model:
+  provider: atlas
+  model: Sehyo/Qwen3.5-35B-A3B-NVFP4
+"""
+
+
+def get_atlas_docker_command(
+    model: str = "Sehyo/Qwen3.5-35B-A3B-NVFP4",
+    port: int = 8888,
+    speculative: bool = True,
+    max_seq_len: int = 131072,
+    max_batch_size: int = 1,
+) -> str:
+    """Return the docker run command for Atlas."""
+    cmd = (
+        "docker run -d --gpus all --ipc=host "
+        f"-p {port}:8888 "
+        "-v ~/.cache/huggingface:/root/.cache/huggingface "
+        "avarok/atlas-gb10:alpha-2.8 serve "
+        f"{model} "
+    )
+    if speculative:
+        cmd += "--speculative --scheduling-policy slai "
+    cmd += f"--max-seq-len {max_seq_len} --max-batch-size {max_batch_size} "
+    cmd += "--max-prefill-tokens 0"
+    return cmd
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -43,6 +43,13 @@ model:
  #   Set OLLAMA_API_KEY in .env — automatically picked up when base_url
  #   points to ollama.com.
  #
+  #   Atlas Inference Engine (Rust+CUDA, 3x faster than vLLM):
+  #     provider: "atlas"
+  #     base_url: "http://localhost:8888/v1"
+  #   Start with: docker run -d --gpus all --ipc=host -p 8888:8888
+  #     avarok/atlas-gb10:alpha-2.8 serve <model> --speculative
+  #   See: agent/atlas_provider.py for full config.
+  #
  # Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
  provider: "auto"
  
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -924,6 +924,7 @@ def resolve_provider(
        # Local server aliases — route through the generic custom provider
        "lmstudio": "custom", "lm-studio": "custom", "lm_studio": "custom",
        "ollama": "custom", "vllm": "custom", "llamacpp": "custom",
+        "atlas": "custom",
        "llama.cpp": "custom", "llama-cpp": "custom",
    }
    normalized = _PROVIDER_ALIASES.get(normalized, normalized)