[claude] Implement three-tier metabolic LLM router (#966) (#1160)

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
2026-03-23 19:45:56 +00:00
parent 9e08e87312
commit 08d337e03d
3 changed files with 782 additions and 0 deletions
--- a/src/infrastructure/router/init.py
+++ b/src/infrastructure/router/init.py
@@ -3,6 +3,14 @@
 from .api import router
 from .cascade import CascadeRouter, Provider, ProviderStatus, get_router
 from .history import HealthHistoryStore, get_history_store
+from .metabolic import (
+    DEFAULT_TIER_MODELS,
+    MetabolicRouter,
+    ModelTier,
+    build_prompt,
+    classify_complexity,
+    get_metabolic_router,
+)

 __all__ = [
    "CascadeRouter",
@@ -12,4 +20,11 @@ __all__ = [
    "router",
    "HealthHistoryStore",
    "get_history_store",
+    # Metabolic router
+    "MetabolicRouter",
+    "ModelTier",
+    "DEFAULT_TIER_MODELS",
+    "classify_complexity",
+    "build_prompt",
+    "get_metabolic_router",
 ]
--- a/src/infrastructure/router/metabolic.py
+++ b/src/infrastructure/router/metabolic.py
@@ -0,0 +1,381 @@
+"""Three-tier metabolic LLM router.
+
+Routes queries to the cheapest-sufficient model tier using MLX for all
+inference on Apple Silicon GPU:
+
+  T1 — Routine  (Qwen3-8B Q6_K, ~45-55 tok/s):    Simple navigation, basic choices.
+  T2 — Medium   (Qwen3-14B Q5_K_M, ~20-28 tok/s):  Dialogue, inventory management.
+  T3 — Complex  (Qwen3-32B Q4_K_M, ~8-12 tok/s):   Quest planning, stuck recovery.
+
+Memory budget:
+  - T1+T2 always loaded (~8.5 GB combined)
+  - T3 loaded on demand (+20 GB) — game pauses during inference
+
+Design notes:
+  - 70% of game ticks never reach the LLM (handled upstream by behavior trees)
+  - T3 pauses the game world before inference and unpauses after (graceful if no world)
+  - All inference via vllm-mlx / Ollama — local-first, no cloud for game ticks
+
+References:
+  - Issue #966 — Three-Tier Metabolic LLM Router
+  - Issue #1063 — Best Local Uncensored Agent Model for M3 Max 36GB
+  - Issue #1075 — Claude Quota Monitor + Metabolic Protocol
+"""
+
+import asyncio
+import logging
+from enum import StrEnum
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+class ModelTier(StrEnum):
+    """Three metabolic model tiers ordered by cost and capability.
+
+    Tier selection is driven by classify_complexity().  The cheapest
+    sufficient tier is always chosen — T1 handles routine tasks, T2
+    handles dialogue and management, T3 handles planning and recovery.
+    """
+
+    T1_ROUTINE = "t1_routine"  # Fast, cheap — Qwen3-8B, always loaded
+    T2_MEDIUM = "t2_medium"  # Balanced — Qwen3-14B, always loaded
+    T3_COMPLEX = "t3_complex"  # Deep — Qwen3-32B, loaded on demand, pauses game
+
+
+# ── Classification vocabulary ────────────────────────────────────────────────
+
+# T1: single-action navigation and binary-choice words
+_T1_KEYWORDS = frozenset(
+    {
+        "go", "move", "walk", "run", "north", "south", "east", "west",
+        "up", "down", "left", "right", "yes", "no", "ok", "okay",
+        "open", "close", "take", "drop", "look", "pick", "use",
+        "wait", "rest", "save", "attack", "flee", "jump", "crouch",
+    }
+)
+
+# T3: planning, optimisation, or recovery signals
+_T3_KEYWORDS = frozenset(
+    {
+        "plan", "strategy", "optimize", "optimise", "quest", "stuck",
+        "recover", "multi-step", "long-term", "negotiate", "persuade",
+        "faction", "reputation", "best", "optimal", "recommend",
+        "analyze", "analyse", "evaluate", "decide", "complex", "how do i",
+        "what should i do", "help me figure", "what is the best",
+    }
+)
+
+
+def classify_complexity(task: str, state: dict) -> ModelTier:
+    """Classify a task to the cheapest-sufficient model tier.
+
+    Classification priority (highest wins):
+      1. T3 — any T3 keyword, stuck indicator, or ``state["require_t3"] = True``
+      2. T1 — short task with only T1 keywords and no active context
+      3. T2 — everything else (safe default)
+
+    Args:
+        task:  Natural-language task description or player input.
+        state: Current game state dict.  Recognised keys:
+               ``stuck`` (bool), ``require_t3`` (bool),
+               ``active_quests`` (list), ``dialogue_active`` (bool).
+
+    Returns:
+        ModelTier appropriate for the task.
+    """
+    task_lower = task.lower()
+    words = set(task_lower.split())
+
+    # ── T3 signals ──────────────────────────────────────────────────────────
+    t3_keyword_hit = bool(words & _T3_KEYWORDS)
+    # Check multi-word T3 phrases
+    t3_phrase_hit = any(phrase in task_lower for phrase in _T3_KEYWORDS if " " in phrase)
+    is_stuck = bool(state.get("stuck", False))
+    explicit_t3 = bool(state.get("require_t3", False))
+
+    if t3_keyword_hit or t3_phrase_hit or is_stuck or explicit_t3:
+        logger.debug(
+            "classify_complexity → T3 (keywords=%s stuck=%s explicit=%s)",
+            t3_keyword_hit or t3_phrase_hit,
+            is_stuck,
+            explicit_t3,
+        )
+        return ModelTier.T3_COMPLEX
+
+    # ── T1 signals ──────────────────────────────────────────────────────────
+    t1_keyword_hit = bool(words & _T1_KEYWORDS)
+    task_short = len(task.split()) <= 6
+    no_active_context = (
+        not state.get("active_quests")
+        and not state.get("dialogue_active")
+        and not state.get("combat_active")
+    )
+
+    if t1_keyword_hit and task_short and no_active_context:
+        logger.debug("classify_complexity → T1 (keywords=%s short=%s)", t1_keyword_hit, task_short)
+        return ModelTier.T1_ROUTINE
+
+    # ── Default: T2 ─────────────────────────────────────────────────────────
+    logger.debug("classify_complexity → T2 (default)")
+    return ModelTier.T2_MEDIUM
+
+
+def build_prompt(
+    state: dict,
+    ui_state: dict,
+    text: str,
+    visual_context: str | None = None,
+) -> list[dict]:
+    """Build an OpenAI-compatible messages list from game context.
+
+    Assembles a system message from structured game state and a user
+    message from the player's text input.  This format is accepted by
+    CascadeRouter.complete() directly.
+
+    Args:
+        state:          Current game state dict.  Common keys:
+                        ``location`` (str), ``health`` (int/float),
+                        ``inventory`` (list), ``active_quests`` (list),
+                        ``stuck`` (bool).
+        ui_state:       Current UI state dict.  Common keys:
+                        ``dialogue_active`` (bool), ``dialogue_npc`` (str),
+                        ``menu_open`` (str), ``combat_active`` (bool).
+        text:           Player text or task description (becomes user message).
+        visual_context: Optional free-text description of the current screen
+                        or scene — from a vision model or rule-based extractor.
+
+    Returns:
+        List of message dicts: [{"role": "system", ...}, {"role": "user", ...}]
+    """
+    context_lines: list[str] = []
+
+    location = state.get("location", "unknown")
+    context_lines.append(f"Location: {location}")
+
+    health = state.get("health")
+    if health is not None:
+        context_lines.append(f"Health: {health}")
+
+    inventory = state.get("inventory", [])
+    if inventory:
+        items = [i if isinstance(i, str) else i.get("name", str(i)) for i in inventory[:10]]
+        context_lines.append(f"Inventory: {', '.join(items)}")
+
+    active_quests = state.get("active_quests", [])
+    if active_quests:
+        names = [
+            q if isinstance(q, str) else q.get("name", str(q)) for q in active_quests[:5]
+        ]
+        context_lines.append(f"Active quests: {', '.join(names)}")
+
+    if state.get("stuck"):
+        context_lines.append("Status: STUCK — need recovery strategy")
+
+    if ui_state.get("dialogue_active"):
+        npc = ui_state.get("dialogue_npc", "NPC")
+        context_lines.append(f"In dialogue with: {npc}")
+
+    if ui_state.get("menu_open"):
+        context_lines.append(f"Menu open: {ui_state['menu_open']}")
+
+    if ui_state.get("combat_active"):
+        context_lines.append("Status: IN COMBAT")
+
+    if visual_context:
+        context_lines.append(f"Scene: {visual_context}")
+
+    system_content = (
+        "You are Timmy, an AI game agent. "
+        "Respond with valid game commands only.\n\n"
+        + "\n".join(context_lines)
+    )
+
+    return [
+        {"role": "system", "content": system_content},
+        {"role": "user", "content": text},
+    ]
+
+
+# ── Default model assignments ────────────────────────────────────────────────
+# Overridable per deployment via MetabolicRouter(tier_models={...}).
+# Model benchmarks (M3 Max 36 GB, issue #1063):
+#   Qwen3-8B  Q6_K  — 0.933 F1 tool calling, ~45-55 tok/s (~6 GB)
+#   Qwen3-14B Q5_K_M — 0.971 F1 tool calling, ~20-28 tok/s (~9.5 GB)
+#   Qwen3-32B Q4_K_M — highest quality, ~8-12 tok/s (~20 GB, on demand)
+DEFAULT_TIER_MODELS: dict[ModelTier, str] = {
+    ModelTier.T1_ROUTINE: "qwen3:8b",
+    ModelTier.T2_MEDIUM: "qwen3:14b",
+    ModelTier.T3_COMPLEX: "qwen3:30b",  # Closest Ollama tag to 32B Q4
+}
+
+
+class MetabolicRouter:
+    """Routes LLM requests to the cheapest-sufficient model tier.
+
+    Wraps CascadeRouter with:
+      - Complexity classification via classify_complexity()
+      - Prompt assembly via build_prompt()
+      - T3 world-pause / world-unpause (graceful if no world adapter)
+
+    Usage::
+
+        router = MetabolicRouter()
+
+        # Simple route call — classification + prompt + inference in one step
+        result = await router.route(
+            task="Go north",
+            state={"location": "Balmora"},
+            ui_state={},
+        )
+        print(result["content"], result["tier"])
+
+        # Pre-classify if you need the tier for telemetry
+        tier = router.classify("Plan the best path to Vivec", game_state)
+
+        # Wire in world adapter for T3 pause/unpause
+        router.set_world(world_adapter)
+    """
+
+    def __init__(
+        self,
+        cascade: Any | None = None,
+        tier_models: dict[ModelTier, str] | None = None,
+    ) -> None:
+        """Initialise the metabolic router.
+
+        Args:
+            cascade:     CascadeRouter instance to use.  If None, the
+                         singleton returned by get_router() is used lazily.
+            tier_models: Override default model names per tier.
+        """
+        self._cascade = cascade
+        self._tier_models: dict[ModelTier, str] = dict(DEFAULT_TIER_MODELS)
+        if tier_models:
+            self._tier_models.update(tier_models)
+        self._world: Any | None = None
+
+    def set_world(self, world: Any) -> None:
+        """Wire in a world adapter for T3 pause / unpause support.
+
+        The adapter only needs to implement ``act(CommandInput)`` — the full
+        WorldInterface contract is not required.  A missing or broken world
+        adapter degrades gracefully (logs a warning, inference continues).
+
+        Args:
+            world: Any object with an ``act(CommandInput)`` method.
+        """
+        self._world = world
+
+    def _get_cascade(self) -> Any:
+        """Return the CascadeRouter, creating the singleton if needed."""
+        if self._cascade is None:
+            from infrastructure.router.cascade import get_router
+
+            self._cascade = get_router()
+        return self._cascade
+
+    def classify(self, task: str, state: dict) -> ModelTier:
+        """Classify task complexity.  Delegates to classify_complexity()."""
+        return classify_complexity(task, state)
+
+    async def _pause_world(self) -> None:
+        """Pause the game world before T3 inference (graceful degradation)."""
+        if self._world is None:
+            return
+        try:
+            from infrastructure.world.types import CommandInput
+
+            await asyncio.to_thread(self._world.act, CommandInput(action="pause"))
+            logger.debug("MetabolicRouter: world paused for T3 inference")
+        except Exception as exc:
+            logger.warning("world.pause() failed — continuing without pause: %s", exc)
+
+    async def _unpause_world(self) -> None:
+        """Unpause the game world after T3 inference (always called, even on error)."""
+        if self._world is None:
+            return
+        try:
+            from infrastructure.world.types import CommandInput
+
+            await asyncio.to_thread(self._world.act, CommandInput(action="unpause"))
+            logger.debug("MetabolicRouter: world unpaused after T3 inference")
+        except Exception as exc:
+            logger.warning("world.unpause() failed — game may remain paused: %s", exc)
+
+    async def route(
+        self,
+        task: str,
+        state: dict,
+        ui_state: dict | None = None,
+        visual_context: str | None = None,
+        temperature: float = 0.3,
+        max_tokens: int | None = None,
+    ) -> dict:
+        """Route a task to the appropriate model tier and return the LLM response.
+
+        Selects the tier via classify_complexity(), assembles the prompt via
+        build_prompt(), and dispatches to CascadeRouter.  For T3, the game
+        world is paused before inference and unpaused after (in a finally block).
+
+        Args:
+            task:           Natural-language task description or player input.
+            state:          Current game state dict.
+            ui_state:       Current UI state dict (optional, defaults to {}).
+            visual_context: Optional screen/scene description from vision model.
+            temperature:    Sampling temperature (default 0.3 for game commands).
+            max_tokens:     Maximum tokens to generate.
+
+        Returns:
+            Dict with keys: ``content``, ``provider``, ``model``, ``tier``,
+            ``latency_ms``, plus any extra keys from CascadeRouter.
+
+        Raises:
+            RuntimeError: If all providers fail (propagated from CascadeRouter).
+        """
+        ui_state = ui_state or {}
+        tier = self.classify(task, state)
+        model = self._tier_models[tier]
+        messages = build_prompt(state, ui_state, task, visual_context)
+        cascade = self._get_cascade()
+
+        logger.info(
+            "MetabolicRouter: tier=%s model=%s task=%r",
+            tier,
+            model,
+            task[:80],
+        )
+
+        if tier == ModelTier.T3_COMPLEX:
+            await self._pause_world()
+            try:
+                result = await cascade.complete(
+                    messages=messages,
+                    model=model,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                )
+            finally:
+                await self._unpause_world()
+        else:
+            result = await cascade.complete(
+                messages=messages,
+                model=model,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+
+        result["tier"] = tier
+        return result
+
+
+# ── Module-level singleton ────────────────────────────────────────────────────
+_metabolic_router: MetabolicRouter | None = None
+
+
+def get_metabolic_router() -> MetabolicRouter:
+    """Get or create the MetabolicRouter singleton."""
+    global _metabolic_router
+    if _metabolic_router is None:
+        _metabolic_router = MetabolicRouter()
+    return _metabolic_router