1
0

[claude] Implement three-tier metabolic LLM router (#966) (#1160)

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit is contained in:
2026-03-23 19:45:56 +00:00
committed by rockachopa
parent 9e08e87312
commit 08d337e03d
3 changed files with 782 additions and 0 deletions

View File

@@ -3,6 +3,14 @@
from .api import router
from .cascade import CascadeRouter, Provider, ProviderStatus, get_router
from .history import HealthHistoryStore, get_history_store
from .metabolic import (
DEFAULT_TIER_MODELS,
MetabolicRouter,
ModelTier,
build_prompt,
classify_complexity,
get_metabolic_router,
)
__all__ = [
"CascadeRouter",
@@ -12,4 +20,11 @@ __all__ = [
"router",
"HealthHistoryStore",
"get_history_store",
# Metabolic router
"MetabolicRouter",
"ModelTier",
"DEFAULT_TIER_MODELS",
"classify_complexity",
"build_prompt",
"get_metabolic_router",
]

View File

@@ -0,0 +1,381 @@
"""Three-tier metabolic LLM router.
Routes queries to the cheapest-sufficient model tier using MLX for all
inference on Apple Silicon GPU:
T1 — Routine (Qwen3-8B Q6_K, ~45-55 tok/s): Simple navigation, basic choices.
T2 — Medium (Qwen3-14B Q5_K_M, ~20-28 tok/s): Dialogue, inventory management.
T3 — Complex (Qwen3-32B Q4_K_M, ~8-12 tok/s): Quest planning, stuck recovery.
Memory budget:
- T1+T2 always loaded (~8.5 GB combined)
- T3 loaded on demand (+20 GB) — game pauses during inference
Design notes:
- 70% of game ticks never reach the LLM (handled upstream by behavior trees)
- T3 pauses the game world before inference and unpauses after (graceful if no world)
- All inference via vllm-mlx / Ollama — local-first, no cloud for game ticks
References:
- Issue #966 — Three-Tier Metabolic LLM Router
- Issue #1063 — Best Local Uncensored Agent Model for M3 Max 36GB
- Issue #1075 — Claude Quota Monitor + Metabolic Protocol
"""
import asyncio
import logging
from enum import StrEnum
from typing import Any
logger = logging.getLogger(__name__)
class ModelTier(StrEnum):
"""Three metabolic model tiers ordered by cost and capability.
Tier selection is driven by classify_complexity(). The cheapest
sufficient tier is always chosen — T1 handles routine tasks, T2
handles dialogue and management, T3 handles planning and recovery.
"""
T1_ROUTINE = "t1_routine" # Fast, cheap — Qwen3-8B, always loaded
T2_MEDIUM = "t2_medium" # Balanced — Qwen3-14B, always loaded
T3_COMPLEX = "t3_complex" # Deep — Qwen3-32B, loaded on demand, pauses game
# ── Classification vocabulary ────────────────────────────────────────────────
# T1: single-action navigation and binary-choice words
_T1_KEYWORDS = frozenset(
{
"go", "move", "walk", "run", "north", "south", "east", "west",
"up", "down", "left", "right", "yes", "no", "ok", "okay",
"open", "close", "take", "drop", "look", "pick", "use",
"wait", "rest", "save", "attack", "flee", "jump", "crouch",
}
)
# T3: planning, optimisation, or recovery signals
_T3_KEYWORDS = frozenset(
{
"plan", "strategy", "optimize", "optimise", "quest", "stuck",
"recover", "multi-step", "long-term", "negotiate", "persuade",
"faction", "reputation", "best", "optimal", "recommend",
"analyze", "analyse", "evaluate", "decide", "complex", "how do i",
"what should i do", "help me figure", "what is the best",
}
)
def classify_complexity(task: str, state: dict) -> ModelTier:
"""Classify a task to the cheapest-sufficient model tier.
Classification priority (highest wins):
1. T3 — any T3 keyword, stuck indicator, or ``state["require_t3"] = True``
2. T1 — short task with only T1 keywords and no active context
3. T2 — everything else (safe default)
Args:
task: Natural-language task description or player input.
state: Current game state dict. Recognised keys:
``stuck`` (bool), ``require_t3`` (bool),
``active_quests`` (list), ``dialogue_active`` (bool).
Returns:
ModelTier appropriate for the task.
"""
task_lower = task.lower()
words = set(task_lower.split())
# ── T3 signals ──────────────────────────────────────────────────────────
t3_keyword_hit = bool(words & _T3_KEYWORDS)
# Check multi-word T3 phrases
t3_phrase_hit = any(phrase in task_lower for phrase in _T3_KEYWORDS if " " in phrase)
is_stuck = bool(state.get("stuck", False))
explicit_t3 = bool(state.get("require_t3", False))
if t3_keyword_hit or t3_phrase_hit or is_stuck or explicit_t3:
logger.debug(
"classify_complexity → T3 (keywords=%s stuck=%s explicit=%s)",
t3_keyword_hit or t3_phrase_hit,
is_stuck,
explicit_t3,
)
return ModelTier.T3_COMPLEX
# ── T1 signals ──────────────────────────────────────────────────────────
t1_keyword_hit = bool(words & _T1_KEYWORDS)
task_short = len(task.split()) <= 6
no_active_context = (
not state.get("active_quests")
and not state.get("dialogue_active")
and not state.get("combat_active")
)
if t1_keyword_hit and task_short and no_active_context:
logger.debug("classify_complexity → T1 (keywords=%s short=%s)", t1_keyword_hit, task_short)
return ModelTier.T1_ROUTINE
# ── Default: T2 ─────────────────────────────────────────────────────────
logger.debug("classify_complexity → T2 (default)")
return ModelTier.T2_MEDIUM
def build_prompt(
state: dict,
ui_state: dict,
text: str,
visual_context: str | None = None,
) -> list[dict]:
"""Build an OpenAI-compatible messages list from game context.
Assembles a system message from structured game state and a user
message from the player's text input. This format is accepted by
CascadeRouter.complete() directly.
Args:
state: Current game state dict. Common keys:
``location`` (str), ``health`` (int/float),
``inventory`` (list), ``active_quests`` (list),
``stuck`` (bool).
ui_state: Current UI state dict. Common keys:
``dialogue_active`` (bool), ``dialogue_npc`` (str),
``menu_open`` (str), ``combat_active`` (bool).
text: Player text or task description (becomes user message).
visual_context: Optional free-text description of the current screen
or scene — from a vision model or rule-based extractor.
Returns:
List of message dicts: [{"role": "system", ...}, {"role": "user", ...}]
"""
context_lines: list[str] = []
location = state.get("location", "unknown")
context_lines.append(f"Location: {location}")
health = state.get("health")
if health is not None:
context_lines.append(f"Health: {health}")
inventory = state.get("inventory", [])
if inventory:
items = [i if isinstance(i, str) else i.get("name", str(i)) for i in inventory[:10]]
context_lines.append(f"Inventory: {', '.join(items)}")
active_quests = state.get("active_quests", [])
if active_quests:
names = [
q if isinstance(q, str) else q.get("name", str(q)) for q in active_quests[:5]
]
context_lines.append(f"Active quests: {', '.join(names)}")
if state.get("stuck"):
context_lines.append("Status: STUCK — need recovery strategy")
if ui_state.get("dialogue_active"):
npc = ui_state.get("dialogue_npc", "NPC")
context_lines.append(f"In dialogue with: {npc}")
if ui_state.get("menu_open"):
context_lines.append(f"Menu open: {ui_state['menu_open']}")
if ui_state.get("combat_active"):
context_lines.append("Status: IN COMBAT")
if visual_context:
context_lines.append(f"Scene: {visual_context}")
system_content = (
"You are Timmy, an AI game agent. "
"Respond with valid game commands only.\n\n"
+ "\n".join(context_lines)
)
return [
{"role": "system", "content": system_content},
{"role": "user", "content": text},
]
# ── Default model assignments ────────────────────────────────────────────────
# Overridable per deployment via MetabolicRouter(tier_models={...}).
# Model benchmarks (M3 Max 36 GB, issue #1063):
# Qwen3-8B Q6_K — 0.933 F1 tool calling, ~45-55 tok/s (~6 GB)
# Qwen3-14B Q5_K_M — 0.971 F1 tool calling, ~20-28 tok/s (~9.5 GB)
# Qwen3-32B Q4_K_M — highest quality, ~8-12 tok/s (~20 GB, on demand)
DEFAULT_TIER_MODELS: dict[ModelTier, str] = {
ModelTier.T1_ROUTINE: "qwen3:8b",
ModelTier.T2_MEDIUM: "qwen3:14b",
ModelTier.T3_COMPLEX: "qwen3:30b", # Closest Ollama tag to 32B Q4
}
class MetabolicRouter:
"""Routes LLM requests to the cheapest-sufficient model tier.
Wraps CascadeRouter with:
- Complexity classification via classify_complexity()
- Prompt assembly via build_prompt()
- T3 world-pause / world-unpause (graceful if no world adapter)
Usage::
router = MetabolicRouter()
# Simple route call — classification + prompt + inference in one step
result = await router.route(
task="Go north",
state={"location": "Balmora"},
ui_state={},
)
print(result["content"], result["tier"])
# Pre-classify if you need the tier for telemetry
tier = router.classify("Plan the best path to Vivec", game_state)
# Wire in world adapter for T3 pause/unpause
router.set_world(world_adapter)
"""
def __init__(
self,
cascade: Any | None = None,
tier_models: dict[ModelTier, str] | None = None,
) -> None:
"""Initialise the metabolic router.
Args:
cascade: CascadeRouter instance to use. If None, the
singleton returned by get_router() is used lazily.
tier_models: Override default model names per tier.
"""
self._cascade = cascade
self._tier_models: dict[ModelTier, str] = dict(DEFAULT_TIER_MODELS)
if tier_models:
self._tier_models.update(tier_models)
self._world: Any | None = None
def set_world(self, world: Any) -> None:
"""Wire in a world adapter for T3 pause / unpause support.
The adapter only needs to implement ``act(CommandInput)`` — the full
WorldInterface contract is not required. A missing or broken world
adapter degrades gracefully (logs a warning, inference continues).
Args:
world: Any object with an ``act(CommandInput)`` method.
"""
self._world = world
def _get_cascade(self) -> Any:
"""Return the CascadeRouter, creating the singleton if needed."""
if self._cascade is None:
from infrastructure.router.cascade import get_router
self._cascade = get_router()
return self._cascade
def classify(self, task: str, state: dict) -> ModelTier:
"""Classify task complexity. Delegates to classify_complexity()."""
return classify_complexity(task, state)
async def _pause_world(self) -> None:
"""Pause the game world before T3 inference (graceful degradation)."""
if self._world is None:
return
try:
from infrastructure.world.types import CommandInput
await asyncio.to_thread(self._world.act, CommandInput(action="pause"))
logger.debug("MetabolicRouter: world paused for T3 inference")
except Exception as exc:
logger.warning("world.pause() failed — continuing without pause: %s", exc)
async def _unpause_world(self) -> None:
"""Unpause the game world after T3 inference (always called, even on error)."""
if self._world is None:
return
try:
from infrastructure.world.types import CommandInput
await asyncio.to_thread(self._world.act, CommandInput(action="unpause"))
logger.debug("MetabolicRouter: world unpaused after T3 inference")
except Exception as exc:
logger.warning("world.unpause() failed — game may remain paused: %s", exc)
async def route(
self,
task: str,
state: dict,
ui_state: dict | None = None,
visual_context: str | None = None,
temperature: float = 0.3,
max_tokens: int | None = None,
) -> dict:
"""Route a task to the appropriate model tier and return the LLM response.
Selects the tier via classify_complexity(), assembles the prompt via
build_prompt(), and dispatches to CascadeRouter. For T3, the game
world is paused before inference and unpaused after (in a finally block).
Args:
task: Natural-language task description or player input.
state: Current game state dict.
ui_state: Current UI state dict (optional, defaults to {}).
visual_context: Optional screen/scene description from vision model.
temperature: Sampling temperature (default 0.3 for game commands).
max_tokens: Maximum tokens to generate.
Returns:
Dict with keys: ``content``, ``provider``, ``model``, ``tier``,
``latency_ms``, plus any extra keys from CascadeRouter.
Raises:
RuntimeError: If all providers fail (propagated from CascadeRouter).
"""
ui_state = ui_state or {}
tier = self.classify(task, state)
model = self._tier_models[tier]
messages = build_prompt(state, ui_state, task, visual_context)
cascade = self._get_cascade()
logger.info(
"MetabolicRouter: tier=%s model=%s task=%r",
tier,
model,
task[:80],
)
if tier == ModelTier.T3_COMPLEX:
await self._pause_world()
try:
result = await cascade.complete(
messages=messages,
model=model,
temperature=temperature,
max_tokens=max_tokens,
)
finally:
await self._unpause_world()
else:
result = await cascade.complete(
messages=messages,
model=model,
temperature=temperature,
max_tokens=max_tokens,
)
result["tier"] = tier
return result
# ── Module-level singleton ────────────────────────────────────────────────────
_metabolic_router: MetabolicRouter | None = None
def get_metabolic_router() -> MetabolicRouter:
"""Get or create the MetabolicRouter singleton."""
global _metabolic_router
if _metabolic_router is None:
_metabolic_router = MetabolicRouter()
return _metabolic_router