[claude] Qwen3 two-model routing via task complexity classifier (#1065) v2 (#1233)

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit is contained in:
2026-03-23 22:58:21 +00:00
committed by Timmy Time
parent acc0df00cf
commit 1e1689f931
7 changed files with 635 additions and 6 deletions

View File

@@ -51,6 +51,13 @@ class Settings(BaseSettings):
# Set to 0 to use model defaults.
ollama_num_ctx: int = 32768
# Maximum models loaded simultaneously in Ollama — override with OLLAMA_MAX_LOADED_MODELS
# Set to 2 so Qwen3-8B and Qwen3-14B can stay hot concurrently (~17 GB combined).
# Requires Ollama ≥ 0.1.33. Export this to the Ollama process environment:
# OLLAMA_MAX_LOADED_MODELS=2 ollama serve
# or add it to your systemd/launchd unit before starting the harness.
ollama_max_loaded_models: int = 2
# Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS
# as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:8b,qwen2.5:14b"
# Or edit config/providers.yaml → fallback_chains for the canonical source.

View File

@@ -2,6 +2,7 @@
from .api import router
from .cascade import CascadeRouter, Provider, ProviderStatus, get_router
from .classifier import TaskComplexity, classify_task
from .history import HealthHistoryStore, get_history_store
from .metabolic import (
DEFAULT_TIER_MODELS,
@@ -27,4 +28,7 @@ __all__ = [
"classify_complexity",
"build_prompt",
"get_metabolic_router",
# Classifier
"TaskComplexity",
"classify_task",
]

View File

@@ -593,6 +593,34 @@ class CascadeRouter:
"is_fallback_model": is_fallback_model,
}
def _get_model_for_complexity(
self, provider: Provider, complexity: "TaskComplexity"
) -> str | None:
"""Return the best model on *provider* for the given complexity tier.
Checks fallback chains first (routine / complex), then falls back to
any model with the matching capability tag, then the provider default.
"""
from infrastructure.router.classifier import TaskComplexity
chain_key = "routine" if complexity == TaskComplexity.SIMPLE else "complex"
# Walk the capability fallback chain — first model present on this provider wins
for model_name in self.config.fallback_chains.get(chain_key, []):
if any(m["name"] == model_name for m in provider.models):
return model_name
# Direct capability lookup — only return if a model explicitly has the tag
# (do not use get_model_with_capability here as it falls back to the default)
cap_model = next(
(m["name"] for m in provider.models if chain_key in m.get("capabilities", [])),
None,
)
if cap_model:
return cap_model
return None # Caller will use provider default
async def complete(
self,
messages: list[dict],
@@ -600,6 +628,7 @@ class CascadeRouter:
temperature: float = 0.7,
max_tokens: int | None = None,
cascade_tier: str | None = None,
complexity_hint: str | None = None,
) -> dict:
"""Complete a chat conversation with automatic failover.
@@ -608,33 +637,103 @@ class CascadeRouter:
- Falls back to vision-capable models when needed
- Supports image URLs, paths, and base64 encoding
Complexity-based routing (issue #1065):
- ``complexity_hint="simple"`` → routes to Qwen3-8B (low-latency)
- ``complexity_hint="complex"`` → routes to Qwen3-14B (quality)
- ``complexity_hint=None`` (default) → auto-classifies from messages
Args:
messages: List of message dicts with role and content
model: Preferred model (tries this first, then provider defaults)
model: Preferred model (tries this first; complexity routing is
skipped when an explicit model is given)
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
cascade_tier: If specified, filters providers by this tier.
- "frontier_required": Uses only Anthropic provider for top-tier models.
complexity_hint: "simple", "complex", or None (auto-detect).
Returns:
Dict with content, provider_used, and metrics
Dict with content, provider_used, model, latency_ms,
is_fallback_model, and complexity fields.
Raises:
RuntimeError: If all providers fail
"""
from infrastructure.router.classifier import TaskComplexity, classify_task
content_type = self._detect_content_type(messages)
if content_type != ContentType.TEXT:
logger.debug("Detected %s content, selecting appropriate model", content_type.value)
# Resolve task complexity ─────────────────────────────────────────────
# Skip complexity routing when caller explicitly specifies a model.
complexity: TaskComplexity | None = None
if model is None:
if complexity_hint is not None:
try:
complexity = TaskComplexity(complexity_hint.lower())
except ValueError:
logger.warning("Unknown complexity_hint %r, auto-classifying", complexity_hint)
complexity = classify_task(messages)
else:
complexity = classify_task(messages)
logger.debug("Task complexity: %s", complexity.value)
errors: list[str] = []
providers = self._filter_providers(cascade_tier)
for provider in providers:
result = await self._try_single_provider(
provider, messages, model, temperature, max_tokens, content_type, errors
if not self._is_provider_available(provider):
continue
# Metabolic protocol: skip cloud providers when quota is low
if provider.type in ("anthropic", "openai", "grok"):
if not self._quota_allows_cloud(provider):
logger.info(
"Metabolic protocol: skipping cloud provider %s (quota too low)",
provider.name,
)
continue
# Complexity-based model selection (only when no explicit model) ──
effective_model = model
if effective_model is None and complexity is not None:
effective_model = self._get_model_for_complexity(provider, complexity)
if effective_model:
logger.debug(
"Complexity routing [%s]: %s%s",
complexity.value,
provider.name,
effective_model,
)
selected_model, is_fallback_model = self._select_model(
provider, effective_model, content_type
)
if result is not None:
return result
try:
result = await self._attempt_with_retry(
provider,
messages,
selected_model,
temperature,
max_tokens,
content_type,
)
except RuntimeError as exc:
errors.append(str(exc))
self._record_failure(provider)
continue
self._record_success(provider, result.get("latency_ms", 0))
return {
"content": result["content"],
"provider": provider.name,
"model": result.get("model", selected_model or provider.get_default_model()),
"latency_ms": result.get("latency_ms", 0),
"is_fallback_model": is_fallback_model,
"complexity": complexity.value if complexity is not None else None,
}
raise RuntimeError(f"All providers failed: {'; '.join(errors)}")

View File

@@ -0,0 +1,166 @@
"""Task complexity classifier for Qwen3 dual-model routing.
Classifies incoming tasks as SIMPLE (route to Qwen3-8B for low-latency)
or COMPLEX (route to Qwen3-14B for quality-sensitive work).
Classification is fully heuristic — no LLM inference required.
"""
import re
from enum import Enum
class TaskComplexity(Enum):
"""Task complexity tier for model routing."""
SIMPLE = "simple" # Qwen3-8B Q6_K: routine, latency-sensitive
COMPLEX = "complex" # Qwen3-14B Q5_K_M: quality-sensitive, multi-step
# Keywords strongly associated with complex tasks
_COMPLEX_KEYWORDS: frozenset[str] = frozenset(
[
"plan",
"review",
"analyze",
"analyse",
"triage",
"refactor",
"design",
"architecture",
"implement",
"compare",
"debug",
"explain",
"prioritize",
"prioritise",
"strategy",
"optimize",
"optimise",
"evaluate",
"assess",
"brainstorm",
"outline",
"summarize",
"summarise",
"generate code",
"write a",
"write the",
"code review",
"pull request",
"multi-step",
"multi step",
"step by step",
"backlog prioriti",
"issue triage",
"root cause",
"how does",
"why does",
"what are the",
]
)
# Keywords strongly associated with simple/routine tasks
_SIMPLE_KEYWORDS: frozenset[str] = frozenset(
[
"status",
"list ",
"show ",
"what is",
"how many",
"ping",
"run ",
"execute ",
"ls ",
"cat ",
"ps ",
"fetch ",
"count ",
"tail ",
"head ",
"grep ",
"find file",
"read file",
"get ",
"query ",
"check ",
"yes",
"no",
"ok",
"done",
"thanks",
]
)
# Content longer than this is treated as complex regardless of keywords
_COMPLEX_CHAR_THRESHOLD = 500
# Short content defaults to simple
_SIMPLE_CHAR_THRESHOLD = 150
# More than this many messages suggests an ongoing complex conversation
_COMPLEX_CONVERSATION_DEPTH = 6
def classify_task(messages: list[dict]) -> TaskComplexity:
"""Classify task complexity from a list of messages.
Uses heuristic rules — no LLM call required. Errs toward COMPLEX
when uncertain so that quality is preserved.
Args:
messages: List of message dicts with ``role`` and ``content`` keys.
Returns:
TaskComplexity.SIMPLE or TaskComplexity.COMPLEX
"""
if not messages:
return TaskComplexity.SIMPLE
# Concatenate all user-turn content for analysis
user_content = " ".join(
msg.get("content", "")
for msg in messages
if msg.get("role") in ("user", "human")
and isinstance(msg.get("content"), str)
).lower().strip()
if not user_content:
return TaskComplexity.SIMPLE
# Complexity signals override everything -----------------------------------
# Explicit complex keywords
for kw in _COMPLEX_KEYWORDS:
if kw in user_content:
return TaskComplexity.COMPLEX
# Numbered / multi-step instruction list: "1. do this 2. do that"
if re.search(r"\b\d+\.\s+\w", user_content):
return TaskComplexity.COMPLEX
# Code blocks embedded in messages
if "```" in user_content:
return TaskComplexity.COMPLEX
# Long content → complex reasoning likely required
if len(user_content) > _COMPLEX_CHAR_THRESHOLD:
return TaskComplexity.COMPLEX
# Deep conversation → complex ongoing task
if len(messages) > _COMPLEX_CONVERSATION_DEPTH:
return TaskComplexity.COMPLEX
# Simplicity signals -------------------------------------------------------
# Explicit simple keywords
for kw in _SIMPLE_KEYWORDS:
if kw in user_content:
return TaskComplexity.SIMPLE
# Short single-sentence messages default to simple
if len(user_content) <= _SIMPLE_CHAR_THRESHOLD:
return TaskComplexity.SIMPLE
# When uncertain, prefer quality (complex model)
return TaskComplexity.COMPLEX