[claude] Three-tier model router: Local 8B / Hermes 70B / Cloud API cascade (#882) (#1297)

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit is contained in:
2026-03-24 01:53:25 +00:00
committed by Timmy Time
parent a7ccfbddc9
commit 4f80cfcd58
6 changed files with 1326 additions and 0 deletions

View File

@@ -0,0 +1,178 @@
"""Tests for the cloud API budget tracker (issue #882)."""
import time
from unittest.mock import patch
import pytest
from infrastructure.models.budget import (
BudgetTracker,
SpendRecord,
estimate_cost_usd,
get_budget_tracker,
)
pytestmark = pytest.mark.unit
# ── estimate_cost_usd ─────────────────────────────────────────────────────────
class TestEstimateCostUsd:
def test_haiku_cheaper_than_sonnet(self):
haiku_cost = estimate_cost_usd("claude-haiku-4-5", 1000, 1000)
sonnet_cost = estimate_cost_usd("claude-sonnet-4-5", 1000, 1000)
assert haiku_cost < sonnet_cost
def test_zero_tokens_is_zero_cost(self):
assert estimate_cost_usd("gpt-4o", 0, 0) == 0.0
def test_unknown_model_uses_default(self):
cost = estimate_cost_usd("some-unknown-model-xyz", 1000, 1000)
assert cost > 0 # Uses conservative default, not zero
def test_versioned_model_name_matches(self):
# "claude-haiku-4-5-20251001" should match "haiku"
cost1 = estimate_cost_usd("claude-haiku-4-5-20251001", 1000, 0)
cost2 = estimate_cost_usd("claude-haiku-4-5", 1000, 0)
assert cost1 == cost2
def test_gpt4o_mini_cheaper_than_gpt4o(self):
mini = estimate_cost_usd("gpt-4o-mini", 1000, 1000)
full = estimate_cost_usd("gpt-4o", 1000, 1000)
assert mini < full
def test_returns_float(self):
assert isinstance(estimate_cost_usd("haiku", 100, 200), float)
# ── BudgetTracker ─────────────────────────────────────────────────────────────
class TestBudgetTrackerInit:
def test_creates_with_memory_db(self):
tracker = BudgetTracker(db_path=":memory:")
assert tracker._db_ok is True
def test_in_memory_fallback_empty_on_creation(self):
tracker = BudgetTracker(db_path=":memory:")
assert tracker._in_memory == []
def test_bad_path_uses_memory_fallback(self, tmp_path):
bad_path = str(tmp_path / "nonexistent" / "x" / "budget.db")
# Should not raise — just log and continue with memory fallback
# (actually will create parent dirs, so test with truly bad path)
tracker = BudgetTracker.__new__(BudgetTracker)
tracker._db_path = bad_path
tracker._lock = __import__("threading").Lock()
tracker._in_memory = []
tracker._db_ok = False
# Record to in-memory fallback
tracker._in_memory.append(
SpendRecord(time.time(), "test", "model", 100, 100, 0.001, "cloud")
)
assert len(tracker._in_memory) == 1
class TestBudgetTrackerRecordSpend:
def test_record_spend_returns_cost(self):
tracker = BudgetTracker(db_path=":memory:")
cost = tracker.record_spend("anthropic", "claude-haiku-4-5", 100, 200)
assert cost > 0
def test_record_spend_explicit_cost(self):
tracker = BudgetTracker(db_path=":memory:")
cost = tracker.record_spend("anthropic", "model", cost_usd=1.23)
assert cost == pytest.approx(1.23)
def test_record_spend_accumulates(self):
tracker = BudgetTracker(db_path=":memory:")
tracker.record_spend("openai", "gpt-4o", cost_usd=0.01)
tracker.record_spend("openai", "gpt-4o", cost_usd=0.02)
assert tracker.get_daily_spend() == pytest.approx(0.03, abs=1e-9)
def test_record_spend_with_tier_label(self):
tracker = BudgetTracker(db_path=":memory:")
cost = tracker.record_spend("anthropic", "haiku", tier="cloud_api")
assert cost >= 0
def test_monthly_spend_includes_daily(self):
tracker = BudgetTracker(db_path=":memory:")
tracker.record_spend("anthropic", "haiku", cost_usd=5.00)
assert tracker.get_monthly_spend() >= tracker.get_daily_spend()
class TestBudgetTrackerCloudAllowed:
def test_allowed_when_no_spend(self):
tracker = BudgetTracker(db_path=":memory:")
with (
patch.object(type(tracker._get_budget() if hasattr(tracker, "_get_budget") else tracker), "tier_cloud_daily_budget_usd", 5.0, create=True),
):
# Settings-based check — use real settings (5.0 default, 0 spent)
assert tracker.cloud_allowed() is True
def test_blocked_when_daily_limit_exceeded(self):
tracker = BudgetTracker(db_path=":memory:")
tracker.record_spend("anthropic", "haiku", cost_usd=999.0)
# With default daily limit of 5.0, 999 should block
assert tracker.cloud_allowed() is False
def test_allowed_when_daily_limit_zero(self):
tracker = BudgetTracker(db_path=":memory:")
tracker.record_spend("anthropic", "haiku", cost_usd=999.0)
with (
patch("infrastructure.models.budget.settings") as mock_settings,
):
mock_settings.tier_cloud_daily_budget_usd = 0 # disabled
mock_settings.tier_cloud_monthly_budget_usd = 0 # disabled
assert tracker.cloud_allowed() is True
def test_blocked_when_monthly_limit_exceeded(self):
tracker = BudgetTracker(db_path=":memory:")
tracker.record_spend("anthropic", "haiku", cost_usd=999.0)
with patch("infrastructure.models.budget.settings") as mock_settings:
mock_settings.tier_cloud_daily_budget_usd = 0 # daily disabled
mock_settings.tier_cloud_monthly_budget_usd = 10.0
assert tracker.cloud_allowed() is False
class TestBudgetTrackerSummary:
def test_summary_keys_present(self):
tracker = BudgetTracker(db_path=":memory:")
summary = tracker.get_summary()
assert "daily_usd" in summary
assert "monthly_usd" in summary
assert "daily_limit_usd" in summary
assert "monthly_limit_usd" in summary
assert "daily_ok" in summary
assert "monthly_ok" in summary
def test_summary_daily_ok_true_on_empty(self):
tracker = BudgetTracker(db_path=":memory:")
summary = tracker.get_summary()
assert summary["daily_ok"] is True
assert summary["monthly_ok"] is True
def test_summary_daily_ok_false_when_exceeded(self):
tracker = BudgetTracker(db_path=":memory:")
tracker.record_spend("openai", "gpt-4o", cost_usd=999.0)
summary = tracker.get_summary()
assert summary["daily_ok"] is False
# ── Singleton ─────────────────────────────────────────────────────────────────
class TestGetBudgetTrackerSingleton:
def test_returns_budget_tracker(self):
import infrastructure.models.budget as bmod
bmod._budget_tracker = None
tracker = get_budget_tracker()
assert isinstance(tracker, BudgetTracker)
def test_returns_same_instance(self):
import infrastructure.models.budget as bmod
bmod._budget_tracker = None
t1 = get_budget_tracker()
t2 = get_budget_tracker()
assert t1 is t2

View File

@@ -0,0 +1,380 @@
"""Tests for the tiered model router (issue #882).
Covers:
- classify_tier() for Tier-1/2/3 routing
- TieredModelRouter.route() with mocked CascadeRouter + BudgetTracker
- Auto-escalation from Tier-1 on low-quality responses
- Cloud-tier budget guard
- Acceptance criteria from the issue:
- "Walk to the next room" → LOCAL_FAST
- "Plan the optimal path to become Hortator" → LOCAL_HEAVY
"""
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from infrastructure.models.router import (
TierLabel,
TieredModelRouter,
_is_low_quality,
classify_tier,
get_tiered_router,
)
pytestmark = pytest.mark.unit
# ── classify_tier ─────────────────────────────────────────────────────────────
class TestClassifyTier:
# ── Tier-1 (LOCAL_FAST) ────────────────────────────────────────────────
def test_simple_navigation_is_local_fast(self):
assert classify_tier("walk to the next room") == TierLabel.LOCAL_FAST
def test_go_north_is_local_fast(self):
assert classify_tier("go north") == TierLabel.LOCAL_FAST
def test_single_binary_choice_is_local_fast(self):
assert classify_tier("yes") == TierLabel.LOCAL_FAST
def test_open_door_is_local_fast(self):
assert classify_tier("open door") == TierLabel.LOCAL_FAST
def test_attack_is_local_fast(self):
assert classify_tier("attack", {}) == TierLabel.LOCAL_FAST
# ── Tier-2 (LOCAL_HEAVY) ───────────────────────────────────────────────
def test_quest_planning_is_local_heavy(self):
assert classify_tier("plan the optimal path to become Hortator") == TierLabel.LOCAL_HEAVY
def test_strategy_keyword_is_local_heavy(self):
assert classify_tier("what is the best strategy") == TierLabel.LOCAL_HEAVY
def test_stuck_state_escalates_to_local_heavy(self):
assert classify_tier("help me", {"stuck": True}) == TierLabel.LOCAL_HEAVY
def test_require_t2_flag_is_local_heavy(self):
assert classify_tier("go north", {"require_t2": True}) == TierLabel.LOCAL_HEAVY
def test_long_input_is_local_heavy(self):
long_task = "tell me about " + ("the dungeon " * 30)
assert classify_tier(long_task) == TierLabel.LOCAL_HEAVY
def test_active_quests_upgrades_to_local_heavy(self):
ctx = {"active_quests": ["Q1", "Q2", "Q3"]}
assert classify_tier("go north", ctx) == TierLabel.LOCAL_HEAVY
def test_dialogue_active_upgrades_to_local_heavy(self):
ctx = {"dialogue_active": True}
assert classify_tier("yes", ctx) == TierLabel.LOCAL_HEAVY
def test_analyze_is_local_heavy(self):
assert classify_tier("analyze the situation") == TierLabel.LOCAL_HEAVY
def test_optimize_is_local_heavy(self):
assert classify_tier("optimize my build") == TierLabel.LOCAL_HEAVY
def test_negotiate_is_local_heavy(self):
assert classify_tier("negotiate with the Camonna Tong") == TierLabel.LOCAL_HEAVY
def test_explain_is_local_heavy(self):
assert classify_tier("explain the faction system") == TierLabel.LOCAL_HEAVY
# ── Tier-3 (CLOUD_API) ─────────────────────────────────────────────────
def test_require_cloud_flag_is_cloud_api(self):
assert classify_tier("go north", {"require_cloud": True}) == TierLabel.CLOUD_API
def test_require_cloud_overrides_everything(self):
assert classify_tier("yes", {"require_cloud": True}) == TierLabel.CLOUD_API
# ── Edge cases ────────────────────────────────────────────────────────
def test_empty_task_defaults_to_local_heavy(self):
# Empty string → nothing classifies it as T1 or T3
assert classify_tier("") == TierLabel.LOCAL_HEAVY
def test_case_insensitive(self):
assert classify_tier("PLAN my route") == TierLabel.LOCAL_HEAVY
def test_combat_active_upgrades_t1_to_heavy(self):
ctx = {"combat_active": True}
# "attack" is T1 word, but combat context → should NOT be LOCAL_FAST
result = classify_tier("attack", ctx)
assert result != TierLabel.LOCAL_FAST
# ── _is_low_quality ───────────────────────────────────────────────────────────
class TestIsLowQuality:
def test_empty_is_low_quality(self):
assert _is_low_quality("", TierLabel.LOCAL_FAST) is True
def test_whitespace_only_is_low_quality(self):
assert _is_low_quality(" ", TierLabel.LOCAL_FAST) is True
def test_very_short_is_low_quality(self):
assert _is_low_quality("ok", TierLabel.LOCAL_FAST) is True
def test_idontknow_is_low_quality(self):
assert _is_low_quality("I don't know how to help with that.", TierLabel.LOCAL_FAST) is True
def test_not_sure_is_low_quality(self):
assert _is_low_quality("I'm not sure about this.", TierLabel.LOCAL_FAST) is True
def test_as_an_ai_is_low_quality(self):
assert _is_low_quality("As an AI, I cannot...", TierLabel.LOCAL_FAST) is True
def test_good_response_is_not_low_quality(self):
response = "You move north into the Vivec Canton. The Ordinators watch your approach."
assert _is_low_quality(response, TierLabel.LOCAL_FAST) is False
def test_t1_short_response_triggers_escalation(self):
# Less than _ESCALATION_MIN_CHARS for T1
assert _is_low_quality("OK, done.", TierLabel.LOCAL_FAST) is True
def test_borderline_ok_for_t2_not_t1(self):
# Between _LOW_QUALITY_MIN_CHARS (20) and _ESCALATION_MIN_CHARS (60)
# → low quality for T1 (escalation threshold), but acceptable for T2/T3
response = "Done. The item is retrieved." # 28 chars: ≥20, <60
assert _is_low_quality(response, TierLabel.LOCAL_FAST) is True
assert _is_low_quality(response, TierLabel.LOCAL_HEAVY) is False
# ── TieredModelRouter ─────────────────────────────────────────────────────────
_GOOD_CONTENT = (
"You move north through the doorway into the next room. "
"The stone walls glisten with moisture."
) # 90 chars — well above the escalation threshold
def _make_cascade_mock(content=_GOOD_CONTENT, model="llama3.1:8b"):
mock = MagicMock()
mock.complete = AsyncMock(
return_value={
"content": content,
"provider": "ollama-local",
"model": model,
"latency_ms": 150.0,
}
)
return mock
def _make_budget_mock(allowed=True):
mock = MagicMock()
mock.cloud_allowed = MagicMock(return_value=allowed)
mock.record_spend = MagicMock(return_value=0.001)
return mock
@pytest.mark.asyncio
class TestTieredModelRouterRoute:
async def test_route_returns_tier_in_result(self):
router = TieredModelRouter(cascade=_make_cascade_mock())
result = await router.route("go north")
assert "tier" in result
assert result["tier"] == TierLabel.LOCAL_FAST
async def test_acceptance_walk_to_room_is_local_fast(self):
"""Acceptance: 'Walk to the next room' → LOCAL_FAST."""
router = TieredModelRouter(cascade=_make_cascade_mock())
result = await router.route("Walk to the next room")
assert result["tier"] == TierLabel.LOCAL_FAST
async def test_acceptance_plan_hortator_is_local_heavy(self):
"""Acceptance: 'Plan the optimal path to become Hortator' → LOCAL_HEAVY."""
router = TieredModelRouter(
cascade=_make_cascade_mock(model="hermes3:70b"),
)
result = await router.route("Plan the optimal path to become Hortator")
assert result["tier"] == TierLabel.LOCAL_HEAVY
async def test_t1_low_quality_escalates_to_t2(self):
"""Failed Tier-1 response auto-escalates to Tier-2."""
call_models = []
cascade = MagicMock()
async def complete_side_effect(messages, model, temperature, max_tokens):
call_models.append(model)
# First call (T1) returns a low-quality response
if len(call_models) == 1:
return {
"content": "I don't know.",
"provider": "ollama",
"model": model,
"latency_ms": 50,
}
# Second call (T2) returns a good response
return {
"content": "You move to the northern passage, passing through the Dunmer stronghold.",
"provider": "ollama",
"model": model,
"latency_ms": 800,
}
cascade.complete = complete_side_effect
router = TieredModelRouter(cascade=cascade, auto_escalate=True)
result = await router.route("go north")
assert len(call_models) == 2, "Should have called twice (T1 escalated to T2)"
assert result["tier"] == TierLabel.LOCAL_HEAVY
async def test_auto_escalate_false_no_escalation(self):
"""With auto_escalate=False, low-quality T1 response is returned as-is."""
call_count = {"n": 0}
cascade = MagicMock()
async def complete_side_effect(**kwargs):
call_count["n"] += 1
return {
"content": "I don't know.",
"provider": "ollama",
"model": "llama3.1:8b",
"latency_ms": 50,
}
cascade.complete = AsyncMock(side_effect=complete_side_effect)
router = TieredModelRouter(cascade=cascade, auto_escalate=False)
result = await router.route("go north")
assert call_count["n"] == 1
assert result["tier"] == TierLabel.LOCAL_FAST
async def test_t2_failure_escalates_to_cloud(self):
"""Tier-2 failure escalates to Cloud API (when budget allows)."""
cascade = MagicMock()
call_models = []
async def complete_side_effect(messages, model, temperature, max_tokens):
call_models.append(model)
if "hermes3" in model or "70b" in model.lower():
raise RuntimeError("Tier-2 model unavailable")
return {
"content": "Cloud response here.",
"provider": "anthropic",
"model": model,
"latency_ms": 1200,
}
cascade.complete = complete_side_effect
budget = _make_budget_mock(allowed=True)
router = TieredModelRouter(cascade=cascade, budget_tracker=budget)
result = await router.route("plan my route", context={"require_t2": True})
assert result["tier"] == TierLabel.CLOUD_API
async def test_cloud_blocked_by_budget_raises(self):
"""Cloud tier blocked when budget is exhausted."""
cascade = MagicMock()
cascade.complete = AsyncMock(side_effect=RuntimeError("T2 fail"))
budget = _make_budget_mock(allowed=False)
router = TieredModelRouter(cascade=cascade, budget_tracker=budget)
with pytest.raises(RuntimeError, match="budget limit"):
await router.route("plan my route", context={"require_t2": True})
async def test_explicit_cloud_tier_uses_cloud_model(self):
cascade = _make_cascade_mock(model="claude-haiku-4-5")
budget = _make_budget_mock(allowed=True)
router = TieredModelRouter(cascade=cascade, budget_tracker=budget)
result = await router.route("go north", context={"require_cloud": True})
assert result["tier"] == TierLabel.CLOUD_API
async def test_cloud_spend_recorded_with_usage(self):
"""Cloud spend is recorded when the response includes usage info."""
cascade = MagicMock()
cascade.complete = AsyncMock(
return_value={
"content": "Cloud answer.",
"provider": "anthropic",
"model": "claude-haiku-4-5",
"latency_ms": 900,
"usage": {"prompt_tokens": 50, "completion_tokens": 100},
}
)
budget = _make_budget_mock(allowed=True)
router = TieredModelRouter(cascade=cascade, budget_tracker=budget)
result = await router.route("go north", context={"require_cloud": True})
budget.record_spend.assert_called_once()
assert "cost_usd" in result
async def test_cloud_spend_not_recorded_without_usage(self):
"""Cloud spend is not recorded when usage info is absent."""
cascade = MagicMock()
cascade.complete = AsyncMock(
return_value={
"content": "Cloud answer.",
"provider": "anthropic",
"model": "claude-haiku-4-5",
"latency_ms": 900,
# no "usage" key
}
)
budget = _make_budget_mock(allowed=True)
router = TieredModelRouter(cascade=cascade, budget_tracker=budget)
result = await router.route("go north", context={"require_cloud": True})
budget.record_spend.assert_not_called()
assert "cost_usd" not in result
async def test_custom_tier_models_respected(self):
cascade = _make_cascade_mock()
router = TieredModelRouter(
cascade=cascade,
tier_models={TierLabel.LOCAL_FAST: "llama3.2:3b"},
)
await router.route("go north")
call_kwargs = cascade.complete.call_args
assert call_kwargs.kwargs["model"] == "llama3.2:3b"
async def test_messages_override_used_when_provided(self):
cascade = _make_cascade_mock()
router = TieredModelRouter(cascade=cascade)
custom_msgs = [{"role": "user", "content": "custom message"}]
await router.route("go north", messages=custom_msgs)
call_kwargs = cascade.complete.call_args
assert call_kwargs.kwargs["messages"] == custom_msgs
async def test_temperature_forwarded(self):
cascade = _make_cascade_mock()
router = TieredModelRouter(cascade=cascade)
await router.route("go north", temperature=0.7)
call_kwargs = cascade.complete.call_args
assert call_kwargs.kwargs["temperature"] == 0.7
async def test_max_tokens_forwarded(self):
cascade = _make_cascade_mock()
router = TieredModelRouter(cascade=cascade)
await router.route("go north", max_tokens=128)
call_kwargs = cascade.complete.call_args
assert call_kwargs.kwargs["max_tokens"] == 128
class TestTieredModelRouterClassify:
def test_classify_delegates_to_classify_tier(self):
router = TieredModelRouter(cascade=MagicMock())
assert router.classify("go north") == classify_tier("go north")
assert router.classify("plan the quest") == classify_tier("plan the quest")
class TestGetTieredRouterSingleton:
def test_returns_tiered_router_instance(self):
import infrastructure.models.router as rmod
rmod._tiered_router = None
router = get_tiered_router()
assert isinstance(router, TieredModelRouter)
def test_singleton_returns_same_instance(self):
import infrastructure.models.router as rmod
rmod._tiered_router = None
r1 = get_tiered_router()
r2 = get_tiered_router()
assert r1 is r2