From d417ba2a4802767db738f52ff2c8d41fa967acad Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 17 Mar 2026 03:44:44 -0700 Subject: [PATCH] feat: add route-aware pricing estimates (#1695) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Salvaged from PR #1563 by @kshitijk4poor. Cherry-picked with authorship preserved. - Route-aware pricing architecture replacing static MODEL_PRICING + heuristics - Canonical usage normalization (Anthropic/OpenAI/Codex API shapes) - Cache-aware billing (separate cache_read/cache_write rates) - Cost status tracking (estimated/included/unknown/actual) - OpenRouter live pricing via models API - Schema migration v4→v5 with billing metadata columns - Removed speculative forward-looking entries - Removed cost display from CLI status bar - Threaded OpenRouter metadata pre-warm Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com> --- agent/insights.py | 104 ++- agent/usage_pricing.py | 662 +++++++++++++++--- cli.py | 95 ++- ...16-pricing-accuracy-architecture-design.md | 608 ++++++++++++++++ gateway/run.py | 7 + gateway/session.py | 43 +- hermes_state.py | 82 ++- run_agent.py | 92 ++- tests/agent/test_usage_pricing.py | 101 +++ tests/gateway/test_session.py | 12 +- tests/gateway/test_status_command.py | 7 + tests/test_cli_status_bar.py | 25 +- tests/test_hermes_state.py | 6 +- tests/test_insights.py | 104 ++- 14 files changed, 1694 insertions(+), 254 deletions(-) create mode 100644 docs/plans/2026-03-16-pricing-accuracy-architecture-design.md create mode 100644 tests/agent/test_usage_pricing.py diff --git a/agent/insights.py b/agent/insights.py index 8fc55e043..64a37f11b 100644 --- a/agent/insights.py +++ b/agent/insights.py @@ -22,14 +22,21 @@ from collections import Counter, defaultdict from datetime import datetime from typing import Any, Dict, List -from agent.usage_pricing import DEFAULT_PRICING, estimate_cost_usd, format_duration_compact, get_pricing, has_known_pricing +from agent.usage_pricing import ( + CanonicalUsage, + DEFAULT_PRICING, + estimate_usage_cost, + format_duration_compact, + get_pricing, + has_known_pricing, +) _DEFAULT_PRICING = DEFAULT_PRICING -def _has_known_pricing(model_name: str) -> bool: +def _has_known_pricing(model_name: str, provider: str = None, base_url: str = None) -> bool: """Check if a model has known pricing (vs unknown/custom endpoint).""" - return has_known_pricing(model_name) + return has_known_pricing(model_name, provider=provider, base_url=base_url) def _get_pricing(model_name: str) -> Dict[str, float]: @@ -41,9 +48,43 @@ def _get_pricing(model_name: str) -> Dict[str, float]: return get_pricing(model_name) -def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: - """Estimate the USD cost for a given model and token counts.""" - return estimate_cost_usd(model, input_tokens, output_tokens) +def _estimate_cost( + session_or_model: Dict[str, Any] | str, + input_tokens: int = 0, + output_tokens: int = 0, + *, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, + provider: str = None, + base_url: str = None, +) -> tuple[float, str]: + """Estimate the USD cost for a session row or a model/token tuple.""" + if isinstance(session_or_model, dict): + session = session_or_model + model = session.get("model") or "" + usage = CanonicalUsage( + input_tokens=session.get("input_tokens") or 0, + output_tokens=session.get("output_tokens") or 0, + cache_read_tokens=session.get("cache_read_tokens") or 0, + cache_write_tokens=session.get("cache_write_tokens") or 0, + ) + provider = session.get("billing_provider") + base_url = session.get("billing_base_url") + else: + model = session_or_model or "" + usage = CanonicalUsage( + input_tokens=input_tokens, + output_tokens=output_tokens, + cache_read_tokens=cache_read_tokens, + cache_write_tokens=cache_write_tokens, + ) + result = estimate_usage_cost( + model, + usage, + provider=provider, + base_url=base_url, + ) + return float(result.amount_usd or 0.0), result.status def _format_duration(seconds: float) -> str: @@ -135,7 +176,10 @@ class InsightsEngine: # Columns we actually need (skip system_prompt, model_config blobs) _SESSION_COLS = ("id, source, model, started_at, ended_at, " - "message_count, tool_call_count, input_tokens, output_tokens") + "message_count, tool_call_count, input_tokens, output_tokens, " + "cache_read_tokens, cache_write_tokens, billing_provider, " + "billing_base_url, billing_mode, estimated_cost_usd, " + "actual_cost_usd, cost_status, cost_source") def _get_sessions(self, cutoff: float, source: str = None) -> List[Dict]: """Fetch sessions within the time window.""" @@ -287,21 +331,30 @@ class InsightsEngine: """Compute high-level overview statistics.""" total_input = sum(s.get("input_tokens") or 0 for s in sessions) total_output = sum(s.get("output_tokens") or 0 for s in sessions) - total_tokens = total_input + total_output + total_cache_read = sum(s.get("cache_read_tokens") or 0 for s in sessions) + total_cache_write = sum(s.get("cache_write_tokens") or 0 for s in sessions) + total_tokens = total_input + total_output + total_cache_read + total_cache_write total_tool_calls = sum(s.get("tool_call_count") or 0 for s in sessions) total_messages = sum(s.get("message_count") or 0 for s in sessions) # Cost estimation (weighted by model) total_cost = 0.0 + actual_cost = 0.0 models_with_pricing = set() models_without_pricing = set() + unknown_cost_sessions = 0 + included_cost_sessions = 0 for s in sessions: model = s.get("model") or "" - inp = s.get("input_tokens") or 0 - out = s.get("output_tokens") or 0 - total_cost += _estimate_cost(model, inp, out) + estimated, status = _estimate_cost(s) + total_cost += estimated + actual_cost += s.get("actual_cost_usd") or 0.0 display = model.split("/")[-1] if "/" in model else (model or "unknown") - if _has_known_pricing(model): + if status == "included": + included_cost_sessions += 1 + elif status == "unknown": + unknown_cost_sessions += 1 + if _has_known_pricing(model, s.get("billing_provider"), s.get("billing_base_url")): models_with_pricing.add(display) else: models_without_pricing.add(display) @@ -328,8 +381,11 @@ class InsightsEngine: "total_tool_calls": total_tool_calls, "total_input_tokens": total_input, "total_output_tokens": total_output, + "total_cache_read_tokens": total_cache_read, + "total_cache_write_tokens": total_cache_write, "total_tokens": total_tokens, "estimated_cost": total_cost, + "actual_cost": actual_cost, "total_hours": total_hours, "avg_session_duration": avg_duration, "avg_messages_per_session": total_messages / len(sessions) if sessions else 0, @@ -341,12 +397,15 @@ class InsightsEngine: "date_range_end": date_range_end, "models_with_pricing": sorted(models_with_pricing), "models_without_pricing": sorted(models_without_pricing), + "unknown_cost_sessions": unknown_cost_sessions, + "included_cost_sessions": included_cost_sessions, } def _compute_model_breakdown(self, sessions: List[Dict]) -> List[Dict]: """Break down usage by model.""" model_data = defaultdict(lambda: { "sessions": 0, "input_tokens": 0, "output_tokens": 0, + "cache_read_tokens": 0, "cache_write_tokens": 0, "total_tokens": 0, "tool_calls": 0, "cost": 0.0, }) @@ -358,12 +417,18 @@ class InsightsEngine: d["sessions"] += 1 inp = s.get("input_tokens") or 0 out = s.get("output_tokens") or 0 + cache_read = s.get("cache_read_tokens") or 0 + cache_write = s.get("cache_write_tokens") or 0 d["input_tokens"] += inp d["output_tokens"] += out - d["total_tokens"] += inp + out + d["cache_read_tokens"] += cache_read + d["cache_write_tokens"] += cache_write + d["total_tokens"] += inp + out + cache_read + cache_write d["tool_calls"] += s.get("tool_call_count") or 0 - d["cost"] += _estimate_cost(model, inp, out) - d["has_pricing"] = _has_known_pricing(model) + estimate, status = _estimate_cost(s) + d["cost"] += estimate + d["has_pricing"] = _has_known_pricing(model, s.get("billing_provider"), s.get("billing_base_url")) + d["cost_status"] = status result = [ {"model": model, **data} @@ -377,7 +442,8 @@ class InsightsEngine: """Break down usage by platform/source.""" platform_data = defaultdict(lambda: { "sessions": 0, "messages": 0, "input_tokens": 0, - "output_tokens": 0, "total_tokens": 0, "tool_calls": 0, + "output_tokens": 0, "cache_read_tokens": 0, + "cache_write_tokens": 0, "total_tokens": 0, "tool_calls": 0, }) for s in sessions: @@ -387,9 +453,13 @@ class InsightsEngine: d["messages"] += s.get("message_count") or 0 inp = s.get("input_tokens") or 0 out = s.get("output_tokens") or 0 + cache_read = s.get("cache_read_tokens") or 0 + cache_write = s.get("cache_write_tokens") or 0 d["input_tokens"] += inp d["output_tokens"] += out - d["total_tokens"] += inp + out + d["cache_read_tokens"] += cache_read + d["cache_write_tokens"] += cache_write + d["total_tokens"] += inp + out + cache_read + cache_write d["tool_calls"] += s.get("tool_call_count") or 0 result = [ diff --git a/agent/usage_pricing.py b/agent/usage_pricing.py index 5bfba25d4..29e7df254 100644 --- a/agent/usage_pricing.py +++ b/agent/usage_pricing.py @@ -1,101 +1,593 @@ from __future__ import annotations +from dataclasses import dataclass +from datetime import datetime, timezone from decimal import Decimal -from typing import Dict +from typing import Any, Dict, Literal, Optional - -MODEL_PRICING = { - "gpt-4o": {"input": 2.50, "output": 10.00}, - "gpt-4o-mini": {"input": 0.15, "output": 0.60}, - "gpt-4.1": {"input": 2.00, "output": 8.00}, - "gpt-4.1-mini": {"input": 0.40, "output": 1.60}, - "gpt-4.1-nano": {"input": 0.10, "output": 0.40}, - "gpt-4.5-preview": {"input": 75.00, "output": 150.00}, - "gpt-5": {"input": 10.00, "output": 30.00}, - "gpt-5.4": {"input": 10.00, "output": 30.00}, - "o3": {"input": 10.00, "output": 40.00}, - "o3-mini": {"input": 1.10, "output": 4.40}, - "o4-mini": {"input": 1.10, "output": 4.40}, - "claude-opus-4-20250514": {"input": 15.00, "output": 75.00}, - "claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00}, - "claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00}, - "claude-3-5-haiku-20241022": {"input": 0.80, "output": 4.00}, - "claude-3-opus-20240229": {"input": 15.00, "output": 75.00}, - "claude-3-haiku-20240307": {"input": 0.25, "output": 1.25}, - "deepseek-chat": {"input": 0.14, "output": 0.28}, - "deepseek-reasoner": {"input": 0.55, "output": 2.19}, - "gemini-2.5-pro": {"input": 1.25, "output": 10.00}, - "gemini-2.5-flash": {"input": 0.15, "output": 0.60}, - "gemini-2.0-flash": {"input": 0.10, "output": 0.40}, - "llama-4-maverick": {"input": 0.50, "output": 0.70}, - "llama-4-scout": {"input": 0.20, "output": 0.30}, - "glm-5": {"input": 0.0, "output": 0.0}, - "glm-4.7": {"input": 0.0, "output": 0.0}, - "glm-4.5": {"input": 0.0, "output": 0.0}, - "glm-4.5-flash": {"input": 0.0, "output": 0.0}, - "kimi-k2.5": {"input": 0.0, "output": 0.0}, - "kimi-k2-thinking": {"input": 0.0, "output": 0.0}, - "kimi-k2-turbo-preview": {"input": 0.0, "output": 0.0}, - "kimi-k2-0905-preview": {"input": 0.0, "output": 0.0}, - "MiniMax-M2.5": {"input": 0.0, "output": 0.0}, - "MiniMax-M2.5-highspeed": {"input": 0.0, "output": 0.0}, - "MiniMax-M2.1": {"input": 0.0, "output": 0.0}, -} +from agent.model_metadata import fetch_model_metadata DEFAULT_PRICING = {"input": 0.0, "output": 0.0} +_ZERO = Decimal("0") +_ONE_MILLION = Decimal("1000000") -def get_pricing(model_name: str) -> Dict[str, float]: - if not model_name: - return DEFAULT_PRICING - - bare = model_name.split("/")[-1].lower() - if bare in MODEL_PRICING: - return MODEL_PRICING[bare] - - best_match = None - best_len = 0 - for key, price in MODEL_PRICING.items(): - if bare.startswith(key) and len(key) > best_len: - best_match = price - best_len = len(key) - if best_match: - return best_match - - if "opus" in bare: - return {"input": 15.00, "output": 75.00} - if "sonnet" in bare: - return {"input": 3.00, "output": 15.00} - if "haiku" in bare: - return {"input": 0.80, "output": 4.00} - if "gpt-4o-mini" in bare: - return {"input": 0.15, "output": 0.60} - if "gpt-4o" in bare: - return {"input": 2.50, "output": 10.00} - if "gpt-5" in bare: - return {"input": 10.00, "output": 30.00} - if "deepseek" in bare: - return {"input": 0.14, "output": 0.28} - if "gemini" in bare: - return {"input": 0.15, "output": 0.60} - - return DEFAULT_PRICING +CostStatus = Literal["actual", "estimated", "included", "unknown"] +CostSource = Literal[ + "provider_cost_api", + "provider_generation_api", + "provider_models_api", + "official_docs_snapshot", + "user_override", + "custom_contract", + "none", +] -def has_known_pricing(model_name: str) -> bool: - pricing = get_pricing(model_name) - return pricing is not DEFAULT_PRICING and any( - float(value) > 0 for value in pricing.values() +@dataclass(frozen=True) +class CanonicalUsage: + input_tokens: int = 0 + output_tokens: int = 0 + cache_read_tokens: int = 0 + cache_write_tokens: int = 0 + reasoning_tokens: int = 0 + request_count: int = 1 + raw_usage: Optional[dict[str, Any]] = None + + @property + def prompt_tokens(self) -> int: + return self.input_tokens + self.cache_read_tokens + self.cache_write_tokens + + @property + def total_tokens(self) -> int: + return self.prompt_tokens + self.output_tokens + + +@dataclass(frozen=True) +class BillingRoute: + provider: str + model: str + base_url: str = "" + billing_mode: str = "unknown" + + +@dataclass(frozen=True) +class PricingEntry: + input_cost_per_million: Optional[Decimal] = None + output_cost_per_million: Optional[Decimal] = None + cache_read_cost_per_million: Optional[Decimal] = None + cache_write_cost_per_million: Optional[Decimal] = None + request_cost: Optional[Decimal] = None + source: CostSource = "none" + source_url: Optional[str] = None + pricing_version: Optional[str] = None + fetched_at: Optional[datetime] = None + + +@dataclass(frozen=True) +class CostResult: + amount_usd: Optional[Decimal] + status: CostStatus + source: CostSource + label: str + fetched_at: Optional[datetime] = None + pricing_version: Optional[str] = None + notes: tuple[str, ...] = () + + +_UTC_NOW = lambda: datetime.now(timezone.utc) + + +# Official docs snapshot entries. Models whose published pricing and cache +# semantics are stable enough to encode exactly. +_OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = { + ( + "anthropic", + "claude-opus-4-20250514", + ): PricingEntry( + input_cost_per_million=Decimal("15.00"), + output_cost_per_million=Decimal("75.00"), + cache_read_cost_per_million=Decimal("1.50"), + cache_write_cost_per_million=Decimal("18.75"), + source="official_docs_snapshot", + source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching", + pricing_version="anthropic-prompt-caching-2026-03-16", + ), + ( + "anthropic", + "claude-sonnet-4-20250514", + ): PricingEntry( + input_cost_per_million=Decimal("3.00"), + output_cost_per_million=Decimal("15.00"), + cache_read_cost_per_million=Decimal("0.30"), + cache_write_cost_per_million=Decimal("3.75"), + source="official_docs_snapshot", + source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching", + pricing_version="anthropic-prompt-caching-2026-03-16", + ), + # OpenAI + ( + "openai", + "gpt-4o", + ): PricingEntry( + input_cost_per_million=Decimal("2.50"), + output_cost_per_million=Decimal("10.00"), + cache_read_cost_per_million=Decimal("1.25"), + source="official_docs_snapshot", + source_url="https://openai.com/api/pricing/", + pricing_version="openai-pricing-2026-03-16", + ), + ( + "openai", + "gpt-4o-mini", + ): PricingEntry( + input_cost_per_million=Decimal("0.15"), + output_cost_per_million=Decimal("0.60"), + cache_read_cost_per_million=Decimal("0.075"), + source="official_docs_snapshot", + source_url="https://openai.com/api/pricing/", + pricing_version="openai-pricing-2026-03-16", + ), + ( + "openai", + "gpt-4.1", + ): PricingEntry( + input_cost_per_million=Decimal("2.00"), + output_cost_per_million=Decimal("8.00"), + cache_read_cost_per_million=Decimal("0.50"), + source="official_docs_snapshot", + source_url="https://openai.com/api/pricing/", + pricing_version="openai-pricing-2026-03-16", + ), + ( + "openai", + "gpt-4.1-mini", + ): PricingEntry( + input_cost_per_million=Decimal("0.40"), + output_cost_per_million=Decimal("1.60"), + cache_read_cost_per_million=Decimal("0.10"), + source="official_docs_snapshot", + source_url="https://openai.com/api/pricing/", + pricing_version="openai-pricing-2026-03-16", + ), + ( + "openai", + "gpt-4.1-nano", + ): PricingEntry( + input_cost_per_million=Decimal("0.10"), + output_cost_per_million=Decimal("0.40"), + cache_read_cost_per_million=Decimal("0.025"), + source="official_docs_snapshot", + source_url="https://openai.com/api/pricing/", + pricing_version="openai-pricing-2026-03-16", + ), + ( + "openai", + "o3", + ): PricingEntry( + input_cost_per_million=Decimal("10.00"), + output_cost_per_million=Decimal("40.00"), + cache_read_cost_per_million=Decimal("2.50"), + source="official_docs_snapshot", + source_url="https://openai.com/api/pricing/", + pricing_version="openai-pricing-2026-03-16", + ), + ( + "openai", + "o3-mini", + ): PricingEntry( + input_cost_per_million=Decimal("1.10"), + output_cost_per_million=Decimal("4.40"), + cache_read_cost_per_million=Decimal("0.55"), + source="official_docs_snapshot", + source_url="https://openai.com/api/pricing/", + pricing_version="openai-pricing-2026-03-16", + ), + # Anthropic older models (pre-4.6 generation) + ( + "anthropic", + "claude-3-5-sonnet-20241022", + ): PricingEntry( + input_cost_per_million=Decimal("3.00"), + output_cost_per_million=Decimal("15.00"), + cache_read_cost_per_million=Decimal("0.30"), + cache_write_cost_per_million=Decimal("3.75"), + source="official_docs_snapshot", + source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching", + pricing_version="anthropic-pricing-2026-03-16", + ), + ( + "anthropic", + "claude-3-5-haiku-20241022", + ): PricingEntry( + input_cost_per_million=Decimal("0.80"), + output_cost_per_million=Decimal("4.00"), + cache_read_cost_per_million=Decimal("0.08"), + cache_write_cost_per_million=Decimal("1.00"), + source="official_docs_snapshot", + source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching", + pricing_version="anthropic-pricing-2026-03-16", + ), + ( + "anthropic", + "claude-3-opus-20240229", + ): PricingEntry( + input_cost_per_million=Decimal("15.00"), + output_cost_per_million=Decimal("75.00"), + cache_read_cost_per_million=Decimal("1.50"), + cache_write_cost_per_million=Decimal("18.75"), + source="official_docs_snapshot", + source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching", + pricing_version="anthropic-pricing-2026-03-16", + ), + ( + "anthropic", + "claude-3-haiku-20240307", + ): PricingEntry( + input_cost_per_million=Decimal("0.25"), + output_cost_per_million=Decimal("1.25"), + cache_read_cost_per_million=Decimal("0.03"), + cache_write_cost_per_million=Decimal("0.30"), + source="official_docs_snapshot", + source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching", + pricing_version="anthropic-pricing-2026-03-16", + ), + # DeepSeek + ( + "deepseek", + "deepseek-chat", + ): PricingEntry( + input_cost_per_million=Decimal("0.14"), + output_cost_per_million=Decimal("0.28"), + source="official_docs_snapshot", + source_url="https://api-docs.deepseek.com/quick_start/pricing", + pricing_version="deepseek-pricing-2026-03-16", + ), + ( + "deepseek", + "deepseek-reasoner", + ): PricingEntry( + input_cost_per_million=Decimal("0.55"), + output_cost_per_million=Decimal("2.19"), + source="official_docs_snapshot", + source_url="https://api-docs.deepseek.com/quick_start/pricing", + pricing_version="deepseek-pricing-2026-03-16", + ), + # Google Gemini + ( + "google", + "gemini-2.5-pro", + ): PricingEntry( + input_cost_per_million=Decimal("1.25"), + output_cost_per_million=Decimal("10.00"), + source="official_docs_snapshot", + source_url="https://ai.google.dev/pricing", + pricing_version="google-pricing-2026-03-16", + ), + ( + "google", + "gemini-2.5-flash", + ): PricingEntry( + input_cost_per_million=Decimal("0.15"), + output_cost_per_million=Decimal("0.60"), + source="official_docs_snapshot", + source_url="https://ai.google.dev/pricing", + pricing_version="google-pricing-2026-03-16", + ), + ( + "google", + "gemini-2.0-flash", + ): PricingEntry( + input_cost_per_million=Decimal("0.10"), + output_cost_per_million=Decimal("0.40"), + source="official_docs_snapshot", + source_url="https://ai.google.dev/pricing", + pricing_version="google-pricing-2026-03-16", + ), +} + + +def _to_decimal(value: Any) -> Optional[Decimal]: + if value is None: + return None + try: + return Decimal(str(value)) + except Exception: + return None + + +def _to_int(value: Any) -> int: + try: + return int(value or 0) + except Exception: + return 0 + + +def resolve_billing_route( + model_name: str, + provider: Optional[str] = None, + base_url: Optional[str] = None, +) -> BillingRoute: + provider_name = (provider or "").strip().lower() + base = (base_url or "").strip().lower() + model = (model_name or "").strip() + if not provider_name and "/" in model: + inferred_provider, bare_model = model.split("/", 1) + if inferred_provider in {"anthropic", "openai", "google"}: + provider_name = inferred_provider + model = bare_model + + if provider_name == "openai-codex": + return BillingRoute(provider="openai-codex", model=model, base_url=base_url or "", billing_mode="subscription_included") + if provider_name == "openrouter" or "openrouter.ai" in base: + return BillingRoute(provider="openrouter", model=model, base_url=base_url or "", billing_mode="official_models_api") + if provider_name == "anthropic": + return BillingRoute(provider="anthropic", model=model.split("/")[-1], base_url=base_url or "", billing_mode="official_docs_snapshot") + if provider_name == "openai": + return BillingRoute(provider="openai", model=model.split("/")[-1], base_url=base_url or "", billing_mode="official_docs_snapshot") + if provider_name in {"custom", "local"} or (base and "localhost" in base): + return BillingRoute(provider=provider_name or "custom", model=model, base_url=base_url or "", billing_mode="unknown") + return BillingRoute(provider=provider_name or "unknown", model=model.split("/")[-1] if model else "", base_url=base_url or "", billing_mode="unknown") + + +def _lookup_official_docs_pricing(route: BillingRoute) -> Optional[PricingEntry]: + return _OFFICIAL_DOCS_PRICING.get((route.provider, route.model.lower())) + + +def _openrouter_pricing_entry(route: BillingRoute) -> Optional[PricingEntry]: + metadata = fetch_model_metadata() + model_id = route.model + if model_id not in metadata: + return None + pricing = metadata[model_id].get("pricing") or {} + prompt = _to_decimal(pricing.get("prompt")) + completion = _to_decimal(pricing.get("completion")) + request = _to_decimal(pricing.get("request")) + cache_read = _to_decimal( + pricing.get("cache_read") + or pricing.get("cached_prompt") + or pricing.get("input_cache_read") + ) + cache_write = _to_decimal( + pricing.get("cache_write") + or pricing.get("cache_creation") + or pricing.get("input_cache_write") + ) + if prompt is None and completion is None and request is None: + return None + def _per_token_to_per_million(value: Optional[Decimal]) -> Optional[Decimal]: + if value is None: + return None + return value * _ONE_MILLION + + return PricingEntry( + input_cost_per_million=_per_token_to_per_million(prompt), + output_cost_per_million=_per_token_to_per_million(completion), + cache_read_cost_per_million=_per_token_to_per_million(cache_read), + cache_write_cost_per_million=_per_token_to_per_million(cache_write), + request_cost=request, + source="provider_models_api", + source_url="https://openrouter.ai/docs/api/api-reference/models/get-models", + pricing_version="openrouter-models-api", + fetched_at=_UTC_NOW(), ) -def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float: - pricing = get_pricing(model) - total = ( - Decimal(input_tokens) * Decimal(str(pricing["input"])) - + Decimal(output_tokens) * Decimal(str(pricing["output"])) - ) / Decimal("1000000") - return float(total) +def get_pricing_entry( + model_name: str, + provider: Optional[str] = None, + base_url: Optional[str] = None, +) -> Optional[PricingEntry]: + route = resolve_billing_route(model_name, provider=provider, base_url=base_url) + if route.billing_mode == "subscription_included": + return PricingEntry( + input_cost_per_million=_ZERO, + output_cost_per_million=_ZERO, + cache_read_cost_per_million=_ZERO, + cache_write_cost_per_million=_ZERO, + source="none", + pricing_version="included-route", + ) + if route.provider == "openrouter": + return _openrouter_pricing_entry(route) + return _lookup_official_docs_pricing(route) + + +def normalize_usage( + response_usage: Any, + *, + provider: Optional[str] = None, + api_mode: Optional[str] = None, +) -> CanonicalUsage: + """Normalize raw API response usage into canonical token buckets. + + Handles three API shapes: + - Anthropic: input_tokens/output_tokens/cache_read_input_tokens/cache_creation_input_tokens + - Codex Responses: input_tokens includes cache tokens; input_tokens_details.cached_tokens separates them + - OpenAI Chat Completions: prompt_tokens includes cache tokens; prompt_tokens_details.cached_tokens separates them + + In both Codex and OpenAI modes, input_tokens is derived by subtracting cache + tokens from the total — the API contract is that input/prompt totals include + cached tokens and the details object breaks them out. + """ + if not response_usage: + return CanonicalUsage() + + provider_name = (provider or "").strip().lower() + mode = (api_mode or "").strip().lower() + + if mode == "anthropic_messages" or provider_name == "anthropic": + input_tokens = _to_int(getattr(response_usage, "input_tokens", 0)) + output_tokens = _to_int(getattr(response_usage, "output_tokens", 0)) + cache_read_tokens = _to_int(getattr(response_usage, "cache_read_input_tokens", 0)) + cache_write_tokens = _to_int(getattr(response_usage, "cache_creation_input_tokens", 0)) + elif mode == "codex_responses": + input_total = _to_int(getattr(response_usage, "input_tokens", 0)) + output_tokens = _to_int(getattr(response_usage, "output_tokens", 0)) + details = getattr(response_usage, "input_tokens_details", None) + cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0) + cache_write_tokens = _to_int( + getattr(details, "cache_creation_tokens", 0) if details else 0 + ) + input_tokens = max(0, input_total - cache_read_tokens - cache_write_tokens) + else: + prompt_total = _to_int(getattr(response_usage, "prompt_tokens", 0)) + output_tokens = _to_int(getattr(response_usage, "completion_tokens", 0)) + details = getattr(response_usage, "prompt_tokens_details", None) + cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0) + cache_write_tokens = _to_int( + getattr(details, "cache_write_tokens", 0) if details else 0 + ) + input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens) + + reasoning_tokens = 0 + output_details = getattr(response_usage, "output_tokens_details", None) + if output_details: + reasoning_tokens = _to_int(getattr(output_details, "reasoning_tokens", 0)) + + return CanonicalUsage( + input_tokens=input_tokens, + output_tokens=output_tokens, + cache_read_tokens=cache_read_tokens, + cache_write_tokens=cache_write_tokens, + reasoning_tokens=reasoning_tokens, + ) + + +def estimate_usage_cost( + model_name: str, + usage: CanonicalUsage, + *, + provider: Optional[str] = None, + base_url: Optional[str] = None, +) -> CostResult: + route = resolve_billing_route(model_name, provider=provider, base_url=base_url) + if route.billing_mode == "subscription_included": + return CostResult( + amount_usd=_ZERO, + status="included", + source="none", + label="included", + pricing_version="included-route", + ) + + entry = get_pricing_entry(model_name, provider=provider, base_url=base_url) + if not entry: + return CostResult(amount_usd=None, status="unknown", source="none", label="n/a") + + notes: list[str] = [] + amount = _ZERO + + if usage.input_tokens and entry.input_cost_per_million is None: + return CostResult(amount_usd=None, status="unknown", source=entry.source, label="n/a") + if usage.output_tokens and entry.output_cost_per_million is None: + return CostResult(amount_usd=None, status="unknown", source=entry.source, label="n/a") + if usage.cache_read_tokens: + if entry.cache_read_cost_per_million is None: + return CostResult( + amount_usd=None, + status="unknown", + source=entry.source, + label="n/a", + notes=("cache-read pricing unavailable for route",), + ) + if usage.cache_write_tokens: + if entry.cache_write_cost_per_million is None: + return CostResult( + amount_usd=None, + status="unknown", + source=entry.source, + label="n/a", + notes=("cache-write pricing unavailable for route",), + ) + + if entry.input_cost_per_million is not None: + amount += Decimal(usage.input_tokens) * entry.input_cost_per_million / _ONE_MILLION + if entry.output_cost_per_million is not None: + amount += Decimal(usage.output_tokens) * entry.output_cost_per_million / _ONE_MILLION + if entry.cache_read_cost_per_million is not None: + amount += Decimal(usage.cache_read_tokens) * entry.cache_read_cost_per_million / _ONE_MILLION + if entry.cache_write_cost_per_million is not None: + amount += Decimal(usage.cache_write_tokens) * entry.cache_write_cost_per_million / _ONE_MILLION + if entry.request_cost is not None and usage.request_count: + amount += Decimal(usage.request_count) * entry.request_cost + + status: CostStatus = "estimated" + label = f"~${amount:.2f}" + if entry.source == "none" and amount == _ZERO: + status = "included" + label = "included" + + if route.provider == "openrouter": + notes.append("OpenRouter cost is estimated from the models API until reconciled.") + + return CostResult( + amount_usd=amount, + status=status, + source=entry.source, + label=label, + fetched_at=entry.fetched_at, + pricing_version=entry.pricing_version, + notes=tuple(notes), + ) + + +def has_known_pricing( + model_name: str, + provider: Optional[str] = None, + base_url: Optional[str] = None, +) -> bool: + """Check whether we have pricing data for this model+route. + + Uses direct lookup instead of routing through the full estimation + pipeline — avoids creating dummy usage objects just to check status. + """ + route = resolve_billing_route(model_name, provider=provider, base_url=base_url) + if route.billing_mode == "subscription_included": + return True + entry = get_pricing_entry(model_name, provider=provider, base_url=base_url) + return entry is not None + + +def get_pricing( + model_name: str, + provider: Optional[str] = None, + base_url: Optional[str] = None, +) -> Dict[str, float]: + """Backward-compatible thin wrapper for legacy callers. + + Returns only non-cache input/output fields when a pricing entry exists. + Unknown routes return zeroes. + """ + entry = get_pricing_entry(model_name, provider=provider, base_url=base_url) + if not entry: + return {"input": 0.0, "output": 0.0} + return { + "input": float(entry.input_cost_per_million or _ZERO), + "output": float(entry.output_cost_per_million or _ZERO), + } + + +def estimate_cost_usd( + model: str, + input_tokens: int, + output_tokens: int, + *, + provider: Optional[str] = None, + base_url: Optional[str] = None, +) -> float: + """Backward-compatible helper for legacy callers. + + This uses non-cached input/output only. New code should call + `estimate_usage_cost()` with canonical usage buckets. + """ + result = estimate_usage_cost( + model, + CanonicalUsage(input_tokens=input_tokens, output_tokens=output_tokens), + provider=provider, + base_url=base_url, + ) + return float(result.amount_usd or _ZERO) def format_duration_compact(seconds: float) -> str: diff --git a/cli.py b/cli.py index 2b0c4ad82..af194069d 100755 --- a/cli.py +++ b/cli.py @@ -58,7 +58,12 @@ except (ImportError, AttributeError): import threading import queue -from agent.usage_pricing import estimate_cost_usd, format_duration_compact, format_token_count_compact, has_known_pricing +from agent.usage_pricing import ( + CanonicalUsage, + estimate_usage_cost, + format_duration_compact, + format_token_count_compact, +) from hermes_cli.banner import _format_context_length _COMMAND_SPINNER_FRAMES = ("⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏") @@ -212,7 +217,7 @@ def load_cli_config() -> Dict[str, Any]: "resume_display": "full", "show_reasoning": False, "streaming": False, - "show_cost": False, + "skin": "default", "theme_mode": "auto", }, @@ -1034,8 +1039,7 @@ class HermesCLI: self.bell_on_complete = CLI_CONFIG["display"].get("bell_on_complete", False) # show_reasoning: display model thinking/reasoning before the response self.show_reasoning = CLI_CONFIG["display"].get("show_reasoning", False) - # show_cost: display $ cost in the status bar (off by default) - self.show_cost = CLI_CONFIG["display"].get("show_cost", False) + self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose") # streaming: stream tokens to the terminal as they arrive (display.streaming in config.yaml) @@ -1260,12 +1264,14 @@ class HermesCLI: "context_tokens": 0, "context_length": None, "context_percent": None, + "session_input_tokens": 0, + "session_output_tokens": 0, + "session_cache_read_tokens": 0, + "session_cache_write_tokens": 0, "session_prompt_tokens": 0, "session_completion_tokens": 0, "session_total_tokens": 0, "session_api_calls": 0, - "session_cost": 0.0, - "pricing_known": has_known_pricing(model_name), "compressions": 0, } @@ -1273,15 +1279,14 @@ class HermesCLI: if not agent: return snapshot + snapshot["session_input_tokens"] = getattr(agent, "session_input_tokens", 0) or 0 + snapshot["session_output_tokens"] = getattr(agent, "session_output_tokens", 0) or 0 + snapshot["session_cache_read_tokens"] = getattr(agent, "session_cache_read_tokens", 0) or 0 + snapshot["session_cache_write_tokens"] = getattr(agent, "session_cache_write_tokens", 0) or 0 snapshot["session_prompt_tokens"] = getattr(agent, "session_prompt_tokens", 0) or 0 snapshot["session_completion_tokens"] = getattr(agent, "session_completion_tokens", 0) or 0 snapshot["session_total_tokens"] = getattr(agent, "session_total_tokens", 0) or 0 snapshot["session_api_calls"] = getattr(agent, "session_api_calls", 0) or 0 - snapshot["session_cost"] = estimate_cost_usd( - model_name, - snapshot["session_prompt_tokens"], - snapshot["session_completion_tokens"], - ) compressor = getattr(agent, "context_compressor", None) if compressor: @@ -1302,19 +1307,11 @@ class HermesCLI: percent = snapshot["context_percent"] percent_label = f"{percent}%" if percent is not None else "--" duration_label = snapshot["duration"] - show_cost = getattr(self, "show_cost", False) - - if show_cost: - cost_label = f"${snapshot['session_cost']:.2f}" if snapshot["pricing_known"] else "cost n/a" - else: - cost_label = None if width < 52: return f"⚕ {snapshot['model_short']} · {duration_label}" if width < 76: parts = [f"⚕ {snapshot['model_short']}", percent_label] - if cost_label: - parts.append(cost_label) parts.append(duration_label) return " · ".join(parts) @@ -1326,8 +1323,6 @@ class HermesCLI: context_label = "ctx --" parts = [f"⚕ {snapshot['model_short']}", context_label, percent_label] - if cost_label: - parts.append(cost_label) parts.append(duration_label) return " │ ".join(parts) except Exception: @@ -1338,12 +1333,6 @@ class HermesCLI: snapshot = self._get_status_bar_snapshot() width = shutil.get_terminal_size((80, 24)).columns duration_label = snapshot["duration"] - show_cost = getattr(self, "show_cost", False) - - if show_cost: - cost_label = f"${snapshot['session_cost']:.2f}" if snapshot["pricing_known"] else "cost n/a" - else: - cost_label = None if width < 52: return [ @@ -1363,11 +1352,6 @@ class HermesCLI: ("class:status-bar-dim", " · "), (self._status_bar_context_style(percent), percent_label), ] - if cost_label: - frags.extend([ - ("class:status-bar-dim", " · "), - ("class:status-bar-dim", cost_label), - ]) frags.extend([ ("class:status-bar-dim", " · "), ("class:status-bar-dim", duration_label), @@ -1393,11 +1377,6 @@ class HermesCLI: ("class:status-bar-dim", " "), (bar_style, percent_label), ] - if cost_label: - frags.extend([ - ("class:status-bar-dim", " │ "), - ("class:status-bar-dim", cost_label), - ]) frags.extend([ ("class:status-bar-dim", " │ "), ("class:status-bar-dim", duration_label), @@ -4250,6 +4229,10 @@ class HermesCLI: return agent = self.agent + input_tokens = getattr(agent, "session_input_tokens", 0) or 0 + output_tokens = getattr(agent, "session_output_tokens", 0) or 0 + cache_read_tokens = getattr(agent, "session_cache_read_tokens", 0) or 0 + cache_write_tokens = getattr(agent, "session_cache_write_tokens", 0) or 0 prompt = agent.session_prompt_tokens completion = agent.session_completion_tokens total = agent.session_total_tokens @@ -4267,33 +4250,45 @@ class HermesCLI: compressions = compressor.compression_count msg_count = len(self.conversation_history) - cost = estimate_cost_usd(agent.model, prompt, completion) - prompt_cost = estimate_cost_usd(agent.model, prompt, 0) - completion_cost = estimate_cost_usd(agent.model, 0, completion) - pricing_known = has_known_pricing(agent.model) + cost_result = estimate_usage_cost( + agent.model, + CanonicalUsage( + input_tokens=input_tokens, + output_tokens=output_tokens, + cache_read_tokens=cache_read_tokens, + cache_write_tokens=cache_write_tokens, + ), + provider=getattr(agent, "provider", None), + base_url=getattr(agent, "base_url", None), + ) elapsed = format_duration_compact((datetime.now() - self.session_start).total_seconds()) print(f" 📊 Session Token Usage") print(f" {'─' * 40}") print(f" Model: {agent.model}") - print(f" Prompt tokens (input): {prompt:>10,}") - print(f" Completion tokens (output): {completion:>9,}") + print(f" Input tokens: {input_tokens:>10,}") + print(f" Cache read tokens: {cache_read_tokens:>10,}") + print(f" Cache write tokens: {cache_write_tokens:>10,}") + print(f" Output tokens: {output_tokens:>10,}") + print(f" Prompt tokens (total): {prompt:>10,}") + print(f" Completion tokens: {completion:>10,}") print(f" Total tokens: {total:>10,}") print(f" API calls: {calls:>10,}") print(f" Session duration: {elapsed:>10}") - if pricing_known: - print(f" Input cost: ${prompt_cost:>10.4f}") - print(f" Output cost: ${completion_cost:>10.4f}") - print(f" Total cost: ${cost:>10.4f}") + print(f" Cost status: {cost_result.status:>10}") + print(f" Cost source: {cost_result.source:>10}") + if cost_result.amount_usd is not None: + prefix = "~" if cost_result.status == "estimated" else "" + print(f" Total cost: {prefix}${float(cost_result.amount_usd):>10.4f}") + elif cost_result.status == "included": + print(f" Total cost: {'included':>10}") else: - print(f" Input cost: {'n/a':>10}") - print(f" Output cost: {'n/a':>10}") print(f" Total cost: {'n/a':>10}") print(f" {'─' * 40}") print(f" Current context: {last_prompt:,} / {ctx_len:,} ({pct:.0f}%)") print(f" Messages: {msg_count}") print(f" Compressions: {compressions}") - if not pricing_known: + if cost_result.status == "unknown": print(f" Note: Pricing unknown for {agent.model}") if self.verbose: diff --git a/docs/plans/2026-03-16-pricing-accuracy-architecture-design.md b/docs/plans/2026-03-16-pricing-accuracy-architecture-design.md new file mode 100644 index 000000000..a75f14ff5 --- /dev/null +++ b/docs/plans/2026-03-16-pricing-accuracy-architecture-design.md @@ -0,0 +1,608 @@ +# Pricing Accuracy Architecture + +Date: 2026-03-16 + +## Goal + +Hermes should only show dollar costs when they are backed by an official source for the user's actual billing path. + +This design replaces the current static, heuristic pricing flow in: + +- `run_agent.py` +- `agent/usage_pricing.py` +- `agent/insights.py` +- `cli.py` + +with a provider-aware pricing system that: + +- handles cache billing correctly +- distinguishes `actual` vs `estimated` vs `included` vs `unknown` +- reconciles post-hoc costs when providers expose authoritative billing data +- supports direct providers, OpenRouter, subscriptions, enterprise pricing, and custom endpoints + +## Problems In The Current Design + +Current Hermes behavior has four structural issues: + +1. It stores only `prompt_tokens` and `completion_tokens`, which is insufficient for providers that bill cache reads and cache writes separately. +2. It uses a static model price table and fuzzy heuristics, which can drift from current official pricing. +3. It assumes public API list pricing matches the user's real billing path. +4. It has no distinction between live estimates and reconciled billed cost. + +## Design Principles + +1. Normalize usage before pricing. +2. Never fold cached tokens into plain input cost. +3. Track certainty explicitly. +4. Treat the billing path as part of the model identity. +5. Prefer official machine-readable sources over scraped docs. +6. Use post-hoc provider cost APIs when available. +7. Show `n/a` rather than inventing precision. + +## High-Level Architecture + +The new system has four layers: + +1. `usage_normalization` + Converts raw provider usage into a canonical usage record. +2. `pricing_source_resolution` + Determines the billing path, source of truth, and applicable pricing source. +3. `cost_estimation_and_reconciliation` + Produces an immediate estimate when possible, then replaces or annotates it with actual billed cost later. +4. `presentation` + `/usage`, `/insights`, and the status bar display cost with certainty metadata. + +## Canonical Usage Record + +Add a canonical usage model that every provider path maps into before any pricing math happens. + +Suggested structure: + +```python +@dataclass +class CanonicalUsage: + provider: str + billing_provider: str + model: str + billing_route: str + + input_tokens: int = 0 + output_tokens: int = 0 + cache_read_tokens: int = 0 + cache_write_tokens: int = 0 + reasoning_tokens: int = 0 + request_count: int = 1 + + raw_usage: dict[str, Any] | None = None + raw_usage_fields: dict[str, str] | None = None + computed_fields: set[str] | None = None + + provider_request_id: str | None = None + provider_generation_id: str | None = None + provider_response_id: str | None = None +``` + +Rules: + +- `input_tokens` means non-cached input only. +- `cache_read_tokens` and `cache_write_tokens` are never merged into `input_tokens`. +- `output_tokens` excludes cache metrics. +- `reasoning_tokens` is telemetry unless a provider officially bills it separately. + +This is the same normalization pattern used by `opencode`, extended with provenance and reconciliation ids. + +## Provider Normalization Rules + +### OpenAI Direct + +Source usage fields: + +- `prompt_tokens` +- `completion_tokens` +- `prompt_tokens_details.cached_tokens` + +Normalization: + +- `cache_read_tokens = cached_tokens` +- `input_tokens = prompt_tokens - cached_tokens` +- `cache_write_tokens = 0` unless OpenAI exposes it in the relevant route +- `output_tokens = completion_tokens` + +### Anthropic Direct + +Source usage fields: + +- `input_tokens` +- `output_tokens` +- `cache_read_input_tokens` +- `cache_creation_input_tokens` + +Normalization: + +- `input_tokens = input_tokens` +- `output_tokens = output_tokens` +- `cache_read_tokens = cache_read_input_tokens` +- `cache_write_tokens = cache_creation_input_tokens` + +### OpenRouter + +Estimate-time usage normalization should use the response usage payload with the same rules as the underlying provider when possible. + +Reconciliation-time records should also store: + +- OpenRouter generation id +- native token fields when available +- `total_cost` +- `cache_discount` +- `upstream_inference_cost` +- `is_byok` + +### Gemini / Vertex + +Use official Gemini or Vertex usage fields where available. + +If cached content tokens are exposed: + +- map them to `cache_read_tokens` + +If a route exposes no cache creation metric: + +- store `cache_write_tokens = 0` +- preserve the raw usage payload for later extension + +### DeepSeek And Other Direct Providers + +Normalize only the fields that are officially exposed. + +If a provider does not expose cache buckets: + +- do not infer them unless the provider explicitly documents how to derive them + +### Subscription / Included-Cost Routes + +These still use the canonical usage model. + +Tokens are tracked normally. Cost depends on billing mode, not on whether usage exists. + +## Billing Route Model + +Hermes must stop keying pricing solely by `model`. + +Introduce a billing route descriptor: + +```python +@dataclass +class BillingRoute: + provider: str + base_url: str | None + model: str + billing_mode: str + organization_hint: str | None = None +``` + +`billing_mode` values: + +- `official_cost_api` +- `official_generation_api` +- `official_models_api` +- `official_docs_snapshot` +- `subscription_included` +- `user_override` +- `custom_contract` +- `unknown` + +Examples: + +- OpenAI direct API with Costs API access: `official_cost_api` +- Anthropic direct API with Usage & Cost API access: `official_cost_api` +- OpenRouter request before reconciliation: `official_models_api` +- OpenRouter request after generation lookup: `official_generation_api` +- GitHub Copilot style subscription route: `subscription_included` +- local OpenAI-compatible server: `unknown` +- enterprise contract with configured rates: `custom_contract` + +## Cost Status Model + +Every displayed cost should have: + +```python +@dataclass +class CostResult: + amount_usd: Decimal | None + status: Literal["actual", "estimated", "included", "unknown"] + source: Literal[ + "provider_cost_api", + "provider_generation_api", + "provider_models_api", + "official_docs_snapshot", + "user_override", + "custom_contract", + "none", + ] + label: str + fetched_at: datetime | None + pricing_version: str | None + notes: list[str] +``` + +Presentation rules: + +- `actual`: show dollar amount as final +- `estimated`: show dollar amount with estimate labeling +- `included`: show `included` or `$0.00 (included)` depending on UX choice +- `unknown`: show `n/a` + +## Official Source Hierarchy + +Resolve cost using this order: + +1. Request-level or account-level official billed cost +2. Official machine-readable model pricing +3. Official docs snapshot +4. User override or custom contract +5. Unknown + +The system must never skip to a lower level if a higher-confidence source exists for the current billing route. + +## Provider-Specific Truth Rules + +### OpenAI Direct + +Preferred truth: + +1. Costs API for reconciled spend +2. Official pricing page for live estimate + +### Anthropic Direct + +Preferred truth: + +1. Usage & Cost API for reconciled spend +2. Official pricing docs for live estimate + +### OpenRouter + +Preferred truth: + +1. `GET /api/v1/generation` for reconciled `total_cost` +2. `GET /api/v1/models` pricing for live estimate + +Do not use underlying provider public pricing as the source of truth for OpenRouter billing. + +### Gemini / Vertex + +Preferred truth: + +1. official billing export or billing API for reconciled spend when available for the route +2. official pricing docs for estimate + +### DeepSeek + +Preferred truth: + +1. official machine-readable cost source if available in the future +2. official pricing docs snapshot today + +### Subscription-Included Routes + +Preferred truth: + +1. explicit route config marking the model as included in subscription + +These should display `included`, not an API list-price estimate. + +### Custom Endpoint / Local Model + +Preferred truth: + +1. user override +2. custom contract config +3. unknown + +These should default to `unknown`. + +## Pricing Catalog + +Replace the current `MODEL_PRICING` dict with a richer pricing catalog. + +Suggested record: + +```python +@dataclass +class PricingEntry: + provider: str + route_pattern: str + model_pattern: str + + input_cost_per_million: Decimal | None = None + output_cost_per_million: Decimal | None = None + cache_read_cost_per_million: Decimal | None = None + cache_write_cost_per_million: Decimal | None = None + request_cost: Decimal | None = None + image_cost: Decimal | None = None + + source: str = "official_docs_snapshot" + source_url: str | None = None + fetched_at: datetime | None = None + pricing_version: str | None = None +``` + +The catalog should be route-aware: + +- `openai:gpt-5` +- `anthropic:claude-opus-4-6` +- `openrouter:anthropic/claude-opus-4.6` +- `copilot:gpt-4o` + +This avoids conflating direct-provider billing with aggregator billing. + +## Pricing Sync Architecture + +Introduce a pricing sync subsystem instead of manually maintaining a single hardcoded table. + +Suggested modules: + +- `agent/pricing/catalog.py` +- `agent/pricing/sources.py` +- `agent/pricing/sync.py` +- `agent/pricing/reconcile.py` +- `agent/pricing/types.py` + +### Sync Sources + +- OpenRouter models API +- official provider docs snapshots where no API exists +- user overrides from config + +### Sync Output + +Cache pricing entries locally with: + +- source URL +- fetch timestamp +- version/hash +- confidence/source type + +### Sync Frequency + +- startup warm cache +- background refresh every 6 to 24 hours depending on source +- manual `hermes pricing sync` + +## Reconciliation Architecture + +Live requests may produce only an estimate initially. Hermes should reconcile them later when a provider exposes actual billed cost. + +Suggested flow: + +1. Agent call completes. +2. Hermes stores canonical usage plus reconciliation ids. +3. Hermes computes an immediate estimate if a pricing source exists. +4. A reconciliation worker fetches actual cost when supported. +5. Session and message records are updated with `actual` cost. + +This can run: + +- inline for cheap lookups +- asynchronously for delayed provider accounting + +## Persistence Changes + +Session storage should stop storing only aggregate prompt/completion totals. + +Add fields for both usage and cost certainty: + +- `input_tokens` +- `output_tokens` +- `cache_read_tokens` +- `cache_write_tokens` +- `reasoning_tokens` +- `estimated_cost_usd` +- `actual_cost_usd` +- `cost_status` +- `cost_source` +- `pricing_version` +- `billing_provider` +- `billing_mode` + +If schema expansion is too large for one PR, add a new pricing events table: + +```text +session_cost_events + id + session_id + request_id + provider + model + billing_mode + input_tokens + output_tokens + cache_read_tokens + cache_write_tokens + estimated_cost_usd + actual_cost_usd + cost_status + cost_source + pricing_version + created_at + updated_at +``` + +## Hermes Touchpoints + +### `run_agent.py` + +Current responsibility: + +- parse raw provider usage +- update session token counters + +New responsibility: + +- build `CanonicalUsage` +- update canonical counters +- store reconciliation ids +- emit usage event to pricing subsystem + +### `agent/usage_pricing.py` + +Current responsibility: + +- static lookup table +- direct cost arithmetic + +New responsibility: + +- move or replace with pricing catalog facade +- no fuzzy model-family heuristics +- no direct pricing without billing-route context + +### `cli.py` + +Current responsibility: + +- compute session cost directly from prompt/completion totals + +New responsibility: + +- display `CostResult` +- show status badges: + - `actual` + - `estimated` + - `included` + - `n/a` + +### `agent/insights.py` + +Current responsibility: + +- recompute historical estimates from static pricing + +New responsibility: + +- aggregate stored pricing events +- prefer actual cost over estimate +- surface estimates only when reconciliation is unavailable + +## UX Rules + +### Status Bar + +Show one of: + +- `$1.42` +- `~$1.42` +- `included` +- `cost n/a` + +Where: + +- `$1.42` means `actual` +- `~$1.42` means `estimated` +- `included` means subscription-backed or explicitly zero-cost route +- `cost n/a` means unknown + +### `/usage` + +Show: + +- token buckets +- estimated cost +- actual cost if available +- cost status +- pricing source + +### `/insights` + +Aggregate: + +- actual cost totals +- estimated-only totals +- unknown-cost sessions count +- included-cost sessions count + +## Config And Overrides + +Add user-configurable pricing overrides in config: + +```yaml +pricing: + mode: hybrid + sync_on_startup: true + sync_interval_hours: 12 + overrides: + - provider: openrouter + model: anthropic/claude-opus-4.6 + billing_mode: custom_contract + input_cost_per_million: 4.25 + output_cost_per_million: 22.0 + cache_read_cost_per_million: 0.5 + cache_write_cost_per_million: 6.0 + included_routes: + - provider: copilot + model: "*" + - provider: codex-subscription + model: "*" +``` + +Overrides must win over catalog defaults for the matching billing route. + +## Rollout Plan + +### Phase 1 + +- add canonical usage model +- split cache token buckets in `run_agent.py` +- stop pricing cache-inflated prompt totals +- preserve current UI with improved backend math + +### Phase 2 + +- add route-aware pricing catalog +- integrate OpenRouter models API sync +- add `estimated` vs `included` vs `unknown` + +### Phase 3 + +- add reconciliation for OpenRouter generation cost +- add actual cost persistence +- update `/insights` to prefer actual cost + +### Phase 4 + +- add direct OpenAI and Anthropic reconciliation paths +- add user overrides and contract pricing +- add pricing sync CLI command + +## Testing Strategy + +Add tests for: + +- OpenAI cached token subtraction +- Anthropic cache read/write separation +- OpenRouter estimated vs actual reconciliation +- subscription-backed models showing `included` +- custom endpoints showing `n/a` +- override precedence +- stale catalog fallback behavior + +Current tests that assume heuristic pricing should be replaced with route-aware expectations. + +## Non-Goals + +- exact enterprise billing reconstruction without an official source or user override +- backfilling perfect historical cost for old sessions that lack cache bucket data +- scraping arbitrary provider web pages at request time + +## Recommendation + +Do not expand the existing `MODEL_PRICING` dict. + +That path cannot satisfy the product requirement. Hermes should instead migrate to: + +- canonical usage normalization +- route-aware pricing sources +- estimate-then-reconcile cost lifecycle +- explicit certainty states in the UI + +This is the minimum architecture that makes the statement "Hermes pricing is backed by official sources where possible, and otherwise clearly labeled" defensible. diff --git a/gateway/run.py b/gateway/run.py index 3791dd6b3..c1fc7a627 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2089,8 +2089,15 @@ class GatewayRunner: session_entry.session_key, input_tokens=agent_result.get("input_tokens", 0), output_tokens=agent_result.get("output_tokens", 0), + cache_read_tokens=agent_result.get("cache_read_tokens", 0), + cache_write_tokens=agent_result.get("cache_write_tokens", 0), last_prompt_tokens=agent_result.get("last_prompt_tokens", 0), model=agent_result.get("model"), + estimated_cost_usd=agent_result.get("estimated_cost_usd"), + cost_status=agent_result.get("cost_status"), + cost_source=agent_result.get("cost_source"), + provider=agent_result.get("provider"), + base_url=agent_result.get("base_url"), ) # Auto voice reply: send TTS audio before the text response diff --git a/gateway/session.py b/gateway/session.py index d0bf0cfe4..e58b6d689 100644 --- a/gateway/session.py +++ b/gateway/session.py @@ -343,7 +343,11 @@ class SessionEntry: # Token tracking input_tokens: int = 0 output_tokens: int = 0 + cache_read_tokens: int = 0 + cache_write_tokens: int = 0 total_tokens: int = 0 + estimated_cost_usd: float = 0.0 + cost_status: str = "unknown" # Last API-reported prompt tokens (for accurate compression pre-check) last_prompt_tokens: int = 0 @@ -363,8 +367,12 @@ class SessionEntry: "chat_type": self.chat_type, "input_tokens": self.input_tokens, "output_tokens": self.output_tokens, + "cache_read_tokens": self.cache_read_tokens, + "cache_write_tokens": self.cache_write_tokens, "total_tokens": self.total_tokens, "last_prompt_tokens": self.last_prompt_tokens, + "estimated_cost_usd": self.estimated_cost_usd, + "cost_status": self.cost_status, } if self.origin: result["origin"] = self.origin.to_dict() @@ -394,8 +402,12 @@ class SessionEntry: chat_type=data.get("chat_type", "dm"), input_tokens=data.get("input_tokens", 0), output_tokens=data.get("output_tokens", 0), + cache_read_tokens=data.get("cache_read_tokens", 0), + cache_write_tokens=data.get("cache_write_tokens", 0), total_tokens=data.get("total_tokens", 0), last_prompt_tokens=data.get("last_prompt_tokens", 0), + estimated_cost_usd=data.get("estimated_cost_usd", 0.0), + cost_status=data.get("cost_status", "unknown"), ) @@ -696,8 +708,15 @@ class SessionStore: session_key: str, input_tokens: int = 0, output_tokens: int = 0, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, last_prompt_tokens: int = None, model: str = None, + estimated_cost_usd: Optional[float] = None, + cost_status: Optional[str] = None, + cost_source: Optional[str] = None, + provider: Optional[str] = None, + base_url: Optional[str] = None, ) -> None: """Update a session's metadata after an interaction.""" self._ensure_loaded() @@ -707,15 +726,35 @@ class SessionStore: entry.updated_at = datetime.now() entry.input_tokens += input_tokens entry.output_tokens += output_tokens + entry.cache_read_tokens += cache_read_tokens + entry.cache_write_tokens += cache_write_tokens if last_prompt_tokens is not None: entry.last_prompt_tokens = last_prompt_tokens - entry.total_tokens = entry.input_tokens + entry.output_tokens + if estimated_cost_usd is not None: + entry.estimated_cost_usd += estimated_cost_usd + if cost_status: + entry.cost_status = cost_status + entry.total_tokens = ( + entry.input_tokens + + entry.output_tokens + + entry.cache_read_tokens + + entry.cache_write_tokens + ) self._save() if self._db: try: self._db.update_token_counts( - entry.session_id, input_tokens, output_tokens, + entry.session_id, + input_tokens=input_tokens, + output_tokens=output_tokens, + cache_read_tokens=cache_read_tokens, + cache_write_tokens=cache_write_tokens, + estimated_cost_usd=estimated_cost_usd, + cost_status=cost_status, + cost_source=cost_source, + billing_provider=provider, + billing_base_url=base_url, model=model, ) except Exception as e: diff --git a/hermes_state.py b/hermes_state.py index d0237a5bb..e990cbbc8 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -26,7 +26,7 @@ from typing import Dict, Any, List, Optional DEFAULT_DB_PATH = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "state.db" -SCHEMA_VERSION = 4 +SCHEMA_VERSION = 5 SCHEMA_SQL = """ CREATE TABLE IF NOT EXISTS schema_version ( @@ -48,6 +48,17 @@ CREATE TABLE IF NOT EXISTS sessions ( tool_call_count INTEGER DEFAULT 0, input_tokens INTEGER DEFAULT 0, output_tokens INTEGER DEFAULT 0, + cache_read_tokens INTEGER DEFAULT 0, + cache_write_tokens INTEGER DEFAULT 0, + reasoning_tokens INTEGER DEFAULT 0, + billing_provider TEXT, + billing_base_url TEXT, + billing_mode TEXT, + estimated_cost_usd REAL, + actual_cost_usd REAL, + cost_status TEXT, + cost_source TEXT, + pricing_version TEXT, title TEXT, FOREIGN KEY (parent_session_id) REFERENCES sessions(id) ); @@ -154,6 +165,26 @@ class SessionDB: except sqlite3.OperationalError: pass # Index already exists cursor.execute("UPDATE schema_version SET version = 4") + if current_version < 5: + new_columns = [ + ("cache_read_tokens", "INTEGER DEFAULT 0"), + ("cache_write_tokens", "INTEGER DEFAULT 0"), + ("reasoning_tokens", "INTEGER DEFAULT 0"), + ("billing_provider", "TEXT"), + ("billing_base_url", "TEXT"), + ("billing_mode", "TEXT"), + ("estimated_cost_usd", "REAL"), + ("actual_cost_usd", "REAL"), + ("cost_status", "TEXT"), + ("cost_source", "TEXT"), + ("pricing_version", "TEXT"), + ] + for name, column_type in new_columns: + try: + cursor.execute(f"ALTER TABLE sessions ADD COLUMN {name} {column_type}") + except sqlite3.OperationalError: + pass + cursor.execute("UPDATE schema_version SET version = 5") # Unique title index — always ensure it exists (safe to run after migrations # since the title column is guaranteed to exist at this point) @@ -233,8 +264,22 @@ class SessionDB: self._conn.commit() def update_token_counts( - self, session_id: str, input_tokens: int = 0, output_tokens: int = 0, + self, + session_id: str, + input_tokens: int = 0, + output_tokens: int = 0, model: str = None, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, + reasoning_tokens: int = 0, + estimated_cost_usd: Optional[float] = None, + actual_cost_usd: Optional[float] = None, + cost_status: Optional[str] = None, + cost_source: Optional[str] = None, + pricing_version: Optional[str] = None, + billing_provider: Optional[str] = None, + billing_base_url: Optional[str] = None, + billing_mode: Optional[str] = None, ) -> None: """Increment token counters and backfill model if not already set.""" with self._lock: @@ -242,9 +287,40 @@ class SessionDB: """UPDATE sessions SET input_tokens = input_tokens + ?, output_tokens = output_tokens + ?, + cache_read_tokens = cache_read_tokens + ?, + cache_write_tokens = cache_write_tokens + ?, + reasoning_tokens = reasoning_tokens + ?, + estimated_cost_usd = COALESCE(estimated_cost_usd, 0) + COALESCE(?, 0), + actual_cost_usd = CASE + WHEN ? IS NULL THEN actual_cost_usd + ELSE COALESCE(actual_cost_usd, 0) + ? + END, + cost_status = COALESCE(?, cost_status), + cost_source = COALESCE(?, cost_source), + pricing_version = COALESCE(?, pricing_version), + billing_provider = COALESCE(billing_provider, ?), + billing_base_url = COALESCE(billing_base_url, ?), + billing_mode = COALESCE(billing_mode, ?), model = COALESCE(model, ?) WHERE id = ?""", - (input_tokens, output_tokens, model, session_id), + ( + input_tokens, + output_tokens, + cache_read_tokens, + cache_write_tokens, + reasoning_tokens, + estimated_cost_usd, + actual_cost_usd, + actual_cost_usd, + cost_status, + cost_source, + pricing_version, + billing_provider, + billing_base_url, + billing_mode, + model, + session_id, + ), ) self._conn.commit() diff --git a/run_agent.py b/run_agent.py index e8bf35c47..b17c95eb2 100644 --- a/run_agent.py +++ b/run_agent.py @@ -86,6 +86,7 @@ from agent.model_metadata import ( from agent.context_compressor import ContextCompressor from agent.prompt_caching import apply_anthropic_cache_control from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt +from agent.usage_pricing import estimate_usage_cost, normalize_usage from agent.display import ( KawaiiSpinner, build_tool_preview as _build_tool_preview, get_cute_tool_message as _get_cute_tool_message_impl, @@ -391,6 +392,15 @@ class AIAgent: else: self.api_mode = "chat_completions" + # Pre-warm OpenRouter model metadata cache in a background thread. + # fetch_model_metadata() is cached for 1 hour; this avoids a blocking + # HTTP request on the first API response when pricing is estimated. + if self.provider == "openrouter" or "openrouter" in self.base_url.lower(): + threading.Thread( + target=lambda: fetch_model_metadata(), + daemon=True, + ).start() + self.tool_progress_callback = tool_progress_callback self.thinking_callback = thinking_callback self.reasoning_callback = reasoning_callback @@ -850,6 +860,14 @@ class AIAgent: self.session_completion_tokens = 0 self.session_total_tokens = 0 self.session_api_calls = 0 + self.session_input_tokens = 0 + self.session_output_tokens = 0 + self.session_cache_read_tokens = 0 + self.session_cache_write_tokens = 0 + self.session_reasoning_tokens = 0 + self.session_estimated_cost_usd = 0.0 + self.session_cost_status = "unknown" + self.session_cost_source = "none" if not self.quiet_mode: if compression_enabled: @@ -5272,26 +5290,14 @@ class AIAgent: # Track actual token usage from response for context management if hasattr(response, 'usage') and response.usage: - if self.api_mode in ("codex_responses", "anthropic_messages"): - prompt_tokens = getattr(response.usage, 'input_tokens', 0) or 0 - if self.api_mode == "anthropic_messages": - # Anthropic splits input into cache_read + cache_creation - # + non-cached input_tokens. Without adding the cached - # portions, the context bar shows only the tiny non-cached - # portion (e.g. 3 tokens) instead of the real total (~18K). - # Other providers (OpenAI/Codex) already include cached - # tokens in their input_tokens/prompt_tokens field. - prompt_tokens += getattr(response.usage, 'cache_read_input_tokens', 0) or 0 - prompt_tokens += getattr(response.usage, 'cache_creation_input_tokens', 0) or 0 - completion_tokens = getattr(response.usage, 'output_tokens', 0) or 0 - total_tokens = ( - getattr(response.usage, 'total_tokens', None) - or (prompt_tokens + completion_tokens) - ) - else: - prompt_tokens = getattr(response.usage, 'prompt_tokens', 0) or 0 - completion_tokens = getattr(response.usage, 'completion_tokens', 0) or 0 - total_tokens = getattr(response.usage, 'total_tokens', 0) or 0 + canonical_usage = normalize_usage( + response.usage, + provider=self.provider, + api_mode=self.api_mode, + ) + prompt_tokens = canonical_usage.prompt_tokens + completion_tokens = canonical_usage.output_tokens + total_tokens = canonical_usage.total_tokens usage_dict = { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, @@ -5310,6 +5316,22 @@ class AIAgent: self.session_completion_tokens += completion_tokens self.session_total_tokens += total_tokens self.session_api_calls += 1 + self.session_input_tokens += canonical_usage.input_tokens + self.session_output_tokens += canonical_usage.output_tokens + self.session_cache_read_tokens += canonical_usage.cache_read_tokens + self.session_cache_write_tokens += canonical_usage.cache_write_tokens + self.session_reasoning_tokens += canonical_usage.reasoning_tokens + + cost_result = estimate_usage_cost( + self.model, + canonical_usage, + provider=self.provider, + base_url=self.base_url, + ) + if cost_result.amount_usd is not None: + self.session_estimated_cost_usd += float(cost_result.amount_usd) + self.session_cost_status = cost_result.status + self.session_cost_source = cost_result.source # Persist token counts to session DB for /insights. # Gateway sessions persist via session_store.update_session() @@ -5320,8 +5342,19 @@ class AIAgent: try: self._session_db.update_token_counts( self.session_id, - input_tokens=prompt_tokens, - output_tokens=completion_tokens, + input_tokens=canonical_usage.input_tokens, + output_tokens=canonical_usage.output_tokens, + cache_read_tokens=canonical_usage.cache_read_tokens, + cache_write_tokens=canonical_usage.cache_write_tokens, + reasoning_tokens=canonical_usage.reasoning_tokens, + estimated_cost_usd=float(cost_result.amount_usd) + if cost_result.amount_usd is not None else None, + cost_status=cost_result.status, + cost_source=cost_result.source, + billing_provider=self.provider, + billing_base_url=self.base_url, + billing_mode="subscription_included" + if cost_result.status == "included" else None, model=self.model, ) except Exception: @@ -6242,6 +6275,21 @@ class AIAgent: "partial": False, # True only when stopped due to invalid tool calls "interrupted": interrupted, "response_previewed": getattr(self, "_response_was_previewed", False), + "model": self.model, + "provider": self.provider, + "base_url": self.base_url, + "input_tokens": self.session_input_tokens, + "output_tokens": self.session_output_tokens, + "cache_read_tokens": self.session_cache_read_tokens, + "cache_write_tokens": self.session_cache_write_tokens, + "reasoning_tokens": self.session_reasoning_tokens, + "prompt_tokens": self.session_prompt_tokens, + "completion_tokens": self.session_completion_tokens, + "total_tokens": self.session_total_tokens, + "last_prompt_tokens": getattr(self.context_compressor, "last_prompt_tokens", 0) or 0, + "estimated_cost_usd": self.session_estimated_cost_usd, + "cost_status": self.session_cost_status, + "cost_source": self.session_cost_source, } self._response_was_previewed = False diff --git a/tests/agent/test_usage_pricing.py b/tests/agent/test_usage_pricing.py new file mode 100644 index 000000000..6d972dfa7 --- /dev/null +++ b/tests/agent/test_usage_pricing.py @@ -0,0 +1,101 @@ +from types import SimpleNamespace + +from agent.usage_pricing import ( + CanonicalUsage, + estimate_usage_cost, + get_pricing_entry, + normalize_usage, +) + + +def test_normalize_usage_anthropic_keeps_cache_buckets_separate(): + usage = SimpleNamespace( + input_tokens=1000, + output_tokens=500, + cache_read_input_tokens=2000, + cache_creation_input_tokens=400, + ) + + normalized = normalize_usage(usage, provider="anthropic", api_mode="anthropic_messages") + + assert normalized.input_tokens == 1000 + assert normalized.output_tokens == 500 + assert normalized.cache_read_tokens == 2000 + assert normalized.cache_write_tokens == 400 + assert normalized.prompt_tokens == 3400 + + +def test_normalize_usage_openai_subtracts_cached_prompt_tokens(): + usage = SimpleNamespace( + prompt_tokens=3000, + completion_tokens=700, + prompt_tokens_details=SimpleNamespace(cached_tokens=1800), + ) + + normalized = normalize_usage(usage, provider="openai", api_mode="chat_completions") + + assert normalized.input_tokens == 1200 + assert normalized.cache_read_tokens == 1800 + assert normalized.output_tokens == 700 + + +def test_openrouter_models_api_pricing_is_converted_from_per_token_to_per_million(monkeypatch): + monkeypatch.setattr( + "agent.usage_pricing.fetch_model_metadata", + lambda: { + "anthropic/claude-opus-4.6": { + "pricing": { + "prompt": "0.000005", + "completion": "0.000025", + "input_cache_read": "0.0000005", + "input_cache_write": "0.00000625", + } + } + }, + ) + + entry = get_pricing_entry( + "anthropic/claude-opus-4.6", + provider="openrouter", + base_url="https://openrouter.ai/api/v1", + ) + + assert float(entry.input_cost_per_million) == 5.0 + assert float(entry.output_cost_per_million) == 25.0 + assert float(entry.cache_read_cost_per_million) == 0.5 + assert float(entry.cache_write_cost_per_million) == 6.25 + + +def test_estimate_usage_cost_marks_subscription_routes_included(): + result = estimate_usage_cost( + "gpt-5.3-codex", + CanonicalUsage(input_tokens=1000, output_tokens=500), + provider="openai-codex", + base_url="https://chatgpt.com/backend-api/codex", + ) + + assert result.status == "included" + assert float(result.amount_usd) == 0.0 + + +def test_estimate_usage_cost_refuses_cache_pricing_without_official_cache_rate(monkeypatch): + monkeypatch.setattr( + "agent.usage_pricing.fetch_model_metadata", + lambda: { + "google/gemini-2.5-pro": { + "pricing": { + "prompt": "0.00000125", + "completion": "0.00001", + } + } + }, + ) + + result = estimate_usage_cost( + "google/gemini-2.5-pro", + CanonicalUsage(input_tokens=1000, output_tokens=500, cache_read_tokens=100), + provider="openrouter", + base_url="https://openrouter.ai/api/v1", + ) + + assert result.status == "unknown" diff --git a/tests/gateway/test_session.py b/tests/gateway/test_session.py index e29a9583d..afe436870 100644 --- a/tests/gateway/test_session.py +++ b/tests/gateway/test_session.py @@ -703,5 +703,15 @@ class TestLastPromptTokens: store.update_session("k1", model="openai/gpt-5.4") store._db.update_token_counts.assert_called_once_with( - "s1", 0, 0, model="openai/gpt-5.4" + "s1", + input_tokens=0, + output_tokens=0, + cache_read_tokens=0, + cache_write_tokens=0, + estimated_cost_usd=None, + cost_status=None, + cost_source=None, + billing_provider=None, + billing_base_url=None, + model="openai/gpt-5.4", ) diff --git a/tests/gateway/test_status_command.py b/tests/gateway/test_status_command.py index 1c22543f7..1378ff1cb 100644 --- a/tests/gateway/test_status_command.py +++ b/tests/gateway/test_status_command.py @@ -128,6 +128,13 @@ async def test_handle_message_persists_agent_token_counts(monkeypatch): session_entry.session_key, input_tokens=120, output_tokens=45, + cache_read_tokens=0, + cache_write_tokens=0, last_prompt_tokens=80, model="openai/test-model", + estimated_cost_usd=None, + cost_status=None, + cost_source=None, + provider=None, + base_url=None, ) diff --git a/tests/test_cli_status_bar.py b/tests/test_cli_status_bar.py index 4e281ffa8..c1dd4b35b 100644 --- a/tests/test_cli_status_bar.py +++ b/tests/test_cli_status_bar.py @@ -16,6 +16,10 @@ def _make_cli(model: str = "anthropic/claude-sonnet-4-20250514"): def _attach_agent( cli_obj, *, + input_tokens: int | None = None, + output_tokens: int | None = None, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, prompt_tokens: int, completion_tokens: int, total_tokens: int, @@ -26,6 +30,12 @@ def _attach_agent( ): cli_obj.agent = SimpleNamespace( model=cli_obj.model, + provider="anthropic" if cli_obj.model.startswith("anthropic/") else None, + base_url="", + session_input_tokens=input_tokens if input_tokens is not None else prompt_tokens, + session_output_tokens=output_tokens if output_tokens is not None else completion_tokens, + session_cache_read_tokens=cache_read_tokens, + session_cache_write_tokens=cache_write_tokens, session_prompt_tokens=prompt_tokens, session_completion_tokens=completion_tokens, session_total_tokens=total_tokens, @@ -68,20 +78,19 @@ class TestCLIStatusBar: assert "$0.06" not in text # cost hidden by default assert "15m" in text - def test_build_status_bar_text_shows_cost_when_enabled(self): + def test_build_status_bar_text_no_cost_in_status_bar(self): cli_obj = _attach_agent( _make_cli(), prompt_tokens=10000, - completion_tokens=2400, - total_tokens=12400, + completion_tokens=5000, + total_tokens=15000, api_calls=7, - context_tokens=12400, + context_tokens=50000, context_length=200_000, ) - cli_obj.show_cost = True text = cli_obj._build_status_bar_text(width=120) - assert "$" in text # cost is shown when enabled + assert "$" not in text # cost is never shown in status bar def test_build_status_bar_text_collapses_for_narrow_terminal(self): cli_obj = _attach_agent( @@ -128,8 +137,8 @@ class TestCLIUsageReport: output = capsys.readouterr().out assert "Model:" in output - assert "Input cost:" in output - assert "Output cost:" in output + assert "Cost status:" in output + assert "Cost source:" in output assert "Total cost:" in output assert "$" in output assert "0.064" in output diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index d77247936..01d9c37ca 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -657,7 +657,7 @@ class TestSchemaInit: def test_schema_version(self, db): cursor = db._conn.execute("SELECT version FROM schema_version") version = cursor.fetchone()[0] - assert version == 4 + assert version == 5 def test_title_column_exists(self, db): """Verify the title column was created in the sessions table.""" @@ -713,12 +713,12 @@ class TestSchemaInit: conn.commit() conn.close() - # Open with SessionDB — should migrate to v4 + # Open with SessionDB — should migrate to v5 migrated_db = SessionDB(db_path=db_path) # Verify migration cursor = migrated_db._conn.execute("SELECT version FROM schema_version") - assert cursor.fetchone()[0] == 4 + assert cursor.fetchone()[0] == 5 # Verify title column exists and is NULL for existing sessions session = migrated_db.get_session("existing") diff --git a/tests/test_insights.py b/tests/test_insights.py index 6f6280a1d..af4f59829 100644 --- a/tests/test_insights.py +++ b/tests/test_insights.py @@ -123,28 +123,16 @@ def populated_db(db): # ========================================================================= class TestPricing: - def test_exact_match(self): - pricing = _get_pricing("gpt-4o") - assert pricing["input"] == 2.50 - assert pricing["output"] == 10.00 - def test_provider_prefix_stripped(self): pricing = _get_pricing("anthropic/claude-sonnet-4-20250514") assert pricing["input"] == 3.00 assert pricing["output"] == 15.00 - def test_prefix_match(self): - pricing = _get_pricing("claude-3-5-sonnet-20241022") - assert pricing["input"] == 3.00 - - def test_keyword_heuristic_opus(self): + def test_unknown_models_do_not_use_heuristics(self): pricing = _get_pricing("some-new-opus-model") - assert pricing["input"] == 15.00 - assert pricing["output"] == 75.00 - - def test_keyword_heuristic_haiku(self): + assert pricing == _DEFAULT_PRICING pricing = _get_pricing("anthropic/claude-haiku-future") - assert pricing["input"] == 0.80 + assert pricing == _DEFAULT_PRICING def test_unknown_model_returns_zero_cost(self): """Unknown/custom models should NOT have fabricated costs.""" @@ -168,40 +156,12 @@ class TestPricing: pricing = _get_pricing("") assert pricing == _DEFAULT_PRICING - def test_deepseek_heuristic(self): - pricing = _get_pricing("deepseek-v3") - assert pricing["input"] == 0.14 - - def test_gemini_heuristic(self): - pricing = _get_pricing("gemini-3.0-ultra") - assert pricing["input"] == 0.15 - - def test_dated_model_gpt4o_mini(self): - """gpt-4o-mini-2024-07-18 should match gpt-4o-mini, NOT gpt-4o.""" - pricing = _get_pricing("gpt-4o-mini-2024-07-18") - assert pricing["input"] == 0.15 # gpt-4o-mini price, not gpt-4o's 2.50 - - def test_dated_model_o3_mini(self): - """o3-mini-2025-01-31 should match o3-mini, NOT o3.""" - pricing = _get_pricing("o3-mini-2025-01-31") - assert pricing["input"] == 1.10 # o3-mini price, not o3's 10.00 - - def test_dated_model_gpt41_mini(self): - """gpt-4.1-mini-2025-04-14 should match gpt-4.1-mini, NOT gpt-4.1.""" - pricing = _get_pricing("gpt-4.1-mini-2025-04-14") - assert pricing["input"] == 0.40 # gpt-4.1-mini, not gpt-4.1's 2.00 - - def test_dated_model_gpt41_nano(self): - """gpt-4.1-nano-2025-04-14 should match gpt-4.1-nano, NOT gpt-4.1.""" - pricing = _get_pricing("gpt-4.1-nano-2025-04-14") - assert pricing["input"] == 0.10 # gpt-4.1-nano, not gpt-4.1's 2.00 - class TestHasKnownPricing: def test_known_commercial_model(self): - assert _has_known_pricing("gpt-4o") is True + assert _has_known_pricing("gpt-4o", provider="openai") is True assert _has_known_pricing("anthropic/claude-sonnet-4-20250514") is True - assert _has_known_pricing("deepseek-chat") is True + assert _has_known_pricing("gpt-4.1", provider="openai") is True def test_unknown_custom_model(self): assert _has_known_pricing("FP16_Hermes_4.5") is False @@ -210,26 +170,39 @@ class TestHasKnownPricing: assert _has_known_pricing("") is False assert _has_known_pricing(None) is False - def test_heuristic_matched_models(self): - """Models matched by keyword heuristics should be considered known.""" - assert _has_known_pricing("some-opus-model") is True - assert _has_known_pricing("future-sonnet-v2") is True + def test_heuristic_matched_models_are_not_considered_known(self): + assert _has_known_pricing("some-opus-model") is False + assert _has_known_pricing("future-sonnet-v2") is False class TestEstimateCost: def test_basic_cost(self): - # gpt-4o: 2.50/M input, 10.00/M output - cost = _estimate_cost("gpt-4o", 1_000_000, 1_000_000) - assert cost == pytest.approx(12.50, abs=0.01) + cost, status = _estimate_cost( + "anthropic/claude-sonnet-4-20250514", + 1_000_000, + 1_000_000, + provider="anthropic", + ) + assert status == "estimated" + assert cost == pytest.approx(18.0, abs=0.01) def test_zero_tokens(self): - cost = _estimate_cost("gpt-4o", 0, 0) + cost, status = _estimate_cost("gpt-4o", 0, 0, provider="openai") + assert status == "estimated" assert cost == 0.0 - def test_small_usage(self): - cost = _estimate_cost("gpt-4o", 1000, 500) - # 1000 * 2.50/1M + 500 * 10.00/1M = 0.0025 + 0.005 = 0.0075 - assert cost == pytest.approx(0.0075, abs=0.0001) + def test_cache_aware_usage(self): + cost, status = _estimate_cost( + "anthropic/claude-sonnet-4-20250514", + 1000, + 500, + cache_read_tokens=2000, + cache_write_tokens=400, + provider="anthropic", + ) + assert status == "estimated" + expected = (1000 * 3.0 + 500 * 15.0 + 2000 * 0.30 + 400 * 3.75) / 1_000_000 + assert cost == pytest.approx(expected, abs=0.0001) # ========================================================================= @@ -660,8 +633,13 @@ class TestEdgeCases: def test_mixed_commercial_and_custom_models(self, db): """Mix of commercial and custom models: only commercial ones get costs.""" - db.create_session(session_id="s1", source="cli", model="gpt-4o") - db.update_token_counts("s1", input_tokens=10000, output_tokens=5000) + db.create_session(session_id="s1", source="cli", model="anthropic/claude-sonnet-4-20250514") + db.update_token_counts( + "s1", + input_tokens=10000, + output_tokens=5000, + billing_provider="anthropic", + ) db.create_session(session_id="s2", source="cli", model="my-local-llama") db.update_token_counts("s2", input_tokens=10000, output_tokens=5000) db._conn.commit() @@ -672,13 +650,13 @@ class TestEdgeCases: # Cost should only come from gpt-4o, not from the custom model overview = report["overview"] assert overview["estimated_cost"] > 0 - assert "gpt-4o" in overview["models_with_pricing"] # list now, not set + assert "claude-sonnet-4-20250514" in overview["models_with_pricing"] # list now, not set assert "my-local-llama" in overview["models_without_pricing"] # Verify individual model entries - gpt = next(m for m in report["models"] if m["model"] == "gpt-4o") - assert gpt["has_pricing"] is True - assert gpt["cost"] > 0 + claude = next(m for m in report["models"] if m["model"] == "claude-sonnet-4-20250514") + assert claude["has_pricing"] is True + assert claude["cost"] > 0 llama = next(m for m in report["models"] if m["model"] == "my-local-llama") assert llama["has_pricing"] is False