feat: add route-aware pricing estimates (#1695)

Salvaged from PR #1563 by @kshitijk4poor. Cherry-picked with authorship preserved.

- Route-aware pricing architecture replacing static MODEL_PRICING + heuristics
- Canonical usage normalization (Anthropic/OpenAI/Codex API shapes)
- Cache-aware billing (separate cache_read/cache_write rates)
- Cost status tracking (estimated/included/unknown/actual)
- OpenRouter live pricing via models API
- Schema migration v4→v5 with billing metadata columns
- Removed speculative forward-looking entries
- Removed cost display from CLI status bar
- Threaded OpenRouter metadata pre-warm

Co-authored-by: kshitij <82637225+kshitijk4poor@users.noreply.github.com>
This commit is contained in:
Teknium
2026-03-17 03:44:44 -07:00
committed by GitHub
parent d9b9987ad3
commit d417ba2a48
14 changed files with 1694 additions and 254 deletions

View File

@@ -22,14 +22,21 @@ from collections import Counter, defaultdict
from datetime import datetime
from typing import Any, Dict, List
from agent.usage_pricing import DEFAULT_PRICING, estimate_cost_usd, format_duration_compact, get_pricing, has_known_pricing
from agent.usage_pricing import (
CanonicalUsage,
DEFAULT_PRICING,
estimate_usage_cost,
format_duration_compact,
get_pricing,
has_known_pricing,
)
_DEFAULT_PRICING = DEFAULT_PRICING
def _has_known_pricing(model_name: str) -> bool:
def _has_known_pricing(model_name: str, provider: str = None, base_url: str = None) -> bool:
"""Check if a model has known pricing (vs unknown/custom endpoint)."""
return has_known_pricing(model_name)
return has_known_pricing(model_name, provider=provider, base_url=base_url)
def _get_pricing(model_name: str) -> Dict[str, float]:
@@ -41,9 +48,43 @@ def _get_pricing(model_name: str) -> Dict[str, float]:
return get_pricing(model_name)
def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
"""Estimate the USD cost for a given model and token counts."""
return estimate_cost_usd(model, input_tokens, output_tokens)
def _estimate_cost(
session_or_model: Dict[str, Any] | str,
input_tokens: int = 0,
output_tokens: int = 0,
*,
cache_read_tokens: int = 0,
cache_write_tokens: int = 0,
provider: str = None,
base_url: str = None,
) -> tuple[float, str]:
"""Estimate the USD cost for a session row or a model/token tuple."""
if isinstance(session_or_model, dict):
session = session_or_model
model = session.get("model") or ""
usage = CanonicalUsage(
input_tokens=session.get("input_tokens") or 0,
output_tokens=session.get("output_tokens") or 0,
cache_read_tokens=session.get("cache_read_tokens") or 0,
cache_write_tokens=session.get("cache_write_tokens") or 0,
)
provider = session.get("billing_provider")
base_url = session.get("billing_base_url")
else:
model = session_or_model or ""
usage = CanonicalUsage(
input_tokens=input_tokens,
output_tokens=output_tokens,
cache_read_tokens=cache_read_tokens,
cache_write_tokens=cache_write_tokens,
)
result = estimate_usage_cost(
model,
usage,
provider=provider,
base_url=base_url,
)
return float(result.amount_usd or 0.0), result.status
def _format_duration(seconds: float) -> str:
@@ -135,7 +176,10 @@ class InsightsEngine:
# Columns we actually need (skip system_prompt, model_config blobs)
_SESSION_COLS = ("id, source, model, started_at, ended_at, "
"message_count, tool_call_count, input_tokens, output_tokens")
"message_count, tool_call_count, input_tokens, output_tokens, "
"cache_read_tokens, cache_write_tokens, billing_provider, "
"billing_base_url, billing_mode, estimated_cost_usd, "
"actual_cost_usd, cost_status, cost_source")
def _get_sessions(self, cutoff: float, source: str = None) -> List[Dict]:
"""Fetch sessions within the time window."""
@@ -287,21 +331,30 @@ class InsightsEngine:
"""Compute high-level overview statistics."""
total_input = sum(s.get("input_tokens") or 0 for s in sessions)
total_output = sum(s.get("output_tokens") or 0 for s in sessions)
total_tokens = total_input + total_output
total_cache_read = sum(s.get("cache_read_tokens") or 0 for s in sessions)
total_cache_write = sum(s.get("cache_write_tokens") or 0 for s in sessions)
total_tokens = total_input + total_output + total_cache_read + total_cache_write
total_tool_calls = sum(s.get("tool_call_count") or 0 for s in sessions)
total_messages = sum(s.get("message_count") or 0 for s in sessions)
# Cost estimation (weighted by model)
total_cost = 0.0
actual_cost = 0.0
models_with_pricing = set()
models_without_pricing = set()
unknown_cost_sessions = 0
included_cost_sessions = 0
for s in sessions:
model = s.get("model") or ""
inp = s.get("input_tokens") or 0
out = s.get("output_tokens") or 0
total_cost += _estimate_cost(model, inp, out)
estimated, status = _estimate_cost(s)
total_cost += estimated
actual_cost += s.get("actual_cost_usd") or 0.0
display = model.split("/")[-1] if "/" in model else (model or "unknown")
if _has_known_pricing(model):
if status == "included":
included_cost_sessions += 1
elif status == "unknown":
unknown_cost_sessions += 1
if _has_known_pricing(model, s.get("billing_provider"), s.get("billing_base_url")):
models_with_pricing.add(display)
else:
models_without_pricing.add(display)
@@ -328,8 +381,11 @@ class InsightsEngine:
"total_tool_calls": total_tool_calls,
"total_input_tokens": total_input,
"total_output_tokens": total_output,
"total_cache_read_tokens": total_cache_read,
"total_cache_write_tokens": total_cache_write,
"total_tokens": total_tokens,
"estimated_cost": total_cost,
"actual_cost": actual_cost,
"total_hours": total_hours,
"avg_session_duration": avg_duration,
"avg_messages_per_session": total_messages / len(sessions) if sessions else 0,
@@ -341,12 +397,15 @@ class InsightsEngine:
"date_range_end": date_range_end,
"models_with_pricing": sorted(models_with_pricing),
"models_without_pricing": sorted(models_without_pricing),
"unknown_cost_sessions": unknown_cost_sessions,
"included_cost_sessions": included_cost_sessions,
}
def _compute_model_breakdown(self, sessions: List[Dict]) -> List[Dict]:
"""Break down usage by model."""
model_data = defaultdict(lambda: {
"sessions": 0, "input_tokens": 0, "output_tokens": 0,
"cache_read_tokens": 0, "cache_write_tokens": 0,
"total_tokens": 0, "tool_calls": 0, "cost": 0.0,
})
@@ -358,12 +417,18 @@ class InsightsEngine:
d["sessions"] += 1
inp = s.get("input_tokens") or 0
out = s.get("output_tokens") or 0
cache_read = s.get("cache_read_tokens") or 0
cache_write = s.get("cache_write_tokens") or 0
d["input_tokens"] += inp
d["output_tokens"] += out
d["total_tokens"] += inp + out
d["cache_read_tokens"] += cache_read
d["cache_write_tokens"] += cache_write
d["total_tokens"] += inp + out + cache_read + cache_write
d["tool_calls"] += s.get("tool_call_count") or 0
d["cost"] += _estimate_cost(model, inp, out)
d["has_pricing"] = _has_known_pricing(model)
estimate, status = _estimate_cost(s)
d["cost"] += estimate
d["has_pricing"] = _has_known_pricing(model, s.get("billing_provider"), s.get("billing_base_url"))
d["cost_status"] = status
result = [
{"model": model, **data}
@@ -377,7 +442,8 @@ class InsightsEngine:
"""Break down usage by platform/source."""
platform_data = defaultdict(lambda: {
"sessions": 0, "messages": 0, "input_tokens": 0,
"output_tokens": 0, "total_tokens": 0, "tool_calls": 0,
"output_tokens": 0, "cache_read_tokens": 0,
"cache_write_tokens": 0, "total_tokens": 0, "tool_calls": 0,
})
for s in sessions:
@@ -387,9 +453,13 @@ class InsightsEngine:
d["messages"] += s.get("message_count") or 0
inp = s.get("input_tokens") or 0
out = s.get("output_tokens") or 0
cache_read = s.get("cache_read_tokens") or 0
cache_write = s.get("cache_write_tokens") or 0
d["input_tokens"] += inp
d["output_tokens"] += out
d["total_tokens"] += inp + out
d["cache_read_tokens"] += cache_read
d["cache_write_tokens"] += cache_write
d["total_tokens"] += inp + out + cache_read + cache_write
d["tool_calls"] += s.get("tool_call_count") or 0
result = [

View File

@@ -1,101 +1,593 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from decimal import Decimal
from typing import Dict
from typing import Any, Dict, Literal, Optional
MODEL_PRICING = {
"gpt-4o": {"input": 2.50, "output": 10.00},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-4.1": {"input": 2.00, "output": 8.00},
"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
"gpt-4.5-preview": {"input": 75.00, "output": 150.00},
"gpt-5": {"input": 10.00, "output": 30.00},
"gpt-5.4": {"input": 10.00, "output": 30.00},
"o3": {"input": 10.00, "output": 40.00},
"o3-mini": {"input": 1.10, "output": 4.40},
"o4-mini": {"input": 1.10, "output": 4.40},
"claude-opus-4-20250514": {"input": 15.00, "output": 75.00},
"claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00},
"claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
"claude-3-5-haiku-20241022": {"input": 0.80, "output": 4.00},
"claude-3-opus-20240229": {"input": 15.00, "output": 75.00},
"claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
"deepseek-chat": {"input": 0.14, "output": 0.28},
"deepseek-reasoner": {"input": 0.55, "output": 2.19},
"gemini-2.5-pro": {"input": 1.25, "output": 10.00},
"gemini-2.5-flash": {"input": 0.15, "output": 0.60},
"gemini-2.0-flash": {"input": 0.10, "output": 0.40},
"llama-4-maverick": {"input": 0.50, "output": 0.70},
"llama-4-scout": {"input": 0.20, "output": 0.30},
"glm-5": {"input": 0.0, "output": 0.0},
"glm-4.7": {"input": 0.0, "output": 0.0},
"glm-4.5": {"input": 0.0, "output": 0.0},
"glm-4.5-flash": {"input": 0.0, "output": 0.0},
"kimi-k2.5": {"input": 0.0, "output": 0.0},
"kimi-k2-thinking": {"input": 0.0, "output": 0.0},
"kimi-k2-turbo-preview": {"input": 0.0, "output": 0.0},
"kimi-k2-0905-preview": {"input": 0.0, "output": 0.0},
"MiniMax-M2.5": {"input": 0.0, "output": 0.0},
"MiniMax-M2.5-highspeed": {"input": 0.0, "output": 0.0},
"MiniMax-M2.1": {"input": 0.0, "output": 0.0},
}
from agent.model_metadata import fetch_model_metadata
DEFAULT_PRICING = {"input": 0.0, "output": 0.0}
_ZERO = Decimal("0")
_ONE_MILLION = Decimal("1000000")
def get_pricing(model_name: str) -> Dict[str, float]:
if not model_name:
return DEFAULT_PRICING
bare = model_name.split("/")[-1].lower()
if bare in MODEL_PRICING:
return MODEL_PRICING[bare]
best_match = None
best_len = 0
for key, price in MODEL_PRICING.items():
if bare.startswith(key) and len(key) > best_len:
best_match = price
best_len = len(key)
if best_match:
return best_match
if "opus" in bare:
return {"input": 15.00, "output": 75.00}
if "sonnet" in bare:
return {"input": 3.00, "output": 15.00}
if "haiku" in bare:
return {"input": 0.80, "output": 4.00}
if "gpt-4o-mini" in bare:
return {"input": 0.15, "output": 0.60}
if "gpt-4o" in bare:
return {"input": 2.50, "output": 10.00}
if "gpt-5" in bare:
return {"input": 10.00, "output": 30.00}
if "deepseek" in bare:
return {"input": 0.14, "output": 0.28}
if "gemini" in bare:
return {"input": 0.15, "output": 0.60}
return DEFAULT_PRICING
CostStatus = Literal["actual", "estimated", "included", "unknown"]
CostSource = Literal[
"provider_cost_api",
"provider_generation_api",
"provider_models_api",
"official_docs_snapshot",
"user_override",
"custom_contract",
"none",
]
def has_known_pricing(model_name: str) -> bool:
pricing = get_pricing(model_name)
return pricing is not DEFAULT_PRICING and any(
float(value) > 0 for value in pricing.values()
@dataclass(frozen=True)
class CanonicalUsage:
input_tokens: int = 0
output_tokens: int = 0
cache_read_tokens: int = 0
cache_write_tokens: int = 0
reasoning_tokens: int = 0
request_count: int = 1
raw_usage: Optional[dict[str, Any]] = None
@property
def prompt_tokens(self) -> int:
return self.input_tokens + self.cache_read_tokens + self.cache_write_tokens
@property
def total_tokens(self) -> int:
return self.prompt_tokens + self.output_tokens
@dataclass(frozen=True)
class BillingRoute:
provider: str
model: str
base_url: str = ""
billing_mode: str = "unknown"
@dataclass(frozen=True)
class PricingEntry:
input_cost_per_million: Optional[Decimal] = None
output_cost_per_million: Optional[Decimal] = None
cache_read_cost_per_million: Optional[Decimal] = None
cache_write_cost_per_million: Optional[Decimal] = None
request_cost: Optional[Decimal] = None
source: CostSource = "none"
source_url: Optional[str] = None
pricing_version: Optional[str] = None
fetched_at: Optional[datetime] = None
@dataclass(frozen=True)
class CostResult:
amount_usd: Optional[Decimal]
status: CostStatus
source: CostSource
label: str
fetched_at: Optional[datetime] = None
pricing_version: Optional[str] = None
notes: tuple[str, ...] = ()
_UTC_NOW = lambda: datetime.now(timezone.utc)
# Official docs snapshot entries. Models whose published pricing and cache
# semantics are stable enough to encode exactly.
_OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
(
"anthropic",
"claude-opus-4-20250514",
): PricingEntry(
input_cost_per_million=Decimal("15.00"),
output_cost_per_million=Decimal("75.00"),
cache_read_cost_per_million=Decimal("1.50"),
cache_write_cost_per_million=Decimal("18.75"),
source="official_docs_snapshot",
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
pricing_version="anthropic-prompt-caching-2026-03-16",
),
(
"anthropic",
"claude-sonnet-4-20250514",
): PricingEntry(
input_cost_per_million=Decimal("3.00"),
output_cost_per_million=Decimal("15.00"),
cache_read_cost_per_million=Decimal("0.30"),
cache_write_cost_per_million=Decimal("3.75"),
source="official_docs_snapshot",
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
pricing_version="anthropic-prompt-caching-2026-03-16",
),
# OpenAI
(
"openai",
"gpt-4o",
): PricingEntry(
input_cost_per_million=Decimal("2.50"),
output_cost_per_million=Decimal("10.00"),
cache_read_cost_per_million=Decimal("1.25"),
source="official_docs_snapshot",
source_url="https://openai.com/api/pricing/",
pricing_version="openai-pricing-2026-03-16",
),
(
"openai",
"gpt-4o-mini",
): PricingEntry(
input_cost_per_million=Decimal("0.15"),
output_cost_per_million=Decimal("0.60"),
cache_read_cost_per_million=Decimal("0.075"),
source="official_docs_snapshot",
source_url="https://openai.com/api/pricing/",
pricing_version="openai-pricing-2026-03-16",
),
(
"openai",
"gpt-4.1",
): PricingEntry(
input_cost_per_million=Decimal("2.00"),
output_cost_per_million=Decimal("8.00"),
cache_read_cost_per_million=Decimal("0.50"),
source="official_docs_snapshot",
source_url="https://openai.com/api/pricing/",
pricing_version="openai-pricing-2026-03-16",
),
(
"openai",
"gpt-4.1-mini",
): PricingEntry(
input_cost_per_million=Decimal("0.40"),
output_cost_per_million=Decimal("1.60"),
cache_read_cost_per_million=Decimal("0.10"),
source="official_docs_snapshot",
source_url="https://openai.com/api/pricing/",
pricing_version="openai-pricing-2026-03-16",
),
(
"openai",
"gpt-4.1-nano",
): PricingEntry(
input_cost_per_million=Decimal("0.10"),
output_cost_per_million=Decimal("0.40"),
cache_read_cost_per_million=Decimal("0.025"),
source="official_docs_snapshot",
source_url="https://openai.com/api/pricing/",
pricing_version="openai-pricing-2026-03-16",
),
(
"openai",
"o3",
): PricingEntry(
input_cost_per_million=Decimal("10.00"),
output_cost_per_million=Decimal("40.00"),
cache_read_cost_per_million=Decimal("2.50"),
source="official_docs_snapshot",
source_url="https://openai.com/api/pricing/",
pricing_version="openai-pricing-2026-03-16",
),
(
"openai",
"o3-mini",
): PricingEntry(
input_cost_per_million=Decimal("1.10"),
output_cost_per_million=Decimal("4.40"),
cache_read_cost_per_million=Decimal("0.55"),
source="official_docs_snapshot",
source_url="https://openai.com/api/pricing/",
pricing_version="openai-pricing-2026-03-16",
),
# Anthropic older models (pre-4.6 generation)
(
"anthropic",
"claude-3-5-sonnet-20241022",
): PricingEntry(
input_cost_per_million=Decimal("3.00"),
output_cost_per_million=Decimal("15.00"),
cache_read_cost_per_million=Decimal("0.30"),
cache_write_cost_per_million=Decimal("3.75"),
source="official_docs_snapshot",
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
pricing_version="anthropic-pricing-2026-03-16",
),
(
"anthropic",
"claude-3-5-haiku-20241022",
): PricingEntry(
input_cost_per_million=Decimal("0.80"),
output_cost_per_million=Decimal("4.00"),
cache_read_cost_per_million=Decimal("0.08"),
cache_write_cost_per_million=Decimal("1.00"),
source="official_docs_snapshot",
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
pricing_version="anthropic-pricing-2026-03-16",
),
(
"anthropic",
"claude-3-opus-20240229",
): PricingEntry(
input_cost_per_million=Decimal("15.00"),
output_cost_per_million=Decimal("75.00"),
cache_read_cost_per_million=Decimal("1.50"),
cache_write_cost_per_million=Decimal("18.75"),
source="official_docs_snapshot",
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
pricing_version="anthropic-pricing-2026-03-16",
),
(
"anthropic",
"claude-3-haiku-20240307",
): PricingEntry(
input_cost_per_million=Decimal("0.25"),
output_cost_per_million=Decimal("1.25"),
cache_read_cost_per_million=Decimal("0.03"),
cache_write_cost_per_million=Decimal("0.30"),
source="official_docs_snapshot",
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
pricing_version="anthropic-pricing-2026-03-16",
),
# DeepSeek
(
"deepseek",
"deepseek-chat",
): PricingEntry(
input_cost_per_million=Decimal("0.14"),
output_cost_per_million=Decimal("0.28"),
source="official_docs_snapshot",
source_url="https://api-docs.deepseek.com/quick_start/pricing",
pricing_version="deepseek-pricing-2026-03-16",
),
(
"deepseek",
"deepseek-reasoner",
): PricingEntry(
input_cost_per_million=Decimal("0.55"),
output_cost_per_million=Decimal("2.19"),
source="official_docs_snapshot",
source_url="https://api-docs.deepseek.com/quick_start/pricing",
pricing_version="deepseek-pricing-2026-03-16",
),
# Google Gemini
(
"google",
"gemini-2.5-pro",
): PricingEntry(
input_cost_per_million=Decimal("1.25"),
output_cost_per_million=Decimal("10.00"),
source="official_docs_snapshot",
source_url="https://ai.google.dev/pricing",
pricing_version="google-pricing-2026-03-16",
),
(
"google",
"gemini-2.5-flash",
): PricingEntry(
input_cost_per_million=Decimal("0.15"),
output_cost_per_million=Decimal("0.60"),
source="official_docs_snapshot",
source_url="https://ai.google.dev/pricing",
pricing_version="google-pricing-2026-03-16",
),
(
"google",
"gemini-2.0-flash",
): PricingEntry(
input_cost_per_million=Decimal("0.10"),
output_cost_per_million=Decimal("0.40"),
source="official_docs_snapshot",
source_url="https://ai.google.dev/pricing",
pricing_version="google-pricing-2026-03-16",
),
}
def _to_decimal(value: Any) -> Optional[Decimal]:
if value is None:
return None
try:
return Decimal(str(value))
except Exception:
return None
def _to_int(value: Any) -> int:
try:
return int(value or 0)
except Exception:
return 0
def resolve_billing_route(
model_name: str,
provider: Optional[str] = None,
base_url: Optional[str] = None,
) -> BillingRoute:
provider_name = (provider or "").strip().lower()
base = (base_url or "").strip().lower()
model = (model_name or "").strip()
if not provider_name and "/" in model:
inferred_provider, bare_model = model.split("/", 1)
if inferred_provider in {"anthropic", "openai", "google"}:
provider_name = inferred_provider
model = bare_model
if provider_name == "openai-codex":
return BillingRoute(provider="openai-codex", model=model, base_url=base_url or "", billing_mode="subscription_included")
if provider_name == "openrouter" or "openrouter.ai" in base:
return BillingRoute(provider="openrouter", model=model, base_url=base_url or "", billing_mode="official_models_api")
if provider_name == "anthropic":
return BillingRoute(provider="anthropic", model=model.split("/")[-1], base_url=base_url or "", billing_mode="official_docs_snapshot")
if provider_name == "openai":
return BillingRoute(provider="openai", model=model.split("/")[-1], base_url=base_url or "", billing_mode="official_docs_snapshot")
if provider_name in {"custom", "local"} or (base and "localhost" in base):
return BillingRoute(provider=provider_name or "custom", model=model, base_url=base_url or "", billing_mode="unknown")
return BillingRoute(provider=provider_name or "unknown", model=model.split("/")[-1] if model else "", base_url=base_url or "", billing_mode="unknown")
def _lookup_official_docs_pricing(route: BillingRoute) -> Optional[PricingEntry]:
return _OFFICIAL_DOCS_PRICING.get((route.provider, route.model.lower()))
def _openrouter_pricing_entry(route: BillingRoute) -> Optional[PricingEntry]:
metadata = fetch_model_metadata()
model_id = route.model
if model_id not in metadata:
return None
pricing = metadata[model_id].get("pricing") or {}
prompt = _to_decimal(pricing.get("prompt"))
completion = _to_decimal(pricing.get("completion"))
request = _to_decimal(pricing.get("request"))
cache_read = _to_decimal(
pricing.get("cache_read")
or pricing.get("cached_prompt")
or pricing.get("input_cache_read")
)
cache_write = _to_decimal(
pricing.get("cache_write")
or pricing.get("cache_creation")
or pricing.get("input_cache_write")
)
if prompt is None and completion is None and request is None:
return None
def _per_token_to_per_million(value: Optional[Decimal]) -> Optional[Decimal]:
if value is None:
return None
return value * _ONE_MILLION
return PricingEntry(
input_cost_per_million=_per_token_to_per_million(prompt),
output_cost_per_million=_per_token_to_per_million(completion),
cache_read_cost_per_million=_per_token_to_per_million(cache_read),
cache_write_cost_per_million=_per_token_to_per_million(cache_write),
request_cost=request,
source="provider_models_api",
source_url="https://openrouter.ai/docs/api/api-reference/models/get-models",
pricing_version="openrouter-models-api",
fetched_at=_UTC_NOW(),
)
def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
pricing = get_pricing(model)
total = (
Decimal(input_tokens) * Decimal(str(pricing["input"]))
+ Decimal(output_tokens) * Decimal(str(pricing["output"]))
) / Decimal("1000000")
return float(total)
def get_pricing_entry(
model_name: str,
provider: Optional[str] = None,
base_url: Optional[str] = None,
) -> Optional[PricingEntry]:
route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
if route.billing_mode == "subscription_included":
return PricingEntry(
input_cost_per_million=_ZERO,
output_cost_per_million=_ZERO,
cache_read_cost_per_million=_ZERO,
cache_write_cost_per_million=_ZERO,
source="none",
pricing_version="included-route",
)
if route.provider == "openrouter":
return _openrouter_pricing_entry(route)
return _lookup_official_docs_pricing(route)
def normalize_usage(
response_usage: Any,
*,
provider: Optional[str] = None,
api_mode: Optional[str] = None,
) -> CanonicalUsage:
"""Normalize raw API response usage into canonical token buckets.
Handles three API shapes:
- Anthropic: input_tokens/output_tokens/cache_read_input_tokens/cache_creation_input_tokens
- Codex Responses: input_tokens includes cache tokens; input_tokens_details.cached_tokens separates them
- OpenAI Chat Completions: prompt_tokens includes cache tokens; prompt_tokens_details.cached_tokens separates them
In both Codex and OpenAI modes, input_tokens is derived by subtracting cache
tokens from the total — the API contract is that input/prompt totals include
cached tokens and the details object breaks them out.
"""
if not response_usage:
return CanonicalUsage()
provider_name = (provider or "").strip().lower()
mode = (api_mode or "").strip().lower()
if mode == "anthropic_messages" or provider_name == "anthropic":
input_tokens = _to_int(getattr(response_usage, "input_tokens", 0))
output_tokens = _to_int(getattr(response_usage, "output_tokens", 0))
cache_read_tokens = _to_int(getattr(response_usage, "cache_read_input_tokens", 0))
cache_write_tokens = _to_int(getattr(response_usage, "cache_creation_input_tokens", 0))
elif mode == "codex_responses":
input_total = _to_int(getattr(response_usage, "input_tokens", 0))
output_tokens = _to_int(getattr(response_usage, "output_tokens", 0))
details = getattr(response_usage, "input_tokens_details", None)
cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0)
cache_write_tokens = _to_int(
getattr(details, "cache_creation_tokens", 0) if details else 0
)
input_tokens = max(0, input_total - cache_read_tokens - cache_write_tokens)
else:
prompt_total = _to_int(getattr(response_usage, "prompt_tokens", 0))
output_tokens = _to_int(getattr(response_usage, "completion_tokens", 0))
details = getattr(response_usage, "prompt_tokens_details", None)
cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0)
cache_write_tokens = _to_int(
getattr(details, "cache_write_tokens", 0) if details else 0
)
input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens)
reasoning_tokens = 0
output_details = getattr(response_usage, "output_tokens_details", None)
if output_details:
reasoning_tokens = _to_int(getattr(output_details, "reasoning_tokens", 0))
return CanonicalUsage(
input_tokens=input_tokens,
output_tokens=output_tokens,
cache_read_tokens=cache_read_tokens,
cache_write_tokens=cache_write_tokens,
reasoning_tokens=reasoning_tokens,
)
def estimate_usage_cost(
model_name: str,
usage: CanonicalUsage,
*,
provider: Optional[str] = None,
base_url: Optional[str] = None,
) -> CostResult:
route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
if route.billing_mode == "subscription_included":
return CostResult(
amount_usd=_ZERO,
status="included",
source="none",
label="included",
pricing_version="included-route",
)
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url)
if not entry:
return CostResult(amount_usd=None, status="unknown", source="none", label="n/a")
notes: list[str] = []
amount = _ZERO
if usage.input_tokens and entry.input_cost_per_million is None:
return CostResult(amount_usd=None, status="unknown", source=entry.source, label="n/a")
if usage.output_tokens and entry.output_cost_per_million is None:
return CostResult(amount_usd=None, status="unknown", source=entry.source, label="n/a")
if usage.cache_read_tokens:
if entry.cache_read_cost_per_million is None:
return CostResult(
amount_usd=None,
status="unknown",
source=entry.source,
label="n/a",
notes=("cache-read pricing unavailable for route",),
)
if usage.cache_write_tokens:
if entry.cache_write_cost_per_million is None:
return CostResult(
amount_usd=None,
status="unknown",
source=entry.source,
label="n/a",
notes=("cache-write pricing unavailable for route",),
)
if entry.input_cost_per_million is not None:
amount += Decimal(usage.input_tokens) * entry.input_cost_per_million / _ONE_MILLION
if entry.output_cost_per_million is not None:
amount += Decimal(usage.output_tokens) * entry.output_cost_per_million / _ONE_MILLION
if entry.cache_read_cost_per_million is not None:
amount += Decimal(usage.cache_read_tokens) * entry.cache_read_cost_per_million / _ONE_MILLION
if entry.cache_write_cost_per_million is not None:
amount += Decimal(usage.cache_write_tokens) * entry.cache_write_cost_per_million / _ONE_MILLION
if entry.request_cost is not None and usage.request_count:
amount += Decimal(usage.request_count) * entry.request_cost
status: CostStatus = "estimated"
label = f"~${amount:.2f}"
if entry.source == "none" and amount == _ZERO:
status = "included"
label = "included"
if route.provider == "openrouter":
notes.append("OpenRouter cost is estimated from the models API until reconciled.")
return CostResult(
amount_usd=amount,
status=status,
source=entry.source,
label=label,
fetched_at=entry.fetched_at,
pricing_version=entry.pricing_version,
notes=tuple(notes),
)
def has_known_pricing(
model_name: str,
provider: Optional[str] = None,
base_url: Optional[str] = None,
) -> bool:
"""Check whether we have pricing data for this model+route.
Uses direct lookup instead of routing through the full estimation
pipeline — avoids creating dummy usage objects just to check status.
"""
route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
if route.billing_mode == "subscription_included":
return True
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url)
return entry is not None
def get_pricing(
model_name: str,
provider: Optional[str] = None,
base_url: Optional[str] = None,
) -> Dict[str, float]:
"""Backward-compatible thin wrapper for legacy callers.
Returns only non-cache input/output fields when a pricing entry exists.
Unknown routes return zeroes.
"""
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url)
if not entry:
return {"input": 0.0, "output": 0.0}
return {
"input": float(entry.input_cost_per_million or _ZERO),
"output": float(entry.output_cost_per_million or _ZERO),
}
def estimate_cost_usd(
model: str,
input_tokens: int,
output_tokens: int,
*,
provider: Optional[str] = None,
base_url: Optional[str] = None,
) -> float:
"""Backward-compatible helper for legacy callers.
This uses non-cached input/output only. New code should call
`estimate_usage_cost()` with canonical usage buckets.
"""
result = estimate_usage_cost(
model,
CanonicalUsage(input_tokens=input_tokens, output_tokens=output_tokens),
provider=provider,
base_url=base_url,
)
return float(result.amount_usd or _ZERO)
def format_duration_compact(seconds: float) -> str:

95
cli.py
View File

@@ -58,7 +58,12 @@ except (ImportError, AttributeError):
import threading
import queue
from agent.usage_pricing import estimate_cost_usd, format_duration_compact, format_token_count_compact, has_known_pricing
from agent.usage_pricing import (
CanonicalUsage,
estimate_usage_cost,
format_duration_compact,
format_token_count_compact,
)
from hermes_cli.banner import _format_context_length
_COMMAND_SPINNER_FRAMES = ("", "", "", "", "", "", "", "", "", "")
@@ -212,7 +217,7 @@ def load_cli_config() -> Dict[str, Any]:
"resume_display": "full",
"show_reasoning": False,
"streaming": False,
"show_cost": False,
"skin": "default",
"theme_mode": "auto",
},
@@ -1034,8 +1039,7 @@ class HermesCLI:
self.bell_on_complete = CLI_CONFIG["display"].get("bell_on_complete", False)
# show_reasoning: display model thinking/reasoning before the response
self.show_reasoning = CLI_CONFIG["display"].get("show_reasoning", False)
# show_cost: display $ cost in the status bar (off by default)
self.show_cost = CLI_CONFIG["display"].get("show_cost", False)
self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose")
# streaming: stream tokens to the terminal as they arrive (display.streaming in config.yaml)
@@ -1260,12 +1264,14 @@ class HermesCLI:
"context_tokens": 0,
"context_length": None,
"context_percent": None,
"session_input_tokens": 0,
"session_output_tokens": 0,
"session_cache_read_tokens": 0,
"session_cache_write_tokens": 0,
"session_prompt_tokens": 0,
"session_completion_tokens": 0,
"session_total_tokens": 0,
"session_api_calls": 0,
"session_cost": 0.0,
"pricing_known": has_known_pricing(model_name),
"compressions": 0,
}
@@ -1273,15 +1279,14 @@ class HermesCLI:
if not agent:
return snapshot
snapshot["session_input_tokens"] = getattr(agent, "session_input_tokens", 0) or 0
snapshot["session_output_tokens"] = getattr(agent, "session_output_tokens", 0) or 0
snapshot["session_cache_read_tokens"] = getattr(agent, "session_cache_read_tokens", 0) or 0
snapshot["session_cache_write_tokens"] = getattr(agent, "session_cache_write_tokens", 0) or 0
snapshot["session_prompt_tokens"] = getattr(agent, "session_prompt_tokens", 0) or 0
snapshot["session_completion_tokens"] = getattr(agent, "session_completion_tokens", 0) or 0
snapshot["session_total_tokens"] = getattr(agent, "session_total_tokens", 0) or 0
snapshot["session_api_calls"] = getattr(agent, "session_api_calls", 0) or 0
snapshot["session_cost"] = estimate_cost_usd(
model_name,
snapshot["session_prompt_tokens"],
snapshot["session_completion_tokens"],
)
compressor = getattr(agent, "context_compressor", None)
if compressor:
@@ -1302,19 +1307,11 @@ class HermesCLI:
percent = snapshot["context_percent"]
percent_label = f"{percent}%" if percent is not None else "--"
duration_label = snapshot["duration"]
show_cost = getattr(self, "show_cost", False)
if show_cost:
cost_label = f"${snapshot['session_cost']:.2f}" if snapshot["pricing_known"] else "cost n/a"
else:
cost_label = None
if width < 52:
return f"{snapshot['model_short']} · {duration_label}"
if width < 76:
parts = [f"{snapshot['model_short']}", percent_label]
if cost_label:
parts.append(cost_label)
parts.append(duration_label)
return " · ".join(parts)
@@ -1326,8 +1323,6 @@ class HermesCLI:
context_label = "ctx --"
parts = [f"{snapshot['model_short']}", context_label, percent_label]
if cost_label:
parts.append(cost_label)
parts.append(duration_label)
return "".join(parts)
except Exception:
@@ -1338,12 +1333,6 @@ class HermesCLI:
snapshot = self._get_status_bar_snapshot()
width = shutil.get_terminal_size((80, 24)).columns
duration_label = snapshot["duration"]
show_cost = getattr(self, "show_cost", False)
if show_cost:
cost_label = f"${snapshot['session_cost']:.2f}" if snapshot["pricing_known"] else "cost n/a"
else:
cost_label = None
if width < 52:
return [
@@ -1363,11 +1352,6 @@ class HermesCLI:
("class:status-bar-dim", " · "),
(self._status_bar_context_style(percent), percent_label),
]
if cost_label:
frags.extend([
("class:status-bar-dim", " · "),
("class:status-bar-dim", cost_label),
])
frags.extend([
("class:status-bar-dim", " · "),
("class:status-bar-dim", duration_label),
@@ -1393,11 +1377,6 @@ class HermesCLI:
("class:status-bar-dim", " "),
(bar_style, percent_label),
]
if cost_label:
frags.extend([
("class:status-bar-dim", ""),
("class:status-bar-dim", cost_label),
])
frags.extend([
("class:status-bar-dim", ""),
("class:status-bar-dim", duration_label),
@@ -4250,6 +4229,10 @@ class HermesCLI:
return
agent = self.agent
input_tokens = getattr(agent, "session_input_tokens", 0) or 0
output_tokens = getattr(agent, "session_output_tokens", 0) or 0
cache_read_tokens = getattr(agent, "session_cache_read_tokens", 0) or 0
cache_write_tokens = getattr(agent, "session_cache_write_tokens", 0) or 0
prompt = agent.session_prompt_tokens
completion = agent.session_completion_tokens
total = agent.session_total_tokens
@@ -4267,33 +4250,45 @@ class HermesCLI:
compressions = compressor.compression_count
msg_count = len(self.conversation_history)
cost = estimate_cost_usd(agent.model, prompt, completion)
prompt_cost = estimate_cost_usd(agent.model, prompt, 0)
completion_cost = estimate_cost_usd(agent.model, 0, completion)
pricing_known = has_known_pricing(agent.model)
cost_result = estimate_usage_cost(
agent.model,
CanonicalUsage(
input_tokens=input_tokens,
output_tokens=output_tokens,
cache_read_tokens=cache_read_tokens,
cache_write_tokens=cache_write_tokens,
),
provider=getattr(agent, "provider", None),
base_url=getattr(agent, "base_url", None),
)
elapsed = format_duration_compact((datetime.now() - self.session_start).total_seconds())
print(f" 📊 Session Token Usage")
print(f" {'' * 40}")
print(f" Model: {agent.model}")
print(f" Prompt tokens (input): {prompt:>10,}")
print(f" Completion tokens (output): {completion:>9,}")
print(f" Input tokens: {input_tokens:>10,}")
print(f" Cache read tokens: {cache_read_tokens:>10,}")
print(f" Cache write tokens: {cache_write_tokens:>10,}")
print(f" Output tokens: {output_tokens:>10,}")
print(f" Prompt tokens (total): {prompt:>10,}")
print(f" Completion tokens: {completion:>10,}")
print(f" Total tokens: {total:>10,}")
print(f" API calls: {calls:>10,}")
print(f" Session duration: {elapsed:>10}")
if pricing_known:
print(f" Input cost: ${prompt_cost:>10.4f}")
print(f" Output cost: ${completion_cost:>10.4f}")
print(f" Total cost: ${cost:>10.4f}")
print(f" Cost status: {cost_result.status:>10}")
print(f" Cost source: {cost_result.source:>10}")
if cost_result.amount_usd is not None:
prefix = "~" if cost_result.status == "estimated" else ""
print(f" Total cost: {prefix}${float(cost_result.amount_usd):>10.4f}")
elif cost_result.status == "included":
print(f" Total cost: {'included':>10}")
else:
print(f" Input cost: {'n/a':>10}")
print(f" Output cost: {'n/a':>10}")
print(f" Total cost: {'n/a':>10}")
print(f" {'' * 40}")
print(f" Current context: {last_prompt:,} / {ctx_len:,} ({pct:.0f}%)")
print(f" Messages: {msg_count}")
print(f" Compressions: {compressions}")
if not pricing_known:
if cost_result.status == "unknown":
print(f" Note: Pricing unknown for {agent.model}")
if self.verbose:

View File

@@ -0,0 +1,608 @@
# Pricing Accuracy Architecture
Date: 2026-03-16
## Goal
Hermes should only show dollar costs when they are backed by an official source for the user's actual billing path.
This design replaces the current static, heuristic pricing flow in:
- `run_agent.py`
- `agent/usage_pricing.py`
- `agent/insights.py`
- `cli.py`
with a provider-aware pricing system that:
- handles cache billing correctly
- distinguishes `actual` vs `estimated` vs `included` vs `unknown`
- reconciles post-hoc costs when providers expose authoritative billing data
- supports direct providers, OpenRouter, subscriptions, enterprise pricing, and custom endpoints
## Problems In The Current Design
Current Hermes behavior has four structural issues:
1. It stores only `prompt_tokens` and `completion_tokens`, which is insufficient for providers that bill cache reads and cache writes separately.
2. It uses a static model price table and fuzzy heuristics, which can drift from current official pricing.
3. It assumes public API list pricing matches the user's real billing path.
4. It has no distinction between live estimates and reconciled billed cost.
## Design Principles
1. Normalize usage before pricing.
2. Never fold cached tokens into plain input cost.
3. Track certainty explicitly.
4. Treat the billing path as part of the model identity.
5. Prefer official machine-readable sources over scraped docs.
6. Use post-hoc provider cost APIs when available.
7. Show `n/a` rather than inventing precision.
## High-Level Architecture
The new system has four layers:
1. `usage_normalization`
Converts raw provider usage into a canonical usage record.
2. `pricing_source_resolution`
Determines the billing path, source of truth, and applicable pricing source.
3. `cost_estimation_and_reconciliation`
Produces an immediate estimate when possible, then replaces or annotates it with actual billed cost later.
4. `presentation`
`/usage`, `/insights`, and the status bar display cost with certainty metadata.
## Canonical Usage Record
Add a canonical usage model that every provider path maps into before any pricing math happens.
Suggested structure:
```python
@dataclass
class CanonicalUsage:
provider: str
billing_provider: str
model: str
billing_route: str
input_tokens: int = 0
output_tokens: int = 0
cache_read_tokens: int = 0
cache_write_tokens: int = 0
reasoning_tokens: int = 0
request_count: int = 1
raw_usage: dict[str, Any] | None = None
raw_usage_fields: dict[str, str] | None = None
computed_fields: set[str] | None = None
provider_request_id: str | None = None
provider_generation_id: str | None = None
provider_response_id: str | None = None
```
Rules:
- `input_tokens` means non-cached input only.
- `cache_read_tokens` and `cache_write_tokens` are never merged into `input_tokens`.
- `output_tokens` excludes cache metrics.
- `reasoning_tokens` is telemetry unless a provider officially bills it separately.
This is the same normalization pattern used by `opencode`, extended with provenance and reconciliation ids.
## Provider Normalization Rules
### OpenAI Direct
Source usage fields:
- `prompt_tokens`
- `completion_tokens`
- `prompt_tokens_details.cached_tokens`
Normalization:
- `cache_read_tokens = cached_tokens`
- `input_tokens = prompt_tokens - cached_tokens`
- `cache_write_tokens = 0` unless OpenAI exposes it in the relevant route
- `output_tokens = completion_tokens`
### Anthropic Direct
Source usage fields:
- `input_tokens`
- `output_tokens`
- `cache_read_input_tokens`
- `cache_creation_input_tokens`
Normalization:
- `input_tokens = input_tokens`
- `output_tokens = output_tokens`
- `cache_read_tokens = cache_read_input_tokens`
- `cache_write_tokens = cache_creation_input_tokens`
### OpenRouter
Estimate-time usage normalization should use the response usage payload with the same rules as the underlying provider when possible.
Reconciliation-time records should also store:
- OpenRouter generation id
- native token fields when available
- `total_cost`
- `cache_discount`
- `upstream_inference_cost`
- `is_byok`
### Gemini / Vertex
Use official Gemini or Vertex usage fields where available.
If cached content tokens are exposed:
- map them to `cache_read_tokens`
If a route exposes no cache creation metric:
- store `cache_write_tokens = 0`
- preserve the raw usage payload for later extension
### DeepSeek And Other Direct Providers
Normalize only the fields that are officially exposed.
If a provider does not expose cache buckets:
- do not infer them unless the provider explicitly documents how to derive them
### Subscription / Included-Cost Routes
These still use the canonical usage model.
Tokens are tracked normally. Cost depends on billing mode, not on whether usage exists.
## Billing Route Model
Hermes must stop keying pricing solely by `model`.
Introduce a billing route descriptor:
```python
@dataclass
class BillingRoute:
provider: str
base_url: str | None
model: str
billing_mode: str
organization_hint: str | None = None
```
`billing_mode` values:
- `official_cost_api`
- `official_generation_api`
- `official_models_api`
- `official_docs_snapshot`
- `subscription_included`
- `user_override`
- `custom_contract`
- `unknown`
Examples:
- OpenAI direct API with Costs API access: `official_cost_api`
- Anthropic direct API with Usage & Cost API access: `official_cost_api`
- OpenRouter request before reconciliation: `official_models_api`
- OpenRouter request after generation lookup: `official_generation_api`
- GitHub Copilot style subscription route: `subscription_included`
- local OpenAI-compatible server: `unknown`
- enterprise contract with configured rates: `custom_contract`
## Cost Status Model
Every displayed cost should have:
```python
@dataclass
class CostResult:
amount_usd: Decimal | None
status: Literal["actual", "estimated", "included", "unknown"]
source: Literal[
"provider_cost_api",
"provider_generation_api",
"provider_models_api",
"official_docs_snapshot",
"user_override",
"custom_contract",
"none",
]
label: str
fetched_at: datetime | None
pricing_version: str | None
notes: list[str]
```
Presentation rules:
- `actual`: show dollar amount as final
- `estimated`: show dollar amount with estimate labeling
- `included`: show `included` or `$0.00 (included)` depending on UX choice
- `unknown`: show `n/a`
## Official Source Hierarchy
Resolve cost using this order:
1. Request-level or account-level official billed cost
2. Official machine-readable model pricing
3. Official docs snapshot
4. User override or custom contract
5. Unknown
The system must never skip to a lower level if a higher-confidence source exists for the current billing route.
## Provider-Specific Truth Rules
### OpenAI Direct
Preferred truth:
1. Costs API for reconciled spend
2. Official pricing page for live estimate
### Anthropic Direct
Preferred truth:
1. Usage & Cost API for reconciled spend
2. Official pricing docs for live estimate
### OpenRouter
Preferred truth:
1. `GET /api/v1/generation` for reconciled `total_cost`
2. `GET /api/v1/models` pricing for live estimate
Do not use underlying provider public pricing as the source of truth for OpenRouter billing.
### Gemini / Vertex
Preferred truth:
1. official billing export or billing API for reconciled spend when available for the route
2. official pricing docs for estimate
### DeepSeek
Preferred truth:
1. official machine-readable cost source if available in the future
2. official pricing docs snapshot today
### Subscription-Included Routes
Preferred truth:
1. explicit route config marking the model as included in subscription
These should display `included`, not an API list-price estimate.
### Custom Endpoint / Local Model
Preferred truth:
1. user override
2. custom contract config
3. unknown
These should default to `unknown`.
## Pricing Catalog
Replace the current `MODEL_PRICING` dict with a richer pricing catalog.
Suggested record:
```python
@dataclass
class PricingEntry:
provider: str
route_pattern: str
model_pattern: str
input_cost_per_million: Decimal | None = None
output_cost_per_million: Decimal | None = None
cache_read_cost_per_million: Decimal | None = None
cache_write_cost_per_million: Decimal | None = None
request_cost: Decimal | None = None
image_cost: Decimal | None = None
source: str = "official_docs_snapshot"
source_url: str | None = None
fetched_at: datetime | None = None
pricing_version: str | None = None
```
The catalog should be route-aware:
- `openai:gpt-5`
- `anthropic:claude-opus-4-6`
- `openrouter:anthropic/claude-opus-4.6`
- `copilot:gpt-4o`
This avoids conflating direct-provider billing with aggregator billing.
## Pricing Sync Architecture
Introduce a pricing sync subsystem instead of manually maintaining a single hardcoded table.
Suggested modules:
- `agent/pricing/catalog.py`
- `agent/pricing/sources.py`
- `agent/pricing/sync.py`
- `agent/pricing/reconcile.py`
- `agent/pricing/types.py`
### Sync Sources
- OpenRouter models API
- official provider docs snapshots where no API exists
- user overrides from config
### Sync Output
Cache pricing entries locally with:
- source URL
- fetch timestamp
- version/hash
- confidence/source type
### Sync Frequency
- startup warm cache
- background refresh every 6 to 24 hours depending on source
- manual `hermes pricing sync`
## Reconciliation Architecture
Live requests may produce only an estimate initially. Hermes should reconcile them later when a provider exposes actual billed cost.
Suggested flow:
1. Agent call completes.
2. Hermes stores canonical usage plus reconciliation ids.
3. Hermes computes an immediate estimate if a pricing source exists.
4. A reconciliation worker fetches actual cost when supported.
5. Session and message records are updated with `actual` cost.
This can run:
- inline for cheap lookups
- asynchronously for delayed provider accounting
## Persistence Changes
Session storage should stop storing only aggregate prompt/completion totals.
Add fields for both usage and cost certainty:
- `input_tokens`
- `output_tokens`
- `cache_read_tokens`
- `cache_write_tokens`
- `reasoning_tokens`
- `estimated_cost_usd`
- `actual_cost_usd`
- `cost_status`
- `cost_source`
- `pricing_version`
- `billing_provider`
- `billing_mode`
If schema expansion is too large for one PR, add a new pricing events table:
```text
session_cost_events
id
session_id
request_id
provider
model
billing_mode
input_tokens
output_tokens
cache_read_tokens
cache_write_tokens
estimated_cost_usd
actual_cost_usd
cost_status
cost_source
pricing_version
created_at
updated_at
```
## Hermes Touchpoints
### `run_agent.py`
Current responsibility:
- parse raw provider usage
- update session token counters
New responsibility:
- build `CanonicalUsage`
- update canonical counters
- store reconciliation ids
- emit usage event to pricing subsystem
### `agent/usage_pricing.py`
Current responsibility:
- static lookup table
- direct cost arithmetic
New responsibility:
- move or replace with pricing catalog facade
- no fuzzy model-family heuristics
- no direct pricing without billing-route context
### `cli.py`
Current responsibility:
- compute session cost directly from prompt/completion totals
New responsibility:
- display `CostResult`
- show status badges:
- `actual`
- `estimated`
- `included`
- `n/a`
### `agent/insights.py`
Current responsibility:
- recompute historical estimates from static pricing
New responsibility:
- aggregate stored pricing events
- prefer actual cost over estimate
- surface estimates only when reconciliation is unavailable
## UX Rules
### Status Bar
Show one of:
- `$1.42`
- `~$1.42`
- `included`
- `cost n/a`
Where:
- `$1.42` means `actual`
- `~$1.42` means `estimated`
- `included` means subscription-backed or explicitly zero-cost route
- `cost n/a` means unknown
### `/usage`
Show:
- token buckets
- estimated cost
- actual cost if available
- cost status
- pricing source
### `/insights`
Aggregate:
- actual cost totals
- estimated-only totals
- unknown-cost sessions count
- included-cost sessions count
## Config And Overrides
Add user-configurable pricing overrides in config:
```yaml
pricing:
mode: hybrid
sync_on_startup: true
sync_interval_hours: 12
overrides:
- provider: openrouter
model: anthropic/claude-opus-4.6
billing_mode: custom_contract
input_cost_per_million: 4.25
output_cost_per_million: 22.0
cache_read_cost_per_million: 0.5
cache_write_cost_per_million: 6.0
included_routes:
- provider: copilot
model: "*"
- provider: codex-subscription
model: "*"
```
Overrides must win over catalog defaults for the matching billing route.
## Rollout Plan
### Phase 1
- add canonical usage model
- split cache token buckets in `run_agent.py`
- stop pricing cache-inflated prompt totals
- preserve current UI with improved backend math
### Phase 2
- add route-aware pricing catalog
- integrate OpenRouter models API sync
- add `estimated` vs `included` vs `unknown`
### Phase 3
- add reconciliation for OpenRouter generation cost
- add actual cost persistence
- update `/insights` to prefer actual cost
### Phase 4
- add direct OpenAI and Anthropic reconciliation paths
- add user overrides and contract pricing
- add pricing sync CLI command
## Testing Strategy
Add tests for:
- OpenAI cached token subtraction
- Anthropic cache read/write separation
- OpenRouter estimated vs actual reconciliation
- subscription-backed models showing `included`
- custom endpoints showing `n/a`
- override precedence
- stale catalog fallback behavior
Current tests that assume heuristic pricing should be replaced with route-aware expectations.
## Non-Goals
- exact enterprise billing reconstruction without an official source or user override
- backfilling perfect historical cost for old sessions that lack cache bucket data
- scraping arbitrary provider web pages at request time
## Recommendation
Do not expand the existing `MODEL_PRICING` dict.
That path cannot satisfy the product requirement. Hermes should instead migrate to:
- canonical usage normalization
- route-aware pricing sources
- estimate-then-reconcile cost lifecycle
- explicit certainty states in the UI
This is the minimum architecture that makes the statement "Hermes pricing is backed by official sources where possible, and otherwise clearly labeled" defensible.

View File

@@ -2089,8 +2089,15 @@ class GatewayRunner:
session_entry.session_key,
input_tokens=agent_result.get("input_tokens", 0),
output_tokens=agent_result.get("output_tokens", 0),
cache_read_tokens=agent_result.get("cache_read_tokens", 0),
cache_write_tokens=agent_result.get("cache_write_tokens", 0),
last_prompt_tokens=agent_result.get("last_prompt_tokens", 0),
model=agent_result.get("model"),
estimated_cost_usd=agent_result.get("estimated_cost_usd"),
cost_status=agent_result.get("cost_status"),
cost_source=agent_result.get("cost_source"),
provider=agent_result.get("provider"),
base_url=agent_result.get("base_url"),
)
# Auto voice reply: send TTS audio before the text response

View File

@@ -343,7 +343,11 @@ class SessionEntry:
# Token tracking
input_tokens: int = 0
output_tokens: int = 0
cache_read_tokens: int = 0
cache_write_tokens: int = 0
total_tokens: int = 0
estimated_cost_usd: float = 0.0
cost_status: str = "unknown"
# Last API-reported prompt tokens (for accurate compression pre-check)
last_prompt_tokens: int = 0
@@ -363,8 +367,12 @@ class SessionEntry:
"chat_type": self.chat_type,
"input_tokens": self.input_tokens,
"output_tokens": self.output_tokens,
"cache_read_tokens": self.cache_read_tokens,
"cache_write_tokens": self.cache_write_tokens,
"total_tokens": self.total_tokens,
"last_prompt_tokens": self.last_prompt_tokens,
"estimated_cost_usd": self.estimated_cost_usd,
"cost_status": self.cost_status,
}
if self.origin:
result["origin"] = self.origin.to_dict()
@@ -394,8 +402,12 @@ class SessionEntry:
chat_type=data.get("chat_type", "dm"),
input_tokens=data.get("input_tokens", 0),
output_tokens=data.get("output_tokens", 0),
cache_read_tokens=data.get("cache_read_tokens", 0),
cache_write_tokens=data.get("cache_write_tokens", 0),
total_tokens=data.get("total_tokens", 0),
last_prompt_tokens=data.get("last_prompt_tokens", 0),
estimated_cost_usd=data.get("estimated_cost_usd", 0.0),
cost_status=data.get("cost_status", "unknown"),
)
@@ -696,8 +708,15 @@ class SessionStore:
session_key: str,
input_tokens: int = 0,
output_tokens: int = 0,
cache_read_tokens: int = 0,
cache_write_tokens: int = 0,
last_prompt_tokens: int = None,
model: str = None,
estimated_cost_usd: Optional[float] = None,
cost_status: Optional[str] = None,
cost_source: Optional[str] = None,
provider: Optional[str] = None,
base_url: Optional[str] = None,
) -> None:
"""Update a session's metadata after an interaction."""
self._ensure_loaded()
@@ -707,15 +726,35 @@ class SessionStore:
entry.updated_at = datetime.now()
entry.input_tokens += input_tokens
entry.output_tokens += output_tokens
entry.cache_read_tokens += cache_read_tokens
entry.cache_write_tokens += cache_write_tokens
if last_prompt_tokens is not None:
entry.last_prompt_tokens = last_prompt_tokens
entry.total_tokens = entry.input_tokens + entry.output_tokens
if estimated_cost_usd is not None:
entry.estimated_cost_usd += estimated_cost_usd
if cost_status:
entry.cost_status = cost_status
entry.total_tokens = (
entry.input_tokens
+ entry.output_tokens
+ entry.cache_read_tokens
+ entry.cache_write_tokens
)
self._save()
if self._db:
try:
self._db.update_token_counts(
entry.session_id, input_tokens, output_tokens,
entry.session_id,
input_tokens=input_tokens,
output_tokens=output_tokens,
cache_read_tokens=cache_read_tokens,
cache_write_tokens=cache_write_tokens,
estimated_cost_usd=estimated_cost_usd,
cost_status=cost_status,
cost_source=cost_source,
billing_provider=provider,
billing_base_url=base_url,
model=model,
)
except Exception as e:

View File

@@ -26,7 +26,7 @@ from typing import Dict, Any, List, Optional
DEFAULT_DB_PATH = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "state.db"
SCHEMA_VERSION = 4
SCHEMA_VERSION = 5
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS schema_version (
@@ -48,6 +48,17 @@ CREATE TABLE IF NOT EXISTS sessions (
tool_call_count INTEGER DEFAULT 0,
input_tokens INTEGER DEFAULT 0,
output_tokens INTEGER DEFAULT 0,
cache_read_tokens INTEGER DEFAULT 0,
cache_write_tokens INTEGER DEFAULT 0,
reasoning_tokens INTEGER DEFAULT 0,
billing_provider TEXT,
billing_base_url TEXT,
billing_mode TEXT,
estimated_cost_usd REAL,
actual_cost_usd REAL,
cost_status TEXT,
cost_source TEXT,
pricing_version TEXT,
title TEXT,
FOREIGN KEY (parent_session_id) REFERENCES sessions(id)
);
@@ -154,6 +165,26 @@ class SessionDB:
except sqlite3.OperationalError:
pass # Index already exists
cursor.execute("UPDATE schema_version SET version = 4")
if current_version < 5:
new_columns = [
("cache_read_tokens", "INTEGER DEFAULT 0"),
("cache_write_tokens", "INTEGER DEFAULT 0"),
("reasoning_tokens", "INTEGER DEFAULT 0"),
("billing_provider", "TEXT"),
("billing_base_url", "TEXT"),
("billing_mode", "TEXT"),
("estimated_cost_usd", "REAL"),
("actual_cost_usd", "REAL"),
("cost_status", "TEXT"),
("cost_source", "TEXT"),
("pricing_version", "TEXT"),
]
for name, column_type in new_columns:
try:
cursor.execute(f"ALTER TABLE sessions ADD COLUMN {name} {column_type}")
except sqlite3.OperationalError:
pass
cursor.execute("UPDATE schema_version SET version = 5")
# Unique title index — always ensure it exists (safe to run after migrations
# since the title column is guaranteed to exist at this point)
@@ -233,8 +264,22 @@ class SessionDB:
self._conn.commit()
def update_token_counts(
self, session_id: str, input_tokens: int = 0, output_tokens: int = 0,
self,
session_id: str,
input_tokens: int = 0,
output_tokens: int = 0,
model: str = None,
cache_read_tokens: int = 0,
cache_write_tokens: int = 0,
reasoning_tokens: int = 0,
estimated_cost_usd: Optional[float] = None,
actual_cost_usd: Optional[float] = None,
cost_status: Optional[str] = None,
cost_source: Optional[str] = None,
pricing_version: Optional[str] = None,
billing_provider: Optional[str] = None,
billing_base_url: Optional[str] = None,
billing_mode: Optional[str] = None,
) -> None:
"""Increment token counters and backfill model if not already set."""
with self._lock:
@@ -242,9 +287,40 @@ class SessionDB:
"""UPDATE sessions SET
input_tokens = input_tokens + ?,
output_tokens = output_tokens + ?,
cache_read_tokens = cache_read_tokens + ?,
cache_write_tokens = cache_write_tokens + ?,
reasoning_tokens = reasoning_tokens + ?,
estimated_cost_usd = COALESCE(estimated_cost_usd, 0) + COALESCE(?, 0),
actual_cost_usd = CASE
WHEN ? IS NULL THEN actual_cost_usd
ELSE COALESCE(actual_cost_usd, 0) + ?
END,
cost_status = COALESCE(?, cost_status),
cost_source = COALESCE(?, cost_source),
pricing_version = COALESCE(?, pricing_version),
billing_provider = COALESCE(billing_provider, ?),
billing_base_url = COALESCE(billing_base_url, ?),
billing_mode = COALESCE(billing_mode, ?),
model = COALESCE(model, ?)
WHERE id = ?""",
(input_tokens, output_tokens, model, session_id),
(
input_tokens,
output_tokens,
cache_read_tokens,
cache_write_tokens,
reasoning_tokens,
estimated_cost_usd,
actual_cost_usd,
actual_cost_usd,
cost_status,
cost_source,
pricing_version,
billing_provider,
billing_base_url,
billing_mode,
model,
session_id,
),
)
self._conn.commit()

View File

@@ -86,6 +86,7 @@ from agent.model_metadata import (
from agent.context_compressor import ContextCompressor
from agent.prompt_caching import apply_anthropic_cache_control
from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt
from agent.usage_pricing import estimate_usage_cost, normalize_usage
from agent.display import (
KawaiiSpinner, build_tool_preview as _build_tool_preview,
get_cute_tool_message as _get_cute_tool_message_impl,
@@ -391,6 +392,15 @@ class AIAgent:
else:
self.api_mode = "chat_completions"
# Pre-warm OpenRouter model metadata cache in a background thread.
# fetch_model_metadata() is cached for 1 hour; this avoids a blocking
# HTTP request on the first API response when pricing is estimated.
if self.provider == "openrouter" or "openrouter" in self.base_url.lower():
threading.Thread(
target=lambda: fetch_model_metadata(),
daemon=True,
).start()
self.tool_progress_callback = tool_progress_callback
self.thinking_callback = thinking_callback
self.reasoning_callback = reasoning_callback
@@ -850,6 +860,14 @@ class AIAgent:
self.session_completion_tokens = 0
self.session_total_tokens = 0
self.session_api_calls = 0
self.session_input_tokens = 0
self.session_output_tokens = 0
self.session_cache_read_tokens = 0
self.session_cache_write_tokens = 0
self.session_reasoning_tokens = 0
self.session_estimated_cost_usd = 0.0
self.session_cost_status = "unknown"
self.session_cost_source = "none"
if not self.quiet_mode:
if compression_enabled:
@@ -5272,26 +5290,14 @@ class AIAgent:
# Track actual token usage from response for context management
if hasattr(response, 'usage') and response.usage:
if self.api_mode in ("codex_responses", "anthropic_messages"):
prompt_tokens = getattr(response.usage, 'input_tokens', 0) or 0
if self.api_mode == "anthropic_messages":
# Anthropic splits input into cache_read + cache_creation
# + non-cached input_tokens. Without adding the cached
# portions, the context bar shows only the tiny non-cached
# portion (e.g. 3 tokens) instead of the real total (~18K).
# Other providers (OpenAI/Codex) already include cached
# tokens in their input_tokens/prompt_tokens field.
prompt_tokens += getattr(response.usage, 'cache_read_input_tokens', 0) or 0
prompt_tokens += getattr(response.usage, 'cache_creation_input_tokens', 0) or 0
completion_tokens = getattr(response.usage, 'output_tokens', 0) or 0
total_tokens = (
getattr(response.usage, 'total_tokens', None)
or (prompt_tokens + completion_tokens)
)
else:
prompt_tokens = getattr(response.usage, 'prompt_tokens', 0) or 0
completion_tokens = getattr(response.usage, 'completion_tokens', 0) or 0
total_tokens = getattr(response.usage, 'total_tokens', 0) or 0
canonical_usage = normalize_usage(
response.usage,
provider=self.provider,
api_mode=self.api_mode,
)
prompt_tokens = canonical_usage.prompt_tokens
completion_tokens = canonical_usage.output_tokens
total_tokens = canonical_usage.total_tokens
usage_dict = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
@@ -5310,6 +5316,22 @@ class AIAgent:
self.session_completion_tokens += completion_tokens
self.session_total_tokens += total_tokens
self.session_api_calls += 1
self.session_input_tokens += canonical_usage.input_tokens
self.session_output_tokens += canonical_usage.output_tokens
self.session_cache_read_tokens += canonical_usage.cache_read_tokens
self.session_cache_write_tokens += canonical_usage.cache_write_tokens
self.session_reasoning_tokens += canonical_usage.reasoning_tokens
cost_result = estimate_usage_cost(
self.model,
canonical_usage,
provider=self.provider,
base_url=self.base_url,
)
if cost_result.amount_usd is not None:
self.session_estimated_cost_usd += float(cost_result.amount_usd)
self.session_cost_status = cost_result.status
self.session_cost_source = cost_result.source
# Persist token counts to session DB for /insights.
# Gateway sessions persist via session_store.update_session()
@@ -5320,8 +5342,19 @@ class AIAgent:
try:
self._session_db.update_token_counts(
self.session_id,
input_tokens=prompt_tokens,
output_tokens=completion_tokens,
input_tokens=canonical_usage.input_tokens,
output_tokens=canonical_usage.output_tokens,
cache_read_tokens=canonical_usage.cache_read_tokens,
cache_write_tokens=canonical_usage.cache_write_tokens,
reasoning_tokens=canonical_usage.reasoning_tokens,
estimated_cost_usd=float(cost_result.amount_usd)
if cost_result.amount_usd is not None else None,
cost_status=cost_result.status,
cost_source=cost_result.source,
billing_provider=self.provider,
billing_base_url=self.base_url,
billing_mode="subscription_included"
if cost_result.status == "included" else None,
model=self.model,
)
except Exception:
@@ -6242,6 +6275,21 @@ class AIAgent:
"partial": False, # True only when stopped due to invalid tool calls
"interrupted": interrupted,
"response_previewed": getattr(self, "_response_was_previewed", False),
"model": self.model,
"provider": self.provider,
"base_url": self.base_url,
"input_tokens": self.session_input_tokens,
"output_tokens": self.session_output_tokens,
"cache_read_tokens": self.session_cache_read_tokens,
"cache_write_tokens": self.session_cache_write_tokens,
"reasoning_tokens": self.session_reasoning_tokens,
"prompt_tokens": self.session_prompt_tokens,
"completion_tokens": self.session_completion_tokens,
"total_tokens": self.session_total_tokens,
"last_prompt_tokens": getattr(self.context_compressor, "last_prompt_tokens", 0) or 0,
"estimated_cost_usd": self.session_estimated_cost_usd,
"cost_status": self.session_cost_status,
"cost_source": self.session_cost_source,
}
self._response_was_previewed = False

View File

@@ -0,0 +1,101 @@
from types import SimpleNamespace
from agent.usage_pricing import (
CanonicalUsage,
estimate_usage_cost,
get_pricing_entry,
normalize_usage,
)
def test_normalize_usage_anthropic_keeps_cache_buckets_separate():
usage = SimpleNamespace(
input_tokens=1000,
output_tokens=500,
cache_read_input_tokens=2000,
cache_creation_input_tokens=400,
)
normalized = normalize_usage(usage, provider="anthropic", api_mode="anthropic_messages")
assert normalized.input_tokens == 1000
assert normalized.output_tokens == 500
assert normalized.cache_read_tokens == 2000
assert normalized.cache_write_tokens == 400
assert normalized.prompt_tokens == 3400
def test_normalize_usage_openai_subtracts_cached_prompt_tokens():
usage = SimpleNamespace(
prompt_tokens=3000,
completion_tokens=700,
prompt_tokens_details=SimpleNamespace(cached_tokens=1800),
)
normalized = normalize_usage(usage, provider="openai", api_mode="chat_completions")
assert normalized.input_tokens == 1200
assert normalized.cache_read_tokens == 1800
assert normalized.output_tokens == 700
def test_openrouter_models_api_pricing_is_converted_from_per_token_to_per_million(monkeypatch):
monkeypatch.setattr(
"agent.usage_pricing.fetch_model_metadata",
lambda: {
"anthropic/claude-opus-4.6": {
"pricing": {
"prompt": "0.000005",
"completion": "0.000025",
"input_cache_read": "0.0000005",
"input_cache_write": "0.00000625",
}
}
},
)
entry = get_pricing_entry(
"anthropic/claude-opus-4.6",
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
)
assert float(entry.input_cost_per_million) == 5.0
assert float(entry.output_cost_per_million) == 25.0
assert float(entry.cache_read_cost_per_million) == 0.5
assert float(entry.cache_write_cost_per_million) == 6.25
def test_estimate_usage_cost_marks_subscription_routes_included():
result = estimate_usage_cost(
"gpt-5.3-codex",
CanonicalUsage(input_tokens=1000, output_tokens=500),
provider="openai-codex",
base_url="https://chatgpt.com/backend-api/codex",
)
assert result.status == "included"
assert float(result.amount_usd) == 0.0
def test_estimate_usage_cost_refuses_cache_pricing_without_official_cache_rate(monkeypatch):
monkeypatch.setattr(
"agent.usage_pricing.fetch_model_metadata",
lambda: {
"google/gemini-2.5-pro": {
"pricing": {
"prompt": "0.00000125",
"completion": "0.00001",
}
}
},
)
result = estimate_usage_cost(
"google/gemini-2.5-pro",
CanonicalUsage(input_tokens=1000, output_tokens=500, cache_read_tokens=100),
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
)
assert result.status == "unknown"

View File

@@ -703,5 +703,15 @@ class TestLastPromptTokens:
store.update_session("k1", model="openai/gpt-5.4")
store._db.update_token_counts.assert_called_once_with(
"s1", 0, 0, model="openai/gpt-5.4"
"s1",
input_tokens=0,
output_tokens=0,
cache_read_tokens=0,
cache_write_tokens=0,
estimated_cost_usd=None,
cost_status=None,
cost_source=None,
billing_provider=None,
billing_base_url=None,
model="openai/gpt-5.4",
)

View File

@@ -128,6 +128,13 @@ async def test_handle_message_persists_agent_token_counts(monkeypatch):
session_entry.session_key,
input_tokens=120,
output_tokens=45,
cache_read_tokens=0,
cache_write_tokens=0,
last_prompt_tokens=80,
model="openai/test-model",
estimated_cost_usd=None,
cost_status=None,
cost_source=None,
provider=None,
base_url=None,
)

View File

@@ -16,6 +16,10 @@ def _make_cli(model: str = "anthropic/claude-sonnet-4-20250514"):
def _attach_agent(
cli_obj,
*,
input_tokens: int | None = None,
output_tokens: int | None = None,
cache_read_tokens: int = 0,
cache_write_tokens: int = 0,
prompt_tokens: int,
completion_tokens: int,
total_tokens: int,
@@ -26,6 +30,12 @@ def _attach_agent(
):
cli_obj.agent = SimpleNamespace(
model=cli_obj.model,
provider="anthropic" if cli_obj.model.startswith("anthropic/") else None,
base_url="",
session_input_tokens=input_tokens if input_tokens is not None else prompt_tokens,
session_output_tokens=output_tokens if output_tokens is not None else completion_tokens,
session_cache_read_tokens=cache_read_tokens,
session_cache_write_tokens=cache_write_tokens,
session_prompt_tokens=prompt_tokens,
session_completion_tokens=completion_tokens,
session_total_tokens=total_tokens,
@@ -68,20 +78,19 @@ class TestCLIStatusBar:
assert "$0.06" not in text # cost hidden by default
assert "15m" in text
def test_build_status_bar_text_shows_cost_when_enabled(self):
def test_build_status_bar_text_no_cost_in_status_bar(self):
cli_obj = _attach_agent(
_make_cli(),
prompt_tokens=10000,
completion_tokens=2400,
total_tokens=12400,
completion_tokens=5000,
total_tokens=15000,
api_calls=7,
context_tokens=12400,
context_tokens=50000,
context_length=200_000,
)
cli_obj.show_cost = True
text = cli_obj._build_status_bar_text(width=120)
assert "$" in text # cost is shown when enabled
assert "$" not in text # cost is never shown in status bar
def test_build_status_bar_text_collapses_for_narrow_terminal(self):
cli_obj = _attach_agent(
@@ -128,8 +137,8 @@ class TestCLIUsageReport:
output = capsys.readouterr().out
assert "Model:" in output
assert "Input cost:" in output
assert "Output cost:" in output
assert "Cost status:" in output
assert "Cost source:" in output
assert "Total cost:" in output
assert "$" in output
assert "0.064" in output

View File

@@ -657,7 +657,7 @@ class TestSchemaInit:
def test_schema_version(self, db):
cursor = db._conn.execute("SELECT version FROM schema_version")
version = cursor.fetchone()[0]
assert version == 4
assert version == 5
def test_title_column_exists(self, db):
"""Verify the title column was created in the sessions table."""
@@ -713,12 +713,12 @@ class TestSchemaInit:
conn.commit()
conn.close()
# Open with SessionDB — should migrate to v4
# Open with SessionDB — should migrate to v5
migrated_db = SessionDB(db_path=db_path)
# Verify migration
cursor = migrated_db._conn.execute("SELECT version FROM schema_version")
assert cursor.fetchone()[0] == 4
assert cursor.fetchone()[0] == 5
# Verify title column exists and is NULL for existing sessions
session = migrated_db.get_session("existing")

View File

@@ -123,28 +123,16 @@ def populated_db(db):
# =========================================================================
class TestPricing:
def test_exact_match(self):
pricing = _get_pricing("gpt-4o")
assert pricing["input"] == 2.50
assert pricing["output"] == 10.00
def test_provider_prefix_stripped(self):
pricing = _get_pricing("anthropic/claude-sonnet-4-20250514")
assert pricing["input"] == 3.00
assert pricing["output"] == 15.00
def test_prefix_match(self):
pricing = _get_pricing("claude-3-5-sonnet-20241022")
assert pricing["input"] == 3.00
def test_keyword_heuristic_opus(self):
def test_unknown_models_do_not_use_heuristics(self):
pricing = _get_pricing("some-new-opus-model")
assert pricing["input"] == 15.00
assert pricing["output"] == 75.00
def test_keyword_heuristic_haiku(self):
assert pricing == _DEFAULT_PRICING
pricing = _get_pricing("anthropic/claude-haiku-future")
assert pricing["input"] == 0.80
assert pricing == _DEFAULT_PRICING
def test_unknown_model_returns_zero_cost(self):
"""Unknown/custom models should NOT have fabricated costs."""
@@ -168,40 +156,12 @@ class TestPricing:
pricing = _get_pricing("")
assert pricing == _DEFAULT_PRICING
def test_deepseek_heuristic(self):
pricing = _get_pricing("deepseek-v3")
assert pricing["input"] == 0.14
def test_gemini_heuristic(self):
pricing = _get_pricing("gemini-3.0-ultra")
assert pricing["input"] == 0.15
def test_dated_model_gpt4o_mini(self):
"""gpt-4o-mini-2024-07-18 should match gpt-4o-mini, NOT gpt-4o."""
pricing = _get_pricing("gpt-4o-mini-2024-07-18")
assert pricing["input"] == 0.15 # gpt-4o-mini price, not gpt-4o's 2.50
def test_dated_model_o3_mini(self):
"""o3-mini-2025-01-31 should match o3-mini, NOT o3."""
pricing = _get_pricing("o3-mini-2025-01-31")
assert pricing["input"] == 1.10 # o3-mini price, not o3's 10.00
def test_dated_model_gpt41_mini(self):
"""gpt-4.1-mini-2025-04-14 should match gpt-4.1-mini, NOT gpt-4.1."""
pricing = _get_pricing("gpt-4.1-mini-2025-04-14")
assert pricing["input"] == 0.40 # gpt-4.1-mini, not gpt-4.1's 2.00
def test_dated_model_gpt41_nano(self):
"""gpt-4.1-nano-2025-04-14 should match gpt-4.1-nano, NOT gpt-4.1."""
pricing = _get_pricing("gpt-4.1-nano-2025-04-14")
assert pricing["input"] == 0.10 # gpt-4.1-nano, not gpt-4.1's 2.00
class TestHasKnownPricing:
def test_known_commercial_model(self):
assert _has_known_pricing("gpt-4o") is True
assert _has_known_pricing("gpt-4o", provider="openai") is True
assert _has_known_pricing("anthropic/claude-sonnet-4-20250514") is True
assert _has_known_pricing("deepseek-chat") is True
assert _has_known_pricing("gpt-4.1", provider="openai") is True
def test_unknown_custom_model(self):
assert _has_known_pricing("FP16_Hermes_4.5") is False
@@ -210,26 +170,39 @@ class TestHasKnownPricing:
assert _has_known_pricing("") is False
assert _has_known_pricing(None) is False
def test_heuristic_matched_models(self):
"""Models matched by keyword heuristics should be considered known."""
assert _has_known_pricing("some-opus-model") is True
assert _has_known_pricing("future-sonnet-v2") is True
def test_heuristic_matched_models_are_not_considered_known(self):
assert _has_known_pricing("some-opus-model") is False
assert _has_known_pricing("future-sonnet-v2") is False
class TestEstimateCost:
def test_basic_cost(self):
# gpt-4o: 2.50/M input, 10.00/M output
cost = _estimate_cost("gpt-4o", 1_000_000, 1_000_000)
assert cost == pytest.approx(12.50, abs=0.01)
cost, status = _estimate_cost(
"anthropic/claude-sonnet-4-20250514",
1_000_000,
1_000_000,
provider="anthropic",
)
assert status == "estimated"
assert cost == pytest.approx(18.0, abs=0.01)
def test_zero_tokens(self):
cost = _estimate_cost("gpt-4o", 0, 0)
cost, status = _estimate_cost("gpt-4o", 0, 0, provider="openai")
assert status == "estimated"
assert cost == 0.0
def test_small_usage(self):
cost = _estimate_cost("gpt-4o", 1000, 500)
# 1000 * 2.50/1M + 500 * 10.00/1M = 0.0025 + 0.005 = 0.0075
assert cost == pytest.approx(0.0075, abs=0.0001)
def test_cache_aware_usage(self):
cost, status = _estimate_cost(
"anthropic/claude-sonnet-4-20250514",
1000,
500,
cache_read_tokens=2000,
cache_write_tokens=400,
provider="anthropic",
)
assert status == "estimated"
expected = (1000 * 3.0 + 500 * 15.0 + 2000 * 0.30 + 400 * 3.75) / 1_000_000
assert cost == pytest.approx(expected, abs=0.0001)
# =========================================================================
@@ -660,8 +633,13 @@ class TestEdgeCases:
def test_mixed_commercial_and_custom_models(self, db):
"""Mix of commercial and custom models: only commercial ones get costs."""
db.create_session(session_id="s1", source="cli", model="gpt-4o")
db.update_token_counts("s1", input_tokens=10000, output_tokens=5000)
db.create_session(session_id="s1", source="cli", model="anthropic/claude-sonnet-4-20250514")
db.update_token_counts(
"s1",
input_tokens=10000,
output_tokens=5000,
billing_provider="anthropic",
)
db.create_session(session_id="s2", source="cli", model="my-local-llama")
db.update_token_counts("s2", input_tokens=10000, output_tokens=5000)
db._conn.commit()
@@ -672,13 +650,13 @@ class TestEdgeCases:
# Cost should only come from gpt-4o, not from the custom model
overview = report["overview"]
assert overview["estimated_cost"] > 0
assert "gpt-4o" in overview["models_with_pricing"] # list now, not set
assert "claude-sonnet-4-20250514" in overview["models_with_pricing"] # list now, not set
assert "my-local-llama" in overview["models_without_pricing"]
# Verify individual model entries
gpt = next(m for m in report["models"] if m["model"] == "gpt-4o")
assert gpt["has_pricing"] is True
assert gpt["cost"] > 0
claude = next(m for m in report["models"] if m["model"] == "claude-sonnet-4-20250514")
assert claude["has_pricing"] is True
assert claude["cost"] > 0
llama = next(m for m in report["models"] if m["model"] == "my-local-llama")
assert llama["has_pricing"] is False