merge: resolve conflicts with origin/main
This commit is contained in:
@@ -963,8 +963,12 @@ def convert_messages_to_anthropic(
|
||||
elif isinstance(prev_blocks, str) and isinstance(curr_blocks, str):
|
||||
fixed[-1]["content"] = prev_blocks + "\n" + curr_blocks
|
||||
else:
|
||||
# Keep the later message
|
||||
fixed[-1] = m
|
||||
# Mixed types — normalize both to list and merge
|
||||
if isinstance(prev_blocks, str):
|
||||
prev_blocks = [{"type": "text", "text": prev_blocks}]
|
||||
if isinstance(curr_blocks, str):
|
||||
curr_blocks = [{"type": "text", "text": curr_blocks}]
|
||||
fixed[-1]["content"] = prev_blocks + curr_blocks
|
||||
else:
|
||||
fixed.append(m)
|
||||
result = fixed
|
||||
@@ -1049,7 +1053,8 @@ def build_anthropic_kwargs(
|
||||
elif tool_choice == "required":
|
||||
kwargs["tool_choice"] = {"type": "any"}
|
||||
elif tool_choice == "none":
|
||||
pass # Don't send tool_choice — Anthropic will use tools if needed
|
||||
# Anthropic has no tool_choice "none" — omit tools entirely to prevent use
|
||||
kwargs.pop("tools", None)
|
||||
elif isinstance(tool_choice, str):
|
||||
# Specific tool name
|
||||
kwargs["tool_choice"] = {"type": "tool", "name": tool_choice}
|
||||
|
||||
@@ -39,6 +39,7 @@ custom OpenAI-compatible endpoint without touching the main model settings.
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
@@ -705,6 +706,8 @@ def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[st
|
||||
|
||||
def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
"""Full auto-detection chain: OpenRouter → Nous → custom → Codex → API-key → None."""
|
||||
global auxiliary_is_nous
|
||||
auxiliary_is_nous = False # Reset — _try_nous() will set True if it wins
|
||||
for try_fn in (_try_openrouter, _try_nous, _try_custom_endpoint,
|
||||
_try_codex, _resolve_api_key_provider):
|
||||
client, model = try_fn()
|
||||
@@ -1171,6 +1174,7 @@ def auxiliary_max_tokens_param(value: int) -> dict:
|
||||
|
||||
# Client cache: (provider, async_mode, base_url, api_key) -> (client, default_model)
|
||||
_client_cache: Dict[tuple, tuple] = {}
|
||||
_client_cache_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_cached_client(
|
||||
@@ -1182,9 +1186,11 @@ def _get_cached_client(
|
||||
) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Get or create a cached client for the given provider."""
|
||||
cache_key = (provider, async_mode, base_url or "", api_key or "")
|
||||
if cache_key in _client_cache:
|
||||
cached_client, cached_default = _client_cache[cache_key]
|
||||
return cached_client, model or cached_default
|
||||
with _client_cache_lock:
|
||||
if cache_key in _client_cache:
|
||||
cached_client, cached_default = _client_cache[cache_key]
|
||||
return cached_client, model or cached_default
|
||||
# Build outside the lock
|
||||
client, default_model = resolve_provider_client(
|
||||
provider,
|
||||
model,
|
||||
@@ -1193,7 +1199,11 @@ def _get_cached_client(
|
||||
explicit_api_key=api_key,
|
||||
)
|
||||
if client is not None:
|
||||
_client_cache[cache_key] = (client, default_model)
|
||||
with _client_cache_lock:
|
||||
if cache_key not in _client_cache:
|
||||
_client_cache[cache_key] = (client, default_model)
|
||||
else:
|
||||
client, default_model = _client_cache[cache_key]
|
||||
return client, model or default_model
|
||||
|
||||
|
||||
|
||||
@@ -313,7 +313,19 @@ Write only the summary body. Do not include any preamble or prefix; the system w
|
||||
|
||||
if summary:
|
||||
last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user"
|
||||
summary_role = "user" if last_head_role in ("assistant", "tool") else "assistant"
|
||||
first_tail_role = messages[compress_end].get("role", "user") if compress_end < n_messages else "user"
|
||||
# Pick a role that avoids consecutive same-role with both neighbors.
|
||||
# Priority: avoid colliding with head (already committed), then tail.
|
||||
if last_head_role in ("assistant", "tool"):
|
||||
summary_role = "user"
|
||||
else:
|
||||
summary_role = "assistant"
|
||||
# If the chosen role collides with the tail AND flipping wouldn't
|
||||
# collide with the head, flip it.
|
||||
if summary_role == first_tail_role:
|
||||
flipped = "assistant" if summary_role == "user" else "user"
|
||||
if flipped != last_head_role:
|
||||
summary_role = flipped
|
||||
compressed.append({"role": summary_role, "content": summary})
|
||||
else:
|
||||
if not self.quiet_mode:
|
||||
|
||||
@@ -22,14 +22,21 @@ from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from agent.usage_pricing import DEFAULT_PRICING, estimate_cost_usd, format_duration_compact, get_pricing, has_known_pricing
|
||||
from agent.usage_pricing import (
|
||||
CanonicalUsage,
|
||||
DEFAULT_PRICING,
|
||||
estimate_usage_cost,
|
||||
format_duration_compact,
|
||||
get_pricing,
|
||||
has_known_pricing,
|
||||
)
|
||||
|
||||
_DEFAULT_PRICING = DEFAULT_PRICING
|
||||
|
||||
|
||||
def _has_known_pricing(model_name: str) -> bool:
|
||||
def _has_known_pricing(model_name: str, provider: str = None, base_url: str = None) -> bool:
|
||||
"""Check if a model has known pricing (vs unknown/custom endpoint)."""
|
||||
return has_known_pricing(model_name)
|
||||
return has_known_pricing(model_name, provider=provider, base_url=base_url)
|
||||
|
||||
|
||||
def _get_pricing(model_name: str) -> Dict[str, float]:
|
||||
@@ -41,9 +48,43 @@ def _get_pricing(model_name: str) -> Dict[str, float]:
|
||||
return get_pricing(model_name)
|
||||
|
||||
|
||||
def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
|
||||
"""Estimate the USD cost for a given model and token counts."""
|
||||
return estimate_cost_usd(model, input_tokens, output_tokens)
|
||||
def _estimate_cost(
|
||||
session_or_model: Dict[str, Any] | str,
|
||||
input_tokens: int = 0,
|
||||
output_tokens: int = 0,
|
||||
*,
|
||||
cache_read_tokens: int = 0,
|
||||
cache_write_tokens: int = 0,
|
||||
provider: str = None,
|
||||
base_url: str = None,
|
||||
) -> tuple[float, str]:
|
||||
"""Estimate the USD cost for a session row or a model/token tuple."""
|
||||
if isinstance(session_or_model, dict):
|
||||
session = session_or_model
|
||||
model = session.get("model") or ""
|
||||
usage = CanonicalUsage(
|
||||
input_tokens=session.get("input_tokens") or 0,
|
||||
output_tokens=session.get("output_tokens") or 0,
|
||||
cache_read_tokens=session.get("cache_read_tokens") or 0,
|
||||
cache_write_tokens=session.get("cache_write_tokens") or 0,
|
||||
)
|
||||
provider = session.get("billing_provider")
|
||||
base_url = session.get("billing_base_url")
|
||||
else:
|
||||
model = session_or_model or ""
|
||||
usage = CanonicalUsage(
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cache_read_tokens=cache_read_tokens,
|
||||
cache_write_tokens=cache_write_tokens,
|
||||
)
|
||||
result = estimate_usage_cost(
|
||||
model,
|
||||
usage,
|
||||
provider=provider,
|
||||
base_url=base_url,
|
||||
)
|
||||
return float(result.amount_usd or 0.0), result.status
|
||||
|
||||
|
||||
def _format_duration(seconds: float) -> str:
|
||||
@@ -135,7 +176,10 @@ class InsightsEngine:
|
||||
|
||||
# Columns we actually need (skip system_prompt, model_config blobs)
|
||||
_SESSION_COLS = ("id, source, model, started_at, ended_at, "
|
||||
"message_count, tool_call_count, input_tokens, output_tokens")
|
||||
"message_count, tool_call_count, input_tokens, output_tokens, "
|
||||
"cache_read_tokens, cache_write_tokens, billing_provider, "
|
||||
"billing_base_url, billing_mode, estimated_cost_usd, "
|
||||
"actual_cost_usd, cost_status, cost_source")
|
||||
|
||||
def _get_sessions(self, cutoff: float, source: str = None) -> List[Dict]:
|
||||
"""Fetch sessions within the time window."""
|
||||
@@ -287,21 +331,30 @@ class InsightsEngine:
|
||||
"""Compute high-level overview statistics."""
|
||||
total_input = sum(s.get("input_tokens") or 0 for s in sessions)
|
||||
total_output = sum(s.get("output_tokens") or 0 for s in sessions)
|
||||
total_tokens = total_input + total_output
|
||||
total_cache_read = sum(s.get("cache_read_tokens") or 0 for s in sessions)
|
||||
total_cache_write = sum(s.get("cache_write_tokens") or 0 for s in sessions)
|
||||
total_tokens = total_input + total_output + total_cache_read + total_cache_write
|
||||
total_tool_calls = sum(s.get("tool_call_count") or 0 for s in sessions)
|
||||
total_messages = sum(s.get("message_count") or 0 for s in sessions)
|
||||
|
||||
# Cost estimation (weighted by model)
|
||||
total_cost = 0.0
|
||||
actual_cost = 0.0
|
||||
models_with_pricing = set()
|
||||
models_without_pricing = set()
|
||||
unknown_cost_sessions = 0
|
||||
included_cost_sessions = 0
|
||||
for s in sessions:
|
||||
model = s.get("model") or ""
|
||||
inp = s.get("input_tokens") or 0
|
||||
out = s.get("output_tokens") or 0
|
||||
total_cost += _estimate_cost(model, inp, out)
|
||||
estimated, status = _estimate_cost(s)
|
||||
total_cost += estimated
|
||||
actual_cost += s.get("actual_cost_usd") or 0.0
|
||||
display = model.split("/")[-1] if "/" in model else (model or "unknown")
|
||||
if _has_known_pricing(model):
|
||||
if status == "included":
|
||||
included_cost_sessions += 1
|
||||
elif status == "unknown":
|
||||
unknown_cost_sessions += 1
|
||||
if _has_known_pricing(model, s.get("billing_provider"), s.get("billing_base_url")):
|
||||
models_with_pricing.add(display)
|
||||
else:
|
||||
models_without_pricing.add(display)
|
||||
@@ -328,8 +381,11 @@ class InsightsEngine:
|
||||
"total_tool_calls": total_tool_calls,
|
||||
"total_input_tokens": total_input,
|
||||
"total_output_tokens": total_output,
|
||||
"total_cache_read_tokens": total_cache_read,
|
||||
"total_cache_write_tokens": total_cache_write,
|
||||
"total_tokens": total_tokens,
|
||||
"estimated_cost": total_cost,
|
||||
"actual_cost": actual_cost,
|
||||
"total_hours": total_hours,
|
||||
"avg_session_duration": avg_duration,
|
||||
"avg_messages_per_session": total_messages / len(sessions) if sessions else 0,
|
||||
@@ -341,12 +397,15 @@ class InsightsEngine:
|
||||
"date_range_end": date_range_end,
|
||||
"models_with_pricing": sorted(models_with_pricing),
|
||||
"models_without_pricing": sorted(models_without_pricing),
|
||||
"unknown_cost_sessions": unknown_cost_sessions,
|
||||
"included_cost_sessions": included_cost_sessions,
|
||||
}
|
||||
|
||||
def _compute_model_breakdown(self, sessions: List[Dict]) -> List[Dict]:
|
||||
"""Break down usage by model."""
|
||||
model_data = defaultdict(lambda: {
|
||||
"sessions": 0, "input_tokens": 0, "output_tokens": 0,
|
||||
"cache_read_tokens": 0, "cache_write_tokens": 0,
|
||||
"total_tokens": 0, "tool_calls": 0, "cost": 0.0,
|
||||
})
|
||||
|
||||
@@ -358,12 +417,18 @@ class InsightsEngine:
|
||||
d["sessions"] += 1
|
||||
inp = s.get("input_tokens") or 0
|
||||
out = s.get("output_tokens") or 0
|
||||
cache_read = s.get("cache_read_tokens") or 0
|
||||
cache_write = s.get("cache_write_tokens") or 0
|
||||
d["input_tokens"] += inp
|
||||
d["output_tokens"] += out
|
||||
d["total_tokens"] += inp + out
|
||||
d["cache_read_tokens"] += cache_read
|
||||
d["cache_write_tokens"] += cache_write
|
||||
d["total_tokens"] += inp + out + cache_read + cache_write
|
||||
d["tool_calls"] += s.get("tool_call_count") or 0
|
||||
d["cost"] += _estimate_cost(model, inp, out)
|
||||
d["has_pricing"] = _has_known_pricing(model)
|
||||
estimate, status = _estimate_cost(s)
|
||||
d["cost"] += estimate
|
||||
d["has_pricing"] = _has_known_pricing(model, s.get("billing_provider"), s.get("billing_base_url"))
|
||||
d["cost_status"] = status
|
||||
|
||||
result = [
|
||||
{"model": model, **data}
|
||||
@@ -377,7 +442,8 @@ class InsightsEngine:
|
||||
"""Break down usage by platform/source."""
|
||||
platform_data = defaultdict(lambda: {
|
||||
"sessions": 0, "messages": 0, "input_tokens": 0,
|
||||
"output_tokens": 0, "total_tokens": 0, "tool_calls": 0,
|
||||
"output_tokens": 0, "cache_read_tokens": 0,
|
||||
"cache_write_tokens": 0, "total_tokens": 0, "tool_calls": 0,
|
||||
})
|
||||
|
||||
for s in sessions:
|
||||
@@ -387,9 +453,13 @@ class InsightsEngine:
|
||||
d["messages"] += s.get("message_count") or 0
|
||||
inp = s.get("input_tokens") or 0
|
||||
out = s.get("output_tokens") or 0
|
||||
cache_read = s.get("cache_read_tokens") or 0
|
||||
cache_write = s.get("cache_write_tokens") or 0
|
||||
d["input_tokens"] += inp
|
||||
d["output_tokens"] += out
|
||||
d["total_tokens"] += inp + out
|
||||
d["cache_read_tokens"] += cache_read
|
||||
d["cache_write_tokens"] += cache_write
|
||||
d["total_tokens"] += inp + out + cache_read + cache_write
|
||||
d["tool_calls"] += s.get("tool_call_count") or 0
|
||||
|
||||
result = [
|
||||
|
||||
@@ -266,8 +266,10 @@ def get_model_context_length(model: str, base_url: str = "") -> int:
|
||||
if model in metadata:
|
||||
return metadata[model].get("context_length", 128000)
|
||||
|
||||
# 3. Hardcoded defaults (fuzzy match)
|
||||
for default_model, length in DEFAULT_CONTEXT_LENGTHS.items():
|
||||
# 3. Hardcoded defaults (fuzzy match — longest key first for specificity)
|
||||
for default_model, length in sorted(
|
||||
DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
|
||||
):
|
||||
if default_model in model or model in default_model:
|
||||
return length
|
||||
|
||||
|
||||
@@ -212,16 +212,15 @@ PLATFORM_HINTS = {
|
||||
"the scheduled destination, put it directly in your final response. Use "
|
||||
"send_message only for additional or different targets."
|
||||
),
|
||||
"sms": (
|
||||
"You are communicating via SMS text messaging. Keep responses concise "
|
||||
"and plain text only -- no markdown, no formatting. SMS has a 1600 "
|
||||
"character limit per message (10 segments). Longer replies are split "
|
||||
"across multiple messages. Be brief and direct."
|
||||
),
|
||||
"cli": (
|
||||
"You are a CLI AI Agent. Try not to use markdown but simple text "
|
||||
"renderable inside a terminal."
|
||||
),
|
||||
"sms": (
|
||||
"You are communicating via SMS. Keep responses concise and use plain text "
|
||||
"only — no markdown, no formatting. SMS messages are limited to ~1600 "
|
||||
"characters, so be brief and direct."
|
||||
),
|
||||
}
|
||||
|
||||
CONTEXT_FILE_MAX_CHARS = 20_000
|
||||
|
||||
@@ -1,101 +1,593 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from decimal import Decimal
|
||||
from typing import Dict
|
||||
from typing import Any, Dict, Literal, Optional
|
||||
|
||||
|
||||
MODEL_PRICING = {
|
||||
"gpt-4o": {"input": 2.50, "output": 10.00},
|
||||
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
||||
"gpt-4.1": {"input": 2.00, "output": 8.00},
|
||||
"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
|
||||
"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
|
||||
"gpt-4.5-preview": {"input": 75.00, "output": 150.00},
|
||||
"gpt-5": {"input": 10.00, "output": 30.00},
|
||||
"gpt-5.4": {"input": 10.00, "output": 30.00},
|
||||
"o3": {"input": 10.00, "output": 40.00},
|
||||
"o3-mini": {"input": 1.10, "output": 4.40},
|
||||
"o4-mini": {"input": 1.10, "output": 4.40},
|
||||
"claude-opus-4-20250514": {"input": 15.00, "output": 75.00},
|
||||
"claude-sonnet-4-20250514": {"input": 3.00, "output": 15.00},
|
||||
"claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
|
||||
"claude-3-5-haiku-20241022": {"input": 0.80, "output": 4.00},
|
||||
"claude-3-opus-20240229": {"input": 15.00, "output": 75.00},
|
||||
"claude-3-haiku-20240307": {"input": 0.25, "output": 1.25},
|
||||
"deepseek-chat": {"input": 0.14, "output": 0.28},
|
||||
"deepseek-reasoner": {"input": 0.55, "output": 2.19},
|
||||
"gemini-2.5-pro": {"input": 1.25, "output": 10.00},
|
||||
"gemini-2.5-flash": {"input": 0.15, "output": 0.60},
|
||||
"gemini-2.0-flash": {"input": 0.10, "output": 0.40},
|
||||
"llama-4-maverick": {"input": 0.50, "output": 0.70},
|
||||
"llama-4-scout": {"input": 0.20, "output": 0.30},
|
||||
"glm-5": {"input": 0.0, "output": 0.0},
|
||||
"glm-4.7": {"input": 0.0, "output": 0.0},
|
||||
"glm-4.5": {"input": 0.0, "output": 0.0},
|
||||
"glm-4.5-flash": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2.5": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2-thinking": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2-turbo-preview": {"input": 0.0, "output": 0.0},
|
||||
"kimi-k2-0905-preview": {"input": 0.0, "output": 0.0},
|
||||
"MiniMax-M2.5": {"input": 0.0, "output": 0.0},
|
||||
"MiniMax-M2.5-highspeed": {"input": 0.0, "output": 0.0},
|
||||
"MiniMax-M2.1": {"input": 0.0, "output": 0.0},
|
||||
}
|
||||
from agent.model_metadata import fetch_model_metadata
|
||||
|
||||
DEFAULT_PRICING = {"input": 0.0, "output": 0.0}
|
||||
|
||||
_ZERO = Decimal("0")
|
||||
_ONE_MILLION = Decimal("1000000")
|
||||
|
||||
def get_pricing(model_name: str) -> Dict[str, float]:
|
||||
if not model_name:
|
||||
return DEFAULT_PRICING
|
||||
|
||||
bare = model_name.split("/")[-1].lower()
|
||||
if bare in MODEL_PRICING:
|
||||
return MODEL_PRICING[bare]
|
||||
|
||||
best_match = None
|
||||
best_len = 0
|
||||
for key, price in MODEL_PRICING.items():
|
||||
if bare.startswith(key) and len(key) > best_len:
|
||||
best_match = price
|
||||
best_len = len(key)
|
||||
if best_match:
|
||||
return best_match
|
||||
|
||||
if "opus" in bare:
|
||||
return {"input": 15.00, "output": 75.00}
|
||||
if "sonnet" in bare:
|
||||
return {"input": 3.00, "output": 15.00}
|
||||
if "haiku" in bare:
|
||||
return {"input": 0.80, "output": 4.00}
|
||||
if "gpt-4o-mini" in bare:
|
||||
return {"input": 0.15, "output": 0.60}
|
||||
if "gpt-4o" in bare:
|
||||
return {"input": 2.50, "output": 10.00}
|
||||
if "gpt-5" in bare:
|
||||
return {"input": 10.00, "output": 30.00}
|
||||
if "deepseek" in bare:
|
||||
return {"input": 0.14, "output": 0.28}
|
||||
if "gemini" in bare:
|
||||
return {"input": 0.15, "output": 0.60}
|
||||
|
||||
return DEFAULT_PRICING
|
||||
CostStatus = Literal["actual", "estimated", "included", "unknown"]
|
||||
CostSource = Literal[
|
||||
"provider_cost_api",
|
||||
"provider_generation_api",
|
||||
"provider_models_api",
|
||||
"official_docs_snapshot",
|
||||
"user_override",
|
||||
"custom_contract",
|
||||
"none",
|
||||
]
|
||||
|
||||
|
||||
def has_known_pricing(model_name: str) -> bool:
|
||||
pricing = get_pricing(model_name)
|
||||
return pricing is not DEFAULT_PRICING and any(
|
||||
float(value) > 0 for value in pricing.values()
|
||||
@dataclass(frozen=True)
|
||||
class CanonicalUsage:
|
||||
input_tokens: int = 0
|
||||
output_tokens: int = 0
|
||||
cache_read_tokens: int = 0
|
||||
cache_write_tokens: int = 0
|
||||
reasoning_tokens: int = 0
|
||||
request_count: int = 1
|
||||
raw_usage: Optional[dict[str, Any]] = None
|
||||
|
||||
@property
|
||||
def prompt_tokens(self) -> int:
|
||||
return self.input_tokens + self.cache_read_tokens + self.cache_write_tokens
|
||||
|
||||
@property
|
||||
def total_tokens(self) -> int:
|
||||
return self.prompt_tokens + self.output_tokens
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BillingRoute:
|
||||
provider: str
|
||||
model: str
|
||||
base_url: str = ""
|
||||
billing_mode: str = "unknown"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PricingEntry:
|
||||
input_cost_per_million: Optional[Decimal] = None
|
||||
output_cost_per_million: Optional[Decimal] = None
|
||||
cache_read_cost_per_million: Optional[Decimal] = None
|
||||
cache_write_cost_per_million: Optional[Decimal] = None
|
||||
request_cost: Optional[Decimal] = None
|
||||
source: CostSource = "none"
|
||||
source_url: Optional[str] = None
|
||||
pricing_version: Optional[str] = None
|
||||
fetched_at: Optional[datetime] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CostResult:
|
||||
amount_usd: Optional[Decimal]
|
||||
status: CostStatus
|
||||
source: CostSource
|
||||
label: str
|
||||
fetched_at: Optional[datetime] = None
|
||||
pricing_version: Optional[str] = None
|
||||
notes: tuple[str, ...] = ()
|
||||
|
||||
|
||||
_UTC_NOW = lambda: datetime.now(timezone.utc)
|
||||
|
||||
|
||||
# Official docs snapshot entries. Models whose published pricing and cache
|
||||
# semantics are stable enough to encode exactly.
|
||||
_OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
|
||||
(
|
||||
"anthropic",
|
||||
"claude-opus-4-20250514",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("15.00"),
|
||||
output_cost_per_million=Decimal("75.00"),
|
||||
cache_read_cost_per_million=Decimal("1.50"),
|
||||
cache_write_cost_per_million=Decimal("18.75"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||
),
|
||||
(
|
||||
"anthropic",
|
||||
"claude-sonnet-4-20250514",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("3.00"),
|
||||
output_cost_per_million=Decimal("15.00"),
|
||||
cache_read_cost_per_million=Decimal("0.30"),
|
||||
cache_write_cost_per_million=Decimal("3.75"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||
),
|
||||
# OpenAI
|
||||
(
|
||||
"openai",
|
||||
"gpt-4o",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("2.50"),
|
||||
output_cost_per_million=Decimal("10.00"),
|
||||
cache_read_cost_per_million=Decimal("1.25"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://openai.com/api/pricing/",
|
||||
pricing_version="openai-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"openai",
|
||||
"gpt-4o-mini",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.15"),
|
||||
output_cost_per_million=Decimal("0.60"),
|
||||
cache_read_cost_per_million=Decimal("0.075"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://openai.com/api/pricing/",
|
||||
pricing_version="openai-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"openai",
|
||||
"gpt-4.1",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("2.00"),
|
||||
output_cost_per_million=Decimal("8.00"),
|
||||
cache_read_cost_per_million=Decimal("0.50"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://openai.com/api/pricing/",
|
||||
pricing_version="openai-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"openai",
|
||||
"gpt-4.1-mini",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.40"),
|
||||
output_cost_per_million=Decimal("1.60"),
|
||||
cache_read_cost_per_million=Decimal("0.10"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://openai.com/api/pricing/",
|
||||
pricing_version="openai-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"openai",
|
||||
"gpt-4.1-nano",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.10"),
|
||||
output_cost_per_million=Decimal("0.40"),
|
||||
cache_read_cost_per_million=Decimal("0.025"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://openai.com/api/pricing/",
|
||||
pricing_version="openai-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"openai",
|
||||
"o3",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("10.00"),
|
||||
output_cost_per_million=Decimal("40.00"),
|
||||
cache_read_cost_per_million=Decimal("2.50"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://openai.com/api/pricing/",
|
||||
pricing_version="openai-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"openai",
|
||||
"o3-mini",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("1.10"),
|
||||
output_cost_per_million=Decimal("4.40"),
|
||||
cache_read_cost_per_million=Decimal("0.55"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://openai.com/api/pricing/",
|
||||
pricing_version="openai-pricing-2026-03-16",
|
||||
),
|
||||
# Anthropic older models (pre-4.6 generation)
|
||||
(
|
||||
"anthropic",
|
||||
"claude-3-5-sonnet-20241022",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("3.00"),
|
||||
output_cost_per_million=Decimal("15.00"),
|
||||
cache_read_cost_per_million=Decimal("0.30"),
|
||||
cache_write_cost_per_million=Decimal("3.75"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||
pricing_version="anthropic-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"anthropic",
|
||||
"claude-3-5-haiku-20241022",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.80"),
|
||||
output_cost_per_million=Decimal("4.00"),
|
||||
cache_read_cost_per_million=Decimal("0.08"),
|
||||
cache_write_cost_per_million=Decimal("1.00"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||
pricing_version="anthropic-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"anthropic",
|
||||
"claude-3-opus-20240229",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("15.00"),
|
||||
output_cost_per_million=Decimal("75.00"),
|
||||
cache_read_cost_per_million=Decimal("1.50"),
|
||||
cache_write_cost_per_million=Decimal("18.75"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||
pricing_version="anthropic-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"anthropic",
|
||||
"claude-3-haiku-20240307",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.25"),
|
||||
output_cost_per_million=Decimal("1.25"),
|
||||
cache_read_cost_per_million=Decimal("0.03"),
|
||||
cache_write_cost_per_million=Decimal("0.30"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||
pricing_version="anthropic-pricing-2026-03-16",
|
||||
),
|
||||
# DeepSeek
|
||||
(
|
||||
"deepseek",
|
||||
"deepseek-chat",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.14"),
|
||||
output_cost_per_million=Decimal("0.28"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://api-docs.deepseek.com/quick_start/pricing",
|
||||
pricing_version="deepseek-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"deepseek",
|
||||
"deepseek-reasoner",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.55"),
|
||||
output_cost_per_million=Decimal("2.19"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://api-docs.deepseek.com/quick_start/pricing",
|
||||
pricing_version="deepseek-pricing-2026-03-16",
|
||||
),
|
||||
# Google Gemini
|
||||
(
|
||||
"google",
|
||||
"gemini-2.5-pro",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("1.25"),
|
||||
output_cost_per_million=Decimal("10.00"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://ai.google.dev/pricing",
|
||||
pricing_version="google-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"google",
|
||||
"gemini-2.5-flash",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.15"),
|
||||
output_cost_per_million=Decimal("0.60"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://ai.google.dev/pricing",
|
||||
pricing_version="google-pricing-2026-03-16",
|
||||
),
|
||||
(
|
||||
"google",
|
||||
"gemini-2.0-flash",
|
||||
): PricingEntry(
|
||||
input_cost_per_million=Decimal("0.10"),
|
||||
output_cost_per_million=Decimal("0.40"),
|
||||
source="official_docs_snapshot",
|
||||
source_url="https://ai.google.dev/pricing",
|
||||
pricing_version="google-pricing-2026-03-16",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _to_decimal(value: Any) -> Optional[Decimal]:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
return Decimal(str(value))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _to_int(value: Any) -> int:
|
||||
try:
|
||||
return int(value or 0)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def resolve_billing_route(
|
||||
model_name: str,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> BillingRoute:
|
||||
provider_name = (provider or "").strip().lower()
|
||||
base = (base_url or "").strip().lower()
|
||||
model = (model_name or "").strip()
|
||||
if not provider_name and "/" in model:
|
||||
inferred_provider, bare_model = model.split("/", 1)
|
||||
if inferred_provider in {"anthropic", "openai", "google"}:
|
||||
provider_name = inferred_provider
|
||||
model = bare_model
|
||||
|
||||
if provider_name == "openai-codex":
|
||||
return BillingRoute(provider="openai-codex", model=model, base_url=base_url or "", billing_mode="subscription_included")
|
||||
if provider_name == "openrouter" or "openrouter.ai" in base:
|
||||
return BillingRoute(provider="openrouter", model=model, base_url=base_url or "", billing_mode="official_models_api")
|
||||
if provider_name == "anthropic":
|
||||
return BillingRoute(provider="anthropic", model=model.split("/")[-1], base_url=base_url or "", billing_mode="official_docs_snapshot")
|
||||
if provider_name == "openai":
|
||||
return BillingRoute(provider="openai", model=model.split("/")[-1], base_url=base_url or "", billing_mode="official_docs_snapshot")
|
||||
if provider_name in {"custom", "local"} or (base and "localhost" in base):
|
||||
return BillingRoute(provider=provider_name or "custom", model=model, base_url=base_url or "", billing_mode="unknown")
|
||||
return BillingRoute(provider=provider_name or "unknown", model=model.split("/")[-1] if model else "", base_url=base_url or "", billing_mode="unknown")
|
||||
|
||||
|
||||
def _lookup_official_docs_pricing(route: BillingRoute) -> Optional[PricingEntry]:
|
||||
return _OFFICIAL_DOCS_PRICING.get((route.provider, route.model.lower()))
|
||||
|
||||
|
||||
def _openrouter_pricing_entry(route: BillingRoute) -> Optional[PricingEntry]:
|
||||
metadata = fetch_model_metadata()
|
||||
model_id = route.model
|
||||
if model_id not in metadata:
|
||||
return None
|
||||
pricing = metadata[model_id].get("pricing") or {}
|
||||
prompt = _to_decimal(pricing.get("prompt"))
|
||||
completion = _to_decimal(pricing.get("completion"))
|
||||
request = _to_decimal(pricing.get("request"))
|
||||
cache_read = _to_decimal(
|
||||
pricing.get("cache_read")
|
||||
or pricing.get("cached_prompt")
|
||||
or pricing.get("input_cache_read")
|
||||
)
|
||||
cache_write = _to_decimal(
|
||||
pricing.get("cache_write")
|
||||
or pricing.get("cache_creation")
|
||||
or pricing.get("input_cache_write")
|
||||
)
|
||||
if prompt is None and completion is None and request is None:
|
||||
return None
|
||||
def _per_token_to_per_million(value: Optional[Decimal]) -> Optional[Decimal]:
|
||||
if value is None:
|
||||
return None
|
||||
return value * _ONE_MILLION
|
||||
|
||||
return PricingEntry(
|
||||
input_cost_per_million=_per_token_to_per_million(prompt),
|
||||
output_cost_per_million=_per_token_to_per_million(completion),
|
||||
cache_read_cost_per_million=_per_token_to_per_million(cache_read),
|
||||
cache_write_cost_per_million=_per_token_to_per_million(cache_write),
|
||||
request_cost=request,
|
||||
source="provider_models_api",
|
||||
source_url="https://openrouter.ai/docs/api/api-reference/models/get-models",
|
||||
pricing_version="openrouter-models-api",
|
||||
fetched_at=_UTC_NOW(),
|
||||
)
|
||||
|
||||
|
||||
def estimate_cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
|
||||
pricing = get_pricing(model)
|
||||
total = (
|
||||
Decimal(input_tokens) * Decimal(str(pricing["input"]))
|
||||
+ Decimal(output_tokens) * Decimal(str(pricing["output"]))
|
||||
) / Decimal("1000000")
|
||||
return float(total)
|
||||
def get_pricing_entry(
|
||||
model_name: str,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> Optional[PricingEntry]:
|
||||
route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
|
||||
if route.billing_mode == "subscription_included":
|
||||
return PricingEntry(
|
||||
input_cost_per_million=_ZERO,
|
||||
output_cost_per_million=_ZERO,
|
||||
cache_read_cost_per_million=_ZERO,
|
||||
cache_write_cost_per_million=_ZERO,
|
||||
source="none",
|
||||
pricing_version="included-route",
|
||||
)
|
||||
if route.provider == "openrouter":
|
||||
return _openrouter_pricing_entry(route)
|
||||
return _lookup_official_docs_pricing(route)
|
||||
|
||||
|
||||
def normalize_usage(
|
||||
response_usage: Any,
|
||||
*,
|
||||
provider: Optional[str] = None,
|
||||
api_mode: Optional[str] = None,
|
||||
) -> CanonicalUsage:
|
||||
"""Normalize raw API response usage into canonical token buckets.
|
||||
|
||||
Handles three API shapes:
|
||||
- Anthropic: input_tokens/output_tokens/cache_read_input_tokens/cache_creation_input_tokens
|
||||
- Codex Responses: input_tokens includes cache tokens; input_tokens_details.cached_tokens separates them
|
||||
- OpenAI Chat Completions: prompt_tokens includes cache tokens; prompt_tokens_details.cached_tokens separates them
|
||||
|
||||
In both Codex and OpenAI modes, input_tokens is derived by subtracting cache
|
||||
tokens from the total — the API contract is that input/prompt totals include
|
||||
cached tokens and the details object breaks them out.
|
||||
"""
|
||||
if not response_usage:
|
||||
return CanonicalUsage()
|
||||
|
||||
provider_name = (provider or "").strip().lower()
|
||||
mode = (api_mode or "").strip().lower()
|
||||
|
||||
if mode == "anthropic_messages" or provider_name == "anthropic":
|
||||
input_tokens = _to_int(getattr(response_usage, "input_tokens", 0))
|
||||
output_tokens = _to_int(getattr(response_usage, "output_tokens", 0))
|
||||
cache_read_tokens = _to_int(getattr(response_usage, "cache_read_input_tokens", 0))
|
||||
cache_write_tokens = _to_int(getattr(response_usage, "cache_creation_input_tokens", 0))
|
||||
elif mode == "codex_responses":
|
||||
input_total = _to_int(getattr(response_usage, "input_tokens", 0))
|
||||
output_tokens = _to_int(getattr(response_usage, "output_tokens", 0))
|
||||
details = getattr(response_usage, "input_tokens_details", None)
|
||||
cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0)
|
||||
cache_write_tokens = _to_int(
|
||||
getattr(details, "cache_creation_tokens", 0) if details else 0
|
||||
)
|
||||
input_tokens = max(0, input_total - cache_read_tokens - cache_write_tokens)
|
||||
else:
|
||||
prompt_total = _to_int(getattr(response_usage, "prompt_tokens", 0))
|
||||
output_tokens = _to_int(getattr(response_usage, "completion_tokens", 0))
|
||||
details = getattr(response_usage, "prompt_tokens_details", None)
|
||||
cache_read_tokens = _to_int(getattr(details, "cached_tokens", 0) if details else 0)
|
||||
cache_write_tokens = _to_int(
|
||||
getattr(details, "cache_write_tokens", 0) if details else 0
|
||||
)
|
||||
input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens)
|
||||
|
||||
reasoning_tokens = 0
|
||||
output_details = getattr(response_usage, "output_tokens_details", None)
|
||||
if output_details:
|
||||
reasoning_tokens = _to_int(getattr(output_details, "reasoning_tokens", 0))
|
||||
|
||||
return CanonicalUsage(
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cache_read_tokens=cache_read_tokens,
|
||||
cache_write_tokens=cache_write_tokens,
|
||||
reasoning_tokens=reasoning_tokens,
|
||||
)
|
||||
|
||||
|
||||
def estimate_usage_cost(
|
||||
model_name: str,
|
||||
usage: CanonicalUsage,
|
||||
*,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> CostResult:
|
||||
route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
|
||||
if route.billing_mode == "subscription_included":
|
||||
return CostResult(
|
||||
amount_usd=_ZERO,
|
||||
status="included",
|
||||
source="none",
|
||||
label="included",
|
||||
pricing_version="included-route",
|
||||
)
|
||||
|
||||
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url)
|
||||
if not entry:
|
||||
return CostResult(amount_usd=None, status="unknown", source="none", label="n/a")
|
||||
|
||||
notes: list[str] = []
|
||||
amount = _ZERO
|
||||
|
||||
if usage.input_tokens and entry.input_cost_per_million is None:
|
||||
return CostResult(amount_usd=None, status="unknown", source=entry.source, label="n/a")
|
||||
if usage.output_tokens and entry.output_cost_per_million is None:
|
||||
return CostResult(amount_usd=None, status="unknown", source=entry.source, label="n/a")
|
||||
if usage.cache_read_tokens:
|
||||
if entry.cache_read_cost_per_million is None:
|
||||
return CostResult(
|
||||
amount_usd=None,
|
||||
status="unknown",
|
||||
source=entry.source,
|
||||
label="n/a",
|
||||
notes=("cache-read pricing unavailable for route",),
|
||||
)
|
||||
if usage.cache_write_tokens:
|
||||
if entry.cache_write_cost_per_million is None:
|
||||
return CostResult(
|
||||
amount_usd=None,
|
||||
status="unknown",
|
||||
source=entry.source,
|
||||
label="n/a",
|
||||
notes=("cache-write pricing unavailable for route",),
|
||||
)
|
||||
|
||||
if entry.input_cost_per_million is not None:
|
||||
amount += Decimal(usage.input_tokens) * entry.input_cost_per_million / _ONE_MILLION
|
||||
if entry.output_cost_per_million is not None:
|
||||
amount += Decimal(usage.output_tokens) * entry.output_cost_per_million / _ONE_MILLION
|
||||
if entry.cache_read_cost_per_million is not None:
|
||||
amount += Decimal(usage.cache_read_tokens) * entry.cache_read_cost_per_million / _ONE_MILLION
|
||||
if entry.cache_write_cost_per_million is not None:
|
||||
amount += Decimal(usage.cache_write_tokens) * entry.cache_write_cost_per_million / _ONE_MILLION
|
||||
if entry.request_cost is not None and usage.request_count:
|
||||
amount += Decimal(usage.request_count) * entry.request_cost
|
||||
|
||||
status: CostStatus = "estimated"
|
||||
label = f"~${amount:.2f}"
|
||||
if entry.source == "none" and amount == _ZERO:
|
||||
status = "included"
|
||||
label = "included"
|
||||
|
||||
if route.provider == "openrouter":
|
||||
notes.append("OpenRouter cost is estimated from the models API until reconciled.")
|
||||
|
||||
return CostResult(
|
||||
amount_usd=amount,
|
||||
status=status,
|
||||
source=entry.source,
|
||||
label=label,
|
||||
fetched_at=entry.fetched_at,
|
||||
pricing_version=entry.pricing_version,
|
||||
notes=tuple(notes),
|
||||
)
|
||||
|
||||
|
||||
def has_known_pricing(
|
||||
model_name: str,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Check whether we have pricing data for this model+route.
|
||||
|
||||
Uses direct lookup instead of routing through the full estimation
|
||||
pipeline — avoids creating dummy usage objects just to check status.
|
||||
"""
|
||||
route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
|
||||
if route.billing_mode == "subscription_included":
|
||||
return True
|
||||
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url)
|
||||
return entry is not None
|
||||
|
||||
|
||||
def get_pricing(
|
||||
model_name: str,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> Dict[str, float]:
|
||||
"""Backward-compatible thin wrapper for legacy callers.
|
||||
|
||||
Returns only non-cache input/output fields when a pricing entry exists.
|
||||
Unknown routes return zeroes.
|
||||
"""
|
||||
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url)
|
||||
if not entry:
|
||||
return {"input": 0.0, "output": 0.0}
|
||||
return {
|
||||
"input": float(entry.input_cost_per_million or _ZERO),
|
||||
"output": float(entry.output_cost_per_million or _ZERO),
|
||||
}
|
||||
|
||||
|
||||
def estimate_cost_usd(
|
||||
model: str,
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
*,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
) -> float:
|
||||
"""Backward-compatible helper for legacy callers.
|
||||
|
||||
This uses non-cached input/output only. New code should call
|
||||
`estimate_usage_cost()` with canonical usage buckets.
|
||||
"""
|
||||
result = estimate_usage_cost(
|
||||
model,
|
||||
CanonicalUsage(input_tokens=input_tokens, output_tokens=output_tokens),
|
||||
provider=provider,
|
||||
base_url=base_url,
|
||||
)
|
||||
return float(result.amount_usd or _ZERO)
|
||||
|
||||
|
||||
def format_duration_compact(seconds: float) -> str:
|
||||
|
||||
Reference in New Issue
Block a user