140 lines
4.9 KiB
Python
140 lines
4.9 KiB
Python
from __future__ import annotations
|
|
|
|
import math
|
|
from datetime import datetime, timezone
|
|
|
|
COST_TABLE = {
|
|
"claude-opus-4-6": {"input": 15.0, "output": 75.0},
|
|
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0},
|
|
"claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
|
|
"claude-haiku-4-20250414": {"input": 0.25, "output": 1.25},
|
|
"hermes4:14b": {"input": 0.0, "output": 0.0},
|
|
"hermes3:8b": {"input": 0.0, "output": 0.0},
|
|
"hermes3:latest": {"input": 0.0, "output": 0.0},
|
|
"qwen3:30b": {"input": 0.0, "output": 0.0},
|
|
}
|
|
|
|
|
|
def estimate_tokens_from_chars(char_count: int) -> int:
|
|
if char_count <= 0:
|
|
return 0
|
|
return math.ceil(char_count / 4)
|
|
|
|
|
|
|
|
def build_local_metric_record(
|
|
*,
|
|
prompt: str,
|
|
response: str,
|
|
model: str,
|
|
caller: str,
|
|
session_id: str | None,
|
|
latency_s: float,
|
|
success: bool,
|
|
error: str | None = None,
|
|
) -> dict:
|
|
input_tokens = estimate_tokens_from_chars(len(prompt))
|
|
output_tokens = estimate_tokens_from_chars(len(response))
|
|
total_tokens = input_tokens + output_tokens
|
|
tokens_per_second = round(total_tokens / latency_s, 2) if latency_s > 0 else None
|
|
return {
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"model": model,
|
|
"caller": caller,
|
|
"prompt_len": len(prompt),
|
|
"response_len": len(response),
|
|
"session_id": session_id,
|
|
"latency_s": round(latency_s, 3),
|
|
"est_input_tokens": input_tokens,
|
|
"est_output_tokens": output_tokens,
|
|
"tokens_per_second": tokens_per_second,
|
|
"success": success,
|
|
"error": error,
|
|
}
|
|
|
|
|
|
|
|
def summarize_local_metrics(records: list[dict]) -> dict:
|
|
total_calls = len(records)
|
|
successful_calls = sum(1 for record in records if record.get("success"))
|
|
failed_calls = total_calls - successful_calls
|
|
input_tokens = sum(int(record.get("est_input_tokens", 0) or 0) for record in records)
|
|
output_tokens = sum(int(record.get("est_output_tokens", 0) or 0) for record in records)
|
|
total_tokens = input_tokens + output_tokens
|
|
latencies = [float(record.get("latency_s", 0) or 0) for record in records if record.get("latency_s") is not None]
|
|
throughputs = [
|
|
float(record.get("tokens_per_second", 0) or 0)
|
|
for record in records
|
|
if record.get("tokens_per_second")
|
|
]
|
|
|
|
by_caller: dict[str, dict] = {}
|
|
by_model: dict[str, dict] = {}
|
|
for record in records:
|
|
caller = record.get("caller", "unknown")
|
|
model = record.get("model", "unknown")
|
|
bucket_tokens = int(record.get("est_input_tokens", 0) or 0) + int(record.get("est_output_tokens", 0) or 0)
|
|
for key, table in ((caller, by_caller), (model, by_model)):
|
|
if key not in table:
|
|
table[key] = {"calls": 0, "successful_calls": 0, "failed_calls": 0, "total_tokens": 0}
|
|
table[key]["calls"] += 1
|
|
table[key]["total_tokens"] += bucket_tokens
|
|
if record.get("success"):
|
|
table[key]["successful_calls"] += 1
|
|
else:
|
|
table[key]["failed_calls"] += 1
|
|
|
|
return {
|
|
"total_calls": total_calls,
|
|
"successful_calls": successful_calls,
|
|
"failed_calls": failed_calls,
|
|
"input_tokens": input_tokens,
|
|
"output_tokens": output_tokens,
|
|
"total_tokens": total_tokens,
|
|
"avg_latency_s": round(sum(latencies) / len(latencies), 2) if latencies else None,
|
|
"avg_tokens_per_second": round(sum(throughputs) / len(throughputs), 2) if throughputs else None,
|
|
"by_caller": by_caller,
|
|
"by_model": by_model,
|
|
}
|
|
|
|
|
|
|
|
def is_local_model(model: str | None) -> bool:
|
|
if not model:
|
|
return False
|
|
costs = COST_TABLE.get(model, {})
|
|
if costs.get("input", 1) == 0 and costs.get("output", 1) == 0:
|
|
return True
|
|
return ":" in model and "/" not in model and "claude" not in model
|
|
|
|
|
|
|
|
def summarize_session_rows(rows: list[tuple]) -> dict:
|
|
total_sessions = 0
|
|
local_sessions = 0
|
|
cloud_sessions = 0
|
|
local_est_tokens = 0
|
|
cloud_est_tokens = 0
|
|
cloud_est_cost_usd = 0.0
|
|
for model, source, sessions, messages, tool_calls in rows:
|
|
sessions = int(sessions or 0)
|
|
messages = int(messages or 0)
|
|
est_tokens = messages * 500
|
|
total_sessions += sessions
|
|
if is_local_model(model):
|
|
local_sessions += sessions
|
|
local_est_tokens += est_tokens
|
|
else:
|
|
cloud_sessions += sessions
|
|
cloud_est_tokens += est_tokens
|
|
pricing = COST_TABLE.get(model, {"input": 5.0, "output": 15.0})
|
|
cloud_est_cost_usd += (est_tokens / 1_000_000) * ((pricing["input"] + pricing["output"]) / 2)
|
|
return {
|
|
"total_sessions": total_sessions,
|
|
"local_sessions": local_sessions,
|
|
"cloud_sessions": cloud_sessions,
|
|
"local_est_tokens": local_est_tokens,
|
|
"cloud_est_tokens": cloud_est_tokens,
|
|
"cloud_est_cost_usd": round(cloud_est_cost_usd, 4),
|
|
}
|