feat: add local-vs-cloud token and throughput metrics #85

Merged
Timmy merged 1 commits from feat/issue-76-local-cloud-metrics into main 2026-03-28 14:24:12 +00:00
5 changed files with 312 additions and 47 deletions

View File

@@ -9,6 +9,7 @@ Usage:
import json import json
import os import os
import sqlite3
import subprocess import subprocess
import sys import sys
import time import time
@@ -16,6 +17,12 @@ import urllib.request
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from pathlib import Path from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from metrics_helpers import summarize_local_metrics, summarize_session_rows
HERMES_HOME = Path.home() / ".hermes" HERMES_HOME = Path.home() / ".hermes"
TIMMY_HOME = Path.home() / ".timmy" TIMMY_HOME = Path.home() / ".timmy"
METRICS_DIR = TIMMY_HOME / "metrics" METRICS_DIR = TIMMY_HOME / "metrics"
@@ -60,6 +67,30 @@ def get_hermes_sessions():
return [] return []
def get_session_rows(hours=24):
state_db = HERMES_HOME / "state.db"
if not state_db.exists():
return []
cutoff = time.time() - (hours * 3600)
try:
conn = sqlite3.connect(str(state_db))
rows = conn.execute(
"""
SELECT model, source, COUNT(*) as sessions,
SUM(message_count) as msgs,
SUM(tool_call_count) as tools
FROM sessions
WHERE started_at > ? AND model IS NOT NULL AND model != ''
GROUP BY model, source
""",
(cutoff,),
).fetchall()
conn.close()
return rows
except Exception:
return []
def get_heartbeat_ticks(date_str=None): def get_heartbeat_ticks(date_str=None):
if not date_str: if not date_str:
date_str = datetime.now().strftime("%Y%m%d") date_str = datetime.now().strftime("%Y%m%d")
@@ -130,6 +161,9 @@ def render(hours=24):
ticks = get_heartbeat_ticks() ticks = get_heartbeat_ticks()
metrics = get_local_metrics(hours) metrics = get_local_metrics(hours)
sessions = get_hermes_sessions() sessions = get_hermes_sessions()
session_rows = get_session_rows(hours)
local_summary = summarize_local_metrics(metrics)
session_summary = summarize_session_rows(session_rows)
loaded_names = {m.get("name", "") for m in loaded} loaded_names = {m.get("name", "") for m in loaded}
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -159,28 +193,18 @@ def render(hours=24):
print(f"\n {BOLD}LOCAL INFERENCE ({len(metrics)} calls, last {hours}h){RST}") print(f"\n {BOLD}LOCAL INFERENCE ({len(metrics)} calls, last {hours}h){RST}")
print(f" {DIM}{'-' * 55}{RST}") print(f" {DIM}{'-' * 55}{RST}")
if metrics: if metrics:
by_caller = {} print(f" Tokens: {local_summary['input_tokens']} in | {local_summary['output_tokens']} out | {local_summary['total_tokens']} total")
for r in metrics: if local_summary.get('avg_latency_s') is not None:
caller = r.get("caller", "unknown") print(f" Avg latency: {local_summary['avg_latency_s']:.2f}s")
if caller not in by_caller: if local_summary.get('avg_tokens_per_second') is not None:
by_caller[caller] = {"count": 0, "success": 0, "errors": 0} print(f" Avg throughput: {GREEN}{local_summary['avg_tokens_per_second']:.2f} tok/s{RST}")
by_caller[caller]["count"] += 1 for caller, stats in sorted(local_summary['by_caller'].items()):
if r.get("success"): err = f" {RED}err:{stats['failed_calls']}{RST}" if stats['failed_calls'] else ""
by_caller[caller]["success"] += 1 print(f" {caller:25s} calls:{stats['calls']:4d} tokens:{stats['total_tokens']:5d} {GREEN}ok:{stats['successful_calls']}{RST}{err}")
else:
by_caller[caller]["errors"] += 1
for caller, stats in by_caller.items():
err = f" {RED}err:{stats['errors']}{RST}" if stats["errors"] else ""
print(f" {caller:25s} calls:{stats['count']:4d} "
f"{GREEN}ok:{stats['success']}{RST}{err}")
by_model = {}
for r in metrics:
model = r.get("model", "unknown")
by_model[model] = by_model.get(model, 0) + 1
print(f"\n {DIM}Models used:{RST}") print(f"\n {DIM}Models used:{RST}")
for model, count in sorted(by_model.items(), key=lambda x: -x[1]): for model, stats in sorted(local_summary['by_model'].items(), key=lambda x: -x[1]['calls']):
print(f" {model:30s} {count} calls") print(f" {model:30s} {stats['calls']} calls {stats['total_tokens']} tok")
else: else:
print(f" {DIM}(no local calls recorded yet){RST}") print(f" {DIM}(no local calls recorded yet){RST}")
@@ -211,15 +235,18 @@ def render(hours=24):
else: else:
print(f" {DIM}(no ticks today){RST}") print(f" {DIM}(no ticks today){RST}")
# ── HERMES SESSIONS ── # ── HERMES SESSIONS / SOVEREIGNTY LOAD ──
local_sessions = [s for s in sessions local_sessions = [s for s in sessions if "localhost:11434" in str(s.get("base_url", ""))]
if "localhost:11434" in str(s.get("base_url", ""))]
cloud_sessions = [s for s in sessions if s not in local_sessions] cloud_sessions = [s for s in sessions if s not in local_sessions]
print(f"\n {BOLD}HERMES SESSIONS{RST}") print(f"\n {BOLD}HERMES SESSIONS / SOVEREIGNTY LOAD{RST}")
print(f" {DIM}{'-' * 55}{RST}") print(f" {DIM}{'-' * 55}{RST}")
print(f" Total: {len(sessions)} | " print(f" Session cache: {len(sessions)} total | {GREEN}{len(local_sessions)} local{RST} | {YELLOW}{len(cloud_sessions)} cloud{RST}")
f"{GREEN}Local: {len(local_sessions)}{RST} | " if session_rows:
f"{YELLOW}Cloud: {len(cloud_sessions)}{RST}") print(f" Session DB: {session_summary['total_sessions']} total | {GREEN}{session_summary['local_sessions']} local{RST} | {YELLOW}{session_summary['cloud_sessions']} cloud{RST}")
print(f" Token est: {GREEN}{session_summary['local_est_tokens']} local{RST} | {YELLOW}{session_summary['cloud_est_tokens']} cloud{RST}")
print(f" Est cloud cost: ${session_summary['cloud_est_cost_usd']:.4f}")
else:
print(f" {DIM}(no session-db stats available){RST}")
# ── ACTIVE LOOPS ── # ── ACTIVE LOOPS ──
print(f"\n {BOLD}ACTIVE LOOPS{RST}") print(f"\n {BOLD}ACTIVE LOOPS{RST}")

View File

@@ -1,8 +1,8 @@
model: model:
default: gpt-5.4 default: hermes4:14b
provider: openai-codex provider: custom
context_length: 65536 context_length: 65536
base_url: https://chatgpt.com/backend-api/codex base_url: http://localhost:8081/v1
toolsets: toolsets:
- all - all
agent: agent:
@@ -188,7 +188,7 @@ custom_providers:
- name: Local llama.cpp - name: Local llama.cpp
base_url: http://localhost:8081/v1 base_url: http://localhost:8081/v1
api_key: none api_key: none
model: auto model: hermes4:14b
- name: Google Gemini - name: Google Gemini
base_url: https://generativelanguage.googleapis.com/v1beta/openai base_url: https://generativelanguage.googleapis.com/v1beta/openai
api_key_env: GEMINI_API_KEY api_key_env: GEMINI_API_KEY

139
metrics_helpers.py Normal file
View File

@@ -0,0 +1,139 @@
from __future__ import annotations
import math
from datetime import datetime, timezone
COST_TABLE = {
"claude-opus-4-6": {"input": 15.0, "output": 75.0},
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0},
"claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
"claude-haiku-4-20250414": {"input": 0.25, "output": 1.25},
"hermes4:14b": {"input": 0.0, "output": 0.0},
"hermes3:8b": {"input": 0.0, "output": 0.0},
"hermes3:latest": {"input": 0.0, "output": 0.0},
"qwen3:30b": {"input": 0.0, "output": 0.0},
}
def estimate_tokens_from_chars(char_count: int) -> int:
if char_count <= 0:
return 0
return math.ceil(char_count / 4)
def build_local_metric_record(
*,
prompt: str,
response: str,
model: str,
caller: str,
session_id: str | None,
latency_s: float,
success: bool,
error: str | None = None,
) -> dict:
input_tokens = estimate_tokens_from_chars(len(prompt))
output_tokens = estimate_tokens_from_chars(len(response))
total_tokens = input_tokens + output_tokens
tokens_per_second = round(total_tokens / latency_s, 2) if latency_s > 0 else None
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"model": model,
"caller": caller,
"prompt_len": len(prompt),
"response_len": len(response),
"session_id": session_id,
"latency_s": round(latency_s, 3),
"est_input_tokens": input_tokens,
"est_output_tokens": output_tokens,
"tokens_per_second": tokens_per_second,
"success": success,
"error": error,
}
def summarize_local_metrics(records: list[dict]) -> dict:
total_calls = len(records)
successful_calls = sum(1 for record in records if record.get("success"))
failed_calls = total_calls - successful_calls
input_tokens = sum(int(record.get("est_input_tokens", 0) or 0) for record in records)
output_tokens = sum(int(record.get("est_output_tokens", 0) or 0) for record in records)
total_tokens = input_tokens + output_tokens
latencies = [float(record.get("latency_s", 0) or 0) for record in records if record.get("latency_s") is not None]
throughputs = [
float(record.get("tokens_per_second", 0) or 0)
for record in records
if record.get("tokens_per_second")
]
by_caller: dict[str, dict] = {}
by_model: dict[str, dict] = {}
for record in records:
caller = record.get("caller", "unknown")
model = record.get("model", "unknown")
bucket_tokens = int(record.get("est_input_tokens", 0) or 0) + int(record.get("est_output_tokens", 0) or 0)
for key, table in ((caller, by_caller), (model, by_model)):
if key not in table:
table[key] = {"calls": 0, "successful_calls": 0, "failed_calls": 0, "total_tokens": 0}
table[key]["calls"] += 1
table[key]["total_tokens"] += bucket_tokens
if record.get("success"):
table[key]["successful_calls"] += 1
else:
table[key]["failed_calls"] += 1
return {
"total_calls": total_calls,
"successful_calls": successful_calls,
"failed_calls": failed_calls,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"total_tokens": total_tokens,
"avg_latency_s": round(sum(latencies) / len(latencies), 2) if latencies else None,
"avg_tokens_per_second": round(sum(throughputs) / len(throughputs), 2) if throughputs else None,
"by_caller": by_caller,
"by_model": by_model,
}
def is_local_model(model: str | None) -> bool:
if not model:
return False
costs = COST_TABLE.get(model, {})
if costs.get("input", 1) == 0 and costs.get("output", 1) == 0:
return True
return ":" in model and "/" not in model and "claude" not in model
def summarize_session_rows(rows: list[tuple]) -> dict:
total_sessions = 0
local_sessions = 0
cloud_sessions = 0
local_est_tokens = 0
cloud_est_tokens = 0
cloud_est_cost_usd = 0.0
for model, source, sessions, messages, tool_calls in rows:
sessions = int(sessions or 0)
messages = int(messages or 0)
est_tokens = messages * 500
total_sessions += sessions
if is_local_model(model):
local_sessions += sessions
local_est_tokens += est_tokens
else:
cloud_sessions += sessions
cloud_est_tokens += est_tokens
pricing = COST_TABLE.get(model, {"input": 5.0, "output": 15.0})
cloud_est_cost_usd += (est_tokens / 1_000_000) * ((pricing["input"] + pricing["output"]) / 2)
return {
"total_sessions": total_sessions,
"local_sessions": local_sessions,
"cloud_sessions": cloud_sessions,
"local_est_tokens": local_est_tokens,
"cloud_est_tokens": cloud_est_tokens,
"cloud_est_cost_usd": round(cloud_est_cost_usd, 4),
}

View File

@@ -5,12 +5,14 @@ import glob
import os import os
import subprocess import subprocess
import sys import sys
import time
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from orchestration import huey from orchestration import huey
from huey import crontab from huey import crontab
from gitea_client import GiteaClient from gitea_client import GiteaClient
from metrics_helpers import build_local_metric_record
HERMES_HOME = Path.home() / ".hermes" HERMES_HOME = Path.home() / ".hermes"
TIMMY_HOME = Path.home() / ".timmy" TIMMY_HOME = Path.home() / ".timmy"
@@ -57,6 +59,7 @@ def run_hermes_local(
_model = model or HEARTBEAT_MODEL _model = model or HEARTBEAT_MODEL
tagged = f"[{caller_tag}] {prompt}" if caller_tag else prompt tagged = f"[{caller_tag}] {prompt}" if caller_tag else prompt
started = time.time()
try: try:
runner = """ runner = """
import io import io
@@ -167,15 +170,15 @@ sys.exit(exit_code)
# Log to metrics jsonl # Log to metrics jsonl
METRICS_DIR.mkdir(parents=True, exist_ok=True) METRICS_DIR.mkdir(parents=True, exist_ok=True)
metrics_file = METRICS_DIR / f"local_{datetime.now().strftime('%Y%m%d')}.jsonl" metrics_file = METRICS_DIR / f"local_{datetime.now().strftime('%Y%m%d')}.jsonl"
record = { record = build_local_metric_record(
"timestamp": datetime.now(timezone.utc).isoformat(), prompt=prompt,
"model": _model, response=response,
"caller": caller_tag or "unknown", model=_model,
"prompt_len": len(prompt), caller=caller_tag or "unknown",
"response_len": len(response), session_id=session_id,
"session_id": session_id, latency_s=time.time() - started,
"success": bool(response), success=bool(response),
} )
with open(metrics_file, "a") as f: with open(metrics_file, "a") as f:
f.write(json.dumps(record) + "\n") f.write(json.dumps(record) + "\n")
@@ -190,13 +193,16 @@ sys.exit(exit_code)
# Log failure # Log failure
METRICS_DIR.mkdir(parents=True, exist_ok=True) METRICS_DIR.mkdir(parents=True, exist_ok=True)
metrics_file = METRICS_DIR / f"local_{datetime.now().strftime('%Y%m%d')}.jsonl" metrics_file = METRICS_DIR / f"local_{datetime.now().strftime('%Y%m%d')}.jsonl"
record = { record = build_local_metric_record(
"timestamp": datetime.now(timezone.utc).isoformat(), prompt=prompt,
"model": _model, response="",
"caller": caller_tag or "unknown", model=_model,
"error": str(e), caller=caller_tag or "unknown",
"success": False, session_id=None,
} latency_s=time.time() - started,
success=False,
error=str(e),
)
with open(metrics_file, "a") as f: with open(metrics_file, "a") as f:
f.write(json.dumps(record) + "\n") f.write(json.dumps(record) + "\n")
return None return None

View File

@@ -0,0 +1,93 @@
from metrics_helpers import (
build_local_metric_record,
estimate_tokens_from_chars,
summarize_local_metrics,
summarize_session_rows,
)
def test_estimate_tokens_from_chars_uses_simple_local_heuristic() -> None:
assert estimate_tokens_from_chars(0) == 0
assert estimate_tokens_from_chars(1) == 1
assert estimate_tokens_from_chars(4) == 1
assert estimate_tokens_from_chars(5) == 2
assert estimate_tokens_from_chars(401) == 101
def test_build_local_metric_record_adds_token_and_throughput_estimates() -> None:
record = build_local_metric_record(
prompt="abcd" * 10,
response="xyz" * 20,
model="hermes4:14b",
caller="heartbeat_tick",
session_id="session-123",
latency_s=2.0,
success=True,
)
assert record["model"] == "hermes4:14b"
assert record["caller"] == "heartbeat_tick"
assert record["session_id"] == "session-123"
assert record["est_input_tokens"] == 10
assert record["est_output_tokens"] == 15
assert record["tokens_per_second"] == 12.5
def test_summarize_local_metrics_rolls_up_tokens_and_latency() -> None:
records = [
{
"caller": "heartbeat_tick",
"model": "hermes4:14b",
"success": True,
"est_input_tokens": 100,
"est_output_tokens": 40,
"latency_s": 2.0,
"tokens_per_second": 20.0,
},
{
"caller": "heartbeat_tick",
"model": "hermes4:14b",
"success": False,
"est_input_tokens": 30,
"est_output_tokens": 0,
"latency_s": 1.0,
},
{
"caller": "session_export",
"model": "hermes3:8b",
"success": True,
"est_input_tokens": 50,
"est_output_tokens": 25,
"latency_s": 5.0,
"tokens_per_second": 5.0,
},
]
summary = summarize_local_metrics(records)
assert summary["total_calls"] == 3
assert summary["successful_calls"] == 2
assert summary["failed_calls"] == 1
assert summary["input_tokens"] == 180
assert summary["output_tokens"] == 65
assert summary["total_tokens"] == 245
assert summary["avg_latency_s"] == 2.67
assert summary["avg_tokens_per_second"] == 12.5
assert summary["by_caller"]["heartbeat_tick"]["total_tokens"] == 170
assert summary["by_model"]["hermes4:14b"]["failed_calls"] == 1
def test_summarize_session_rows_separates_local_and_cloud_estimates() -> None:
rows = [
("hermes4:14b", "local", 2, 10, 4),
("claude-sonnet-4-6", "cli", 3, 9, 2),
]
summary = summarize_session_rows(rows)
assert summary["total_sessions"] == 5
assert summary["local_sessions"] == 2
assert summary["cloud_sessions"] == 3
assert summary["local_est_tokens"] == 5000
assert summary["cloud_est_tokens"] == 4500
assert summary["cloud_est_cost_usd"] > 0