feat: add local-vs-cloud token and throughput metrics

2026-03-28 10:24:07 -04:00
parent d955d2b9f1
commit d5122f1592
5 changed files with 312 additions and 47 deletions
--- a/bin/timmy-dashboard
+++ b/bin/timmy-dashboard
@@ -9,6 +9,7 @@ Usage:

 import json
 import os
+import sqlite3
 import subprocess
 import sys
 import time
@@ -16,6 +17,12 @@ import urllib.request
 from datetime import datetime, timezone, timedelta
 from pathlib import Path

+REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from metrics_helpers import summarize_local_metrics, summarize_session_rows
+
 HERMES_HOME = Path.home() / ".hermes"
 TIMMY_HOME = Path.home() / ".timmy"
 METRICS_DIR = TIMMY_HOME / "metrics"
@@ -60,6 +67,30 @@ def get_hermes_sessions():
        return []


+def get_session_rows(hours=24):
+    state_db = HERMES_HOME / "state.db"
+    if not state_db.exists():
+        return []
+    cutoff = time.time() - (hours * 3600)
+    try:
+        conn = sqlite3.connect(str(state_db))
+        rows = conn.execute(
+            """
+            SELECT model, source, COUNT(*) as sessions,
+                   SUM(message_count) as msgs,
+                   SUM(tool_call_count) as tools
+            FROM sessions
+            WHERE started_at > ? AND model IS NOT NULL AND model != ''
+            GROUP BY model, source
+            """,
+            (cutoff,),
+        ).fetchall()
+        conn.close()
+        return rows
+    except Exception:
+        return []
+
+
 def get_heartbeat_ticks(date_str=None):
    if not date_str:
        date_str = datetime.now().strftime("%Y%m%d")
@@ -130,6 +161,9 @@ def render(hours=24):
    ticks = get_heartbeat_ticks()
    metrics = get_local_metrics(hours)
    sessions = get_hermes_sessions()
+    session_rows = get_session_rows(hours)
+    local_summary = summarize_local_metrics(metrics)
+    session_summary = summarize_session_rows(session_rows)

    loaded_names = {m.get("name", "") for m in loaded}
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -159,28 +193,18 @@ def render(hours=24):
    print(f"\n  {BOLD}LOCAL INFERENCE ({len(metrics)} calls, last {hours}h){RST}")
    print(f"  {DIM}{'-' * 55}{RST}")
    if metrics:
-        by_caller = {}
-        for r in metrics:
-            caller = r.get("caller", "unknown")
-            if caller not in by_caller:
-                by_caller[caller] = {"count": 0, "success": 0, "errors": 0}
-            by_caller[caller]["count"] += 1
-            if r.get("success"):
-                by_caller[caller]["success"] += 1
-            else:
-                by_caller[caller]["errors"] += 1
-        for caller, stats in by_caller.items():
-            err = f"  {RED}err:{stats['errors']}{RST}" if stats["errors"] else ""
-            print(f"    {caller:25s}  calls:{stats['count']:4d}  "
-                  f"{GREEN}ok:{stats['success']}{RST}{err}")
+        print(f"    Tokens: {local_summary['input_tokens']} in  |  {local_summary['output_tokens']} out  |  {local_summary['total_tokens']} total")
+        if local_summary.get('avg_latency_s') is not None:
+            print(f"    Avg latency: {local_summary['avg_latency_s']:.2f}s")
+        if local_summary.get('avg_tokens_per_second') is not None:
+            print(f"    Avg throughput: {GREEN}{local_summary['avg_tokens_per_second']:.2f} tok/s{RST}")
+        for caller, stats in sorted(local_summary['by_caller'].items()):
+            err = f"  {RED}err:{stats['failed_calls']}{RST}" if stats['failed_calls'] else ""
+            print(f"    {caller:25s}  calls:{stats['calls']:4d}  tokens:{stats['total_tokens']:5d}  {GREEN}ok:{stats['successful_calls']}{RST}{err}")

-        by_model = {}
-        for r in metrics:
-            model = r.get("model", "unknown")
-            by_model[model] = by_model.get(model, 0) + 1
        print(f"\n    {DIM}Models used:{RST}")
-        for model, count in sorted(by_model.items(), key=lambda x: -x[1]):
-            print(f"      {model:30s}  {count} calls")
+        for model, stats in sorted(local_summary['by_model'].items(), key=lambda x: -x[1]['calls']):
+            print(f"      {model:30s}  {stats['calls']} calls  {stats['total_tokens']} tok")
    else:
        print(f"    {DIM}(no local calls recorded yet){RST}")

@@ -211,15 +235,18 @@ def render(hours=24):
    else:
        print(f"    {DIM}(no ticks today){RST}")

-    # ── HERMES SESSIONS ──
-    local_sessions = [s for s in sessions
-                     if "localhost:11434" in str(s.get("base_url", ""))]
+    # ── HERMES SESSIONS / SOVEREIGNTY LOAD ──
+    local_sessions = [s for s in sessions if "localhost:11434" in str(s.get("base_url", ""))]
    cloud_sessions = [s for s in sessions if s not in local_sessions]
-    print(f"\n  {BOLD}HERMES SESSIONS{RST}")
+    print(f"\n  {BOLD}HERMES SESSIONS / SOVEREIGNTY LOAD{RST}")
    print(f"  {DIM}{'-' * 55}{RST}")
-    print(f"    Total: {len(sessions)}  |  "
-          f"{GREEN}Local: {len(local_sessions)}{RST}  |  "
-          f"{YELLOW}Cloud: {len(cloud_sessions)}{RST}")
+    print(f"    Session cache: {len(sessions)} total  |  {GREEN}{len(local_sessions)} local{RST}  |  {YELLOW}{len(cloud_sessions)} cloud{RST}")
+    if session_rows:
+        print(f"    Session DB:    {session_summary['total_sessions']} total  |  {GREEN}{session_summary['local_sessions']} local{RST}  |  {YELLOW}{session_summary['cloud_sessions']} cloud{RST}")
+        print(f"    Token est:     {GREEN}{session_summary['local_est_tokens']} local{RST}  |  {YELLOW}{session_summary['cloud_est_tokens']} cloud{RST}")
+        print(f"    Est cloud cost: ${session_summary['cloud_est_cost_usd']:.4f}")
+    else:
+        print(f"    {DIM}(no session-db stats available){RST}")

    # ── ACTIVE LOOPS ──
    print(f"\n  {BOLD}ACTIVE LOOPS{RST}")
--- a/config.yaml
+++ b/config.yaml
@@ -1,8 +1,8 @@
 model:
-  default: gpt-5.4
-  provider: openai-codex
+  default: hermes4:14b
+  provider: custom
  context_length: 65536
-  base_url: https://chatgpt.com/backend-api/codex
+  base_url: http://localhost:8081/v1
 toolsets:
 - all
 agent:
@@ -188,7 +188,7 @@ custom_providers:
 - name: Local llama.cpp
  base_url: http://localhost:8081/v1
  api_key: none
-  model: auto
+  model: hermes4:14b
 - name: Google Gemini
  base_url: https://generativelanguage.googleapis.com/v1beta/openai
  api_key_env: GEMINI_API_KEY
--- a/metrics_helpers.py
+++ b/metrics_helpers.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import math
+from datetime import datetime, timezone
+
+COST_TABLE = {
+    "claude-opus-4-6": {"input": 15.0, "output": 75.0},
+    "claude-sonnet-4-6": {"input": 3.0, "output": 15.0},
+    "claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
+    "claude-haiku-4-20250414": {"input": 0.25, "output": 1.25},
+    "hermes4:14b": {"input": 0.0, "output": 0.0},
+    "hermes3:8b": {"input": 0.0, "output": 0.0},
+    "hermes3:latest": {"input": 0.0, "output": 0.0},
+    "qwen3:30b": {"input": 0.0, "output": 0.0},
+}
+
+
+def estimate_tokens_from_chars(char_count: int) -> int:
+    if char_count <= 0:
+        return 0
+    return math.ceil(char_count / 4)
+
+
+
+def build_local_metric_record(
+    *,
+    prompt: str,
+    response: str,
+    model: str,
+    caller: str,
+    session_id: str | None,
+    latency_s: float,
+    success: bool,
+    error: str | None = None,
+) -> dict:
+    input_tokens = estimate_tokens_from_chars(len(prompt))
+    output_tokens = estimate_tokens_from_chars(len(response))
+    total_tokens = input_tokens + output_tokens
+    tokens_per_second = round(total_tokens / latency_s, 2) if latency_s > 0 else None
+    return {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "model": model,
+        "caller": caller,
+        "prompt_len": len(prompt),
+        "response_len": len(response),
+        "session_id": session_id,
+        "latency_s": round(latency_s, 3),
+        "est_input_tokens": input_tokens,
+        "est_output_tokens": output_tokens,
+        "tokens_per_second": tokens_per_second,
+        "success": success,
+        "error": error,
+    }
+
+
+
+def summarize_local_metrics(records: list[dict]) -> dict:
+    total_calls = len(records)
+    successful_calls = sum(1 for record in records if record.get("success"))
+    failed_calls = total_calls - successful_calls
+    input_tokens = sum(int(record.get("est_input_tokens", 0) or 0) for record in records)
+    output_tokens = sum(int(record.get("est_output_tokens", 0) or 0) for record in records)
+    total_tokens = input_tokens + output_tokens
+    latencies = [float(record.get("latency_s", 0) or 0) for record in records if record.get("latency_s") is not None]
+    throughputs = [
+        float(record.get("tokens_per_second", 0) or 0)
+        for record in records
+        if record.get("tokens_per_second")
+    ]
+
+    by_caller: dict[str, dict] = {}
+    by_model: dict[str, dict] = {}
+    for record in records:
+        caller = record.get("caller", "unknown")
+        model = record.get("model", "unknown")
+        bucket_tokens = int(record.get("est_input_tokens", 0) or 0) + int(record.get("est_output_tokens", 0) or 0)
+        for key, table in ((caller, by_caller), (model, by_model)):
+            if key not in table:
+                table[key] = {"calls": 0, "successful_calls": 0, "failed_calls": 0, "total_tokens": 0}
+            table[key]["calls"] += 1
+            table[key]["total_tokens"] += bucket_tokens
+            if record.get("success"):
+                table[key]["successful_calls"] += 1
+            else:
+                table[key]["failed_calls"] += 1
+
+    return {
+        "total_calls": total_calls,
+        "successful_calls": successful_calls,
+        "failed_calls": failed_calls,
+        "input_tokens": input_tokens,
+        "output_tokens": output_tokens,
+        "total_tokens": total_tokens,
+        "avg_latency_s": round(sum(latencies) / len(latencies), 2) if latencies else None,
+        "avg_tokens_per_second": round(sum(throughputs) / len(throughputs), 2) if throughputs else None,
+        "by_caller": by_caller,
+        "by_model": by_model,
+    }
+
+
+
+def is_local_model(model: str | None) -> bool:
+    if not model:
+        return False
+    costs = COST_TABLE.get(model, {})
+    if costs.get("input", 1) == 0 and costs.get("output", 1) == 0:
+        return True
+    return ":" in model and "/" not in model and "claude" not in model
+
+
+
+def summarize_session_rows(rows: list[tuple]) -> dict:
+    total_sessions = 0
+    local_sessions = 0
+    cloud_sessions = 0
+    local_est_tokens = 0
+    cloud_est_tokens = 0
+    cloud_est_cost_usd = 0.0
+    for model, source, sessions, messages, tool_calls in rows:
+        sessions = int(sessions or 0)
+        messages = int(messages or 0)
+        est_tokens = messages * 500
+        total_sessions += sessions
+        if is_local_model(model):
+            local_sessions += sessions
+            local_est_tokens += est_tokens
+        else:
+            cloud_sessions += sessions
+            cloud_est_tokens += est_tokens
+            pricing = COST_TABLE.get(model, {"input": 5.0, "output": 15.0})
+            cloud_est_cost_usd += (est_tokens / 1_000_000) * ((pricing["input"] + pricing["output"]) / 2)
+    return {
+        "total_sessions": total_sessions,
+        "local_sessions": local_sessions,
+        "cloud_sessions": cloud_sessions,
+        "local_est_tokens": local_est_tokens,
+        "cloud_est_tokens": cloud_est_tokens,
+        "cloud_est_cost_usd": round(cloud_est_cost_usd, 4),
+    }
--- a/tasks.py
+++ b/tasks.py
@@ -5,12 +5,14 @@ import glob
 import os
 import subprocess
 import sys
+import time
 from datetime import datetime, timezone
 from pathlib import Path

 from orchestration import huey
 from huey import crontab
 from gitea_client import GiteaClient
+from metrics_helpers import build_local_metric_record

 HERMES_HOME = Path.home() / ".hermes"
 TIMMY_HOME = Path.home() / ".timmy"
@@ -57,6 +59,7 @@ def run_hermes_local(
    _model = model or HEARTBEAT_MODEL
    tagged = f"[{caller_tag}] {prompt}" if caller_tag else prompt

+    started = time.time()
    try:
        runner = """
 import io
@@ -167,15 +170,15 @@ sys.exit(exit_code)
        # Log to metrics jsonl
        METRICS_DIR.mkdir(parents=True, exist_ok=True)
        metrics_file = METRICS_DIR / f"local_{datetime.now().strftime('%Y%m%d')}.jsonl"
-        record = {
-            "timestamp": datetime.now(timezone.utc).isoformat(),
-            "model": _model,
-            "caller": caller_tag or "unknown",
-            "prompt_len": len(prompt),
-            "response_len": len(response),
-            "session_id": session_id,
-            "success": bool(response),
-        }
+        record = build_local_metric_record(
+            prompt=prompt,
+            response=response,
+            model=_model,
+            caller=caller_tag or "unknown",
+            session_id=session_id,
+            latency_s=time.time() - started,
+            success=bool(response),
+        )
        with open(metrics_file, "a") as f:
            f.write(json.dumps(record) + "\n")

@@ -190,13 +193,16 @@ sys.exit(exit_code)
        # Log failure
        METRICS_DIR.mkdir(parents=True, exist_ok=True)
        metrics_file = METRICS_DIR / f"local_{datetime.now().strftime('%Y%m%d')}.jsonl"
-        record = {
-            "timestamp": datetime.now(timezone.utc).isoformat(),
-            "model": _model,
-            "caller": caller_tag or "unknown",
-            "error": str(e),
-            "success": False,
-        }
+        record = build_local_metric_record(
+            prompt=prompt,
+            response="",
+            model=_model,
+            caller=caller_tag or "unknown",
+            session_id=None,
+            latency_s=time.time() - started,
+            success=False,
+            error=str(e),
+        )
        with open(metrics_file, "a") as f:
            f.write(json.dumps(record) + "\n")
        return None
--- a/tests/test_metrics_helpers.py
+++ b/tests/test_metrics_helpers.py
@@ -0,0 +1,93 @@
+from metrics_helpers import (
+    build_local_metric_record,
+    estimate_tokens_from_chars,
+    summarize_local_metrics,
+    summarize_session_rows,
+)
+
+
+def test_estimate_tokens_from_chars_uses_simple_local_heuristic() -> None:
+    assert estimate_tokens_from_chars(0) == 0
+    assert estimate_tokens_from_chars(1) == 1
+    assert estimate_tokens_from_chars(4) == 1
+    assert estimate_tokens_from_chars(5) == 2
+    assert estimate_tokens_from_chars(401) == 101
+
+
+def test_build_local_metric_record_adds_token_and_throughput_estimates() -> None:
+    record = build_local_metric_record(
+        prompt="abcd" * 10,
+        response="xyz" * 20,
+        model="hermes4:14b",
+        caller="heartbeat_tick",
+        session_id="session-123",
+        latency_s=2.0,
+        success=True,
+    )
+
+    assert record["model"] == "hermes4:14b"
+    assert record["caller"] == "heartbeat_tick"
+    assert record["session_id"] == "session-123"
+    assert record["est_input_tokens"] == 10
+    assert record["est_output_tokens"] == 15
+    assert record["tokens_per_second"] == 12.5
+
+
+def test_summarize_local_metrics_rolls_up_tokens_and_latency() -> None:
+    records = [
+        {
+            "caller": "heartbeat_tick",
+            "model": "hermes4:14b",
+            "success": True,
+            "est_input_tokens": 100,
+            "est_output_tokens": 40,
+            "latency_s": 2.0,
+            "tokens_per_second": 20.0,
+        },
+        {
+            "caller": "heartbeat_tick",
+            "model": "hermes4:14b",
+            "success": False,
+            "est_input_tokens": 30,
+            "est_output_tokens": 0,
+            "latency_s": 1.0,
+        },
+        {
+            "caller": "session_export",
+            "model": "hermes3:8b",
+            "success": True,
+            "est_input_tokens": 50,
+            "est_output_tokens": 25,
+            "latency_s": 5.0,
+            "tokens_per_second": 5.0,
+        },
+    ]
+
+    summary = summarize_local_metrics(records)
+
+    assert summary["total_calls"] == 3
+    assert summary["successful_calls"] == 2
+    assert summary["failed_calls"] == 1
+    assert summary["input_tokens"] == 180
+    assert summary["output_tokens"] == 65
+    assert summary["total_tokens"] == 245
+    assert summary["avg_latency_s"] == 2.67
+    assert summary["avg_tokens_per_second"] == 12.5
+    assert summary["by_caller"]["heartbeat_tick"]["total_tokens"] == 170
+    assert summary["by_model"]["hermes4:14b"]["failed_calls"] == 1
+
+
+def test_summarize_session_rows_separates_local_and_cloud_estimates() -> None:
+    rows = [
+        ("hermes4:14b", "local", 2, 10, 4),
+        ("claude-sonnet-4-6", "cli", 3, 9, 2),
+    ]
+
+    summary = summarize_session_rows(rows)
+
+    assert summary["total_sessions"] == 5
+    assert summary["local_sessions"] == 2
+    assert summary["cloud_sessions"] == 3
+    assert summary["local_est_tokens"] == 5000
+    assert summary["cloud_est_tokens"] == 4500
+    assert summary["cloud_est_cost_usd"] > 0