Files
timmy-home/metrics/model_tracker.py
Alexander Whitestone 0d64d8e559 initial: sovereign home — morrowind agent, skills, training-data, research, specs, notes, operational docs
Tracked: morrowind agent (py/cfg), skills/, training-data/, research/,
notes/, specs/, test-results/, metrics/, heartbeat/, briefings/,
memories/, skins/, hooks/, decisions.md, OPERATIONS.md, SOUL.md

Excluded: screenshots, PNGs, binaries, sessions, databases, secrets,
audio cache, timmy-config/ and timmy-telemetry/ (separate repos)
2026-03-27 13:05:57 -04:00

269 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""
Timmy Model Performance Tracker — What you measure, you manage.
Tracks: local vs cloud usage, response quality, latency, cost estimates.
Stores in SQLite at ~/.timmy/metrics/model_metrics.db
Usage:
# Record a metric
python3 model_tracker.py record --model timmy:v0.1-q4 --task identity --score 0.9 --latency 1.2
# Report
python3 model_tracker.py report
python3 model_tracker.py report --days 7
# Ingest from hermes session DB
python3 model_tracker.py ingest
"""
import sqlite3
import time
import json
import argparse
import os
from pathlib import Path
from datetime import datetime, timedelta
DB_PATH = Path.home() / ".timmy" / "metrics" / "model_metrics.db"
# Cost estimates per 1M tokens (input/output)
COST_TABLE = {
"claude-opus-4-6": {"input": 15.0, "output": 75.0},
"claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0},
"claude-haiku-4-20250414": {"input": 0.25, "output": 1.25},
# Local models = $0
"timmy:v0.1-q4": {"input": 0, "output": 0},
"hermes3:8b": {"input": 0, "output": 0},
"hermes3:latest": {"input": 0, "output": 0},
"hermes4:36b": {"input": 0, "output": 0},
"qwen3:30b": {"input": 0, "output": 0},
"qwen3.5:latest": {"input": 0, "output": 0},
"qwen2.5:14b": {"input": 0, "output": 0},
"llama3.1:latest": {"input": 0, "output": 0},
"llama3.2:latest": {"input": 0, "output": 0},
"glm-4.7-flash:latest": {"input": 0, "output": 0},
}
def is_local(model):
"""Check if a model runs locally (zero cloud cost)."""
if not model:
return False
costs = COST_TABLE.get(model, {})
if costs.get("input", 1) == 0 and costs.get("output", 1) == 0:
return True
# Heuristic: if it has a colon and no slash, it's probably Ollama
if ":" in model and "/" not in model and "claude" not in model:
return True
return False
def init_db():
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(DB_PATH))
conn.execute("""
CREATE TABLE IF NOT EXISTS evals (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp REAL NOT NULL,
model TEXT NOT NULL,
task TEXT NOT NULL,
score REAL,
latency_s REAL,
tokens_in INTEGER,
tokens_out INTEGER,
notes TEXT
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS session_stats (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp REAL NOT NULL,
period TEXT NOT NULL,
model TEXT NOT NULL,
source TEXT,
sessions INTEGER,
messages INTEGER,
tool_calls INTEGER,
est_cost_usd REAL,
is_local INTEGER
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS sovereignty_score (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp REAL NOT NULL,
period TEXT NOT NULL,
total_sessions INTEGER,
local_sessions INTEGER,
cloud_sessions INTEGER,
local_pct REAL,
est_cloud_cost REAL,
est_saved REAL
)
""")
conn.commit()
return conn
def ingest_from_hermes(conn, days=1):
"""Pull session data from Hermes state.db and compute metrics."""
hermes_db = Path.home() / ".hermes" / "state.db"
if not hermes_db.exists():
print("No hermes state.db found")
return
hconn = sqlite3.connect(str(hermes_db))
cutoff = time.time() - (days * 86400)
period = f"{days}d"
rows = hconn.execute("""
SELECT model, source, COUNT(*) as sessions,
SUM(message_count) as msgs,
SUM(tool_call_count) as tools
FROM sessions
WHERE started_at > ? AND model IS NOT NULL AND model != ''
GROUP BY model, source
""", (cutoff,)).fetchall()
now = time.time()
total_sessions = 0
local_sessions = 0
cloud_sessions = 0
est_cloud_cost = 0.0
for model, source, sessions, msgs, tools in rows:
local = is_local(model)
# Rough cost estimate: ~500 tokens per message avg
avg_tokens = (msgs or 0) * 500
costs = COST_TABLE.get(model, {"input": 5.0, "output": 15.0})
est_cost = (avg_tokens / 1_000_000) * (costs["input"] + costs["output"]) / 2
conn.execute("""
INSERT INTO session_stats (timestamp, period, model, source, sessions, messages, tool_calls, est_cost_usd, is_local)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (now, period, model, source, sessions, msgs or 0, tools or 0, round(est_cost, 4), 1 if local else 0))
total_sessions += sessions
if local:
local_sessions += sessions
else:
cloud_sessions += sessions
est_cloud_cost += est_cost
local_pct = (local_sessions / total_sessions * 100) if total_sessions > 0 else 0
# Estimate saved = what it would cost if everything ran on Sonnet
est_if_all_cloud = total_sessions * 0.05 # rough $0.05/session avg
est_saved = max(0, est_if_all_cloud - est_cloud_cost)
conn.execute("""
INSERT INTO sovereignty_score (timestamp, period, total_sessions, local_sessions, cloud_sessions, local_pct, est_cloud_cost, est_saved)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (now, period, total_sessions, local_sessions, cloud_sessions, round(local_pct, 1), round(est_cloud_cost, 4), round(est_saved, 4)))
conn.commit()
hconn.close()
print(f"Ingested {days}d: {total_sessions} sessions ({local_sessions} local, {cloud_sessions} cloud)")
print(f" Sovereignty: {local_pct:.1f}% local")
print(f" Est cloud cost: ${est_cloud_cost:.2f}")
def report(conn, days=3):
"""Print the sovereignty dashboard."""
cutoff = time.time() - (days * 86400)
print(f"\n{'='*60}")
print(f" TIMMY SOVEREIGNTY METRICS — Last {days} days")
print(f"{'='*60}\n")
# Latest sovereignty score
row = conn.execute("""
SELECT local_pct, total_sessions, local_sessions, cloud_sessions, est_cloud_cost
FROM sovereignty_score ORDER BY timestamp DESC LIMIT 1
""").fetchone()
if row:
pct, total, local, cloud, cost = row
bar_len = 40
filled = int(pct / 100 * bar_len)
bar = "" * filled + "" * (bar_len - filled)
print(f" SOVEREIGNTY SCORE: [{bar}] {pct:.1f}%")
print(f" Sessions: {total} total | {local} local | {cloud} cloud")
print(f" Est cloud cost: ${cost:.2f}")
else:
print(" No data yet. Run: python3 model_tracker.py ingest")
# Model breakdown
print(f"\n {'MODEL':<30} {'SESS':>6} {'MSGS':>7} {'TOOLS':>6} {'LOCAL':>6} {'$EST':>8}")
print(f" {'-'*30} {'-'*6} {'-'*7} {'-'*6} {'-'*6} {'-'*8}")
rows = conn.execute("""
SELECT model, SUM(sessions), SUM(messages), SUM(tool_calls), is_local, SUM(est_cost_usd)
FROM session_stats
WHERE timestamp > ?
GROUP BY model
ORDER BY SUM(sessions) DESC
""", (cutoff,)).fetchall()
for model, sess, msgs, tools, local, cost in rows:
flag = "" if local else ""
print(f" {model:<30} {sess:>6} {msgs:>7} {tools:>6} {flag:>6} ${cost:>7.2f}")
# Eval scores if any
evals = conn.execute("""
SELECT model, task, AVG(score), COUNT(*), AVG(latency_s)
FROM evals
WHERE timestamp > ?
GROUP BY model, task
ORDER BY model, task
""", (cutoff,)).fetchall()
if evals:
print(f"\n {'MODEL':<25} {'TASK':<15} {'AVG SCORE':>9} {'RUNS':>5} {'AVG LAT':>8}")
print(f" {'-'*25} {'-'*15} {'-'*9} {'-'*5} {'-'*8}")
for model, task, score, runs, lat in evals:
print(f" {model:<25} {task:<15} {score:>9.2f} {runs:>5} {lat:>7.1f}s")
print(f"\n{'='*60}\n")
def record_eval(conn, model, task, score, latency=None, tokens_in=None, tokens_out=None, notes=None):
conn.execute("""
INSERT INTO evals (timestamp, model, task, score, latency_s, tokens_in, tokens_out, notes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (time.time(), model, task, score, latency, tokens_in, tokens_out, notes))
conn.commit()
print(f"Recorded: {model} | {task} | score={score}")
def main():
parser = argparse.ArgumentParser(description="Timmy Model Performance Tracker")
sub = parser.add_subparsers(dest="cmd")
p_ingest = sub.add_parser("ingest", help="Ingest from Hermes session DB")
p_ingest.add_argument("--days", type=int, default=3)
p_report = sub.add_parser("report", help="Show sovereignty dashboard")
p_report.add_argument("--days", type=int, default=3)
p_record = sub.add_parser("record", help="Record an eval")
p_record.add_argument("--model", required=True)
p_record.add_argument("--task", required=True)
p_record.add_argument("--score", type=float, required=True)
p_record.add_argument("--latency", type=float)
p_record.add_argument("--notes")
args = parser.parse_args()
conn = init_db()
if args.cmd == "ingest":
ingest_from_hermes(conn, args.days)
elif args.cmd == "report":
report(conn, args.days)
elif args.cmd == "record":
record_eval(conn, args.model, args.task, args.score, args.latency, notes=args.notes)
else:
# Default: ingest + report
ingest_from_hermes(conn, 3)
report(conn, 3)
conn.close()
if __name__ == "__main__":
main()