Tracked: morrowind agent (py/cfg), skills/, training-data/, research/, notes/, specs/, test-results/, metrics/, heartbeat/, briefings/, memories/, skins/, hooks/, decisions.md, OPERATIONS.md, SOUL.md Excluded: screenshots, PNGs, binaries, sessions, databases, secrets, audio cache, timmy-config/ and timmy-telemetry/ (separate repos)
269 lines
9.3 KiB
Python
269 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Timmy Model Performance Tracker — What you measure, you manage.
|
|
|
|
Tracks: local vs cloud usage, response quality, latency, cost estimates.
|
|
Stores in SQLite at ~/.timmy/metrics/model_metrics.db
|
|
|
|
Usage:
|
|
# Record a metric
|
|
python3 model_tracker.py record --model timmy:v0.1-q4 --task identity --score 0.9 --latency 1.2
|
|
|
|
# Report
|
|
python3 model_tracker.py report
|
|
python3 model_tracker.py report --days 7
|
|
|
|
# Ingest from hermes session DB
|
|
python3 model_tracker.py ingest
|
|
"""
|
|
|
|
import sqlite3
|
|
import time
|
|
import json
|
|
import argparse
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
|
|
DB_PATH = Path.home() / ".timmy" / "metrics" / "model_metrics.db"
|
|
|
|
# Cost estimates per 1M tokens (input/output)
|
|
COST_TABLE = {
|
|
"claude-opus-4-6": {"input": 15.0, "output": 75.0},
|
|
"claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
|
|
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0},
|
|
"claude-haiku-4-20250414": {"input": 0.25, "output": 1.25},
|
|
# Local models = $0
|
|
"timmy:v0.1-q4": {"input": 0, "output": 0},
|
|
"hermes3:8b": {"input": 0, "output": 0},
|
|
"hermes3:latest": {"input": 0, "output": 0},
|
|
"hermes4:36b": {"input": 0, "output": 0},
|
|
"qwen3:30b": {"input": 0, "output": 0},
|
|
"qwen3.5:latest": {"input": 0, "output": 0},
|
|
"qwen2.5:14b": {"input": 0, "output": 0},
|
|
"llama3.1:latest": {"input": 0, "output": 0},
|
|
"llama3.2:latest": {"input": 0, "output": 0},
|
|
"glm-4.7-flash:latest": {"input": 0, "output": 0},
|
|
}
|
|
|
|
def is_local(model):
|
|
"""Check if a model runs locally (zero cloud cost)."""
|
|
if not model:
|
|
return False
|
|
costs = COST_TABLE.get(model, {})
|
|
if costs.get("input", 1) == 0 and costs.get("output", 1) == 0:
|
|
return True
|
|
# Heuristic: if it has a colon and no slash, it's probably Ollama
|
|
if ":" in model and "/" not in model and "claude" not in model:
|
|
return True
|
|
return False
|
|
|
|
def init_db():
|
|
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
conn = sqlite3.connect(str(DB_PATH))
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS evals (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
timestamp REAL NOT NULL,
|
|
model TEXT NOT NULL,
|
|
task TEXT NOT NULL,
|
|
score REAL,
|
|
latency_s REAL,
|
|
tokens_in INTEGER,
|
|
tokens_out INTEGER,
|
|
notes TEXT
|
|
)
|
|
""")
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS session_stats (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
timestamp REAL NOT NULL,
|
|
period TEXT NOT NULL,
|
|
model TEXT NOT NULL,
|
|
source TEXT,
|
|
sessions INTEGER,
|
|
messages INTEGER,
|
|
tool_calls INTEGER,
|
|
est_cost_usd REAL,
|
|
is_local INTEGER
|
|
)
|
|
""")
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS sovereignty_score (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
timestamp REAL NOT NULL,
|
|
period TEXT NOT NULL,
|
|
total_sessions INTEGER,
|
|
local_sessions INTEGER,
|
|
cloud_sessions INTEGER,
|
|
local_pct REAL,
|
|
est_cloud_cost REAL,
|
|
est_saved REAL
|
|
)
|
|
""")
|
|
conn.commit()
|
|
return conn
|
|
|
|
def ingest_from_hermes(conn, days=1):
|
|
"""Pull session data from Hermes state.db and compute metrics."""
|
|
hermes_db = Path.home() / ".hermes" / "state.db"
|
|
if not hermes_db.exists():
|
|
print("No hermes state.db found")
|
|
return
|
|
|
|
hconn = sqlite3.connect(str(hermes_db))
|
|
cutoff = time.time() - (days * 86400)
|
|
period = f"{days}d"
|
|
|
|
rows = hconn.execute("""
|
|
SELECT model, source, COUNT(*) as sessions,
|
|
SUM(message_count) as msgs,
|
|
SUM(tool_call_count) as tools
|
|
FROM sessions
|
|
WHERE started_at > ? AND model IS NOT NULL AND model != ''
|
|
GROUP BY model, source
|
|
""", (cutoff,)).fetchall()
|
|
|
|
now = time.time()
|
|
total_sessions = 0
|
|
local_sessions = 0
|
|
cloud_sessions = 0
|
|
est_cloud_cost = 0.0
|
|
|
|
for model, source, sessions, msgs, tools in rows:
|
|
local = is_local(model)
|
|
# Rough cost estimate: ~500 tokens per message avg
|
|
avg_tokens = (msgs or 0) * 500
|
|
costs = COST_TABLE.get(model, {"input": 5.0, "output": 15.0})
|
|
est_cost = (avg_tokens / 1_000_000) * (costs["input"] + costs["output"]) / 2
|
|
|
|
conn.execute("""
|
|
INSERT INTO session_stats (timestamp, period, model, source, sessions, messages, tool_calls, est_cost_usd, is_local)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (now, period, model, source, sessions, msgs or 0, tools or 0, round(est_cost, 4), 1 if local else 0))
|
|
|
|
total_sessions += sessions
|
|
if local:
|
|
local_sessions += sessions
|
|
else:
|
|
cloud_sessions += sessions
|
|
est_cloud_cost += est_cost
|
|
|
|
local_pct = (local_sessions / total_sessions * 100) if total_sessions > 0 else 0
|
|
# Estimate saved = what it would cost if everything ran on Sonnet
|
|
est_if_all_cloud = total_sessions * 0.05 # rough $0.05/session avg
|
|
est_saved = max(0, est_if_all_cloud - est_cloud_cost)
|
|
|
|
conn.execute("""
|
|
INSERT INTO sovereignty_score (timestamp, period, total_sessions, local_sessions, cloud_sessions, local_pct, est_cloud_cost, est_saved)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (now, period, total_sessions, local_sessions, cloud_sessions, round(local_pct, 1), round(est_cloud_cost, 4), round(est_saved, 4)))
|
|
|
|
conn.commit()
|
|
hconn.close()
|
|
print(f"Ingested {days}d: {total_sessions} sessions ({local_sessions} local, {cloud_sessions} cloud)")
|
|
print(f" Sovereignty: {local_pct:.1f}% local")
|
|
print(f" Est cloud cost: ${est_cloud_cost:.2f}")
|
|
|
|
def report(conn, days=3):
|
|
"""Print the sovereignty dashboard."""
|
|
cutoff = time.time() - (days * 86400)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f" TIMMY SOVEREIGNTY METRICS — Last {days} days")
|
|
print(f"{'='*60}\n")
|
|
|
|
# Latest sovereignty score
|
|
row = conn.execute("""
|
|
SELECT local_pct, total_sessions, local_sessions, cloud_sessions, est_cloud_cost
|
|
FROM sovereignty_score ORDER BY timestamp DESC LIMIT 1
|
|
""").fetchone()
|
|
|
|
if row:
|
|
pct, total, local, cloud, cost = row
|
|
bar_len = 40
|
|
filled = int(pct / 100 * bar_len)
|
|
bar = "█" * filled + "░" * (bar_len - filled)
|
|
print(f" SOVEREIGNTY SCORE: [{bar}] {pct:.1f}%")
|
|
print(f" Sessions: {total} total | {local} local | {cloud} cloud")
|
|
print(f" Est cloud cost: ${cost:.2f}")
|
|
else:
|
|
print(" No data yet. Run: python3 model_tracker.py ingest")
|
|
|
|
# Model breakdown
|
|
print(f"\n {'MODEL':<30} {'SESS':>6} {'MSGS':>7} {'TOOLS':>6} {'LOCAL':>6} {'$EST':>8}")
|
|
print(f" {'-'*30} {'-'*6} {'-'*7} {'-'*6} {'-'*6} {'-'*8}")
|
|
|
|
rows = conn.execute("""
|
|
SELECT model, SUM(sessions), SUM(messages), SUM(tool_calls), is_local, SUM(est_cost_usd)
|
|
FROM session_stats
|
|
WHERE timestamp > ?
|
|
GROUP BY model
|
|
ORDER BY SUM(sessions) DESC
|
|
""", (cutoff,)).fetchall()
|
|
|
|
for model, sess, msgs, tools, local, cost in rows:
|
|
flag = " ✓" if local else " ✗"
|
|
print(f" {model:<30} {sess:>6} {msgs:>7} {tools:>6} {flag:>6} ${cost:>7.2f}")
|
|
|
|
# Eval scores if any
|
|
evals = conn.execute("""
|
|
SELECT model, task, AVG(score), COUNT(*), AVG(latency_s)
|
|
FROM evals
|
|
WHERE timestamp > ?
|
|
GROUP BY model, task
|
|
ORDER BY model, task
|
|
""", (cutoff,)).fetchall()
|
|
|
|
if evals:
|
|
print(f"\n {'MODEL':<25} {'TASK':<15} {'AVG SCORE':>9} {'RUNS':>5} {'AVG LAT':>8}")
|
|
print(f" {'-'*25} {'-'*15} {'-'*9} {'-'*5} {'-'*8}")
|
|
for model, task, score, runs, lat in evals:
|
|
print(f" {model:<25} {task:<15} {score:>9.2f} {runs:>5} {lat:>7.1f}s")
|
|
|
|
print(f"\n{'='*60}\n")
|
|
|
|
def record_eval(conn, model, task, score, latency=None, tokens_in=None, tokens_out=None, notes=None):
|
|
conn.execute("""
|
|
INSERT INTO evals (timestamp, model, task, score, latency_s, tokens_in, tokens_out, notes)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (time.time(), model, task, score, latency, tokens_in, tokens_out, notes))
|
|
conn.commit()
|
|
print(f"Recorded: {model} | {task} | score={score}")
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Timmy Model Performance Tracker")
|
|
sub = parser.add_subparsers(dest="cmd")
|
|
|
|
p_ingest = sub.add_parser("ingest", help="Ingest from Hermes session DB")
|
|
p_ingest.add_argument("--days", type=int, default=3)
|
|
|
|
p_report = sub.add_parser("report", help="Show sovereignty dashboard")
|
|
p_report.add_argument("--days", type=int, default=3)
|
|
|
|
p_record = sub.add_parser("record", help="Record an eval")
|
|
p_record.add_argument("--model", required=True)
|
|
p_record.add_argument("--task", required=True)
|
|
p_record.add_argument("--score", type=float, required=True)
|
|
p_record.add_argument("--latency", type=float)
|
|
p_record.add_argument("--notes")
|
|
|
|
args = parser.parse_args()
|
|
conn = init_db()
|
|
|
|
if args.cmd == "ingest":
|
|
ingest_from_hermes(conn, args.days)
|
|
elif args.cmd == "report":
|
|
report(conn, args.days)
|
|
elif args.cmd == "record":
|
|
record_eval(conn, args.model, args.task, args.score, args.latency, notes=args.notes)
|
|
else:
|
|
# Default: ingest + report
|
|
ingest_from_hermes(conn, 3)
|
|
report(conn, 3)
|
|
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|