#!/usr/bin/env python3 """ Timmy Model Performance Tracker — What you measure, you manage. Tracks: local vs cloud usage, response quality, latency, cost estimates. Stores in SQLite at ~/.timmy/metrics/model_metrics.db Usage: # Record a metric python3 model_tracker.py record --model timmy:v0.1-q4 --task identity --score 0.9 --latency 1.2 # Report python3 model_tracker.py report python3 model_tracker.py report --days 7 # Ingest from hermes session DB python3 model_tracker.py ingest """ import sqlite3 import time import json import argparse import os from pathlib import Path from datetime import datetime, timedelta DB_PATH = Path.home() / ".timmy" / "metrics" / "model_metrics.db" # Cost estimates per 1M tokens (input/output) COST_TABLE = { "claude-opus-4-6": {"input": 15.0, "output": 75.0}, "claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0}, "claude-sonnet-4-6": {"input": 3.0, "output": 15.0}, "claude-haiku-4-20250414": {"input": 0.25, "output": 1.25}, # Local models = $0 "timmy:v0.1-q4": {"input": 0, "output": 0}, "hermes3:8b": {"input": 0, "output": 0}, "hermes3:latest": {"input": 0, "output": 0}, "hermes4:36b": {"input": 0, "output": 0}, "qwen3:30b": {"input": 0, "output": 0}, "qwen3.5:latest": {"input": 0, "output": 0}, "qwen2.5:14b": {"input": 0, "output": 0}, "llama3.1:latest": {"input": 0, "output": 0}, "llama3.2:latest": {"input": 0, "output": 0}, "glm-4.7-flash:latest": {"input": 0, "output": 0}, } def is_local(model): """Check if a model runs locally (zero cloud cost).""" if not model: return False costs = COST_TABLE.get(model, {}) if costs.get("input", 1) == 0 and costs.get("output", 1) == 0: return True # Heuristic: if it has a colon and no slash, it's probably Ollama if ":" in model and "/" not in model and "claude" not in model: return True return False def init_db(): DB_PATH.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(DB_PATH)) conn.execute(""" CREATE TABLE IF NOT EXISTS evals ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp REAL NOT NULL, model TEXT NOT NULL, task TEXT NOT NULL, score REAL, latency_s REAL, tokens_in INTEGER, tokens_out INTEGER, notes TEXT ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS session_stats ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp REAL NOT NULL, period TEXT NOT NULL, model TEXT NOT NULL, source TEXT, sessions INTEGER, messages INTEGER, tool_calls INTEGER, est_cost_usd REAL, is_local INTEGER ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS sovereignty_score ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp REAL NOT NULL, period TEXT NOT NULL, total_sessions INTEGER, local_sessions INTEGER, cloud_sessions INTEGER, local_pct REAL, est_cloud_cost REAL, est_saved REAL ) """) conn.commit() return conn def ingest_from_hermes(conn, days=1): """Pull session data from Hermes state.db and compute metrics.""" hermes_db = Path.home() / ".hermes" / "state.db" if not hermes_db.exists(): print("No hermes state.db found") return hconn = sqlite3.connect(str(hermes_db)) cutoff = time.time() - (days * 86400) period = f"{days}d" rows = hconn.execute(""" SELECT model, source, COUNT(*) as sessions, SUM(message_count) as msgs, SUM(tool_call_count) as tools FROM sessions WHERE started_at > ? AND model IS NOT NULL AND model != '' GROUP BY model, source """, (cutoff,)).fetchall() now = time.time() total_sessions = 0 local_sessions = 0 cloud_sessions = 0 est_cloud_cost = 0.0 for model, source, sessions, msgs, tools in rows: local = is_local(model) # Rough cost estimate: ~500 tokens per message avg avg_tokens = (msgs or 0) * 500 costs = COST_TABLE.get(model, {"input": 5.0, "output": 15.0}) est_cost = (avg_tokens / 1_000_000) * (costs["input"] + costs["output"]) / 2 conn.execute(""" INSERT INTO session_stats (timestamp, period, model, source, sessions, messages, tool_calls, est_cost_usd, is_local) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, (now, period, model, source, sessions, msgs or 0, tools or 0, round(est_cost, 4), 1 if local else 0)) total_sessions += sessions if local: local_sessions += sessions else: cloud_sessions += sessions est_cloud_cost += est_cost local_pct = (local_sessions / total_sessions * 100) if total_sessions > 0 else 0 # Estimate saved = what it would cost if everything ran on Sonnet est_if_all_cloud = total_sessions * 0.05 # rough $0.05/session avg est_saved = max(0, est_if_all_cloud - est_cloud_cost) conn.execute(""" INSERT INTO sovereignty_score (timestamp, period, total_sessions, local_sessions, cloud_sessions, local_pct, est_cloud_cost, est_saved) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, (now, period, total_sessions, local_sessions, cloud_sessions, round(local_pct, 1), round(est_cloud_cost, 4), round(est_saved, 4))) conn.commit() hconn.close() print(f"Ingested {days}d: {total_sessions} sessions ({local_sessions} local, {cloud_sessions} cloud)") print(f" Sovereignty: {local_pct:.1f}% local") print(f" Est cloud cost: ${est_cloud_cost:.2f}") def report(conn, days=3): """Print the sovereignty dashboard.""" cutoff = time.time() - (days * 86400) print(f"\n{'='*60}") print(f" TIMMY SOVEREIGNTY METRICS — Last {days} days") print(f"{'='*60}\n") # Latest sovereignty score row = conn.execute(""" SELECT local_pct, total_sessions, local_sessions, cloud_sessions, est_cloud_cost FROM sovereignty_score ORDER BY timestamp DESC LIMIT 1 """).fetchone() if row: pct, total, local, cloud, cost = row bar_len = 40 filled = int(pct / 100 * bar_len) bar = "█" * filled + "░" * (bar_len - filled) print(f" SOVEREIGNTY SCORE: [{bar}] {pct:.1f}%") print(f" Sessions: {total} total | {local} local | {cloud} cloud") print(f" Est cloud cost: ${cost:.2f}") else: print(" No data yet. Run: python3 model_tracker.py ingest") # Model breakdown print(f"\n {'MODEL':<30} {'SESS':>6} {'MSGS':>7} {'TOOLS':>6} {'LOCAL':>6} {'$EST':>8}") print(f" {'-'*30} {'-'*6} {'-'*7} {'-'*6} {'-'*6} {'-'*8}") rows = conn.execute(""" SELECT model, SUM(sessions), SUM(messages), SUM(tool_calls), is_local, SUM(est_cost_usd) FROM session_stats WHERE timestamp > ? GROUP BY model ORDER BY SUM(sessions) DESC """, (cutoff,)).fetchall() for model, sess, msgs, tools, local, cost in rows: flag = " ✓" if local else " ✗" print(f" {model:<30} {sess:>6} {msgs:>7} {tools:>6} {flag:>6} ${cost:>7.2f}") # Eval scores if any evals = conn.execute(""" SELECT model, task, AVG(score), COUNT(*), AVG(latency_s) FROM evals WHERE timestamp > ? GROUP BY model, task ORDER BY model, task """, (cutoff,)).fetchall() if evals: print(f"\n {'MODEL':<25} {'TASK':<15} {'AVG SCORE':>9} {'RUNS':>5} {'AVG LAT':>8}") print(f" {'-'*25} {'-'*15} {'-'*9} {'-'*5} {'-'*8}") for model, task, score, runs, lat in evals: print(f" {model:<25} {task:<15} {score:>9.2f} {runs:>5} {lat:>7.1f}s") print(f"\n{'='*60}\n") def record_eval(conn, model, task, score, latency=None, tokens_in=None, tokens_out=None, notes=None): conn.execute(""" INSERT INTO evals (timestamp, model, task, score, latency_s, tokens_in, tokens_out, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, (time.time(), model, task, score, latency, tokens_in, tokens_out, notes)) conn.commit() print(f"Recorded: {model} | {task} | score={score}") def main(): parser = argparse.ArgumentParser(description="Timmy Model Performance Tracker") sub = parser.add_subparsers(dest="cmd") p_ingest = sub.add_parser("ingest", help="Ingest from Hermes session DB") p_ingest.add_argument("--days", type=int, default=3) p_report = sub.add_parser("report", help="Show sovereignty dashboard") p_report.add_argument("--days", type=int, default=3) p_record = sub.add_parser("record", help="Record an eval") p_record.add_argument("--model", required=True) p_record.add_argument("--task", required=True) p_record.add_argument("--score", type=float, required=True) p_record.add_argument("--latency", type=float) p_record.add_argument("--notes") args = parser.parse_args() conn = init_db() if args.cmd == "ingest": ingest_from_hermes(conn, args.days) elif args.cmd == "report": report(conn, args.days) elif args.cmd == "record": record_eval(conn, args.model, args.task, args.score, args.latency, notes=args.notes) else: # Default: ingest + report ingest_from_hermes(conn, 3) report(conn, 3) conn.close() if __name__ == "__main__": main()