timmy-home/metrics/model_tracker.py

#!/usr/bin/env python3
"""
Timmy Model Performance Tracker — What you measure, you manage.

Tracks: local vs cloud usage, response quality, latency, cost estimates.
Stores in SQLite at ~/.timmy/metrics/model_metrics.db

Usage:
  # Record a metric
  python3 model_tracker.py record --model timmy:v0.1-q4 --task identity --score 0.9 --latency 1.2

  # Report
  python3 model_tracker.py report
  python3 model_tracker.py report --days 7

  # Ingest from hermes session DB
  python3 model_tracker.py ingest
"""

import sqlite3
import time
import json
import argparse
import os
from pathlib import Path
from datetime import datetime, timedelta

DB_PATH = Path.home() / ".timmy" / "metrics" / "model_metrics.db"

# Cost estimates per 1M tokens (input/output)
COST_TABLE = {
    "claude-opus-4-6": {"input": 15.0, "output": 75.0},
    "claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
    "claude-sonnet-4-6": {"input": 3.0, "output": 15.0},
    "claude-haiku-4-20250414": {"input": 0.25, "output": 1.25},
    # Local models = $0
    "timmy:v0.1-q4": {"input": 0, "output": 0},
    "hermes3:8b": {"input": 0, "output": 0},
    "hermes3:latest": {"input": 0, "output": 0},
    "hermes4:36b": {"input": 0, "output": 0},
    "qwen3:30b": {"input": 0, "output": 0},
    "qwen3.5:latest": {"input": 0, "output": 0},
    "qwen2.5:14b": {"input": 0, "output": 0},
    "llama3.1:latest": {"input": 0, "output": 0},
    "llama3.2:latest": {"input": 0, "output": 0},
    "glm-4.7-flash:latest": {"input": 0, "output": 0},
}

def is_local(model):
    """Check if a model runs locally (zero cloud cost)."""
    if not model:
        return False
    costs = COST_TABLE.get(model, {})
    if costs.get("input", 1) == 0 and costs.get("output", 1) == 0:
        return True
    # Heuristic: if it has a colon and no slash, it's probably Ollama
    if ":" in model and "/" not in model and "claude" not in model:
        return True
    return False

def init_db():
    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(str(DB_PATH))
    conn.execute("""
        CREATE TABLE IF NOT EXISTS evals (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp REAL NOT NULL,
            model TEXT NOT NULL,
            task TEXT NOT NULL,
            score REAL,
            latency_s REAL,
            tokens_in INTEGER,
            tokens_out INTEGER,
            notes TEXT
        )
    """)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS session_stats (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp REAL NOT NULL,
            period TEXT NOT NULL,
            model TEXT NOT NULL,
            source TEXT,
            sessions INTEGER,
            messages INTEGER,
            tool_calls INTEGER,
            est_cost_usd REAL,
            is_local INTEGER
        )
    """)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS sovereignty_score (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp REAL NOT NULL,
            period TEXT NOT NULL,
            total_sessions INTEGER,
            local_sessions INTEGER,
            cloud_sessions INTEGER,
            local_pct REAL,
            est_cloud_cost REAL,
            est_saved REAL
        )
    """)
    conn.commit()
    return conn

def ingest_from_hermes(conn, days=1):
    """Pull session data from Hermes state.db and compute metrics."""
    hermes_db = Path.home() / ".hermes" / "state.db"
    if not hermes_db.exists():
        print("No hermes state.db found")
        return

    hconn = sqlite3.connect(str(hermes_db))
    cutoff = time.time() - (days * 86400)
    period = f"{days}d"

    rows = hconn.execute("""
        SELECT model, source, COUNT(*) as sessions,
               SUM(message_count) as msgs,
               SUM(tool_call_count) as tools
        FROM sessions
        WHERE started_at > ? AND model IS NOT NULL AND model != ''
        GROUP BY model, source
    """, (cutoff,)).fetchall()

    now = time.time()
    total_sessions = 0
    local_sessions = 0
    cloud_sessions = 0
    est_cloud_cost = 0.0

    for model, source, sessions, msgs, tools in rows:
        local = is_local(model)
        # Rough cost estimate: ~500 tokens per message avg
        avg_tokens = (msgs or 0) * 500
        costs = COST_TABLE.get(model, {"input": 5.0, "output": 15.0})
        est_cost = (avg_tokens / 1_000_000) * (costs["input"] + costs["output"]) / 2

        conn.execute("""
            INSERT INTO session_stats (timestamp, period, model, source, sessions, messages, tool_calls, est_cost_usd, is_local)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (now, period, model, source, sessions, msgs or 0, tools or 0, round(est_cost, 4), 1 if local else 0))

        total_sessions += sessions
        if local:
            local_sessions += sessions
        else:
            cloud_sessions += sessions
            est_cloud_cost += est_cost

    local_pct = (local_sessions / total_sessions * 100) if total_sessions > 0 else 0
    # Estimate saved = what it would cost if everything ran on Sonnet
    est_if_all_cloud = total_sessions * 0.05  # rough $0.05/session avg
    est_saved = max(0, est_if_all_cloud - est_cloud_cost)

    conn.execute("""
        INSERT INTO sovereignty_score (timestamp, period, total_sessions, local_sessions, cloud_sessions, local_pct, est_cloud_cost, est_saved)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    """, (now, period, total_sessions, local_sessions, cloud_sessions, round(local_pct, 1), round(est_cloud_cost, 4), round(est_saved, 4)))

    conn.commit()
    hconn.close()
    print(f"Ingested {days}d: {total_sessions} sessions ({local_sessions} local, {cloud_sessions} cloud)")
    print(f"  Sovereignty: {local_pct:.1f}% local")
    print(f"  Est cloud cost: ${est_cloud_cost:.2f}")

def report(conn, days=3):
    """Print the sovereignty dashboard."""
    cutoff = time.time() - (days * 86400)

    print(f"\n{'='*60}")
    print(f"  TIMMY SOVEREIGNTY METRICS — Last {days} days")
    print(f"{'='*60}\n")

    # Latest sovereignty score
    row = conn.execute("""
        SELECT local_pct, total_sessions, local_sessions, cloud_sessions, est_cloud_cost
        FROM sovereignty_score ORDER BY timestamp DESC LIMIT 1
    """).fetchone()

    if row:
        pct, total, local, cloud, cost = row
        bar_len = 40
        filled = int(pct / 100 * bar_len)
        bar = "█" * filled + "░" * (bar_len - filled)
        print(f"  SOVEREIGNTY SCORE: [{bar}] {pct:.1f}%")
        print(f"  Sessions: {total} total | {local} local | {cloud} cloud")
        print(f"  Est cloud cost: ${cost:.2f}")
    else:
        print("  No data yet. Run: python3 model_tracker.py ingest")

    # Model breakdown
    print(f"\n  {'MODEL':<30} {'SESS':>6} {'MSGS':>7} {'TOOLS':>6} {'LOCAL':>6} {'$EST':>8}")
    print(f"  {'-'*30} {'-'*6} {'-'*7} {'-'*6} {'-'*6} {'-'*8}")

    rows = conn.execute("""
        SELECT model, SUM(sessions), SUM(messages), SUM(tool_calls), is_local, SUM(est_cost_usd)
        FROM session_stats
        WHERE timestamp > ?
        GROUP BY model
        ORDER BY SUM(sessions) DESC
    """, (cutoff,)).fetchall()

    for model, sess, msgs, tools, local, cost in rows:
        flag = "  ✓" if local else "  ✗"
        print(f"  {model:<30} {sess:>6} {msgs:>7} {tools:>6} {flag:>6} ${cost:>7.2f}")

    # Eval scores if any
    evals = conn.execute("""
        SELECT model, task, AVG(score), COUNT(*), AVG(latency_s)
        FROM evals
        WHERE timestamp > ?
        GROUP BY model, task
        ORDER BY model, task
    """, (cutoff,)).fetchall()

    if evals:
        print(f"\n  {'MODEL':<25} {'TASK':<15} {'AVG SCORE':>9} {'RUNS':>5} {'AVG LAT':>8}")
        print(f"  {'-'*25} {'-'*15} {'-'*9} {'-'*5} {'-'*8}")
        for model, task, score, runs, lat in evals:
            print(f"  {model:<25} {task:<15} {score:>9.2f} {runs:>5} {lat:>7.1f}s")

    print(f"\n{'='*60}\n")

def record_eval(conn, model, task, score, latency=None, tokens_in=None, tokens_out=None, notes=None):
    conn.execute("""
        INSERT INTO evals (timestamp, model, task, score, latency_s, tokens_in, tokens_out, notes)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    """, (time.time(), model, task, score, latency, tokens_in, tokens_out, notes))
    conn.commit()
    print(f"Recorded: {model} | {task} | score={score}")

def main():
    parser = argparse.ArgumentParser(description="Timmy Model Performance Tracker")
    sub = parser.add_subparsers(dest="cmd")

    p_ingest = sub.add_parser("ingest", help="Ingest from Hermes session DB")
    p_ingest.add_argument("--days", type=int, default=3)

    p_report = sub.add_parser("report", help="Show sovereignty dashboard")
    p_report.add_argument("--days", type=int, default=3)

    p_record = sub.add_parser("record", help="Record an eval")
    p_record.add_argument("--model", required=True)
    p_record.add_argument("--task", required=True)
    p_record.add_argument("--score", type=float, required=True)
    p_record.add_argument("--latency", type=float)
    p_record.add_argument("--notes")

    args = parser.parse_args()
    conn = init_db()

    if args.cmd == "ingest":
        ingest_from_hermes(conn, args.days)
    elif args.cmd == "report":
        report(conn, args.days)
    elif args.cmd == "record":
        record_eval(conn, args.model, args.task, args.score, args.latency, notes=args.notes)
    else:
        # Default: ingest + report
        ingest_from_hermes(conn, 3)
        report(conn, 3)

    conn.close()

if __name__ == "__main__":
    main()