From 585f8528b217e51f4d01cfc8dfb6909e3b4bdfd5 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Fri, 6 Mar 2026 14:50:57 -0800
Subject: [PATCH] =?UTF-8?q?fix:=20deep=20review=20=E2=80=94=20prefix=20mat?=
 =?UTF-8?q?ching,=20tool=5Fcalls=20extraction,=20query=20perf,=20serializa?=
 =?UTF-8?q?tion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issues found and fixed during deep code path review:

1. CRITICAL: Prefix matching returned wrong prices for dated model names
   - 'gpt-4o-mini-2024-07-18' matched gpt-4o ($2.50) instead of gpt-4o-mini ($0.15)
   - Same for o3-mini→o3 (9x), gpt-4.1-mini→gpt-4.1 (5x), gpt-4.1-nano→gpt-4.1 (20x)
   - Fix: use longest-match-wins strategy instead of first-match
   - Removed dangerous key.startswith(bare) reverse matching

2. CRITICAL: Top Tools section was empty for CLI sessions
   - run_agent.py doesn't set tool_name on tool response messages (pre-existing)
   - Insights now also extracts tool names from tool_calls JSON on assistant
     messages, which IS populated for all sessions
   - Uses max() merge strategy to avoid double-counting between sources

3. SELECT * replaced with explicit column list
   - Skips system_prompt and model_config blobs (can be thousands of chars)
   - Reduces memory and I/O for large session counts

4. Sets in overview dict converted to sorted lists
   - models_with_pricing / models_without_pricing were Python sets
   - Sets aren't JSON-serializable — would crash json.dumps()

5. Negative duration guard
   - end > start check prevents negative durations from clock drift

6. Model breakdown sort fallback
   - When all tokens are 0, now sorts by session count instead of arbitrary order

7. Removed unused timedelta import

Added 6 new tests: dated model pricing (4), tool_calls JSON extraction,
JSON serialization safety. Total: 69 tests.
---
 agent/insights.py      | 114 ++++++++++++++++++++++++++++++++++-------
 tests/test_insights.py |  74 +++++++++++++++++++++++++-
 2 files changed, 169 insertions(+), 19 deletions(-)

diff --git a/agent/insights.py b/agent/insights.py
index e355dcf5b..81fb52260 100644
--- a/agent/insights.py
+++ b/agent/insights.py
@@ -16,9 +16,10 @@ Usage:
     print(engine.format_terminal(report))
 """
 
+import json
 import time
 from collections import Counter, defaultdict
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Any, Dict, List, Optional
 
 # =========================================================================
@@ -82,12 +83,18 @@ def _get_pricing(model_name: str) -> Dict[str, float]:
     if bare in MODEL_PRICING:
         return MODEL_PRICING[bare]
 
-    # Fuzzy prefix match
+    # Fuzzy prefix match — prefer the LONGEST matching key to avoid
+    # e.g. "gpt-4o" matching before "gpt-4o-mini" for "gpt-4o-mini-2024-07-18"
+    best_match = None
+    best_len = 0
     for key, price in MODEL_PRICING.items():
-        if bare.startswith(key) or key.startswith(bare):
-            return price
+        if bare.startswith(key) and len(key) > best_len:
+            best_match = price
+            best_len = len(key)
+    if best_match:
+        return best_match
 
-    # Keyword heuristics
+    # Keyword heuristics (checked in most-specific-first order)
     if "opus" in bare:
         return {"input": 15.00, "output": 75.00}
     if "sonnet" in bare:
@@ -211,26 +218,39 @@ class InsightsEngine:
     # Data gathering (SQL queries)
     # =========================================================================
 
+    # Columns we actually need (skip system_prompt, model_config blobs)
+    _SESSION_COLS = ("id, source, model, started_at, ended_at, "
+                     "message_count, tool_call_count, input_tokens, output_tokens")
+
     def _get_sessions(self, cutoff: float, source: str = None) -> List[Dict]:
         """Fetch sessions within the time window."""
         if source:
             cursor = self._conn.execute(
-                """SELECT * FROM sessions
-                   WHERE started_at >= ? AND source = ?
-                   ORDER BY started_at DESC""",
+                f"""SELECT {self._SESSION_COLS} FROM sessions
+                    WHERE started_at >= ? AND source = ?
+                    ORDER BY started_at DESC""",
                 (cutoff, source),
             )
         else:
             cursor = self._conn.execute(
-                """SELECT * FROM sessions
-                   WHERE started_at >= ?
-                   ORDER BY started_at DESC""",
+                f"""SELECT {self._SESSION_COLS} FROM sessions
+                    WHERE started_at >= ?
+                    ORDER BY started_at DESC""",
                 (cutoff,),
             )
         return [dict(row) for row in cursor.fetchall()]
 
     def _get_tool_usage(self, cutoff: float, source: str = None) -> List[Dict]:
-        """Get tool call counts from messages."""
+        """Get tool call counts from messages.
+
+        Uses two sources:
+        1. tool_name column on 'tool' role messages (set by gateway)
+        2. tool_calls JSON on 'assistant' role messages (covers CLI where
+           tool_name is not populated on tool responses)
+        """
+        tool_counts = Counter()
+
+        # Source 1: explicit tool_name on tool response messages
         if source:
             cursor = self._conn.execute(
                 """SELECT m.tool_name, COUNT(*) as count
@@ -253,7 +273,64 @@ class InsightsEngine:
                    ORDER BY count DESC""",
                 (cutoff,),
             )
-        return [dict(row) for row in cursor.fetchall()]
+        for row in cursor.fetchall():
+            tool_counts[row["tool_name"]] += row["count"]
+
+        # Source 2: extract from tool_calls JSON on assistant messages
+        # (covers CLI sessions where tool_name is NULL on tool responses)
+        if source:
+            cursor2 = self._conn.execute(
+                """SELECT m.tool_calls
+                   FROM messages m
+                   JOIN sessions s ON s.id = m.session_id
+                   WHERE s.started_at >= ? AND s.source = ?
+                     AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""",
+                (cutoff, source),
+            )
+        else:
+            cursor2 = self._conn.execute(
+                """SELECT m.tool_calls
+                   FROM messages m
+                   JOIN sessions s ON s.id = m.session_id
+                   WHERE s.started_at >= ?
+                     AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""",
+                (cutoff,),
+            )
+
+        tool_calls_counts = Counter()
+        for row in cursor2.fetchall():
+            try:
+                calls = row["tool_calls"]
+                if isinstance(calls, str):
+                    calls = json.loads(calls)
+                if isinstance(calls, list):
+                    for call in calls:
+                        func = call.get("function", {}) if isinstance(call, dict) else {}
+                        name = func.get("name")
+                        if name:
+                            tool_calls_counts[name] += 1
+            except (json.JSONDecodeError, TypeError, AttributeError):
+                continue
+
+        # Merge: prefer tool_name source, supplement with tool_calls source
+        # for tools not already counted
+        if not tool_counts and tool_calls_counts:
+            # No tool_name data at all — use tool_calls exclusively
+            tool_counts = tool_calls_counts
+        elif tool_counts and tool_calls_counts:
+            # Both sources have data — use whichever has the higher count per tool
+            # (they may overlap, so take the max to avoid double-counting)
+            all_tools = set(tool_counts) | set(tool_calls_counts)
+            merged = Counter()
+            for tool in all_tools:
+                merged[tool] = max(tool_counts.get(tool, 0), tool_calls_counts.get(tool, 0))
+            tool_counts = merged
+
+        # Convert to the expected format
+        return [
+            {"tool_name": name, "count": count}
+            for name, count in tool_counts.most_common()
+        ]
 
     def _get_message_stats(self, cutoff: float, source: str = None) -> Dict:
         """Get aggregate message statistics."""
@@ -314,12 +391,12 @@ class InsightsEngine:
             else:
                 models_without_pricing.add(display)
 
-        # Session duration stats
+        # Session duration stats (guard against negative durations from clock drift)
         durations = []
         for s in sessions:
             start = s.get("started_at")
             end = s.get("ended_at")
-            if start and end:
+            if start and end and end > start:
                 durations.append(end - start)
 
         total_hours = sum(durations) / 3600 if durations else 0
@@ -347,8 +424,8 @@ class InsightsEngine:
             "tool_messages": message_stats.get("tool_messages") or 0,
             "date_range_start": date_range_start,
             "date_range_end": date_range_end,
-            "models_with_pricing": models_with_pricing,
-            "models_without_pricing": models_without_pricing,
+            "models_with_pricing": sorted(models_with_pricing),
+            "models_without_pricing": sorted(models_without_pricing),
         }
 
     def _compute_model_breakdown(self, sessions: List[Dict]) -> List[Dict]:
@@ -377,7 +454,8 @@ class InsightsEngine:
             {"model": model, **data}
             for model, data in model_data.items()
         ]
-        result.sort(key=lambda x: x["total_tokens"], reverse=True)
+        # Sort by tokens first, fall back to session count when tokens are 0
+        result.sort(key=lambda x: (x["total_tokens"], x["sessions"]), reverse=True)
         return result
 
     def _compute_platform_breakdown(self, sessions: List[Dict]) -> List[Dict]:
diff --git a/tests/test_insights.py b/tests/test_insights.py
index b6a95c612..0f598f9a6 100644
--- a/tests/test_insights.py
+++ b/tests/test_insights.py
@@ -176,6 +176,26 @@ class TestPricing:
         pricing = _get_pricing("gemini-3.0-ultra")
         assert pricing["input"] == 0.15
 
+    def test_dated_model_gpt4o_mini(self):
+        """gpt-4o-mini-2024-07-18 should match gpt-4o-mini, NOT gpt-4o."""
+        pricing = _get_pricing("gpt-4o-mini-2024-07-18")
+        assert pricing["input"] == 0.15  # gpt-4o-mini price, not gpt-4o's 2.50
+
+    def test_dated_model_o3_mini(self):
+        """o3-mini-2025-01-31 should match o3-mini, NOT o3."""
+        pricing = _get_pricing("o3-mini-2025-01-31")
+        assert pricing["input"] == 1.10  # o3-mini price, not o3's 10.00
+
+    def test_dated_model_gpt41_mini(self):
+        """gpt-4.1-mini-2025-04-14 should match gpt-4.1-mini, NOT gpt-4.1."""
+        pricing = _get_pricing("gpt-4.1-mini-2025-04-14")
+        assert pricing["input"] == 0.40  # gpt-4.1-mini, not gpt-4.1's 2.00
+
+    def test_dated_model_gpt41_nano(self):
+        """gpt-4.1-nano-2025-04-14 should match gpt-4.1-nano, NOT gpt-4.1."""
+        pricing = _get_pricing("gpt-4.1-nano-2025-04-14")
+        assert pricing["input"] == 0.10  # gpt-4.1-nano, not gpt-4.1's 2.00
+
 
 class TestHasKnownPricing:
     def test_known_commercial_model(self):
@@ -585,6 +605,58 @@ class TestEdgeCases:
         assert custom["cost"] == 0.0
         assert custom["has_pricing"] is False
 
+    def test_tool_usage_from_tool_calls_json(self, db):
+        """Tool usage should be extracted from tool_calls JSON when tool_name is NULL."""
+        import json as _json
+        db.create_session(session_id="s1", source="cli", model="test")
+        # Assistant message with tool_calls (this is what CLI produces)
+        db.append_message("s1", role="assistant", content="Let me search",
+                          tool_calls=[{"id": "call_1", "type": "function",
+                                       "function": {"name": "search_files", "arguments": "{}"}}])
+        # Tool response WITHOUT tool_name (this is the CLI bug)
+        db.append_message("s1", role="tool", content="found results",
+                          tool_call_id="call_1")
+        db.append_message("s1", role="assistant", content="Now reading",
+                          tool_calls=[{"id": "call_2", "type": "function",
+                                       "function": {"name": "read_file", "arguments": "{}"}}])
+        db.append_message("s1", role="tool", content="file content",
+                          tool_call_id="call_2")
+        db.append_message("s1", role="assistant", content="And searching again",
+                          tool_calls=[{"id": "call_3", "type": "function",
+                                       "function": {"name": "search_files", "arguments": "{}"}}])
+        db.append_message("s1", role="tool", content="more results",
+                          tool_call_id="call_3")
+        db._conn.commit()
+
+        engine = InsightsEngine(db)
+        report = engine.generate(days=30)
+        tools = report["tools"]
+
+        # Should find tools from tool_calls JSON even though tool_name is NULL
+        tool_names = [t["tool"] for t in tools]
+        assert "search_files" in tool_names
+        assert "read_file" in tool_names
+
+        # search_files was called twice
+        sf = next(t for t in tools if t["tool"] == "search_files")
+        assert sf["count"] == 2
+
+    def test_overview_pricing_sets_are_lists(self, db):
+        """models_with/without_pricing should be JSON-serializable lists."""
+        import json as _json
+        db.create_session(session_id="s1", source="cli", model="gpt-4o")
+        db.create_session(session_id="s2", source="cli", model="my-custom")
+        db._conn.commit()
+
+        engine = InsightsEngine(db)
+        report = engine.generate(days=30)
+        overview = report["overview"]
+
+        assert isinstance(overview["models_with_pricing"], list)
+        assert isinstance(overview["models_without_pricing"], list)
+        # Should be JSON-serializable
+        _json.dumps(report["overview"])  # would raise if sets present
+
     def test_mixed_commercial_and_custom_models(self, db):
         """Mix of commercial and custom models: only commercial ones get costs."""
         db.create_session(session_id="s1", source="cli", model="gpt-4o")
@@ -599,7 +671,7 @@ class TestEdgeCases:
         # Cost should only come from gpt-4o, not from the custom model
         overview = report["overview"]
         assert overview["estimated_cost"] > 0
-        assert "gpt-4o" in overview["models_with_pricing"]
+        assert "gpt-4o" in overview["models_with_pricing"]  # list now, not set
         assert "my-local-llama" in overview["models_without_pricing"]
 
         # Verify individual model entries