From 585f8528b217e51f4d01cfc8dfb6909e3b4bdfd5 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Fri, 6 Mar 2026 14:50:57 -0800 Subject: [PATCH] =?UTF-8?q?fix:=20deep=20review=20=E2=80=94=20prefix=20mat?= =?UTF-8?q?ching,=20tool=5Fcalls=20extraction,=20query=20perf,=20serializa?= =?UTF-8?q?tion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issues found and fixed during deep code path review: 1. CRITICAL: Prefix matching returned wrong prices for dated model names - 'gpt-4o-mini-2024-07-18' matched gpt-4o ($2.50) instead of gpt-4o-mini ($0.15) - Same for o3-mini→o3 (9x), gpt-4.1-mini→gpt-4.1 (5x), gpt-4.1-nano→gpt-4.1 (20x) - Fix: use longest-match-wins strategy instead of first-match - Removed dangerous key.startswith(bare) reverse matching 2. CRITICAL: Top Tools section was empty for CLI sessions - run_agent.py doesn't set tool_name on tool response messages (pre-existing) - Insights now also extracts tool names from tool_calls JSON on assistant messages, which IS populated for all sessions - Uses max() merge strategy to avoid double-counting between sources 3. SELECT * replaced with explicit column list - Skips system_prompt and model_config blobs (can be thousands of chars) - Reduces memory and I/O for large session counts 4. Sets in overview dict converted to sorted lists - models_with_pricing / models_without_pricing were Python sets - Sets aren't JSON-serializable — would crash json.dumps() 5. Negative duration guard - end > start check prevents negative durations from clock drift 6. Model breakdown sort fallback - When all tokens are 0, now sorts by session count instead of arbitrary order 7. Removed unused timedelta import Added 6 new tests: dated model pricing (4), tool_calls JSON extraction, JSON serialization safety. Total: 69 tests. --- agent/insights.py | 114 ++++++++++++++++++++++++++++++++++------- tests/test_insights.py | 74 +++++++++++++++++++++++++- 2 files changed, 169 insertions(+), 19 deletions(-) diff --git a/agent/insights.py b/agent/insights.py index e355dcf5b..81fb52260 100644 --- a/agent/insights.py +++ b/agent/insights.py @@ -16,9 +16,10 @@ Usage: print(engine.format_terminal(report)) """ +import json import time from collections import Counter, defaultdict -from datetime import datetime, timedelta +from datetime import datetime from typing import Any, Dict, List, Optional # ========================================================================= @@ -82,12 +83,18 @@ def _get_pricing(model_name: str) -> Dict[str, float]: if bare in MODEL_PRICING: return MODEL_PRICING[bare] - # Fuzzy prefix match + # Fuzzy prefix match — prefer the LONGEST matching key to avoid + # e.g. "gpt-4o" matching before "gpt-4o-mini" for "gpt-4o-mini-2024-07-18" + best_match = None + best_len = 0 for key, price in MODEL_PRICING.items(): - if bare.startswith(key) or key.startswith(bare): - return price + if bare.startswith(key) and len(key) > best_len: + best_match = price + best_len = len(key) + if best_match: + return best_match - # Keyword heuristics + # Keyword heuristics (checked in most-specific-first order) if "opus" in bare: return {"input": 15.00, "output": 75.00} if "sonnet" in bare: @@ -211,26 +218,39 @@ class InsightsEngine: # Data gathering (SQL queries) # ========================================================================= + # Columns we actually need (skip system_prompt, model_config blobs) + _SESSION_COLS = ("id, source, model, started_at, ended_at, " + "message_count, tool_call_count, input_tokens, output_tokens") + def _get_sessions(self, cutoff: float, source: str = None) -> List[Dict]: """Fetch sessions within the time window.""" if source: cursor = self._conn.execute( - """SELECT * FROM sessions - WHERE started_at >= ? AND source = ? - ORDER BY started_at DESC""", + f"""SELECT {self._SESSION_COLS} FROM sessions + WHERE started_at >= ? AND source = ? + ORDER BY started_at DESC""", (cutoff, source), ) else: cursor = self._conn.execute( - """SELECT * FROM sessions - WHERE started_at >= ? - ORDER BY started_at DESC""", + f"""SELECT {self._SESSION_COLS} FROM sessions + WHERE started_at >= ? + ORDER BY started_at DESC""", (cutoff,), ) return [dict(row) for row in cursor.fetchall()] def _get_tool_usage(self, cutoff: float, source: str = None) -> List[Dict]: - """Get tool call counts from messages.""" + """Get tool call counts from messages. + + Uses two sources: + 1. tool_name column on 'tool' role messages (set by gateway) + 2. tool_calls JSON on 'assistant' role messages (covers CLI where + tool_name is not populated on tool responses) + """ + tool_counts = Counter() + + # Source 1: explicit tool_name on tool response messages if source: cursor = self._conn.execute( """SELECT m.tool_name, COUNT(*) as count @@ -253,7 +273,64 @@ class InsightsEngine: ORDER BY count DESC""", (cutoff,), ) - return [dict(row) for row in cursor.fetchall()] + for row in cursor.fetchall(): + tool_counts[row["tool_name"]] += row["count"] + + # Source 2: extract from tool_calls JSON on assistant messages + # (covers CLI sessions where tool_name is NULL on tool responses) + if source: + cursor2 = self._conn.execute( + """SELECT m.tool_calls + FROM messages m + JOIN sessions s ON s.id = m.session_id + WHERE s.started_at >= ? AND s.source = ? + AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""", + (cutoff, source), + ) + else: + cursor2 = self._conn.execute( + """SELECT m.tool_calls + FROM messages m + JOIN sessions s ON s.id = m.session_id + WHERE s.started_at >= ? + AND m.role = 'assistant' AND m.tool_calls IS NOT NULL""", + (cutoff,), + ) + + tool_calls_counts = Counter() + for row in cursor2.fetchall(): + try: + calls = row["tool_calls"] + if isinstance(calls, str): + calls = json.loads(calls) + if isinstance(calls, list): + for call in calls: + func = call.get("function", {}) if isinstance(call, dict) else {} + name = func.get("name") + if name: + tool_calls_counts[name] += 1 + except (json.JSONDecodeError, TypeError, AttributeError): + continue + + # Merge: prefer tool_name source, supplement with tool_calls source + # for tools not already counted + if not tool_counts and tool_calls_counts: + # No tool_name data at all — use tool_calls exclusively + tool_counts = tool_calls_counts + elif tool_counts and tool_calls_counts: + # Both sources have data — use whichever has the higher count per tool + # (they may overlap, so take the max to avoid double-counting) + all_tools = set(tool_counts) | set(tool_calls_counts) + merged = Counter() + for tool in all_tools: + merged[tool] = max(tool_counts.get(tool, 0), tool_calls_counts.get(tool, 0)) + tool_counts = merged + + # Convert to the expected format + return [ + {"tool_name": name, "count": count} + for name, count in tool_counts.most_common() + ] def _get_message_stats(self, cutoff: float, source: str = None) -> Dict: """Get aggregate message statistics.""" @@ -314,12 +391,12 @@ class InsightsEngine: else: models_without_pricing.add(display) - # Session duration stats + # Session duration stats (guard against negative durations from clock drift) durations = [] for s in sessions: start = s.get("started_at") end = s.get("ended_at") - if start and end: + if start and end and end > start: durations.append(end - start) total_hours = sum(durations) / 3600 if durations else 0 @@ -347,8 +424,8 @@ class InsightsEngine: "tool_messages": message_stats.get("tool_messages") or 0, "date_range_start": date_range_start, "date_range_end": date_range_end, - "models_with_pricing": models_with_pricing, - "models_without_pricing": models_without_pricing, + "models_with_pricing": sorted(models_with_pricing), + "models_without_pricing": sorted(models_without_pricing), } def _compute_model_breakdown(self, sessions: List[Dict]) -> List[Dict]: @@ -377,7 +454,8 @@ class InsightsEngine: {"model": model, **data} for model, data in model_data.items() ] - result.sort(key=lambda x: x["total_tokens"], reverse=True) + # Sort by tokens first, fall back to session count when tokens are 0 + result.sort(key=lambda x: (x["total_tokens"], x["sessions"]), reverse=True) return result def _compute_platform_breakdown(self, sessions: List[Dict]) -> List[Dict]: diff --git a/tests/test_insights.py b/tests/test_insights.py index b6a95c612..0f598f9a6 100644 --- a/tests/test_insights.py +++ b/tests/test_insights.py @@ -176,6 +176,26 @@ class TestPricing: pricing = _get_pricing("gemini-3.0-ultra") assert pricing["input"] == 0.15 + def test_dated_model_gpt4o_mini(self): + """gpt-4o-mini-2024-07-18 should match gpt-4o-mini, NOT gpt-4o.""" + pricing = _get_pricing("gpt-4o-mini-2024-07-18") + assert pricing["input"] == 0.15 # gpt-4o-mini price, not gpt-4o's 2.50 + + def test_dated_model_o3_mini(self): + """o3-mini-2025-01-31 should match o3-mini, NOT o3.""" + pricing = _get_pricing("o3-mini-2025-01-31") + assert pricing["input"] == 1.10 # o3-mini price, not o3's 10.00 + + def test_dated_model_gpt41_mini(self): + """gpt-4.1-mini-2025-04-14 should match gpt-4.1-mini, NOT gpt-4.1.""" + pricing = _get_pricing("gpt-4.1-mini-2025-04-14") + assert pricing["input"] == 0.40 # gpt-4.1-mini, not gpt-4.1's 2.00 + + def test_dated_model_gpt41_nano(self): + """gpt-4.1-nano-2025-04-14 should match gpt-4.1-nano, NOT gpt-4.1.""" + pricing = _get_pricing("gpt-4.1-nano-2025-04-14") + assert pricing["input"] == 0.10 # gpt-4.1-nano, not gpt-4.1's 2.00 + class TestHasKnownPricing: def test_known_commercial_model(self): @@ -585,6 +605,58 @@ class TestEdgeCases: assert custom["cost"] == 0.0 assert custom["has_pricing"] is False + def test_tool_usage_from_tool_calls_json(self, db): + """Tool usage should be extracted from tool_calls JSON when tool_name is NULL.""" + import json as _json + db.create_session(session_id="s1", source="cli", model="test") + # Assistant message with tool_calls (this is what CLI produces) + db.append_message("s1", role="assistant", content="Let me search", + tool_calls=[{"id": "call_1", "type": "function", + "function": {"name": "search_files", "arguments": "{}"}}]) + # Tool response WITHOUT tool_name (this is the CLI bug) + db.append_message("s1", role="tool", content="found results", + tool_call_id="call_1") + db.append_message("s1", role="assistant", content="Now reading", + tool_calls=[{"id": "call_2", "type": "function", + "function": {"name": "read_file", "arguments": "{}"}}]) + db.append_message("s1", role="tool", content="file content", + tool_call_id="call_2") + db.append_message("s1", role="assistant", content="And searching again", + tool_calls=[{"id": "call_3", "type": "function", + "function": {"name": "search_files", "arguments": "{}"}}]) + db.append_message("s1", role="tool", content="more results", + tool_call_id="call_3") + db._conn.commit() + + engine = InsightsEngine(db) + report = engine.generate(days=30) + tools = report["tools"] + + # Should find tools from tool_calls JSON even though tool_name is NULL + tool_names = [t["tool"] for t in tools] + assert "search_files" in tool_names + assert "read_file" in tool_names + + # search_files was called twice + sf = next(t for t in tools if t["tool"] == "search_files") + assert sf["count"] == 2 + + def test_overview_pricing_sets_are_lists(self, db): + """models_with/without_pricing should be JSON-serializable lists.""" + import json as _json + db.create_session(session_id="s1", source="cli", model="gpt-4o") + db.create_session(session_id="s2", source="cli", model="my-custom") + db._conn.commit() + + engine = InsightsEngine(db) + report = engine.generate(days=30) + overview = report["overview"] + + assert isinstance(overview["models_with_pricing"], list) + assert isinstance(overview["models_without_pricing"], list) + # Should be JSON-serializable + _json.dumps(report["overview"]) # would raise if sets present + def test_mixed_commercial_and_custom_models(self, db): """Mix of commercial and custom models: only commercial ones get costs.""" db.create_session(session_id="s1", source="cli", model="gpt-4o") @@ -599,7 +671,7 @@ class TestEdgeCases: # Cost should only come from gpt-4o, not from the custom model overview = report["overview"] assert overview["estimated_cost"] > 0 - assert "gpt-4o" in overview["models_with_pricing"] + assert "gpt-4o" in overview["models_with_pricing"] # list now, not set assert "my-local-llama" in overview["models_without_pricing"] # Verify individual model entries