fix: gateway token double-counting with cached agents (#3306)

The cached agent accumulates session_input_tokens across messages, so run_conversation() returns cumulative totals. But update_session() used += (increment), double-counting on every message after the first. - session.py: change in-memory entry updates from += to = (direct assignment for cumulative values) - hermes_state.py: add absolute=True flag to update_token_counts() that uses SET column = ? instead of SET column = column + ? - session.py: pass absolute=True to the DB call CLI path is unchanged — it passes per-API-call deltas directly to update_token_counts() with the default absolute=False (increment). Reported by @zaycruz in #3222. Closes #3222.
2026-03-26 19:04:53 -07:00
parent 1519c4d477
commit a8df7f9964
3 changed files with 45 additions and 10 deletions
--- a/gateway/session.py
+++ b/gateway/session.py
@@ -762,14 +762,16 @@ class SessionStore:
            if session_key in self._entries:
                entry = self._entries[session_key]
                entry.updated_at = _now()
-                entry.input_tokens += input_tokens
-                entry.output_tokens += output_tokens
-                entry.cache_read_tokens += cache_read_tokens
-                entry.cache_write_tokens += cache_write_tokens
+                # Direct assignment — the gateway receives cumulative totals
+                # from the cached agent, not per-call deltas.
+                entry.input_tokens = input_tokens
+                entry.output_tokens = output_tokens
+                entry.cache_read_tokens = cache_read_tokens
+                entry.cache_write_tokens = cache_write_tokens
                if last_prompt_tokens is not None:
                    entry.last_prompt_tokens = last_prompt_tokens
                if estimated_cost_usd is not None:
-                    entry.estimated_cost_usd += estimated_cost_usd
+                    entry.estimated_cost_usd = estimated_cost_usd
                if cost_status:
                    entry.cost_status = cost_status
                entry.total_tokens = (
@@ -795,6 +797,7 @@ class SessionStore:
                    billing_provider=provider,
                    billing_base_url=base_url,
                    model=model,
+                    absolute=True,
                )
            except Exception as e:
                logger.debug("Session DB operation failed: %s", e)
--- a/hermes_state.py
+++ b/hermes_state.py
@@ -319,11 +319,39 @@ class SessionDB:
        billing_provider: Optional[str] = None,
        billing_base_url: Optional[str] = None,
        billing_mode: Optional[str] = None,
+        absolute: bool = False,
    ) -> None:
-        """Increment token counters and backfill model if not already set."""
-        with self._lock:
-            self._conn.execute(
-                """UPDATE sessions SET
+        """Update token counters and backfill model if not already set.
+
+        When *absolute* is False (default), values are **incremented** — use
+        this for per-API-call deltas (CLI path).
+
+        When *absolute* is True, values are **set directly** — use this when
+        the caller already holds cumulative totals (gateway path, where the
+        cached agent accumulates across messages).
+        """
+        if absolute:
+            sql = """UPDATE sessions SET
+                   input_tokens = ?,
+                   output_tokens = ?,
+                   cache_read_tokens = ?,
+                   cache_write_tokens = ?,
+                   reasoning_tokens = ?,
+                   estimated_cost_usd = COALESCE(?, 0),
+                   actual_cost_usd = CASE
+                       WHEN ? IS NULL THEN actual_cost_usd
+                       ELSE ?
+                   END,
+                   cost_status = COALESCE(?, cost_status),
+                   cost_source = COALESCE(?, cost_source),
+                   pricing_version = COALESCE(?, pricing_version),
+                   billing_provider = COALESCE(billing_provider, ?),
+                   billing_base_url = COALESCE(billing_base_url, ?),
+                   billing_mode = COALESCE(billing_mode, ?),
+                   model = COALESCE(model, ?)
+                   WHERE id = ?"""
+        else:
+            sql = """UPDATE sessions SET
                   input_tokens = input_tokens + ?,
                   output_tokens = output_tokens + ?,
                   cache_read_tokens = cache_read_tokens + ?,
@@ -341,7 +369,10 @@ class SessionDB:
                   billing_base_url = COALESCE(billing_base_url, ?),
                   billing_mode = COALESCE(billing_mode, ?),
                   model = COALESCE(model, ?)
-                   WHERE id = ?""",
+                   WHERE id = ?"""
+        with self._lock:
+            self._conn.execute(
+                sql,
                (
                    input_tokens,
                    output_tokens,
--- a/tests/gateway/test_session.py
+++ b/tests/gateway/test_session.py
@@ -858,6 +858,7 @@ class TestLastPromptTokens:
            billing_provider=None,
            billing_base_url=None,
            model="openai/gpt-5.4",
+            absolute=True,
        )