fix: gateway token double-counting with cached agents (#3306)

The cached agent accumulates session_input_tokens across messages, so
run_conversation() returns cumulative totals. But update_session() used
+= (increment), double-counting on every message after the first.

- session.py: change in-memory entry updates from += to = (direct
  assignment for cumulative values)
- hermes_state.py: add absolute=True flag to update_token_counts()
  that uses SET column = ? instead of SET column = column + ?
- session.py: pass absolute=True to the DB call

CLI path is unchanged — it passes per-API-call deltas directly to
update_token_counts() with the default absolute=False (increment).

Reported by @zaycruz in #3222. Closes #3222.
This commit is contained in:
Teknium
2026-03-26 19:04:53 -07:00
committed by GitHub
parent 1519c4d477
commit a8df7f9964
3 changed files with 45 additions and 10 deletions

View File

@@ -762,14 +762,16 @@ class SessionStore:
if session_key in self._entries:
entry = self._entries[session_key]
entry.updated_at = _now()
entry.input_tokens += input_tokens
entry.output_tokens += output_tokens
entry.cache_read_tokens += cache_read_tokens
entry.cache_write_tokens += cache_write_tokens
# Direct assignment — the gateway receives cumulative totals
# from the cached agent, not per-call deltas.
entry.input_tokens = input_tokens
entry.output_tokens = output_tokens
entry.cache_read_tokens = cache_read_tokens
entry.cache_write_tokens = cache_write_tokens
if last_prompt_tokens is not None:
entry.last_prompt_tokens = last_prompt_tokens
if estimated_cost_usd is not None:
entry.estimated_cost_usd += estimated_cost_usd
entry.estimated_cost_usd = estimated_cost_usd
if cost_status:
entry.cost_status = cost_status
entry.total_tokens = (
@@ -795,6 +797,7 @@ class SessionStore:
billing_provider=provider,
billing_base_url=base_url,
model=model,
absolute=True,
)
except Exception as e:
logger.debug("Session DB operation failed: %s", e)

View File

@@ -319,11 +319,39 @@ class SessionDB:
billing_provider: Optional[str] = None,
billing_base_url: Optional[str] = None,
billing_mode: Optional[str] = None,
absolute: bool = False,
) -> None:
"""Increment token counters and backfill model if not already set."""
with self._lock:
self._conn.execute(
"""UPDATE sessions SET
"""Update token counters and backfill model if not already set.
When *absolute* is False (default), values are **incremented** — use
this for per-API-call deltas (CLI path).
When *absolute* is True, values are **set directly** — use this when
the caller already holds cumulative totals (gateway path, where the
cached agent accumulates across messages).
"""
if absolute:
sql = """UPDATE sessions SET
input_tokens = ?,
output_tokens = ?,
cache_read_tokens = ?,
cache_write_tokens = ?,
reasoning_tokens = ?,
estimated_cost_usd = COALESCE(?, 0),
actual_cost_usd = CASE
WHEN ? IS NULL THEN actual_cost_usd
ELSE ?
END,
cost_status = COALESCE(?, cost_status),
cost_source = COALESCE(?, cost_source),
pricing_version = COALESCE(?, pricing_version),
billing_provider = COALESCE(billing_provider, ?),
billing_base_url = COALESCE(billing_base_url, ?),
billing_mode = COALESCE(billing_mode, ?),
model = COALESCE(model, ?)
WHERE id = ?"""
else:
sql = """UPDATE sessions SET
input_tokens = input_tokens + ?,
output_tokens = output_tokens + ?,
cache_read_tokens = cache_read_tokens + ?,
@@ -341,7 +369,10 @@ class SessionDB:
billing_base_url = COALESCE(billing_base_url, ?),
billing_mode = COALESCE(billing_mode, ?),
model = COALESCE(model, ?)
WHERE id = ?""",
WHERE id = ?"""
with self._lock:
self._conn.execute(
sql,
(
input_tokens,
output_tokens,

View File

@@ -858,6 +858,7 @@ class TestLastPromptTokens:
billing_provider=None,
billing_base_url=None,
model="openai/gpt-5.4",
absolute=True,
)