fix(state): SQLite concurrency hardening + session transcript integrity (#3249)

* fix(session-db): survive CLI/gateway concurrent write contention Closes #3139 Three layered fixes for the scenario where CLI and gateway write to state.db concurrently, causing create_session() to fail with 'database is locked' and permanently disabling session_search on the gateway side. 1. Increase SQLite connection timeout: 10s -> 30s hermes_state.py: longer window for the WAL writer to finish a batch flush before the other process gives up entirely. 2. INSERT OR IGNORE in create_session hermes_state.py: prevents IntegrityError on duplicate session IDs (e.g. gateway restarts while CLI session is still alive). 3. Don't null out _session_db on create_session failure (main fix) run_agent.py: a transient lock at agent startup must not permanently disable session_search for the lifetime of that agent instance. _session_db now stays alive so subsequent flushes and searches work once the lock clears. 4. New ensure_session() helper + call it during flush hermes_state.py: INSERT OR IGNORE for a minimal session row. run_agent.py _flush_messages_to_session_db: calls ensure_session() before appending messages, so the FK constraint is satisfied even when create_session() failed at startup. No-op when the row exists. * fix(state): release lock between context queries in search_messages The context-window queries (one per FTS5 match) were running inside the same lock acquisition as the primary FTS5 query, holding the lock for O(N) sequential SQLite round-trips. Move per-match context fetches outside the outer lock block so each acquires the lock independently, keeping critical sections short and allowing other threads to interleave. * fix(session): prefer longer source in load_transcript to prevent legacy truncation When a long-lived session pre-dates SQLite storage (e.g. sessions created before the DB layer was introduced, or after a clean deployment that reset the DB), _flush_messages_to_session_db only writes the *new* messages from the current turn to SQLite — it skips messages already present in conversation_history, assuming they are already persisted. That assumption fails for legacy JSONL-only sessions: Turn N (first after DB migration): load_transcript(id) → SQLite: 0 → falls back to JSONL: 994 ✓ _flush_messages_to_session_db: skip first 994, write 2 new → SQLite: 2 Turn N+1: load_transcript(id) → SQLite: 2 → returns immediately ✗ Agent sees 2 messages of history instead of 996 The same pattern causes the reported symptom: session JSON truncated to 4 messages (_save_session_log writes agent.messages which only has 2 history + 2 new = 4). Fix: always load both sources and return whichever is longer. For a fully-migrated session SQLite will always be ≥ JSONL, so there is no regression. For a legacy session that hasn't been bootstrapped yet, JSONL wins and the full history is restored. Closes #3212 * test: add load_transcript source preference tests for #3212 Covers: JSONL longer returns JSONL, SQLite longer returns SQLite, SQLite empty falls back to JSONL, both empty returns empty, equal length prefers SQLite (richer reasoning fields). --------- Co-authored-by: Mibayy <mibayy@hermes.ai> Co-authored-by: kewe63 <kewe.3217@gmail.com> Co-authored-by: Mibayy <mibayy@users.noreply.github.com>
2026-03-26 13:47:14 -07:00
parent 3a7907b278
commit b81d49dc45
5 changed files with 247 additions and 33 deletions
--- a/tests/gateway/test_session.py
+++ b/tests/gateway/test_session.py
@@ -386,6 +386,100 @@ class TestLoadTranscriptCorruptLines:
        assert messages[1]["content"] == "b"


+class TestLoadTranscriptPreferLongerSource:
+    """Regression: load_transcript must return whichever source (SQLite or JSONL)
+    has more messages to prevent silent truncation.  GH-3212."""
+
+    @pytest.fixture()
+    def store_with_db(self, tmp_path):
+        """SessionStore with both SQLite and JSONL active."""
+        from hermes_state import SessionDB
+
+        config = GatewayConfig()
+        with patch("gateway.session.SessionStore._ensure_loaded"):
+            s = SessionStore(sessions_dir=tmp_path, config=config)
+        s._db = SessionDB(db_path=tmp_path / "state.db")
+        s._loaded = True
+        return s
+
+    def test_jsonl_longer_than_sqlite_returns_jsonl(self, store_with_db):
+        """Legacy session: JSONL has full history, SQLite has only recent turn."""
+        sid = "legacy_session"
+        store_with_db._db.create_session(session_id=sid, source="gateway", model="m")
+        # JSONL has 10 messages (legacy history — written before SQLite existed)
+        for i in range(10):
+            role = "user" if i % 2 == 0 else "assistant"
+            store_with_db.append_to_transcript(
+                sid, {"role": role, "content": f"msg-{i}"}, skip_db=True,
+            )
+        # SQLite has only 2 messages (recent turn after migration)
+        store_with_db._db.append_message(session_id=sid, role="user", content="new-q")
+        store_with_db._db.append_message(session_id=sid, role="assistant", content="new-a")
+
+        result = store_with_db.load_transcript(sid)
+        assert len(result) == 10
+        assert result[0]["content"] == "msg-0"
+
+    def test_sqlite_longer_than_jsonl_returns_sqlite(self, store_with_db):
+        """Fully migrated session: SQLite has more (JSONL stopped growing)."""
+        sid = "migrated_session"
+        store_with_db._db.create_session(session_id=sid, source="gateway", model="m")
+        # JSONL has 2 old messages
+        store_with_db.append_to_transcript(
+            sid, {"role": "user", "content": "old-q"}, skip_db=True,
+        )
+        store_with_db.append_to_transcript(
+            sid, {"role": "assistant", "content": "old-a"}, skip_db=True,
+        )
+        # SQLite has 4 messages (superset after migration)
+        for i in range(4):
+            role = "user" if i % 2 == 0 else "assistant"
+            store_with_db._db.append_message(session_id=sid, role=role, content=f"db-{i}")
+
+        result = store_with_db.load_transcript(sid)
+        assert len(result) == 4
+        assert result[0]["content"] == "db-0"
+
+    def test_sqlite_empty_falls_back_to_jsonl(self, store_with_db):
+        """No SQLite rows — falls back to JSONL (original behavior preserved)."""
+        sid = "no_db_rows"
+        store_with_db.append_to_transcript(
+            sid, {"role": "user", "content": "hello"}, skip_db=True,
+        )
+        store_with_db.append_to_transcript(
+            sid, {"role": "assistant", "content": "hi"}, skip_db=True,
+        )
+
+        result = store_with_db.load_transcript(sid)
+        assert len(result) == 2
+        assert result[0]["content"] == "hello"
+
+    def test_both_empty_returns_empty(self, store_with_db):
+        """Neither source has data — returns empty list."""
+        result = store_with_db.load_transcript("nonexistent")
+        assert result == []
+
+    def test_equal_length_prefers_sqlite(self, store_with_db):
+        """When both have same count, SQLite wins (has richer fields like reasoning)."""
+        sid = "equal_session"
+        store_with_db._db.create_session(session_id=sid, source="gateway", model="m")
+        # Write 2 messages to JSONL only
+        store_with_db.append_to_transcript(
+            sid, {"role": "user", "content": "jsonl-q"}, skip_db=True,
+        )
+        store_with_db.append_to_transcript(
+            sid, {"role": "assistant", "content": "jsonl-a"}, skip_db=True,
+        )
+        # Write 2 different messages to SQLite only
+        store_with_db._db.append_message(session_id=sid, role="user", content="db-q")
+        store_with_db._db.append_message(session_id=sid, role="assistant", content="db-a")
+
+        result = store_with_db.load_transcript(sid)
+        assert len(result) == 2
+        # Should be the SQLite version (equal count → prefers SQLite)
+        assert result[0]["content"] == "db-q"
+
+
 class TestWhatsAppDMSessionKeyConsistency:
    """Regression: all session-key construction must go through build_session_key
    so DMs are isolated by chat_id across platforms."""