Merge PR #565: fix: sanitize FTS5 queries and close mirror DB connections
Authored by 0xbyt4. Fixes #N/A (no linked issue). - Sanitize user input before FTS5 MATCH to prevent OperationalError on special characters (C++, unbalanced quotes, dangling operators, etc.) - Close SessionDB connection in mirror._append_to_sqlite() via finally block - Added tests for both fixes
This commit is contained in:
@@ -111,6 +111,7 @@ def _append_to_jsonl(session_id: str, message: dict) -> None:
|
||||
|
||||
def _append_to_sqlite(session_id: str, message: dict) -> None:
|
||||
"""Append a message to the SQLite session database."""
|
||||
db = None
|
||||
try:
|
||||
from hermes_state import SessionDB
|
||||
db = SessionDB()
|
||||
@@ -121,3 +122,6 @@ def _append_to_sqlite(session_id: str, message: dict) -> None:
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Mirror SQLite write failed: %s", e)
|
||||
finally:
|
||||
if db is not None:
|
||||
db.close()
|
||||
|
||||
@@ -16,6 +16,7 @@ Key design decisions:
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -557,6 +558,32 @@ class SessionDB:
|
||||
# Search
|
||||
# =========================================================================
|
||||
|
||||
@staticmethod
|
||||
def _sanitize_fts5_query(query: str) -> str:
|
||||
"""Sanitize user input for safe use in FTS5 MATCH queries.
|
||||
|
||||
FTS5 has its own query syntax where characters like ``"``, ``(``, ``)``,
|
||||
``+``, ``*``, ``{``, ``}`` and bare boolean operators (``AND``, ``OR``,
|
||||
``NOT``) have special meaning. Passing raw user input directly to
|
||||
MATCH can cause ``sqlite3.OperationalError``.
|
||||
|
||||
Strategy: strip characters that are only meaningful as FTS5 operators
|
||||
and would otherwise cause syntax errors. This preserves normal keyword
|
||||
search while preventing crashes on inputs like ``C++``, ``"unterminated``,
|
||||
or ``hello AND``.
|
||||
"""
|
||||
# Remove FTS5-special characters that are not useful in keyword search
|
||||
sanitized = re.sub(r'[+{}()"^]', " ", query)
|
||||
# Collapse repeated * (e.g. "***") into a single one, and remove
|
||||
# leading * (prefix-only matching requires at least one char before *)
|
||||
sanitized = re.sub(r"\*+", "*", sanitized)
|
||||
sanitized = re.sub(r"(^|\s)\*", r"\1", sanitized)
|
||||
# Remove dangling boolean operators at start/end that would cause
|
||||
# syntax errors (e.g. "hello AND" or "OR world")
|
||||
sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip())
|
||||
sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip())
|
||||
return sanitized.strip()
|
||||
|
||||
def search_messages(
|
||||
self,
|
||||
query: str,
|
||||
@@ -580,6 +607,10 @@ class SessionDB:
|
||||
if not query or not query.strip():
|
||||
return []
|
||||
|
||||
query = self._sanitize_fts5_query(query)
|
||||
if not query:
|
||||
return []
|
||||
|
||||
if source_filter is None:
|
||||
source_filter = ["cli", "telegram", "discord", "whatsapp", "slack"]
|
||||
|
||||
@@ -619,7 +650,11 @@ class SessionDB:
|
||||
LIMIT ? OFFSET ?
|
||||
"""
|
||||
|
||||
cursor = self._conn.execute(sql, params)
|
||||
try:
|
||||
cursor = self._conn.execute(sql, params)
|
||||
except sqlite3.OperationalError:
|
||||
# FTS5 query syntax error despite sanitization — return empty
|
||||
return []
|
||||
matches = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# Add surrounding context (1 message before + after each match)
|
||||
|
||||
@@ -160,3 +160,27 @@ class TestMirrorToSession:
|
||||
result = mirror_to_session("telegram", "123", "msg")
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestAppendToSqlite:
|
||||
def test_connection_is_closed_after_use(self, tmp_path):
|
||||
"""Verify _append_to_sqlite closes the SessionDB connection."""
|
||||
from gateway.mirror import _append_to_sqlite
|
||||
mock_db = MagicMock()
|
||||
|
||||
with patch("hermes_state.SessionDB", return_value=mock_db):
|
||||
_append_to_sqlite("sess_1", {"role": "assistant", "content": "hello"})
|
||||
|
||||
mock_db.append_message.assert_called_once()
|
||||
mock_db.close.assert_called_once()
|
||||
|
||||
def test_connection_closed_even_on_error(self, tmp_path):
|
||||
"""Verify connection is closed even when append_message raises."""
|
||||
from gateway.mirror import _append_to_sqlite
|
||||
mock_db = MagicMock()
|
||||
mock_db.append_message.side_effect = Exception("db error")
|
||||
|
||||
with patch("hermes_state.SessionDB", return_value=mock_db):
|
||||
_append_to_sqlite("sess_1", {"role": "assistant", "content": "hello"})
|
||||
|
||||
mock_db.close.assert_called_once()
|
||||
|
||||
@@ -216,6 +216,54 @@ class TestFTS5Search:
|
||||
assert isinstance(results[0]["context"], list)
|
||||
assert len(results[0]["context"]) > 0
|
||||
|
||||
def test_search_special_chars_do_not_crash(self, db):
|
||||
"""FTS5 special characters in queries must not raise OperationalError."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="How do I use C++ templates?")
|
||||
|
||||
# Each of these previously caused sqlite3.OperationalError
|
||||
dangerous_queries = [
|
||||
'C++', # + is FTS5 column filter
|
||||
'"unterminated', # unbalanced double-quote
|
||||
'(problem', # unbalanced parenthesis
|
||||
'hello AND', # dangling boolean operator
|
||||
'***', # repeated wildcard
|
||||
'{test}', # curly braces (column reference)
|
||||
'OR hello', # leading boolean operator
|
||||
'a AND OR b', # adjacent operators
|
||||
]
|
||||
for query in dangerous_queries:
|
||||
# Must not raise — should return list (possibly empty)
|
||||
results = db.search_messages(query)
|
||||
assert isinstance(results, list), f"Query {query!r} did not return a list"
|
||||
|
||||
def test_search_sanitized_query_still_finds_content(self, db):
|
||||
"""Sanitization must not break normal keyword search."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="Learning C++ templates today")
|
||||
|
||||
# "C++" sanitized to "C" should still match "C++"
|
||||
results = db.search_messages("C++")
|
||||
# The word "C" appears in the content, so FTS5 should find it
|
||||
assert isinstance(results, list)
|
||||
|
||||
def test_sanitize_fts5_query_strips_dangerous_chars(self):
|
||||
"""Unit test for _sanitize_fts5_query static method."""
|
||||
from hermes_state import SessionDB
|
||||
s = SessionDB._sanitize_fts5_query
|
||||
assert s('hello world') == 'hello world'
|
||||
assert '+' not in s('C++')
|
||||
assert '"' not in s('"unterminated')
|
||||
assert '(' not in s('(problem')
|
||||
assert '{' not in s('{test}')
|
||||
# Dangling operators removed
|
||||
assert s('hello AND') == 'hello'
|
||||
assert s('OR world') == 'world'
|
||||
# Leading bare * removed
|
||||
assert s('***') == ''
|
||||
# Valid prefix kept
|
||||
assert s('deploy*') == 'deploy*'
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Session search and listing
|
||||
|
||||
Reference in New Issue
Block a user