From d7a2e3ddae71e95adb8e4ea960b7a813524c2000 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 17 Mar 2026 09:44:01 -0700 Subject: [PATCH] fix: handle hyphenated FTS5 queries and preserve quoted literals (#1776) _sanitize_fts5_query() was stripping ALL double quotes (including properly paired ones), breaking user-provided quoted phrases like "exact phrase". Hyphenated terms like chat-send also silently expanded to chat AND send, returning unexpected or zero results. Fix: 1. Extract balanced quoted phrases into placeholders before stripping FTS5-special characters, then restore them. 2. Wrap unquoted hyphenated terms (word-word) in double quotes so FTS5 matches them as exact phrases instead of splitting on the hyphen. 3. Unmatched quotes are still stripped as before. Based on issue report by @bailob (#1770) and PR #1773 by @Jah-yee (whose branch contained unrelated changes and couldn't be merged directly). Closes #1770 Closes #1773 Co-authored-by: Jah-yee --- hermes_state.py | 44 ++++++++++++++++++++------- tests/test_hermes_state.py | 61 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 10 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index a38870809..e87997ece 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -689,21 +689,45 @@ class SessionDB: ``NOT``) have special meaning. Passing raw user input directly to MATCH can cause ``sqlite3.OperationalError``. - Strategy: strip characters that are only meaningful as FTS5 operators - and would otherwise cause syntax errors. This preserves normal keyword - search while preventing crashes on inputs like ``C++``, ``"unterminated``, - or ``hello AND``. + Strategy: + - Preserve properly paired quoted phrases (``"exact phrase"``) + - Strip unmatched FTS5-special characters that would cause errors + - Wrap unquoted hyphenated terms in quotes so FTS5 matches them + as exact phrases instead of splitting on the hyphen """ - # Remove FTS5-special characters that are not useful in keyword search - sanitized = re.sub(r'[+{}()"^]', " ", query) - # Collapse repeated * (e.g. "***") into a single one, and remove - # leading * (prefix-only matching requires at least one char before *) + # Step 1: Extract balanced double-quoted phrases and protect them + # from further processing via numbered placeholders. + _quoted_parts: list = [] + + def _preserve_quoted(m: re.Match) -> str: + _quoted_parts.append(m.group(0)) + return f"\x00Q{len(_quoted_parts) - 1}\x00" + + sanitized = re.sub(r'"[^"]*"', _preserve_quoted, query) + + # Step 2: Strip remaining (unmatched) FTS5-special characters + sanitized = re.sub(r'[+{}()\"^]', " ", sanitized) + + # Step 3: Collapse repeated * (e.g. "***") into a single one, + # and remove leading * (prefix-only needs at least one char before *) sanitized = re.sub(r"\*+", "*", sanitized) sanitized = re.sub(r"(^|\s)\*", r"\1", sanitized) - # Remove dangling boolean operators at start/end that would cause - # syntax errors (e.g. "hello AND" or "OR world") + + # Step 4: Remove dangling boolean operators at start/end that would + # cause syntax errors (e.g. "hello AND" or "OR world") sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip()) sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip()) + + # Step 5: Wrap unquoted hyphenated terms (e.g. ``chat-send``) in + # double quotes. FTS5's tokenizer splits on hyphens, turning + # ``chat-send`` into ``chat AND send``. Quoting preserves the + # intended phrase match. + sanitized = re.sub(r"\b(\w+(?:-\w+)+)\b", r'"\1"', sanitized) + + # Step 6: Restore preserved quoted phrases + for i, quoted in enumerate(_quoted_parts): + sanitized = sanitized.replace(f"\x00Q{i}\x00", quoted) + return sanitized.strip() def search_messages( diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index 01d9c37ca..f9155d3f2 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -261,6 +261,30 @@ class TestFTS5Search: # The word "C" appears in the content, so FTS5 should find it assert isinstance(results, list) + def test_search_hyphenated_term_does_not_crash(self, db): + """Hyphenated terms like 'chat-send' must not crash FTS5.""" + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="Run the chat-send command") + + results = db.search_messages("chat-send") + assert isinstance(results, list) + assert len(results) >= 1 + assert any("chat-send" in (r.get("snippet") or r.get("content", "")).lower() + for r in results) + + def test_search_quoted_phrase_preserved(self, db): + """User-provided quoted phrases should be preserved for exact matching.""" + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="docker networking is complex") + db.append_message("s1", role="assistant", content="networking docker tips") + + # Quoted phrase should match only the exact order + results = db.search_messages('"docker networking"') + assert isinstance(results, list) + # Should find the user message (exact phrase) but may or may not find + # the assistant message depending on FTS5 phrase matching + assert len(results) >= 1 + def test_sanitize_fts5_query_strips_dangerous_chars(self): """Unit test for _sanitize_fts5_query static method.""" from hermes_state import SessionDB @@ -278,6 +302,43 @@ class TestFTS5Search: # Valid prefix kept assert s('deploy*') == 'deploy*' + def test_sanitize_fts5_preserves_quoted_phrases(self): + """Properly paired double-quoted phrases should be preserved.""" + from hermes_state import SessionDB + s = SessionDB._sanitize_fts5_query + # Simple quoted phrase + assert s('"exact phrase"') == '"exact phrase"' + # Quoted phrase alongside unquoted terms + assert '"docker networking"' in s('"docker networking" setup') + # Multiple quoted phrases + result = s('"hello world" OR "foo bar"') + assert '"hello world"' in result + assert '"foo bar"' in result + # Unmatched quote still stripped + assert '"' not in s('"unterminated') + + def test_sanitize_fts5_quotes_hyphenated_terms(self): + """Hyphenated terms should be wrapped in quotes for exact matching.""" + from hermes_state import SessionDB + s = SessionDB._sanitize_fts5_query + # Simple hyphenated term + assert s('chat-send') == '"chat-send"' + # Multiple hyphens + assert s('docker-compose-up') == '"docker-compose-up"' + # Hyphenated term with other words + result = s('fix chat-send bug') + assert '"chat-send"' in result + assert 'fix' in result + assert 'bug' in result + # Multiple hyphenated terms with OR + result = s('chat-send OR deploy-prod') + assert '"chat-send"' in result + assert '"deploy-prod"' in result + # Already-quoted hyphenated term — no double quoting + assert s('"chat-send"') == '"chat-send"' + # Hyphenated inside a quoted phrase stays as-is + assert s('"my chat-send thing"') == '"my chat-send thing"' + # ========================================================================= # Session search and listing