fix: handle hyphenated FTS5 queries and preserve quoted literals (#1776)
_sanitize_fts5_query() was stripping ALL double quotes (including properly paired ones), breaking user-provided quoted phrases like "exact phrase". Hyphenated terms like chat-send also silently expanded to chat AND send, returning unexpected or zero results. Fix: 1. Extract balanced quoted phrases into placeholders before stripping FTS5-special characters, then restore them. 2. Wrap unquoted hyphenated terms (word-word) in double quotes so FTS5 matches them as exact phrases instead of splitting on the hyphen. 3. Unmatched quotes are still stripped as before. Based on issue report by @bailob (#1770) and PR #1773 by @Jah-yee (whose branch contained unrelated changes and couldn't be merged directly). Closes #1770 Closes #1773 Co-authored-by: Jah-yee <Jah-yee@users.noreply.github.com>
This commit is contained in:
@@ -689,21 +689,45 @@ class SessionDB:
|
||||
``NOT``) have special meaning. Passing raw user input directly to
|
||||
MATCH can cause ``sqlite3.OperationalError``.
|
||||
|
||||
Strategy: strip characters that are only meaningful as FTS5 operators
|
||||
and would otherwise cause syntax errors. This preserves normal keyword
|
||||
search while preventing crashes on inputs like ``C++``, ``"unterminated``,
|
||||
or ``hello AND``.
|
||||
Strategy:
|
||||
- Preserve properly paired quoted phrases (``"exact phrase"``)
|
||||
- Strip unmatched FTS5-special characters that would cause errors
|
||||
- Wrap unquoted hyphenated terms in quotes so FTS5 matches them
|
||||
as exact phrases instead of splitting on the hyphen
|
||||
"""
|
||||
# Remove FTS5-special characters that are not useful in keyword search
|
||||
sanitized = re.sub(r'[+{}()"^]', " ", query)
|
||||
# Collapse repeated * (e.g. "***") into a single one, and remove
|
||||
# leading * (prefix-only matching requires at least one char before *)
|
||||
# Step 1: Extract balanced double-quoted phrases and protect them
|
||||
# from further processing via numbered placeholders.
|
||||
_quoted_parts: list = []
|
||||
|
||||
def _preserve_quoted(m: re.Match) -> str:
|
||||
_quoted_parts.append(m.group(0))
|
||||
return f"\x00Q{len(_quoted_parts) - 1}\x00"
|
||||
|
||||
sanitized = re.sub(r'"[^"]*"', _preserve_quoted, query)
|
||||
|
||||
# Step 2: Strip remaining (unmatched) FTS5-special characters
|
||||
sanitized = re.sub(r'[+{}()\"^]', " ", sanitized)
|
||||
|
||||
# Step 3: Collapse repeated * (e.g. "***") into a single one,
|
||||
# and remove leading * (prefix-only needs at least one char before *)
|
||||
sanitized = re.sub(r"\*+", "*", sanitized)
|
||||
sanitized = re.sub(r"(^|\s)\*", r"\1", sanitized)
|
||||
# Remove dangling boolean operators at start/end that would cause
|
||||
# syntax errors (e.g. "hello AND" or "OR world")
|
||||
|
||||
# Step 4: Remove dangling boolean operators at start/end that would
|
||||
# cause syntax errors (e.g. "hello AND" or "OR world")
|
||||
sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip())
|
||||
sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip())
|
||||
|
||||
# Step 5: Wrap unquoted hyphenated terms (e.g. ``chat-send``) in
|
||||
# double quotes. FTS5's tokenizer splits on hyphens, turning
|
||||
# ``chat-send`` into ``chat AND send``. Quoting preserves the
|
||||
# intended phrase match.
|
||||
sanitized = re.sub(r"\b(\w+(?:-\w+)+)\b", r'"\1"', sanitized)
|
||||
|
||||
# Step 6: Restore preserved quoted phrases
|
||||
for i, quoted in enumerate(_quoted_parts):
|
||||
sanitized = sanitized.replace(f"\x00Q{i}\x00", quoted)
|
||||
|
||||
return sanitized.strip()
|
||||
|
||||
def search_messages(
|
||||
|
||||
@@ -261,6 +261,30 @@ class TestFTS5Search:
|
||||
# The word "C" appears in the content, so FTS5 should find it
|
||||
assert isinstance(results, list)
|
||||
|
||||
def test_search_hyphenated_term_does_not_crash(self, db):
|
||||
"""Hyphenated terms like 'chat-send' must not crash FTS5."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="Run the chat-send command")
|
||||
|
||||
results = db.search_messages("chat-send")
|
||||
assert isinstance(results, list)
|
||||
assert len(results) >= 1
|
||||
assert any("chat-send" in (r.get("snippet") or r.get("content", "")).lower()
|
||||
for r in results)
|
||||
|
||||
def test_search_quoted_phrase_preserved(self, db):
|
||||
"""User-provided quoted phrases should be preserved for exact matching."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="docker networking is complex")
|
||||
db.append_message("s1", role="assistant", content="networking docker tips")
|
||||
|
||||
# Quoted phrase should match only the exact order
|
||||
results = db.search_messages('"docker networking"')
|
||||
assert isinstance(results, list)
|
||||
# Should find the user message (exact phrase) but may or may not find
|
||||
# the assistant message depending on FTS5 phrase matching
|
||||
assert len(results) >= 1
|
||||
|
||||
def test_sanitize_fts5_query_strips_dangerous_chars(self):
|
||||
"""Unit test for _sanitize_fts5_query static method."""
|
||||
from hermes_state import SessionDB
|
||||
@@ -278,6 +302,43 @@ class TestFTS5Search:
|
||||
# Valid prefix kept
|
||||
assert s('deploy*') == 'deploy*'
|
||||
|
||||
def test_sanitize_fts5_preserves_quoted_phrases(self):
|
||||
"""Properly paired double-quoted phrases should be preserved."""
|
||||
from hermes_state import SessionDB
|
||||
s = SessionDB._sanitize_fts5_query
|
||||
# Simple quoted phrase
|
||||
assert s('"exact phrase"') == '"exact phrase"'
|
||||
# Quoted phrase alongside unquoted terms
|
||||
assert '"docker networking"' in s('"docker networking" setup')
|
||||
# Multiple quoted phrases
|
||||
result = s('"hello world" OR "foo bar"')
|
||||
assert '"hello world"' in result
|
||||
assert '"foo bar"' in result
|
||||
# Unmatched quote still stripped
|
||||
assert '"' not in s('"unterminated')
|
||||
|
||||
def test_sanitize_fts5_quotes_hyphenated_terms(self):
|
||||
"""Hyphenated terms should be wrapped in quotes for exact matching."""
|
||||
from hermes_state import SessionDB
|
||||
s = SessionDB._sanitize_fts5_query
|
||||
# Simple hyphenated term
|
||||
assert s('chat-send') == '"chat-send"'
|
||||
# Multiple hyphens
|
||||
assert s('docker-compose-up') == '"docker-compose-up"'
|
||||
# Hyphenated term with other words
|
||||
result = s('fix chat-send bug')
|
||||
assert '"chat-send"' in result
|
||||
assert 'fix' in result
|
||||
assert 'bug' in result
|
||||
# Multiple hyphenated terms with OR
|
||||
result = s('chat-send OR deploy-prod')
|
||||
assert '"chat-send"' in result
|
||||
assert '"deploy-prod"' in result
|
||||
# Already-quoted hyphenated term — no double quoting
|
||||
assert s('"chat-send"') == '"chat-send"'
|
||||
# Hyphenated inside a quoted phrase stays as-is
|
||||
assert s('"my chat-send thing"') == '"my chat-send thing"'
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Session search and listing
|
||||
|
||||
Reference in New Issue
Block a user