Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 41s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Nix / nix (ubuntu-latest) (pull_request) Failing after 6s
Docs Site Checks / docs-site-checks (pull_request) Failing after 6m44s
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 30s
Tests / e2e (pull_request) Successful in 4m19s
Tests / test (pull_request) Failing after 35m31s
Nix / nix (macos-latest) (pull_request) Has been cancelled
Implements hybrid search combining three backends: - FTS5: keyword/exact search (existing SessionDB) - Qdrant: semantic vector search (graceful degradation) - HRR: compositional/contradiction detection (pure numpy) Query Router: auto-detects query type (semantic/keyword/compositional/ multi-modal) and routes to optimal backends with per-type weights. Result Merger: Reciprocal Rank Fusion with consensus boost for multi-backend matches. 9 files, 1272 insertions. 32 tests passing. docs/hybrid-search.md for architecture and usage.
208 lines
6.5 KiB
Python
208 lines
6.5 KiB
Python
"""Query Router — analyze queries and dispatch to optimal search backends.
|
|
|
|
Query types:
|
|
- semantic: natural language, conceptual ("What did we discuss about X?")
|
|
- keyword: exact terms, identifiers ("Find references to function_name")
|
|
- compositional: relational, contradiction detection ("Is there a contradiction?")
|
|
- multi-modal: cross-domain ("Find code related to pattern")
|
|
|
|
Each type routes to one or more backends. Results merged via Reciprocal Rank Fusion.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class QueryType(Enum):
|
|
SEMANTIC = "semantic"
|
|
KEYWORD = "keyword"
|
|
COMPOSITIONAL = "compositional"
|
|
MULTI_MODAL = "multi_modal"
|
|
|
|
|
|
@dataclass
|
|
class SearchQuery:
|
|
"""Parsed query with routing metadata."""
|
|
raw: str
|
|
query_type: QueryType
|
|
keywords: List[str] = field(default_factory=list)
|
|
semantic_text: str = ""
|
|
confidence: float = 0.0
|
|
backends: List[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class SearchResult:
|
|
"""Unified search result from any backend."""
|
|
content: str
|
|
source: str # backend name: "fts5", "qdrant", "hrr"
|
|
score: float
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
session_id: Optional[str] = None
|
|
timestamp: Optional[float] = None
|
|
|
|
|
|
# --- Query Type Detection ---
|
|
|
|
# Patterns for keyword queries (code identifiers, paths, exact terms)
|
|
_KEYWORD_PATTERNS = [
|
|
r'\b[a-z_]+\.[a-z_]+\.[a-z_]+\b', # dotted identifiers (module.class.method)
|
|
r'\b[A-Z][a-z]+[A-Z][a-z]+\b', # CamelCase identifiers
|
|
r'\b\w+\(\)', # function calls
|
|
r'\bdef\s+\w+', # def function_name
|
|
r'\bclass\s+\w+', # class ClassName
|
|
r'\bimport\s+\w+', # import statements
|
|
r'["\'][^"\']+["\']', # quoted strings
|
|
r'\b\w+\.\w{1,4}\b', # file.ext patterns
|
|
r'/[\w/]+', # path-like strings
|
|
]
|
|
|
|
# Patterns for compositional queries
|
|
_COMPOSITIONAL_PATTERNS = [
|
|
r'\bcontradiction\b',
|
|
r'\bconflict\b',
|
|
r'\bdiffer\b',
|
|
r'\bagree\b',
|
|
r'\brelate\b',
|
|
r'\bcompare\b',
|
|
r'\bversus\b',
|
|
r'\bvs\b',
|
|
r'\bboth\b.*\band\b',
|
|
r'\beither\b.*\bor\b',
|
|
r'\bif\b.*\bthen\b',
|
|
r'\bbecause\b',
|
|
r'\bimplies\b',
|
|
r'\bdepends on\b',
|
|
r'\bwhy\b',
|
|
]
|
|
|
|
# Patterns for multi-modal queries
|
|
_MULTIMODAL_PATTERNS = [
|
|
r'\bcode\b.*\brelat\w*\b',
|
|
r'\bpattern\b',
|
|
r'\bsimilar\b',
|
|
r'\bresembl\w*\b',
|
|
r'\banalog\w*\b',
|
|
r'\bcorrespond\w*\b',
|
|
r'\bcross\b.*\breferenc\w*\b',
|
|
]
|
|
|
|
|
|
def detect_query_type(query: str) -> Tuple[QueryType, float]:
|
|
"""Detect the type of a search query.
|
|
|
|
Returns (query_type, confidence).
|
|
"""
|
|
query_lower = query.lower().strip()
|
|
|
|
# Check compositional first (most specific)
|
|
comp_score = sum(
|
|
1 for p in _COMPOSITIONAL_PATTERNS
|
|
if re.search(p, query_lower)
|
|
)
|
|
if comp_score >= 2:
|
|
return QueryType.COMPOSITIONAL, min(0.9, 0.5 + comp_score * 0.15)
|
|
|
|
# Check multi-modal
|
|
multi_score = sum(
|
|
1 for p in _MULTIMODAL_PATTERNS
|
|
if re.search(p, query_lower)
|
|
)
|
|
if multi_score >= 1 and comp_score >= 1:
|
|
return QueryType.MULTI_MODAL, min(0.85, 0.5 + multi_score * 0.15)
|
|
|
|
# Check keyword (code identifiers, exact terms)
|
|
kw_score = sum(
|
|
1 for p in _KEYWORD_PATTERNS
|
|
if re.search(p, query)
|
|
)
|
|
# Short queries with identifiers are likely keyword
|
|
if kw_score >= 2:
|
|
return QueryType.KEYWORD, min(0.95, 0.6 + kw_score * 0.1)
|
|
if kw_score >= 1 and len(query.split()) <= 5:
|
|
return QueryType.KEYWORD, 0.7
|
|
|
|
# Default: semantic (natural language)
|
|
# Longer queries with question words are likely semantic
|
|
question_words = ['what', 'how', 'why', 'when', 'where', 'who', 'which', 'explain', 'describe', 'tell me']
|
|
has_question = any(w in query_lower for w in question_words)
|
|
if has_question or len(query.split()) > 4:
|
|
return QueryType.SEMANTIC, 0.8
|
|
|
|
# Short, ambiguous queries — lean keyword
|
|
if len(query.split()) <= 3:
|
|
return QueryType.KEYWORD, 0.5
|
|
|
|
return QueryType.SEMANTIC, 0.6
|
|
|
|
|
|
def extract_keywords(query: str) -> List[str]:
|
|
"""Extract keyword terms from a query for FTS5 search."""
|
|
# Remove common stop words
|
|
stop_words = {
|
|
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
|
|
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
'could', 'should', 'may', 'might', 'can', 'shall',
|
|
'i', 'you', 'he', 'she', 'it', 'we', 'they',
|
|
'this', 'that', 'these', 'those',
|
|
'what', 'how', 'why', 'when', 'where', 'who', 'which',
|
|
'and', 'or', 'but', 'not', 'if', 'then', 'else',
|
|
'about', 'for', 'with', 'from', 'into', 'during', 'before',
|
|
'after', 'above', 'below', 'between', 'through',
|
|
'find', 'search', 'look', 'show', 'tell', 'get',
|
|
}
|
|
|
|
# Tokenize and filter
|
|
tokens = re.findall(r'\b\w+\b', query.lower())
|
|
keywords = [t for t in tokens if t not in stop_words and len(t) > 2]
|
|
|
|
# Also extract quoted strings as exact terms
|
|
quoted = re.findall(r'"([^"]+)"', query)
|
|
keywords.extend(quoted)
|
|
|
|
return keywords
|
|
|
|
|
|
def select_backends(query_type: QueryType, confidence: float) -> List[str]:
|
|
"""Select which backends to query based on query type."""
|
|
backends = []
|
|
|
|
if query_type == QueryType.SEMANTIC:
|
|
backends = ["qdrant", "fts5"] # Semantic primary, FTS5 for recall
|
|
elif query_type == QueryType.KEYWORD:
|
|
backends = ["fts5", "qdrant"] # FTS5 primary, Qdrant for fuzzy
|
|
elif query_type == QueryType.COMPOSITIONAL:
|
|
backends = ["hrr", "fts5"] # HRR primary for compositional
|
|
elif query_type == QueryType.MULTI_MODAL:
|
|
backends = ["qdrant", "hrr", "fts5"] # All three
|
|
|
|
# If confidence is low, query all backends
|
|
if confidence < 0.6:
|
|
backends = ["qdrant", "fts5", "hrr"]
|
|
|
|
return backends
|
|
|
|
|
|
def route_query(query: str) -> SearchQuery:
|
|
"""Analyze a query and return routing information.
|
|
|
|
This is the main entry point for the query router.
|
|
"""
|
|
query_type, confidence = detect_query_type(query)
|
|
keywords = extract_keywords(query)
|
|
backends = select_backends(query_type, confidence)
|
|
|
|
return SearchQuery(
|
|
raw=query,
|
|
query_type=query_type,
|
|
keywords=keywords,
|
|
semantic_text=query,
|
|
confidence=confidence,
|
|
backends=backends,
|
|
)
|