210 lines
5.9 KiB
Python
210 lines
5.9 KiB
Python
"""
|
|
Hybrid Memory Query Router
|
|
|
|
Routes queries to the best search method:
|
|
- HRR: Compositional/conceptual queries
|
|
- Vector: Semantic similarity
|
|
- FTS5: Exact keyword matching
|
|
|
|
Issue: #663
|
|
"""
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
|
|
class SearchMethod(Enum):
|
|
"""Available search methods."""
|
|
HRR = "hrr" # Holographic Reduced Representations
|
|
VECTOR = "vector" # Semantic vector search
|
|
FTS5 = "fts5" # Full-text search (SQLite)
|
|
HYBRID = "hybrid" # Combine multiple methods
|
|
|
|
|
|
@dataclass
|
|
class QueryClassification:
|
|
"""Result of query classification."""
|
|
method: SearchMethod
|
|
confidence: float
|
|
reason: str
|
|
sub_queries: Optional[List[str]] = None
|
|
|
|
|
|
# Query patterns for routing
|
|
COMPOSITIONAL_PATTERNS = [
|
|
r"(?i)\brelated\s+to\b",
|
|
r"(?i)\bcombined\s+with\b",
|
|
r"(?i)\bbound\s+to\b",
|
|
r"(?i)\bassociated\s+with\b",
|
|
r"(?i)\bwhat\s+connects?\b",
|
|
r"(?i)\bhow\s+.*\s+relate\b",
|
|
r"(?i)\brelationship\s+between\b",
|
|
]
|
|
|
|
CONTRADICTION_PATTERNS = [
|
|
r"(?i)\bcontradicts?\b",
|
|
r"(?i)\bconflicts?\s+with\b",
|
|
r"(?i)\binconsistent\b",
|
|
r"(?i)\bopposite\s+of\b",
|
|
r"(?i)\bopposes?\b",
|
|
r"(?i)\bdisagrees?\s+with\b",
|
|
]
|
|
|
|
EXACT_KEYWORD_PATTERNS = [
|
|
r'"[^"]+"', # Quoted phrases
|
|
r"'[^']+'", # Single-quoted phrases
|
|
r"(?i)\bexact\b",
|
|
r"(?i)\bprecisely\b",
|
|
r"(?i)\bspecifically\b",
|
|
]
|
|
|
|
TEMPORAL_PATTERNS = [
|
|
r"(?i)\brecent\b",
|
|
r"(?i)\btoday\b",
|
|
r"(?i)\byesterday\b",
|
|
r"(?i)\blast\s+(week|month|hour)\b",
|
|
r"(?i)\bsince\b",
|
|
r"(?i)\bbefore\b",
|
|
r"(?i)\bafter\b",
|
|
]
|
|
|
|
|
|
class QueryRouter:
|
|
"""Route queries to the best search method."""
|
|
|
|
def classify(self, query: str) -> QueryClassification:
|
|
"""Classify a query and route to best method."""
|
|
|
|
# Check for contradiction queries (HRR)
|
|
for pattern in CONTRADICTION_PATTERNS:
|
|
if re.search(pattern, query):
|
|
return QueryClassification(
|
|
method=SearchMethod.HRR,
|
|
confidence=0.95,
|
|
reason="Contradiction detection query"
|
|
)
|
|
|
|
# Check for compositional queries (HRR)
|
|
for pattern in COMPOSITIONAL_PATTERNS:
|
|
if re.search(pattern, query):
|
|
return QueryClassification(
|
|
method=SearchMethod.HRR,
|
|
confidence=0.90,
|
|
reason="Compositional/conceptual query"
|
|
)
|
|
|
|
# Check for exact keyword queries (FTS5)
|
|
for pattern in EXACT_KEYWORD_PATTERNS:
|
|
if re.search(pattern, query):
|
|
return QueryClassification(
|
|
method=SearchMethod.FTS5,
|
|
confidence=0.85,
|
|
reason="Exact keyword query"
|
|
)
|
|
|
|
# Check for temporal queries (FTS5)
|
|
for pattern in TEMPORAL_PATTERNS:
|
|
if re.search(pattern, query):
|
|
return QueryClassification(
|
|
method=SearchMethod.FTS5,
|
|
confidence=0.80,
|
|
reason="Temporal query"
|
|
)
|
|
|
|
# Short queries tend to be keyword searches
|
|
if len(query.split()) <= 3:
|
|
return QueryClassification(
|
|
method=SearchMethod.FTS5,
|
|
confidence=0.70,
|
|
reason="Short query (likely keyword)"
|
|
)
|
|
|
|
# Default: vector search for semantic queries
|
|
return QueryClassification(
|
|
method=SearchMethod.VECTOR,
|
|
confidence=0.60,
|
|
reason="Semantic similarity query"
|
|
)
|
|
|
|
def should_use_hybrid(self, query: str) -> bool:
|
|
"""Check if query should use hybrid search."""
|
|
classification = self.classify(query)
|
|
|
|
# Low confidence -> use hybrid
|
|
if classification.confidence < 0.70:
|
|
return True
|
|
|
|
# Mixed signals -> use hybrid
|
|
has_compositional = any(re.search(p, query) for p in COMPOSITIONAL_PATTERNS)
|
|
has_keywords = any(re.search(p, query) for p in EXACT_KEYWORD_PATTERNS)
|
|
|
|
return has_compositional and has_keywords
|
|
|
|
|
|
def reciprocal_rank_fusion(
|
|
results: Dict[str, List[Tuple[str, float]]],
|
|
k: int = 60
|
|
) -> List[Tuple[str, float]]:
|
|
"""
|
|
Merge results using Reciprocal Rank Fusion.
|
|
|
|
Args:
|
|
results: Dict of method -> [(item_id, score), ...]
|
|
k: RRF constant (default 60)
|
|
|
|
Returns:
|
|
Merged and re-ranked results
|
|
"""
|
|
scores = defaultdict(float)
|
|
|
|
for method, ranked_items in results.items():
|
|
for rank, (item_id, _) in enumerate(ranked_items, 1):
|
|
scores[item_id] += 1.0 / (k + rank)
|
|
|
|
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
def merge_with_hrr_priority(
|
|
hrr_results: List[Tuple[str, float]],
|
|
vector_results: List[Tuple[str, float]],
|
|
fts5_results: List[Tuple[str, float]],
|
|
query_type: str = "default"
|
|
) -> List[Tuple[str, float]]:
|
|
"""
|
|
Merge results with HRR priority for compositional queries.
|
|
"""
|
|
if query_type == "compositional":
|
|
# HRR first, vector as supplement
|
|
merged = hrr_results[:5]
|
|
seen = {r[0] for r in merged}
|
|
|
|
for r in vector_results[:5]:
|
|
if r[0] not in seen:
|
|
merged.append(r)
|
|
|
|
return merged
|
|
|
|
# Default: RRF merge
|
|
return reciprocal_rank_fusion({
|
|
"hrr": hrr_results,
|
|
"vector": vector_results,
|
|
"fts5": fts5_results
|
|
})
|
|
|
|
|
|
# Module-level router
|
|
_router = QueryRouter()
|
|
|
|
|
|
def route_query(query: str) -> QueryClassification:
|
|
"""Route a query to the best search method."""
|
|
return _router.classify(query)
|
|
|
|
|
|
def should_use_hybrid(query: str) -> bool:
|
|
"""Check if query should use hybrid search."""
|
|
return _router.should_use_hybrid(query)
|