hermes-agent/tools/retrieval/query_router.py

"""Query Router — analyze queries and dispatch to optimal search backends.

Query types:
  - semantic: natural language, conceptual ("What did we discuss about X?")
  - keyword: exact terms, identifiers ("Find references to function_name")
  - compositional: relational, contradiction detection ("Is there a contradiction?")
  - multi-modal: cross-domain ("Find code related to pattern")

Each type routes to one or more backends. Results merged via Reciprocal Rank Fusion.
"""

import logging
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)


class QueryType(Enum):
    SEMANTIC = "semantic"
    KEYWORD = "keyword"
    COMPOSITIONAL = "compositional"
    MULTI_MODAL = "multi_modal"


@dataclass
class SearchQuery:
    """Parsed query with routing metadata."""
    raw: str
    query_type: QueryType
    keywords: List[str] = field(default_factory=list)
    semantic_text: str = ""
    confidence: float = 0.0
    backends: List[str] = field(default_factory=list)


@dataclass
class SearchResult:
    """Unified search result from any backend."""
    content: str
    source: str  # backend name: "fts5", "qdrant", "hrr"
    score: float
    metadata: Dict[str, Any] = field(default_factory=dict)
    session_id: Optional[str] = None
    timestamp: Optional[float] = None


# --- Query Type Detection ---

# Patterns for keyword queries (code identifiers, paths, exact terms)
_KEYWORD_PATTERNS = [
    r'\b[a-z_]+\.[a-z_]+\.[a-z_]+\b',  # dotted identifiers (module.class.method)
    r'\b[A-Z][a-z]+[A-Z][a-z]+\b',      # CamelCase identifiers
    r'\b\w+\(\)',                         # function calls
    r'\bdef\s+\w+',                       # def function_name
    r'\bclass\s+\w+',                     # class ClassName
    r'\bimport\s+\w+',                    # import statements
    r'["\'][^"\']+["\']',                 # quoted strings
    r'\b\w+\.\w{1,4}\b',                 # file.ext patterns
    r'/[\w/]+',                           # path-like strings
]

# Patterns for compositional queries
_COMPOSITIONAL_PATTERNS = [
    r'\bcontradiction\b',
    r'\bconflict\b',
    r'\bdiffer\b',
    r'\bagree\b',
    r'\brelate\b',
    r'\bcompare\b',
    r'\bversus\b',
    r'\bvs\b',
    r'\bboth\b.*\band\b',
    r'\beither\b.*\bor\b',
    r'\bif\b.*\bthen\b',
    r'\bbecause\b',
    r'\bimplies\b',
    r'\bdepends on\b',
    r'\bwhy\b',
]

# Patterns for multi-modal queries
_MULTIMODAL_PATTERNS = [
    r'\bcode\b.*\brelat\w*\b',
    r'\bpattern\b',
    r'\bsimilar\b',
    r'\bresembl\w*\b',
    r'\banalog\w*\b',
    r'\bcorrespond\w*\b',
    r'\bcross\b.*\breferenc\w*\b',
]


def detect_query_type(query: str) -> Tuple[QueryType, float]:
    """Detect the type of a search query.

    Returns (query_type, confidence).
    """
    query_lower = query.lower().strip()

    # Check compositional first (most specific)
    comp_score = sum(
        1 for p in _COMPOSITIONAL_PATTERNS
        if re.search(p, query_lower)
    )
    if comp_score >= 2:
        return QueryType.COMPOSITIONAL, min(0.9, 0.5 + comp_score * 0.15)

    # Check multi-modal
    multi_score = sum(
        1 for p in _MULTIMODAL_PATTERNS
        if re.search(p, query_lower)
    )
    if multi_score >= 1 and comp_score >= 1:
        return QueryType.MULTI_MODAL, min(0.85, 0.5 + multi_score * 0.15)

    # Check keyword (code identifiers, exact terms)
    kw_score = sum(
        1 for p in _KEYWORD_PATTERNS
        if re.search(p, query)
    )
    # Short queries with identifiers are likely keyword
    if kw_score >= 2:
        return QueryType.KEYWORD, min(0.95, 0.6 + kw_score * 0.1)
    if kw_score >= 1 and len(query.split()) <= 5:
        return QueryType.KEYWORD, 0.7

    # Default: semantic (natural language)
    # Longer queries with question words are likely semantic
    question_words = ['what', 'how', 'why', 'when', 'where', 'who', 'which', 'explain', 'describe', 'tell me']
    has_question = any(w in query_lower for w in question_words)
    if has_question or len(query.split()) > 4:
        return QueryType.SEMANTIC, 0.8

    # Short, ambiguous queries — lean keyword
    if len(query.split()) <= 3:
        return QueryType.KEYWORD, 0.5

    return QueryType.SEMANTIC, 0.6


def extract_keywords(query: str) -> List[str]:
    """Extract keyword terms from a query for FTS5 search."""
    # Remove common stop words
    stop_words = {
        'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
        'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
        'could', 'should', 'may', 'might', 'can', 'shall',
        'i', 'you', 'he', 'she', 'it', 'we', 'they',
        'this', 'that', 'these', 'those',
        'what', 'how', 'why', 'when', 'where', 'who', 'which',
        'and', 'or', 'but', 'not', 'if', 'then', 'else',
        'about', 'for', 'with', 'from', 'into', 'during', 'before',
        'after', 'above', 'below', 'between', 'through',
        'find', 'search', 'look', 'show', 'tell', 'get',
    }

    # Tokenize and filter
    tokens = re.findall(r'\b\w+\b', query.lower())
    keywords = [t for t in tokens if t not in stop_words and len(t) > 2]

    # Also extract quoted strings as exact terms
    quoted = re.findall(r'"([^"]+)"', query)
    keywords.extend(quoted)

    return keywords


def select_backends(query_type: QueryType, confidence: float) -> List[str]:
    """Select which backends to query based on query type."""
    backends = []

    if query_type == QueryType.SEMANTIC:
        backends = ["qdrant", "fts5"]  # Semantic primary, FTS5 for recall
    elif query_type == QueryType.KEYWORD:
        backends = ["fts5", "qdrant"]  # FTS5 primary, Qdrant for fuzzy
    elif query_type == QueryType.COMPOSITIONAL:
        backends = ["hrr", "fts5"]     # HRR primary for compositional
    elif query_type == QueryType.MULTI_MODAL:
        backends = ["qdrant", "hrr", "fts5"]  # All three

    # If confidence is low, query all backends
    if confidence < 0.6:
        backends = ["qdrant", "fts5", "hrr"]

    return backends


def route_query(query: str) -> SearchQuery:
    """Analyze a query and return routing information.

    This is the main entry point for the query router.
    """
    query_type, confidence = detect_query_type(query)
    keywords = extract_keywords(query)
    backends = select_backends(query_type, confidence)

    return SearchQuery(
        raw=query,
        query_type=query_type,
        keywords=keywords,
        semantic_text=query,
        confidence=confidence,
        backends=backends,
    )