594 lines
21 KiB
Python
594 lines
21 KiB
Python
|
|
"""Hybrid keyword/BM25 retrieval for the memory store.
|
|||
|
|
|
|||
|
|
Ported from KIK memory_agent.py — combines FTS5 full-text search with
|
|||
|
|
Jaccard similarity reranking and trust-weighted scoring.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import math
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from typing import TYPE_CHECKING
|
|||
|
|
|
|||
|
|
if TYPE_CHECKING:
|
|||
|
|
from .store import MemoryStore
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from . import holographic as hrr
|
|||
|
|
except ImportError:
|
|||
|
|
import holographic as hrr # type: ignore[no-redef]
|
|||
|
|
|
|||
|
|
|
|||
|
|
class FactRetriever:
|
|||
|
|
"""Multi-strategy fact retrieval with trust-weighted scoring."""
|
|||
|
|
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
store: MemoryStore,
|
|||
|
|
temporal_decay_half_life: int = 0, # days, 0 = disabled
|
|||
|
|
fts_weight: float = 0.4,
|
|||
|
|
jaccard_weight: float = 0.3,
|
|||
|
|
hrr_weight: float = 0.3,
|
|||
|
|
hrr_dim: int = 1024,
|
|||
|
|
):
|
|||
|
|
self.store = store
|
|||
|
|
self.half_life = temporal_decay_half_life
|
|||
|
|
self.hrr_dim = hrr_dim
|
|||
|
|
|
|||
|
|
# Auto-redistribute weights if numpy unavailable
|
|||
|
|
if hrr_weight > 0 and not hrr._HAS_NUMPY:
|
|||
|
|
fts_weight = 0.6
|
|||
|
|
jaccard_weight = 0.4
|
|||
|
|
hrr_weight = 0.0
|
|||
|
|
|
|||
|
|
self.fts_weight = fts_weight
|
|||
|
|
self.jaccard_weight = jaccard_weight
|
|||
|
|
self.hrr_weight = hrr_weight
|
|||
|
|
|
|||
|
|
def search(
|
|||
|
|
self,
|
|||
|
|
query: str,
|
|||
|
|
category: str | None = None,
|
|||
|
|
min_trust: float = 0.3,
|
|||
|
|
limit: int = 10,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Hybrid search: FTS5 candidates → Jaccard rerank → trust weighting.
|
|||
|
|
|
|||
|
|
Pipeline:
|
|||
|
|
1. FTS5 search: Get limit*3 candidates from SQLite full-text search
|
|||
|
|
2. Jaccard boost: Token overlap between query and fact content
|
|||
|
|
3. Trust weighting: final_score = relevance * trust_score
|
|||
|
|
4. Temporal decay (optional): decay = 0.5^(age_days / half_life)
|
|||
|
|
|
|||
|
|
Returns list of dicts with fact data + 'score' field, sorted by score desc.
|
|||
|
|
"""
|
|||
|
|
# Stage 1: Get FTS5 candidates (more than limit for reranking headroom)
|
|||
|
|
candidates = self._fts_candidates(query, category, min_trust, limit * 3)
|
|||
|
|
|
|||
|
|
if not candidates:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# Stage 2: Rerank with Jaccard + trust + optional decay
|
|||
|
|
query_tokens = self._tokenize(query)
|
|||
|
|
scored = []
|
|||
|
|
|
|||
|
|
for fact in candidates:
|
|||
|
|
content_tokens = self._tokenize(fact["content"])
|
|||
|
|
tag_tokens = self._tokenize(fact.get("tags", ""))
|
|||
|
|
all_tokens = content_tokens | tag_tokens
|
|||
|
|
|
|||
|
|
jaccard = self._jaccard_similarity(query_tokens, all_tokens)
|
|||
|
|
fts_score = fact.get("fts_rank", 0.0)
|
|||
|
|
|
|||
|
|
# HRR similarity
|
|||
|
|
if self.hrr_weight > 0 and fact.get("hrr_vector"):
|
|||
|
|
fact_vec = hrr.bytes_to_phases(fact["hrr_vector"])
|
|||
|
|
query_vec = hrr.encode_text(query, self.hrr_dim)
|
|||
|
|
hrr_sim = (hrr.similarity(query_vec, fact_vec) + 1.0) / 2.0 # shift to [0,1]
|
|||
|
|
else:
|
|||
|
|
hrr_sim = 0.5 # neutral
|
|||
|
|
|
|||
|
|
# Combine FTS5 + Jaccard + HRR
|
|||
|
|
relevance = (self.fts_weight * fts_score
|
|||
|
|
+ self.jaccard_weight * jaccard
|
|||
|
|
+ self.hrr_weight * hrr_sim)
|
|||
|
|
|
|||
|
|
# Trust weighting
|
|||
|
|
score = relevance * fact["trust_score"]
|
|||
|
|
|
|||
|
|
# Optional temporal decay
|
|||
|
|
if self.half_life > 0:
|
|||
|
|
score *= self._temporal_decay(fact.get("updated_at") or fact.get("created_at"))
|
|||
|
|
|
|||
|
|
fact["score"] = score
|
|||
|
|
scored.append(fact)
|
|||
|
|
|
|||
|
|
# Sort by score descending, return top limit
|
|||
|
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
|||
|
|
results = scored[:limit]
|
|||
|
|
# Strip raw HRR bytes — callers expect JSON-serializable dicts
|
|||
|
|
for fact in results:
|
|||
|
|
fact.pop("hrr_vector", None)
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
def probe(
|
|||
|
|
self,
|
|||
|
|
entity: str,
|
|||
|
|
category: str | None = None,
|
|||
|
|
limit: int = 10,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Compositional entity query using HRR algebra.
|
|||
|
|
|
|||
|
|
Unbinds entity from memory bank to extract associated content.
|
|||
|
|
This is NOT keyword search — it uses algebraic structure to find facts
|
|||
|
|
where the entity plays a structural role.
|
|||
|
|
|
|||
|
|
Falls back to FTS5 search if numpy unavailable.
|
|||
|
|
"""
|
|||
|
|
if not hrr._HAS_NUMPY:
|
|||
|
|
# Fallback to keyword search on entity name
|
|||
|
|
return self.search(entity, category=category, limit=limit)
|
|||
|
|
|
|||
|
|
conn = self.store._conn
|
|||
|
|
|
|||
|
|
# Encode entity as role-bound vector
|
|||
|
|
role_entity = hrr.encode_atom("__hrr_role_entity__", self.hrr_dim)
|
|||
|
|
entity_vec = hrr.encode_atom(entity.lower(), self.hrr_dim)
|
|||
|
|
probe_key = hrr.bind(entity_vec, role_entity)
|
|||
|
|
|
|||
|
|
# Try category-specific bank first, then all facts
|
|||
|
|
if category:
|
|||
|
|
bank_name = f"cat:{category}"
|
|||
|
|
bank_row = conn.execute(
|
|||
|
|
"SELECT vector FROM memory_banks WHERE bank_name = ?",
|
|||
|
|
(bank_name,),
|
|||
|
|
).fetchone()
|
|||
|
|
if bank_row:
|
|||
|
|
bank_vec = hrr.bytes_to_phases(bank_row["vector"])
|
|||
|
|
extracted = hrr.unbind(bank_vec, probe_key)
|
|||
|
|
# Use extracted signal to score individual facts
|
|||
|
|
return self._score_facts_by_vector(
|
|||
|
|
extracted, category=category, limit=limit
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Score against individual fact vectors directly
|
|||
|
|
where = "WHERE hrr_vector IS NOT NULL"
|
|||
|
|
params: list = []
|
|||
|
|
if category:
|
|||
|
|
where += " AND category = ?"
|
|||
|
|
params.append(category)
|
|||
|
|
|
|||
|
|
rows = conn.execute(
|
|||
|
|
f"""
|
|||
|
|
SELECT fact_id, content, category, tags, trust_score,
|
|||
|
|
retrieval_count, helpful_count, created_at, updated_at,
|
|||
|
|
hrr_vector
|
|||
|
|
FROM facts
|
|||
|
|
{where}
|
|||
|
|
""",
|
|||
|
|
params,
|
|||
|
|
).fetchall()
|
|||
|
|
|
|||
|
|
if not rows:
|
|||
|
|
# Final fallback: keyword search
|
|||
|
|
return self.search(entity, category=category, limit=limit)
|
|||
|
|
|
|||
|
|
scored = []
|
|||
|
|
for row in rows:
|
|||
|
|
fact = dict(row)
|
|||
|
|
fact_vec = hrr.bytes_to_phases(fact.pop("hrr_vector"))
|
|||
|
|
# Unbind probe key from fact to see if entity is structurally present
|
|||
|
|
residual = hrr.unbind(fact_vec, probe_key)
|
|||
|
|
# Compare residual against content signal
|
|||
|
|
role_content = hrr.encode_atom("__hrr_role_content__", self.hrr_dim)
|
|||
|
|
content_vec = hrr.bind(hrr.encode_text(fact["content"], self.hrr_dim), role_content)
|
|||
|
|
sim = hrr.similarity(residual, content_vec)
|
|||
|
|
fact["score"] = (sim + 1.0) / 2.0 * fact["trust_score"]
|
|||
|
|
scored.append(fact)
|
|||
|
|
|
|||
|
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
|||
|
|
return scored[:limit]
|
|||
|
|
|
|||
|
|
def related(
|
|||
|
|
self,
|
|||
|
|
entity: str,
|
|||
|
|
category: str | None = None,
|
|||
|
|
limit: int = 10,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Discover facts that share structural connections with an entity.
|
|||
|
|
|
|||
|
|
Unlike probe (which finds facts *about* an entity), related finds
|
|||
|
|
facts that are connected through shared context — e.g., other entities
|
|||
|
|
mentioned alongside this one, or content that overlaps structurally.
|
|||
|
|
|
|||
|
|
Falls back to FTS5 search if numpy unavailable.
|
|||
|
|
"""
|
|||
|
|
if not hrr._HAS_NUMPY:
|
|||
|
|
return self.search(entity, category=category, limit=limit)
|
|||
|
|
|
|||
|
|
conn = self.store._conn
|
|||
|
|
|
|||
|
|
# Encode entity as a bare atom (not role-bound — we want ANY structural match)
|
|||
|
|
entity_vec = hrr.encode_atom(entity.lower(), self.hrr_dim)
|
|||
|
|
|
|||
|
|
# Get all facts with vectors
|
|||
|
|
where = "WHERE hrr_vector IS NOT NULL"
|
|||
|
|
params: list = []
|
|||
|
|
if category:
|
|||
|
|
where += " AND category = ?"
|
|||
|
|
params.append(category)
|
|||
|
|
|
|||
|
|
rows = conn.execute(
|
|||
|
|
f"""
|
|||
|
|
SELECT fact_id, content, category, tags, trust_score,
|
|||
|
|
retrieval_count, helpful_count, created_at, updated_at,
|
|||
|
|
hrr_vector
|
|||
|
|
FROM facts
|
|||
|
|
{where}
|
|||
|
|
""",
|
|||
|
|
params,
|
|||
|
|
).fetchall()
|
|||
|
|
|
|||
|
|
if not rows:
|
|||
|
|
return self.search(entity, category=category, limit=limit)
|
|||
|
|
|
|||
|
|
# Score each fact by how much the entity's atom appears in its vector
|
|||
|
|
# This catches both role-bound entity matches AND content word matches
|
|||
|
|
scored = []
|
|||
|
|
for row in rows:
|
|||
|
|
fact = dict(row)
|
|||
|
|
fact_vec = hrr.bytes_to_phases(fact.pop("hrr_vector"))
|
|||
|
|
|
|||
|
|
# Check structural similarity: unbind entity from fact
|
|||
|
|
residual = hrr.unbind(fact_vec, entity_vec)
|
|||
|
|
# A high-similarity residual to ANY known role vector means this entity
|
|||
|
|
# plays a structural role in the fact
|
|||
|
|
role_entity = hrr.encode_atom("__hrr_role_entity__", self.hrr_dim)
|
|||
|
|
role_content = hrr.encode_atom("__hrr_role_content__", self.hrr_dim)
|
|||
|
|
|
|||
|
|
entity_role_sim = hrr.similarity(residual, role_entity)
|
|||
|
|
content_role_sim = hrr.similarity(residual, role_content)
|
|||
|
|
# Take the max — entity could appear in either role
|
|||
|
|
best_sim = max(entity_role_sim, content_role_sim)
|
|||
|
|
|
|||
|
|
fact["score"] = (best_sim + 1.0) / 2.0 * fact["trust_score"]
|
|||
|
|
scored.append(fact)
|
|||
|
|
|
|||
|
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
|||
|
|
return scored[:limit]
|
|||
|
|
|
|||
|
|
def reason(
|
|||
|
|
self,
|
|||
|
|
entities: list[str],
|
|||
|
|
category: str | None = None,
|
|||
|
|
limit: int = 10,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Multi-entity compositional query — vector-space JOIN.
|
|||
|
|
|
|||
|
|
Given multiple entities, algebraically intersects their structural
|
|||
|
|
connections to find facts related to ALL of them simultaneously.
|
|||
|
|
This is compositional reasoning that no embedding DB can do.
|
|||
|
|
|
|||
|
|
Example: reason(["peppi", "backend"]) finds facts where peppi AND
|
|||
|
|
backend both play structural roles — without keyword matching.
|
|||
|
|
|
|||
|
|
Falls back to FTS5 search if numpy unavailable.
|
|||
|
|
"""
|
|||
|
|
if not hrr._HAS_NUMPY or not entities:
|
|||
|
|
# Fallback: search with all entities as keywords
|
|||
|
|
query = " ".join(entities)
|
|||
|
|
return self.search(query, category=category, limit=limit)
|
|||
|
|
|
|||
|
|
conn = self.store._conn
|
|||
|
|
role_entity = hrr.encode_atom("__hrr_role_entity__", self.hrr_dim)
|
|||
|
|
|
|||
|
|
# For each entity, compute what the bank "remembers" about it
|
|||
|
|
# by unbinding entity+role from each fact vector
|
|||
|
|
entity_residuals = []
|
|||
|
|
for entity in entities:
|
|||
|
|
entity_vec = hrr.encode_atom(entity.lower(), self.hrr_dim)
|
|||
|
|
probe_key = hrr.bind(entity_vec, role_entity)
|
|||
|
|
entity_residuals.append(probe_key)
|
|||
|
|
|
|||
|
|
# Get all facts with vectors
|
|||
|
|
where = "WHERE hrr_vector IS NOT NULL"
|
|||
|
|
params: list = []
|
|||
|
|
if category:
|
|||
|
|
where += " AND category = ?"
|
|||
|
|
params.append(category)
|
|||
|
|
|
|||
|
|
rows = conn.execute(
|
|||
|
|
f"""
|
|||
|
|
SELECT fact_id, content, category, tags, trust_score,
|
|||
|
|
retrieval_count, helpful_count, created_at, updated_at,
|
|||
|
|
hrr_vector
|
|||
|
|
FROM facts
|
|||
|
|
{where}
|
|||
|
|
""",
|
|||
|
|
params,
|
|||
|
|
).fetchall()
|
|||
|
|
|
|||
|
|
if not rows:
|
|||
|
|
query = " ".join(entities)
|
|||
|
|
return self.search(query, category=category, limit=limit)
|
|||
|
|
|
|||
|
|
# Score each fact by how much EACH entity is structurally present.
|
|||
|
|
# A fact scores high only if ALL entities have structural presence
|
|||
|
|
# (AND semantics via min, vs OR which would use mean/max).
|
|||
|
|
role_content = hrr.encode_atom("__hrr_role_content__", self.hrr_dim)
|
|||
|
|
|
|||
|
|
scored = []
|
|||
|
|
for row in rows:
|
|||
|
|
fact = dict(row)
|
|||
|
|
fact_vec = hrr.bytes_to_phases(fact.pop("hrr_vector"))
|
|||
|
|
|
|||
|
|
entity_scores = []
|
|||
|
|
for probe_key in entity_residuals:
|
|||
|
|
residual = hrr.unbind(fact_vec, probe_key)
|
|||
|
|
sim = hrr.similarity(residual, role_content)
|
|||
|
|
entity_scores.append(sim)
|
|||
|
|
|
|||
|
|
min_sim = min(entity_scores)
|
|||
|
|
fact["score"] = (min_sim + 1.0) / 2.0 * fact["trust_score"]
|
|||
|
|
scored.append(fact)
|
|||
|
|
|
|||
|
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
|||
|
|
return scored[:limit]
|
|||
|
|
|
|||
|
|
def contradict(
|
|||
|
|
self,
|
|||
|
|
category: str | None = None,
|
|||
|
|
threshold: float = 0.3,
|
|||
|
|
limit: int = 10,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Find potentially contradictory facts via entity overlap + content divergence.
|
|||
|
|
|
|||
|
|
Two facts contradict when they share entities (same subject) but have
|
|||
|
|
low content-vector similarity (different claims). This is automated
|
|||
|
|
memory hygiene — no other memory system does this.
|
|||
|
|
|
|||
|
|
Returns pairs of facts with a contradiction score.
|
|||
|
|
Falls back to empty list if numpy unavailable.
|
|||
|
|
"""
|
|||
|
|
if not hrr._HAS_NUMPY:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
conn = self.store._conn
|
|||
|
|
|
|||
|
|
# Get all facts with vectors and their linked entities
|
|||
|
|
where = "WHERE f.hrr_vector IS NOT NULL"
|
|||
|
|
params: list = []
|
|||
|
|
if category:
|
|||
|
|
where += " AND f.category = ?"
|
|||
|
|
params.append(category)
|
|||
|
|
|
|||
|
|
rows = conn.execute(
|
|||
|
|
f"""
|
|||
|
|
SELECT f.fact_id, f.content, f.category, f.tags, f.trust_score,
|
|||
|
|
f.created_at, f.updated_at, f.hrr_vector
|
|||
|
|
FROM facts f
|
|||
|
|
{where}
|
|||
|
|
""",
|
|||
|
|
params,
|
|||
|
|
).fetchall()
|
|||
|
|
|
|||
|
|
if len(rows) < 2:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# Guard against O(n²) explosion on large fact stores.
|
|||
|
|
# At 500 facts, that's ~125K comparisons — acceptable.
|
|||
|
|
# Above that, only check the most recently updated facts.
|
|||
|
|
_MAX_CONTRADICT_FACTS = 500
|
|||
|
|
if len(rows) > _MAX_CONTRADICT_FACTS:
|
|||
|
|
rows = sorted(rows, key=lambda r: r["updated_at"] or r["created_at"], reverse=True)
|
|||
|
|
rows = rows[:_MAX_CONTRADICT_FACTS]
|
|||
|
|
|
|||
|
|
# Build entity sets per fact
|
|||
|
|
fact_entities: dict[int, set[str]] = {}
|
|||
|
|
for row in rows:
|
|||
|
|
fid = row["fact_id"]
|
|||
|
|
entity_rows = conn.execute(
|
|||
|
|
"""
|
|||
|
|
SELECT e.name FROM entities e
|
|||
|
|
JOIN fact_entities fe ON fe.entity_id = e.entity_id
|
|||
|
|
WHERE fe.fact_id = ?
|
|||
|
|
""",
|
|||
|
|
(fid,),
|
|||
|
|
).fetchall()
|
|||
|
|
fact_entities[fid] = {r["name"].lower() for r in entity_rows}
|
|||
|
|
|
|||
|
|
# Compare all pairs: high entity overlap + low content similarity = contradiction
|
|||
|
|
facts = [dict(r) for r in rows]
|
|||
|
|
contradictions = []
|
|||
|
|
|
|||
|
|
for i in range(len(facts)):
|
|||
|
|
for j in range(i + 1, len(facts)):
|
|||
|
|
f1, f2 = facts[i], facts[j]
|
|||
|
|
ents1 = fact_entities.get(f1["fact_id"], set())
|
|||
|
|
ents2 = fact_entities.get(f2["fact_id"], set())
|
|||
|
|
|
|||
|
|
if not ents1 or not ents2:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Entity overlap (Jaccard)
|
|||
|
|
entity_overlap = len(ents1 & ents2) / len(ents1 | ents2) if (ents1 | ents2) else 0.0
|
|||
|
|
|
|||
|
|
if entity_overlap < 0.3:
|
|||
|
|
continue # Not enough entity overlap to be contradictory
|
|||
|
|
|
|||
|
|
# Content similarity via HRR vectors
|
|||
|
|
v1 = hrr.bytes_to_phases(f1["hrr_vector"])
|
|||
|
|
v2 = hrr.bytes_to_phases(f2["hrr_vector"])
|
|||
|
|
content_sim = hrr.similarity(v1, v2)
|
|||
|
|
|
|||
|
|
# High entity overlap + low content similarity = potential contradiction
|
|||
|
|
# contradiction_score: higher = more contradictory
|
|||
|
|
contradiction_score = entity_overlap * (1.0 - (content_sim + 1.0) / 2.0)
|
|||
|
|
|
|||
|
|
if contradiction_score >= threshold:
|
|||
|
|
# Strip hrr_vector from output (not JSON serializable)
|
|||
|
|
f1_clean = {k: v for k, v in f1.items() if k != "hrr_vector"}
|
|||
|
|
f2_clean = {k: v for k, v in f2.items() if k != "hrr_vector"}
|
|||
|
|
contradictions.append({
|
|||
|
|
"fact_a": f1_clean,
|
|||
|
|
"fact_b": f2_clean,
|
|||
|
|
"entity_overlap": round(entity_overlap, 3),
|
|||
|
|
"content_similarity": round(content_sim, 3),
|
|||
|
|
"contradiction_score": round(contradiction_score, 3),
|
|||
|
|
"shared_entities": sorted(ents1 & ents2),
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
contradictions.sort(key=lambda x: x["contradiction_score"], reverse=True)
|
|||
|
|
return contradictions[:limit]
|
|||
|
|
|
|||
|
|
def _score_facts_by_vector(
|
|||
|
|
self,
|
|||
|
|
target_vec: "np.ndarray",
|
|||
|
|
category: str | None = None,
|
|||
|
|
limit: int = 10,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Score facts by similarity to a target vector."""
|
|||
|
|
conn = self.store._conn
|
|||
|
|
|
|||
|
|
where = "WHERE hrr_vector IS NOT NULL"
|
|||
|
|
params: list = []
|
|||
|
|
if category:
|
|||
|
|
where += " AND category = ?"
|
|||
|
|
params.append(category)
|
|||
|
|
|
|||
|
|
rows = conn.execute(
|
|||
|
|
f"""
|
|||
|
|
SELECT fact_id, content, category, tags, trust_score,
|
|||
|
|
retrieval_count, helpful_count, created_at, updated_at,
|
|||
|
|
hrr_vector
|
|||
|
|
FROM facts
|
|||
|
|
{where}
|
|||
|
|
""",
|
|||
|
|
params,
|
|||
|
|
).fetchall()
|
|||
|
|
|
|||
|
|
scored = []
|
|||
|
|
for row in rows:
|
|||
|
|
fact = dict(row)
|
|||
|
|
fact_vec = hrr.bytes_to_phases(fact.pop("hrr_vector"))
|
|||
|
|
sim = hrr.similarity(target_vec, fact_vec)
|
|||
|
|
fact["score"] = (sim + 1.0) / 2.0 * fact["trust_score"]
|
|||
|
|
scored.append(fact)
|
|||
|
|
|
|||
|
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
|||
|
|
return scored[:limit]
|
|||
|
|
|
|||
|
|
def _fts_candidates(
|
|||
|
|
self,
|
|||
|
|
query: str,
|
|||
|
|
category: str | None,
|
|||
|
|
min_trust: float,
|
|||
|
|
limit: int,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Get raw FTS5 candidates from the store.
|
|||
|
|
|
|||
|
|
Uses the store's database connection directly for FTS5 MATCH
|
|||
|
|
with rank scoring. Normalizes FTS5 rank to [0, 1] range.
|
|||
|
|
"""
|
|||
|
|
conn = self.store._conn
|
|||
|
|
|
|||
|
|
# Build query - FTS5 rank is negative (lower = better match)
|
|||
|
|
# We need to join facts_fts with facts to get all columns
|
|||
|
|
params: list = []
|
|||
|
|
where_clauses = ["facts_fts MATCH ?"]
|
|||
|
|
params.append(query)
|
|||
|
|
|
|||
|
|
if category:
|
|||
|
|
where_clauses.append("f.category = ?")
|
|||
|
|
params.append(category)
|
|||
|
|
|
|||
|
|
where_clauses.append("f.trust_score >= ?")
|
|||
|
|
params.append(min_trust)
|
|||
|
|
|
|||
|
|
where_sql = " AND ".join(where_clauses)
|
|||
|
|
|
|||
|
|
sql = f"""
|
|||
|
|
SELECT f.*, facts_fts.rank as fts_rank_raw
|
|||
|
|
FROM facts_fts
|
|||
|
|
JOIN facts f ON f.fact_id = facts_fts.rowid
|
|||
|
|
WHERE {where_sql}
|
|||
|
|
ORDER BY facts_fts.rank
|
|||
|
|
LIMIT ?
|
|||
|
|
"""
|
|||
|
|
params.append(limit)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
rows = conn.execute(sql, params).fetchall()
|
|||
|
|
except Exception:
|
|||
|
|
# FTS5 MATCH can fail on malformed queries — fall back to empty
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
if not rows:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# Normalize FTS5 rank: rank is negative, lower = better
|
|||
|
|
# Convert to positive score in [0, 1] range
|
|||
|
|
raw_ranks = [abs(row["fts_rank_raw"]) for row in rows]
|
|||
|
|
max_rank = max(raw_ranks) if raw_ranks else 1.0
|
|||
|
|
max_rank = max(max_rank, 1e-6) # avoid div by zero
|
|||
|
|
|
|||
|
|
results = []
|
|||
|
|
for row, raw_rank in zip(rows, raw_ranks):
|
|||
|
|
fact = dict(row)
|
|||
|
|
fact.pop("fts_rank_raw", None)
|
|||
|
|
fact["fts_rank"] = raw_rank / max_rank # normalize to [0, 1]
|
|||
|
|
results.append(fact)
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _tokenize(text: str) -> set[str]:
|
|||
|
|
"""Simple whitespace tokenization with lowercasing.
|
|||
|
|
|
|||
|
|
Strips common punctuation. No stemming/lemmatization (Phase 1).
|
|||
|
|
"""
|
|||
|
|
if not text:
|
|||
|
|
return set()
|
|||
|
|
# Split on whitespace, lowercase, strip punctuation
|
|||
|
|
tokens = set()
|
|||
|
|
for word in text.lower().split():
|
|||
|
|
cleaned = word.strip(".,;:!?\"'()[]{}#@<>")
|
|||
|
|
if cleaned:
|
|||
|
|
tokens.add(cleaned)
|
|||
|
|
return tokens
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _jaccard_similarity(set_a: set, set_b: set) -> float:
|
|||
|
|
"""Jaccard similarity coefficient: |A ∩ B| / |A ∪ B|."""
|
|||
|
|
if not set_a or not set_b:
|
|||
|
|
return 0.0
|
|||
|
|
intersection = len(set_a & set_b)
|
|||
|
|
union = len(set_a | set_b)
|
|||
|
|
return intersection / union if union > 0 else 0.0
|
|||
|
|
|
|||
|
|
def _temporal_decay(self, timestamp_str: str | None) -> float:
|
|||
|
|
"""Exponential decay: 0.5^(age_days / half_life_days).
|
|||
|
|
|
|||
|
|
Returns 1.0 if decay is disabled or timestamp is missing.
|
|||
|
|
"""
|
|||
|
|
if not self.half_life or not timestamp_str:
|
|||
|
|
return 1.0
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
if isinstance(timestamp_str, str):
|
|||
|
|
# Parse ISO format timestamp from SQLite
|
|||
|
|
ts = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
|
|||
|
|
else:
|
|||
|
|
ts = timestamp_str
|
|||
|
|
|
|||
|
|
if ts.tzinfo is None:
|
|||
|
|
ts = ts.replace(tzinfo=timezone.utc)
|
|||
|
|
|
|||
|
|
age_days = (datetime.now(timezone.utc) - ts).total_seconds() / 86400
|
|||
|
|
if age_days < 0:
|
|||
|
|
return 1.0
|
|||
|
|
|
|||
|
|
return math.pow(0.5, age_days / self.half_life)
|
|||
|
|
except (ValueError, TypeError):
|
|||
|
|
return 1.0
|