SOUL.md compliance: 'When I have verified sources, I must consult them before I generate from pattern alone. Retrieval is not a feature. It is the primary mechanism by which I avoid lying.' scripts/grounding.py: GroundingLayer with ground() - queries memory files + context before generation GroundingResult with grounded flag, confidence, sources, hedging indicator format_sources() for display Searches memory/*.md and provided context text Tests: 6 passing
156 lines
5.0 KiB
Python
Executable File
156 lines
5.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# grounding.py - Grounding before generation.
|
|
# SOUL.md: "When I have verified sources, I must consult them
|
|
# before I generate from pattern alone. Retrieval is not a feature.
|
|
# It is the primary mechanism by which I avoid lying."
|
|
# Part of #792
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
from dataclasses import dataclass, field
|
|
|
|
HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
|
MEMORY_DIR = HERMES_HOME / "memory"
|
|
|
|
|
|
@dataclass
|
|
class GroundingResult:
|
|
query: str
|
|
sources_found: List[Dict[str, Any]] = field(default_factory=list)
|
|
grounded: bool = False
|
|
confidence: float = 0.0
|
|
source_text: str = ""
|
|
source_type: str = "" # memory, file, chain, tool_result
|
|
|
|
@property
|
|
def needs_hedging(self):
|
|
return not self.grounded
|
|
|
|
|
|
class GroundingLayer:
|
|
def __init__(self, memory_dir=None):
|
|
self.memory_dir = Path(memory_dir) if memory_dir else MEMORY_DIR
|
|
|
|
def ground(self, query, context=None):
|
|
"""Query local sources before generation."""
|
|
sources = []
|
|
|
|
# 1. Search memory files
|
|
memory_hits = self._search_memory(query)
|
|
sources.extend(memory_hits)
|
|
|
|
# 2. Search context files if provided
|
|
if context:
|
|
context_hits = self._search_context(query, context)
|
|
sources.extend(context_hits)
|
|
|
|
# 3. Build result
|
|
grounded = len(sources) > 0
|
|
confidence = min(0.95, 0.3 + len(sources) * 0.2) if grounded else 0.0
|
|
|
|
source_text = ""
|
|
source_type = ""
|
|
if sources:
|
|
best = max(sources, key=lambda s: s.get("score", 0))
|
|
source_text = best.get("text", "")[:200]
|
|
source_type = best.get("type", "unknown")
|
|
|
|
return GroundingResult(
|
|
query=query, sources_found=sources, grounded=grounded,
|
|
confidence=confidence, source_text=source_text, source_type=source_type,
|
|
)
|
|
|
|
def _search_memory(self, query):
|
|
"""Search memory files for relevant content."""
|
|
results = []
|
|
if not self.memory_dir.exists():
|
|
return results
|
|
|
|
query_lower = query.lower()
|
|
query_words = set(query_lower.split())
|
|
|
|
for mem_file in self.memory_dir.rglob("*.md"):
|
|
try:
|
|
content = mem_file.read_text(encoding="utf-8", errors="replace")
|
|
except Exception:
|
|
continue
|
|
|
|
content_lower = content.lower()
|
|
# Simple relevance: count query word matches
|
|
matches = sum(1 for w in query_words if w in content_lower)
|
|
if matches > 0:
|
|
score = matches / max(len(query_words), 1)
|
|
# Extract relevant snippet
|
|
lines = content.split("\n")
|
|
snippet = ""
|
|
for line in lines:
|
|
if any(w in line.lower() for w in query_words):
|
|
snippet = line.strip()[:200]
|
|
break
|
|
|
|
results.append({
|
|
"text": snippet or content[:200],
|
|
"source": str(mem_file.relative_to(self.memory_dir)),
|
|
"type": "memory",
|
|
"score": round(score, 3),
|
|
})
|
|
|
|
return sorted(results, key=lambda r: -r["score"])[:5]
|
|
|
|
def _search_context(self, query, context):
|
|
"""Search provided context text for relevant content."""
|
|
results = []
|
|
if not context:
|
|
return results
|
|
|
|
query_lower = query.lower()
|
|
query_words = set(query_lower.split())
|
|
|
|
for ctx in context:
|
|
if isinstance(ctx, dict):
|
|
text = ctx.get("content", "") or ctx.get("text", "")
|
|
source = ctx.get("source", "context")
|
|
else:
|
|
text = str(ctx)
|
|
source = "context"
|
|
|
|
text_lower = text.lower()
|
|
matches = sum(1 for w in query_words if w in text_lower)
|
|
if matches > 0:
|
|
score = matches / max(len(query_words), 1)
|
|
results.append({
|
|
"text": text[:200],
|
|
"source": source,
|
|
"type": "context",
|
|
"score": round(score, 3),
|
|
})
|
|
|
|
return sorted(results, key=lambda r: -r["score"])[:5]
|
|
|
|
def format_sources(self, result):
|
|
"""Format grounding result for display."""
|
|
if not result.grounded:
|
|
return "No verified sources found. Proceeding from pattern matching."
|
|
|
|
lines = ["Based on verified sources:"]
|
|
for s in result.sources_found[:3]:
|
|
ref = s.get("source", "unknown")
|
|
text = s.get("text", "")[:100]
|
|
lines.append(" - [" + ref + "] " + text)
|
|
return "\n".join(lines)
|
|
|
|
|
|
# Convenience
|
|
_default_layer = None
|
|
|
|
def get_grounding_layer():
|
|
global _default_layer
|
|
if _default_layer is None:
|
|
_default_layer = GroundingLayer()
|
|
return _default_layer
|
|
|
|
def ground(query, **kwargs):
|
|
return get_grounding_layer().ground(query, **kwargs)
|