timmy-home/scripts/grounding.py

#!/usr/bin/env python3
# grounding.py - Grounding before generation.
# SOUL.md: "When I have verified sources, I must consult them
# before I generate from pattern alone. Retrieval is not a feature.
# It is the primary mechanism by which I avoid lying."
# Part of #792

import json
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from dataclasses import dataclass, field

HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
MEMORY_DIR = HERMES_HOME / "memory"


@dataclass
class GroundingResult:
    query: str
    sources_found: List[Dict[str, Any]] = field(default_factory=list)
    grounded: bool = False
    confidence: float = 0.0
    source_text: str = ""
    source_type: str = ""  # memory, file, chain, tool_result

    @property
    def needs_hedging(self):
        return not self.grounded


class GroundingLayer:
    def __init__(self, memory_dir=None):
        self.memory_dir = Path(memory_dir) if memory_dir else MEMORY_DIR

    def ground(self, query, context=None):
        """Query local sources before generation."""
        sources = []

        # 1. Search memory files
        memory_hits = self._search_memory(query)
        sources.extend(memory_hits)

        # 2. Search context files if provided
        if context:
            context_hits = self._search_context(query, context)
            sources.extend(context_hits)

        # 3. Build result
        grounded = len(sources) > 0
        confidence = min(0.95, 0.3 + len(sources) * 0.2) if grounded else 0.0

        source_text = ""
        source_type = ""
        if sources:
            best = max(sources, key=lambda s: s.get("score", 0))
            source_text = best.get("text", "")[:200]
            source_type = best.get("type", "unknown")

        return GroundingResult(
            query=query, sources_found=sources, grounded=grounded,
            confidence=confidence, source_text=source_text, source_type=source_type,
        )

    def _search_memory(self, query):
        """Search memory files for relevant content."""
        results = []
        if not self.memory_dir.exists():
            return results

        query_lower = query.lower()
        query_words = set(query_lower.split())

        for mem_file in self.memory_dir.rglob("*.md"):
            try:
                content = mem_file.read_text(encoding="utf-8", errors="replace")
            except Exception:
                continue

            content_lower = content.lower()
            # Simple relevance: count query word matches
            matches = sum(1 for w in query_words if w in content_lower)
            if matches > 0:
                score = matches / max(len(query_words), 1)
                # Extract relevant snippet
                lines = content.split("\n")
                snippet = ""
                for line in lines:
                    if any(w in line.lower() for w in query_words):
                        snippet = line.strip()[:200]
                        break

                results.append({
                    "text": snippet or content[:200],
                    "source": str(mem_file.relative_to(self.memory_dir)),
                    "type": "memory",
                    "score": round(score, 3),
                })

        return sorted(results, key=lambda r: -r["score"])[:5]

    def _search_context(self, query, context):
        """Search provided context text for relevant content."""
        results = []
        if not context:
            return results

        query_lower = query.lower()
        query_words = set(query_lower.split())

        for ctx in context:
            if isinstance(ctx, dict):
                text = ctx.get("content", "") or ctx.get("text", "")
                source = ctx.get("source", "context")
            else:
                text = str(ctx)
                source = "context"

            text_lower = text.lower()
            matches = sum(1 for w in query_words if w in text_lower)
            if matches > 0:
                score = matches / max(len(query_words), 1)
                results.append({
                    "text": text[:200],
                    "source": source,
                    "type": "context",
                    "score": round(score, 3),
                })

        return sorted(results, key=lambda r: -r["score"])[:5]

    def format_sources(self, result):
        """Format grounding result for display."""
        if not result.grounded:
            return "No verified sources found. Proceeding from pattern matching."

        lines = ["Based on verified sources:"]
        for s in result.sources_found[:3]:
            ref = s.get("source", "unknown")
            text = s.get("text", "")[:100]
            lines.append("  - [" + ref + "] " + text)
        return "\n".join(lines)


# Convenience
_default_layer = None

def get_grounding_layer():
    global _default_layer
    if _default_layer is None:
        _default_layer = GroundingLayer()
    return _default_layer

def ground(query, **kwargs):
    return get_grounding_layer().ground(query, **kwargs)