hermes-agent/tests/agent/test_fact_calibration.py

"""Tests for automatic fact trust calibration (Issue #252)."""

import json
import pytest

from agent.memory_manager import MemoryManager, _detect_correction
from plugins.memory.holographic import HolographicMemoryProvider


def _make_holographic_provider(db_path=":memory:"):
    """Create a holographic provider backed by an in-memory SQLite DB."""
    provider = HolographicMemoryProvider(config={
        "db_path": db_path,
        "default_trust": 0.5,
        "min_trust_threshold": 0.3,
        "hrr_dim": 64,  # small for speed
    })
    provider.initialize(session_id="test")
    return provider


class TestDetectCorrection:
    """Correction detection pattern matching."""

    @pytest.mark.parametrize("msg", [
        "No, that's wrong",
        "Actually, it's Python 3.12",
        "That's not right",
        "I said the config is in YAML",
        "Correction: the port is 8080",
        "Nope, wrong file",
        "Not quite what I meant",
        "Undo that last change",
        "that is not correct",
        "what i meant was different",
    ])
    def test_correction_detected(self, msg):
        assert _detect_correction(msg) is True

    @pytest.mark.parametrize("msg", [
        "",
        "Hello",
        "What's the weather today?",
        "I need you to build a new feature. " * 10,
        "yes that's correct",
    ])
    def test_not_a_correction(self, msg):
        assert _detect_correction(msg) is False


class TestAutoCalibrateFeedback:
    """Auto-calibration integration."""

    def test_correction_marks_unhelpful(self):
        provider = _make_holographic_provider()
        manager = MemoryManager()
        manager.add_provider(provider)

        # Store a fact
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "add", "content": "The project uses Flask framework"},
        )
        fact_id = json.loads(result)["fact_id"]

        # Simulate: this fact was prefetched
        provider._last_prefetch_ids = [fact_id]

        # User corrects: "No, it uses FastAPI"
        manager.auto_calibrate_feedback("No, it uses FastAPI")

        # Check trust dropped
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "list", "min_trust": 0.0},
        )
        facts = json.loads(result)["facts"]
        target = next(f for f in facts if f["fact_id"] == fact_id)
        assert target["trust_score"] < 0.5  # dropped from default 0.5
        assert target["trust_score"] == pytest.approx(0.4, abs=0.01)  # 0.5 - 0.1

    def test_successful_interaction_gains_trust(self):
        provider = _make_holographic_provider()
        manager = MemoryManager()
        manager.add_provider(provider)

        # Store a fact
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "add", "content": "The project uses Django framework"},
        )
        fact_id = json.loads(result)["fact_id"]

        # Simulate: this fact was prefetched
        provider._last_prefetch_ids = [fact_id]

        # User says something normal (not a correction)
        manager.auto_calibrate_feedback("What version of Django?")

        # Check trust increased
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "list", "min_trust": 0.0},
        )
        facts = json.loads(result)["facts"]
        target = next(f for f in facts if f["fact_id"] == fact_id)
        assert target["trust_score"] > 0.5  # rose from default 0.5
        assert target["trust_score"] == pytest.approx(0.55, abs=0.01)  # 0.5 + 0.05

    def test_no_prefetch_no_calibration(self):
        provider = _make_holographic_provider()
        manager = MemoryManager()
        manager.add_provider(provider)

        # Store a fact
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "add", "content": "The database is PostgreSQL"},
        )
        fact_id = json.loads(result)["fact_id"]

        # No prefetched facts
        provider._last_prefetch_ids = []

        # Calibrate — should be no-op
        manager.auto_calibrate_feedback("No, it's MySQL")

        # Trust should be unchanged
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "list", "min_trust": 0.0},
        )
        facts = json.loads(result)["facts"]
        target = next(f for f in facts if f["fact_id"] == fact_id)
        assert target["trust_score"] == 0.5  # unchanged

    def test_multiple_corrections_drives_trust_low(self):
        provider = _make_holographic_provider()
        manager = MemoryManager()
        manager.add_provider(provider)

        # Store a fact
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "add", "content": "The server runs on port 3000"},
        )
        fact_id = json.loads(result)["fact_id"]
        provider._last_prefetch_ids = [fact_id]

        # Simulate 5 corrections
        for _ in range(5):
            manager.auto_calibrate_feedback("Wrong, it's port 8080")

        # Trust should be much lower
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "list", "min_trust": 0.0},
        )
        facts = json.loads(result)["facts"]
        target = next(f for f in facts if f["fact_id"] == fact_id)
        assert target["trust_score"] < 0.2  # 0.5 - 5*0.1 = 0.0 (clamped)

    def test_trust_floor_at_zero(self):
        provider = _make_holographic_provider()
        manager = MemoryManager()
        manager.add_provider(provider)

        result = manager.handle_tool_call(
            "fact_store",
            {"action": "add", "content": "Test fact for floor"},
        )
        fact_id = json.loads(result)["fact_id"]
        provider._last_prefetch_ids = [fact_id]

        # 10 corrections should clamp at 0.0, not go negative
        for _ in range(10):
            manager.auto_calibrate_feedback("Wrong!")

        result = manager.handle_tool_call(
            "fact_store",
            {"action": "list", "min_trust": 0.0},
        )
        facts = json.loads(result)["facts"]
        target = next(f for f in facts if f["fact_id"] == fact_id)
        assert target["trust_score"] == 0.0

    def test_trust_ceiling_at_one(self):
        provider = _make_holographic_provider()
        manager = MemoryManager()
        manager.add_provider(provider)

        result = manager.handle_tool_call(
            "fact_store",
            {"action": "add", "content": "Test fact for ceiling"},
        )
        fact_id = json.loads(result)["fact_id"]
        provider._last_prefetch_ids = [fact_id]

        # 20 successful interactions should cap at 1.0
        for _ in range(20):
            manager.auto_calibrate_feedback("Thanks, what else?")

        result = manager.handle_tool_call(
            "fact_store",
            {"action": "list", "min_trust": 0.0},
        )
        facts = json.loads(result)["facts"]
        target = next(f for f in facts if f["fact_id"] == fact_id)
        assert target["trust_score"] == 1.0

    def test_get_pruning_candidates(self):
        provider = _make_holographic_provider()
        manager = MemoryManager()
        manager.add_provider(provider)

        # Add a fact and drive its trust below threshold via corrections
        result = manager.handle_tool_call(
            "fact_store",
            {"action": "add", "content": "Bad fact to be pruned"},
        )
        fact_id = json.loads(result)["fact_id"]
        provider._last_prefetch_ids = [fact_id]

        for _ in range(5):
            manager.auto_calibrate_feedback("Wrong!")

        # Get pruning candidates
        candidates = manager.get_pruning_candidates(threshold=0.15)
        assert any(c["fact_id"] == fact_id for c in candidates)

    def test_prefetch_tracks_fact_ids(self):
        """Verify prefetch populates _last_prefetch_ids."""
        provider = _make_holographic_provider()

        # Add facts
        provider.handle_tool_call("fact_store", {
            "action": "add",
            "content": "Alexander uses Python for development",
        })
        provider.handle_tool_call("fact_store", {
            "action": "add",
            "content": "Alexander prefers dark mode editors",
        })

        # Prefetch should find them and track IDs
        result = provider.prefetch("Alexander")
        assert "Holographic Memory" in result
        assert len(provider._last_prefetch_ids) > 0

        # Empty query clears IDs
        provider.prefetch("")
        assert provider._last_prefetch_ids == []