Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 53s
fact_feedback tool existed but was never called automatically. Trust scores never changed after initial assignment. Facts lived forever regardless of accuracy. Changes: - MemoryProvider: add get_prefetched_fact_ids() for feedback loop - HolographicMemoryProvider: track fact IDs returned by prefetch() - MemoryManager: auto_calibrate_feedback() detects corrections and applies helpful/unhelpful feedback automatically - Correction detection: regex patterns for 'no', 'wrong', 'actually', 'i said', 'correction:', 'undo', etc. - MemoryManager: get_pruning_candidates() for below-threshold facts - Wired into run_agent.py: calibration runs after prefetch, before tool loop Trust mechanics: - Successful interaction: trust += 0.05 per fact (helpful) - Correction detected: trust -= 0.10 per fact (unhelpful) - Trust clamped to [0.0, 1.0] - Facts below threshold (default 0.15) are pruning candidates Tests: 23 new tests, all passing. 139 total memory tests green. Refs: Timmy_Foundation/hermes-agent#252
253 lines
8.4 KiB
Python
253 lines
8.4 KiB
Python
"""Tests for automatic fact trust calibration (Issue #252)."""
|
|
|
|
import json
|
|
import pytest
|
|
|
|
from agent.memory_manager import MemoryManager, _detect_correction
|
|
from plugins.memory.holographic import HolographicMemoryProvider
|
|
|
|
|
|
def _make_holographic_provider(db_path=":memory:"):
|
|
"""Create a holographic provider backed by an in-memory SQLite DB."""
|
|
provider = HolographicMemoryProvider(config={
|
|
"db_path": db_path,
|
|
"default_trust": 0.5,
|
|
"min_trust_threshold": 0.3,
|
|
"hrr_dim": 64, # small for speed
|
|
})
|
|
provider.initialize(session_id="test")
|
|
return provider
|
|
|
|
|
|
class TestDetectCorrection:
|
|
"""Correction detection pattern matching."""
|
|
|
|
@pytest.mark.parametrize("msg", [
|
|
"No, that's wrong",
|
|
"Actually, it's Python 3.12",
|
|
"That's not right",
|
|
"I said the config is in YAML",
|
|
"Correction: the port is 8080",
|
|
"Nope, wrong file",
|
|
"Not quite what I meant",
|
|
"Undo that last change",
|
|
"that is not correct",
|
|
"what i meant was different",
|
|
])
|
|
def test_correction_detected(self, msg):
|
|
assert _detect_correction(msg) is True
|
|
|
|
@pytest.mark.parametrize("msg", [
|
|
"",
|
|
"Hello",
|
|
"What's the weather today?",
|
|
"I need you to build a new feature. " * 10,
|
|
"yes that's correct",
|
|
])
|
|
def test_not_a_correction(self, msg):
|
|
assert _detect_correction(msg) is False
|
|
|
|
|
|
class TestAutoCalibrateFeedback:
|
|
"""Auto-calibration integration."""
|
|
|
|
def test_correction_marks_unhelpful(self):
|
|
provider = _make_holographic_provider()
|
|
manager = MemoryManager()
|
|
manager.add_provider(provider)
|
|
|
|
# Store a fact
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "add", "content": "The project uses Flask framework"},
|
|
)
|
|
fact_id = json.loads(result)["fact_id"]
|
|
|
|
# Simulate: this fact was prefetched
|
|
provider._last_prefetch_ids = [fact_id]
|
|
|
|
# User corrects: "No, it uses FastAPI"
|
|
manager.auto_calibrate_feedback("No, it uses FastAPI")
|
|
|
|
# Check trust dropped
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "list", "min_trust": 0.0},
|
|
)
|
|
facts = json.loads(result)["facts"]
|
|
target = next(f for f in facts if f["fact_id"] == fact_id)
|
|
assert target["trust_score"] < 0.5 # dropped from default 0.5
|
|
assert target["trust_score"] == pytest.approx(0.4, abs=0.01) # 0.5 - 0.1
|
|
|
|
def test_successful_interaction_gains_trust(self):
|
|
provider = _make_holographic_provider()
|
|
manager = MemoryManager()
|
|
manager.add_provider(provider)
|
|
|
|
# Store a fact
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "add", "content": "The project uses Django framework"},
|
|
)
|
|
fact_id = json.loads(result)["fact_id"]
|
|
|
|
# Simulate: this fact was prefetched
|
|
provider._last_prefetch_ids = [fact_id]
|
|
|
|
# User says something normal (not a correction)
|
|
manager.auto_calibrate_feedback("What version of Django?")
|
|
|
|
# Check trust increased
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "list", "min_trust": 0.0},
|
|
)
|
|
facts = json.loads(result)["facts"]
|
|
target = next(f for f in facts if f["fact_id"] == fact_id)
|
|
assert target["trust_score"] > 0.5 # rose from default 0.5
|
|
assert target["trust_score"] == pytest.approx(0.55, abs=0.01) # 0.5 + 0.05
|
|
|
|
def test_no_prefetch_no_calibration(self):
|
|
provider = _make_holographic_provider()
|
|
manager = MemoryManager()
|
|
manager.add_provider(provider)
|
|
|
|
# Store a fact
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "add", "content": "The database is PostgreSQL"},
|
|
)
|
|
fact_id = json.loads(result)["fact_id"]
|
|
|
|
# No prefetched facts
|
|
provider._last_prefetch_ids = []
|
|
|
|
# Calibrate — should be no-op
|
|
manager.auto_calibrate_feedback("No, it's MySQL")
|
|
|
|
# Trust should be unchanged
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "list", "min_trust": 0.0},
|
|
)
|
|
facts = json.loads(result)["facts"]
|
|
target = next(f for f in facts if f["fact_id"] == fact_id)
|
|
assert target["trust_score"] == 0.5 # unchanged
|
|
|
|
def test_multiple_corrections_drives_trust_low(self):
|
|
provider = _make_holographic_provider()
|
|
manager = MemoryManager()
|
|
manager.add_provider(provider)
|
|
|
|
# Store a fact
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "add", "content": "The server runs on port 3000"},
|
|
)
|
|
fact_id = json.loads(result)["fact_id"]
|
|
provider._last_prefetch_ids = [fact_id]
|
|
|
|
# Simulate 5 corrections
|
|
for _ in range(5):
|
|
manager.auto_calibrate_feedback("Wrong, it's port 8080")
|
|
|
|
# Trust should be much lower
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "list", "min_trust": 0.0},
|
|
)
|
|
facts = json.loads(result)["facts"]
|
|
target = next(f for f in facts if f["fact_id"] == fact_id)
|
|
assert target["trust_score"] < 0.2 # 0.5 - 5*0.1 = 0.0 (clamped)
|
|
|
|
def test_trust_floor_at_zero(self):
|
|
provider = _make_holographic_provider()
|
|
manager = MemoryManager()
|
|
manager.add_provider(provider)
|
|
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "add", "content": "Test fact for floor"},
|
|
)
|
|
fact_id = json.loads(result)["fact_id"]
|
|
provider._last_prefetch_ids = [fact_id]
|
|
|
|
# 10 corrections should clamp at 0.0, not go negative
|
|
for _ in range(10):
|
|
manager.auto_calibrate_feedback("Wrong!")
|
|
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "list", "min_trust": 0.0},
|
|
)
|
|
facts = json.loads(result)["facts"]
|
|
target = next(f for f in facts if f["fact_id"] == fact_id)
|
|
assert target["trust_score"] == 0.0
|
|
|
|
def test_trust_ceiling_at_one(self):
|
|
provider = _make_holographic_provider()
|
|
manager = MemoryManager()
|
|
manager.add_provider(provider)
|
|
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "add", "content": "Test fact for ceiling"},
|
|
)
|
|
fact_id = json.loads(result)["fact_id"]
|
|
provider._last_prefetch_ids = [fact_id]
|
|
|
|
# 20 successful interactions should cap at 1.0
|
|
for _ in range(20):
|
|
manager.auto_calibrate_feedback("Thanks, what else?")
|
|
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "list", "min_trust": 0.0},
|
|
)
|
|
facts = json.loads(result)["facts"]
|
|
target = next(f for f in facts if f["fact_id"] == fact_id)
|
|
assert target["trust_score"] == 1.0
|
|
|
|
def test_get_pruning_candidates(self):
|
|
provider = _make_holographic_provider()
|
|
manager = MemoryManager()
|
|
manager.add_provider(provider)
|
|
|
|
# Add a fact and drive its trust below threshold via corrections
|
|
result = manager.handle_tool_call(
|
|
"fact_store",
|
|
{"action": "add", "content": "Bad fact to be pruned"},
|
|
)
|
|
fact_id = json.loads(result)["fact_id"]
|
|
provider._last_prefetch_ids = [fact_id]
|
|
|
|
for _ in range(5):
|
|
manager.auto_calibrate_feedback("Wrong!")
|
|
|
|
# Get pruning candidates
|
|
candidates = manager.get_pruning_candidates(threshold=0.15)
|
|
assert any(c["fact_id"] == fact_id for c in candidates)
|
|
|
|
def test_prefetch_tracks_fact_ids(self):
|
|
"""Verify prefetch populates _last_prefetch_ids."""
|
|
provider = _make_holographic_provider()
|
|
|
|
# Add facts
|
|
provider.handle_tool_call("fact_store", {
|
|
"action": "add",
|
|
"content": "Alexander uses Python for development",
|
|
})
|
|
provider.handle_tool_call("fact_store", {
|
|
"action": "add",
|
|
"content": "Alexander prefers dark mode editors",
|
|
})
|
|
|
|
# Prefetch should find them and track IDs
|
|
result = provider.prefetch("Alexander")
|
|
assert "Holographic Memory" in result
|
|
assert len(provider._last_prefetch_ids) > 0
|
|
|
|
# Empty query clears IDs
|
|
provider.prefetch("")
|
|
assert provider._last_prefetch_ids == []
|