Files
hermes-agent/agent/memory/evaluation.py
Alexander Whitestone 3563896f86
Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 1m1s
feat: pluggable memory backends — evaluate Honcho vs local (#322)
Research evaluation of Honcho memory integration from plastic-labs
fork. Builds a pluggable memory backend system that supports both
cloud (Honcho) and local (SQLite) implementations.

Architecture:
  agent/memory/__init__.py — MemoryBackend ABC, NullBackend, singleton
  agent/memory/local_backend.py — SQLite-backed local storage (default)
  agent/memory/honcho_backend.py — Honcho cloud backend (opt-in)
  agent/memory/evaluation.py — structured comparison framework

Key design decisions:
  - NullBackend default: zero overhead when disabled
  - LocalBackend: zero cloud dependency, stores in ~/.hermes/memory.db
  - HonchoBackend: opt-in via HONCHO_API_KEY, lazy-loaded
  - Evaluation framework scores latency, functionality, privacy

Evaluation scoring:
  - Availability (20pts), Functionality (40pts), Latency (20pts), Privacy (20pts)
  - Local scores higher on privacy (20 vs 5) — sovereignty-first

RECOMMENDATION: LocalBackend for sovereignty. Honcho adds cloud
dependency without clear advantage over local SQLite for our use case.

25 tests, all passing.

Closes #322
2026-04-13 20:56:44 -04:00

264 lines
7.3 KiB
Python

"""Memory Backend Evaluation Framework.
Provides structured evaluation for comparing memory backends on:
1. Latency (store/retrieve/query operations)
2. Relevance (does query return useful results?)
3. Privacy (where is data stored?)
4. Reliability (availability, error handling)
5. Cost (API calls, cloud dependency)
Usage:
from agent.memory.evaluation import evaluate_backends
report = evaluate_backends()
"""
import json
import logging
import time
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
@dataclass
class BackendEvaluation:
"""Evaluation results for a single backend."""
backend_name: str
is_cloud: bool
available: bool
# Latency (milliseconds)
store_latency_ms: float = 0
retrieve_latency_ms: float = 0
query_latency_ms: float = 0
# Functionality
store_success: bool = False
retrieve_success: bool = False
query_returns_results: bool = False
query_result_count: int = 0
# Privacy
data_location: str = "unknown"
requires_api_key: bool = False
# Overall
score: float = 0 # 0-100
recommendation: str = ""
notes: List[str] = field(default_factory=list)
def _measure_latency(func, *args, **kwargs) -> tuple:
"""Measure function latency in milliseconds."""
start = time.perf_counter()
try:
result = func(*args, **kwargs)
elapsed = (time.perf_counter() - start) * 1000
return elapsed, result, None
except Exception as e:
elapsed = (time.perf_counter() - start) * 1000
return elapsed, None, e
def evaluate_backend(backend, test_user: str = "eval_user") -> BackendEvaluation:
"""Evaluate a single memory backend."""
from agent.memory import MemoryBackend
eval_result = BackendEvaluation(
backend_name=backend.backend_name,
is_cloud=backend.is_cloud,
available=backend.is_available(),
)
if not eval_result.available:
eval_result.notes.append("Backend not available")
eval_result.score = 0
eval_result.recommendation = "NOT AVAILABLE"
return eval_result
# Privacy assessment
if backend.is_cloud:
eval_result.data_location = "cloud (external)"
eval_result.requires_api_key = True
else:
eval_result.data_location = "local (~/.hermes/)"
# Test store
latency, success, err = _measure_latency(
backend.store,
test_user,
"eval_test_key",
"eval_test_value",
{"source": "evaluation"},
)
eval_result.store_latency_ms = latency
eval_result.store_success = success is True
if err:
eval_result.notes.append(f"Store error: {err}")
# Test retrieve
latency, result, err = _measure_latency(
backend.retrieve,
test_user,
"eval_test_key",
)
eval_result.retrieve_latency_ms = latency
eval_result.retrieve_success = result is not None
if err:
eval_result.notes.append(f"Retrieve error: {err}")
# Test query
latency, results, err = _measure_latency(
backend.query,
test_user,
"eval_test",
5,
)
eval_result.query_latency_ms = latency
eval_result.query_returns_results = bool(results)
eval_result.query_result_count = len(results) if results else 0
if err:
eval_result.notes.append(f"Query error: {err}")
# Cleanup
try:
backend.delete(test_user, "eval_test_key")
except Exception:
pass
# Score calculation (0-100)
score = 0
# Availability (20 points)
score += 20
# Functionality (40 points)
if eval_result.store_success:
score += 15
if eval_result.retrieve_success:
score += 15
if eval_result.query_returns_results:
score += 10
# Latency (20 points) — lower is better
avg_latency = (
eval_result.store_latency_ms +
eval_result.retrieve_latency_ms +
eval_result.query_latency_ms
) / 3
if avg_latency < 10:
score += 20
elif avg_latency < 50:
score += 15
elif avg_latency < 200:
score += 10
else:
score += 5
# Privacy (20 points) — local is better for sovereignty
if not backend.is_cloud:
score += 20
else:
score += 5 # cloud has privacy trade-offs
eval_result.score = score
# Recommendation
if score >= 80:
eval_result.recommendation = "RECOMMENDED"
elif score >= 60:
eval_result.recommendation = "ACCEPTABLE"
elif score >= 40:
eval_result.recommendation = "MARGINAL"
else:
eval_result.recommendation = "NOT RECOMMENDED"
return eval_result
def evaluate_backends() -> Dict[str, Any]:
"""Evaluate all available memory backends.
Returns a comparison report.
"""
from agent.memory import NullBackend
from agent.memory.local_backend import LocalBackend
backends = []
# Always evaluate Null (baseline)
backends.append(NullBackend())
# Evaluate Local
try:
backends.append(LocalBackend())
except Exception as e:
logger.warning("Local backend init failed: %s", e)
# Try Honcho if configured
import os
if os.getenv("HONCHO_API_KEY"):
try:
from agent.memory.honcho_backend import HonchoBackend
backends.append(HonchoBackend())
except ImportError:
logger.debug("Honcho not installed, skipping evaluation")
evaluations = []
for backend in backends:
try:
evaluations.append(evaluate_backend(backend))
except Exception as e:
logger.warning("Evaluation failed for %s: %s", backend.backend_name, e)
# Build report
report = {
"timestamp": time.time(),
"backends_evaluated": len(evaluations),
"evaluations": [asdict(e) for e in evaluations],
"recommendation": _build_recommendation(evaluations),
}
return report
def _build_recommendation(evaluations: List[BackendEvaluation]) -> str:
"""Build overall recommendation from evaluations."""
if not evaluations:
return "No backends evaluated"
# Find best non-null backend
viable = [e for e in evaluations if e.backend_name != "null (disabled)" and e.available]
if not viable:
return "No viable backends found. Use NullBackend (default)."
best = max(viable, key=lambda e: e.score)
parts = [f"Best backend: {best.backend_name} (score: {best.score})"]
if best.is_cloud:
parts.append(
"WARNING: Cloud backend has privacy trade-offs. "
"Data leaves your machine. Consider LocalBackend for sovereignty."
)
# Compare local vs cloud if both available
local = [e for e in viable if not e.is_cloud]
cloud = [e for e in viable if e.is_cloud]
if local and cloud:
local_score = max(e.score for e in local)
cloud_score = max(e.score for e in cloud)
if local_score >= cloud_score:
parts.append(
f"Local backend (score {local_score}) matches or beats "
f"cloud (score {cloud_score}). RECOMMEND: stay local for sovereignty."
)
else:
parts.append(
f"Cloud backend (score {cloud_score}) outperforms "
f"local (score {local_score}) but adds cloud dependency."
)
return " ".join(parts)