Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 1m1s
Research evaluation of Honcho memory integration from plastic-labs fork. Builds a pluggable memory backend system that supports both cloud (Honcho) and local (SQLite) implementations. Architecture: agent/memory/__init__.py — MemoryBackend ABC, NullBackend, singleton agent/memory/local_backend.py — SQLite-backed local storage (default) agent/memory/honcho_backend.py — Honcho cloud backend (opt-in) agent/memory/evaluation.py — structured comparison framework Key design decisions: - NullBackend default: zero overhead when disabled - LocalBackend: zero cloud dependency, stores in ~/.hermes/memory.db - HonchoBackend: opt-in via HONCHO_API_KEY, lazy-loaded - Evaluation framework scores latency, functionality, privacy Evaluation scoring: - Availability (20pts), Functionality (40pts), Latency (20pts), Privacy (20pts) - Local scores higher on privacy (20 vs 5) — sovereignty-first RECOMMENDATION: LocalBackend for sovereignty. Honcho adds cloud dependency without clear advantage over local SQLite for our use case. 25 tests, all passing. Closes #322
264 lines
7.3 KiB
Python
264 lines
7.3 KiB
Python
"""Memory Backend Evaluation Framework.
|
|
|
|
Provides structured evaluation for comparing memory backends on:
|
|
1. Latency (store/retrieve/query operations)
|
|
2. Relevance (does query return useful results?)
|
|
3. Privacy (where is data stored?)
|
|
4. Reliability (availability, error handling)
|
|
5. Cost (API calls, cloud dependency)
|
|
|
|
Usage:
|
|
from agent.memory.evaluation import evaluate_backends
|
|
report = evaluate_backends()
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class BackendEvaluation:
|
|
"""Evaluation results for a single backend."""
|
|
backend_name: str
|
|
is_cloud: bool
|
|
available: bool
|
|
|
|
# Latency (milliseconds)
|
|
store_latency_ms: float = 0
|
|
retrieve_latency_ms: float = 0
|
|
query_latency_ms: float = 0
|
|
|
|
# Functionality
|
|
store_success: bool = False
|
|
retrieve_success: bool = False
|
|
query_returns_results: bool = False
|
|
query_result_count: int = 0
|
|
|
|
# Privacy
|
|
data_location: str = "unknown"
|
|
requires_api_key: bool = False
|
|
|
|
# Overall
|
|
score: float = 0 # 0-100
|
|
recommendation: str = ""
|
|
notes: List[str] = field(default_factory=list)
|
|
|
|
|
|
def _measure_latency(func, *args, **kwargs) -> tuple:
|
|
"""Measure function latency in milliseconds."""
|
|
start = time.perf_counter()
|
|
try:
|
|
result = func(*args, **kwargs)
|
|
elapsed = (time.perf_counter() - start) * 1000
|
|
return elapsed, result, None
|
|
except Exception as e:
|
|
elapsed = (time.perf_counter() - start) * 1000
|
|
return elapsed, None, e
|
|
|
|
|
|
def evaluate_backend(backend, test_user: str = "eval_user") -> BackendEvaluation:
|
|
"""Evaluate a single memory backend."""
|
|
from agent.memory import MemoryBackend
|
|
|
|
eval_result = BackendEvaluation(
|
|
backend_name=backend.backend_name,
|
|
is_cloud=backend.is_cloud,
|
|
available=backend.is_available(),
|
|
)
|
|
|
|
if not eval_result.available:
|
|
eval_result.notes.append("Backend not available")
|
|
eval_result.score = 0
|
|
eval_result.recommendation = "NOT AVAILABLE"
|
|
return eval_result
|
|
|
|
# Privacy assessment
|
|
if backend.is_cloud:
|
|
eval_result.data_location = "cloud (external)"
|
|
eval_result.requires_api_key = True
|
|
else:
|
|
eval_result.data_location = "local (~/.hermes/)"
|
|
|
|
# Test store
|
|
latency, success, err = _measure_latency(
|
|
backend.store,
|
|
test_user,
|
|
"eval_test_key",
|
|
"eval_test_value",
|
|
{"source": "evaluation"},
|
|
)
|
|
eval_result.store_latency_ms = latency
|
|
eval_result.store_success = success is True
|
|
if err:
|
|
eval_result.notes.append(f"Store error: {err}")
|
|
|
|
# Test retrieve
|
|
latency, result, err = _measure_latency(
|
|
backend.retrieve,
|
|
test_user,
|
|
"eval_test_key",
|
|
)
|
|
eval_result.retrieve_latency_ms = latency
|
|
eval_result.retrieve_success = result is not None
|
|
if err:
|
|
eval_result.notes.append(f"Retrieve error: {err}")
|
|
|
|
# Test query
|
|
latency, results, err = _measure_latency(
|
|
backend.query,
|
|
test_user,
|
|
"eval_test",
|
|
5,
|
|
)
|
|
eval_result.query_latency_ms = latency
|
|
eval_result.query_returns_results = bool(results)
|
|
eval_result.query_result_count = len(results) if results else 0
|
|
if err:
|
|
eval_result.notes.append(f"Query error: {err}")
|
|
|
|
# Cleanup
|
|
try:
|
|
backend.delete(test_user, "eval_test_key")
|
|
except Exception:
|
|
pass
|
|
|
|
# Score calculation (0-100)
|
|
score = 0
|
|
|
|
# Availability (20 points)
|
|
score += 20
|
|
|
|
# Functionality (40 points)
|
|
if eval_result.store_success:
|
|
score += 15
|
|
if eval_result.retrieve_success:
|
|
score += 15
|
|
if eval_result.query_returns_results:
|
|
score += 10
|
|
|
|
# Latency (20 points) — lower is better
|
|
avg_latency = (
|
|
eval_result.store_latency_ms +
|
|
eval_result.retrieve_latency_ms +
|
|
eval_result.query_latency_ms
|
|
) / 3
|
|
if avg_latency < 10:
|
|
score += 20
|
|
elif avg_latency < 50:
|
|
score += 15
|
|
elif avg_latency < 200:
|
|
score += 10
|
|
else:
|
|
score += 5
|
|
|
|
# Privacy (20 points) — local is better for sovereignty
|
|
if not backend.is_cloud:
|
|
score += 20
|
|
else:
|
|
score += 5 # cloud has privacy trade-offs
|
|
|
|
eval_result.score = score
|
|
|
|
# Recommendation
|
|
if score >= 80:
|
|
eval_result.recommendation = "RECOMMENDED"
|
|
elif score >= 60:
|
|
eval_result.recommendation = "ACCEPTABLE"
|
|
elif score >= 40:
|
|
eval_result.recommendation = "MARGINAL"
|
|
else:
|
|
eval_result.recommendation = "NOT RECOMMENDED"
|
|
|
|
return eval_result
|
|
|
|
|
|
def evaluate_backends() -> Dict[str, Any]:
|
|
"""Evaluate all available memory backends.
|
|
|
|
Returns a comparison report.
|
|
"""
|
|
from agent.memory import NullBackend
|
|
from agent.memory.local_backend import LocalBackend
|
|
|
|
backends = []
|
|
|
|
# Always evaluate Null (baseline)
|
|
backends.append(NullBackend())
|
|
|
|
# Evaluate Local
|
|
try:
|
|
backends.append(LocalBackend())
|
|
except Exception as e:
|
|
logger.warning("Local backend init failed: %s", e)
|
|
|
|
# Try Honcho if configured
|
|
import os
|
|
if os.getenv("HONCHO_API_KEY"):
|
|
try:
|
|
from agent.memory.honcho_backend import HonchoBackend
|
|
backends.append(HonchoBackend())
|
|
except ImportError:
|
|
logger.debug("Honcho not installed, skipping evaluation")
|
|
|
|
evaluations = []
|
|
for backend in backends:
|
|
try:
|
|
evaluations.append(evaluate_backend(backend))
|
|
except Exception as e:
|
|
logger.warning("Evaluation failed for %s: %s", backend.backend_name, e)
|
|
|
|
# Build report
|
|
report = {
|
|
"timestamp": time.time(),
|
|
"backends_evaluated": len(evaluations),
|
|
"evaluations": [asdict(e) for e in evaluations],
|
|
"recommendation": _build_recommendation(evaluations),
|
|
}
|
|
|
|
return report
|
|
|
|
|
|
def _build_recommendation(evaluations: List[BackendEvaluation]) -> str:
|
|
"""Build overall recommendation from evaluations."""
|
|
if not evaluations:
|
|
return "No backends evaluated"
|
|
|
|
# Find best non-null backend
|
|
viable = [e for e in evaluations if e.backend_name != "null (disabled)" and e.available]
|
|
if not viable:
|
|
return "No viable backends found. Use NullBackend (default)."
|
|
|
|
best = max(viable, key=lambda e: e.score)
|
|
|
|
parts = [f"Best backend: {best.backend_name} (score: {best.score})"]
|
|
|
|
if best.is_cloud:
|
|
parts.append(
|
|
"WARNING: Cloud backend has privacy trade-offs. "
|
|
"Data leaves your machine. Consider LocalBackend for sovereignty."
|
|
)
|
|
|
|
# Compare local vs cloud if both available
|
|
local = [e for e in viable if not e.is_cloud]
|
|
cloud = [e for e in viable if e.is_cloud]
|
|
if local and cloud:
|
|
local_score = max(e.score for e in local)
|
|
cloud_score = max(e.score for e in cloud)
|
|
if local_score >= cloud_score:
|
|
parts.append(
|
|
f"Local backend (score {local_score}) matches or beats "
|
|
f"cloud (score {cloud_score}). RECOMMEND: stay local for sovereignty."
|
|
)
|
|
else:
|
|
parts.append(
|
|
f"Cloud backend (score {cloud_score}) outperforms "
|
|
f"local (score {local_score}) but adds cloud dependency."
|
|
)
|
|
|
|
return " ".join(parts)
|