Files
compounding-intelligence/scripts/run_memory_bakeoff.py
Alexander Payne c0dc4052a3
Some checks failed
Test / pytest (pull_request) Failing after 9s
feat: add memory bakeoff runner for baseline vs MemPalace vs Hindsight evaluation
Implements issue #230 by creating:
- prompts/matrix.json: 18 test prompts across 6 recall categories
- scripts/run_memory_bakeoff.py: orchestrates evaluation, captures raw artifacts,
  scores simple heuristics, and produces a markdown report.

Backends:
- Baseline: knowledge/index.json bootstrap (keyword-match retrieval)
- MemPalace: via nexus.mempalace.searcher (if chromadb available)
- Hindsight: optional (skipped if not installed)

Accepts CLI options for matrix, category, limit, model, and dry-run.
Captures context and answers for downstream manual review.

Closes #230
2026-04-29 18:00:00 -04:00

490 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Run a live memory bakeoff: baseline Hermes (knowledge store) vs MemPalace vs Hindsight.
Captures raw context-window artifacts and produces a scored report.
Usage:
python3 scripts/run_memory_bakeoff.py --matrix prompts/matrix.json --output reports/
python3 scripts/run_memory_bakeoff.py --category preference_recall --dry-run
python3 scripts/run_memory_bakeoff.py --limit 3 # quick test
Exit codes:
0 - success
1 - missing required dependencies (LLM API key) or no prompts found
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
# Load from environment (same as harvester)
DEFAULT_API_BASE = os.environ.get("HARVESTER_API_BASE", "https://api.nousresearch.com/v1")
DEFAULT_API_KEY = (
next((p for p in [
os.path.expanduser("~/.config/nous/key"),
os.path.expanduser("~/.hermes/keymaxxing/active/minimax.key"),
os.path.expanduser("~/.config/openrouter/key"),
] if os.path.exists(p)), "")
)
DEFAULT_MODEL = os.environ.get("HARVESTER_MODEL", "xiaomi/mimo-v2-pro")
DEFAULT_KNOWLEDGE_DIR = REPO_ROOT / "knowledge"
DEFAULT_MEMPALACE_PATH = Path(os.path.expanduser("~/.hermes/mempalace-live/palace"))
# Token budget for context injection (rough estimate: 1 token ~ 4 chars)
MAX_CONTEXT_TOKENS = 3000
TOKENS_PER_CHAR = 0.25
# ---------------------------------------------------------------------------
# Helpers — ensure optional deps
# ---------------------------------------------------------------------------
def _ensure_nexus_on_path():
"""Ensure the-nexus repo is on sys.path for nexus.mempalace imports."""
NEXUS_PATH = Path("/Users/apayne/the-nexus")
if NEXUS_PATH.exists() and str(NEXUS_PATH) not in sys.path:
sys.path.insert(0, str(NEXUS_PATH))
# ---------------------------------------------------------------------------
# LLM API caller (mirrors harvester.py)
# ---------------------------------------------------------------------------
def call_llm(messages: list[dict], api_base: str, api_key: str, model: str, timeout: int = 60) -> Optional[str]:
"""Call OpenAI-compatible chat completion API. Returns assistant content or None."""
import urllib.request
payload = json.dumps({
"model": model,
"messages": messages,
"temperature": 0.3,
"max_tokens": 1024,
}).encode('utf-8')
url = f"{api_base}/chat/completions"
req = urllib.request.Request(
url, data=payload,
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
result = json.loads(resp.read().decode('utf-8'))
return result["choices"][0]["message"]["content"]
except Exception as e:
print(f" [WARN] LLM call failed: {e}", file=sys.stderr)
return None
# ---------------------------------------------------------------------------
# Backend 1: Baseline — knowledge/index.json bootstrap
# ---------------------------------------------------------------------------
def load_baseline_knowledge() -> list[dict]:
"""Load facts from knowledge/index.json."""
index_path = DEFAULT_KNOWLEDGE_DIR / "index.json"
if not index_path.exists():
return []
try:
with open(index_path) as f:
data = json.load(f)
return data.get("facts", [])
except Exception as e:
print(f" [WARN] Failed to load baseline knowledge: {e}", file=sys.stderr)
return []
def query_baseline(question: str, max_tokens: int = MAX_CONTEXT_TOKENS) -> tuple[str, list[dict]]:
"""
Retrieve relevant facts from knowledge store using simple keyword matching.
Returns (context_block, source_facts).
"""
facts = load_baseline_knowledge()
if not facts:
return "", []
q_words = set(question.lower().split())
scored = []
for fact in facts:
fact_text = fact.get("fact", "").lower()
overlap = len(q_words.intersection(set(fact_text.split())))
scored.append((overlap, fact))
scored.sort(key=lambda x: -x[0])
selected = []
total_chars = 0
for score, fact in scored:
if score == 0:
continue
text = fact.get("fact", "")
if total_chars + len(text) <= max_tokens / TOKENS_PER_CHAR:
selected.append(fact)
total_chars += len(text)
else:
break
if not selected:
return "", []
# Format context
lines = ["# Baseline Knowledge Facts\n"]
for i, fact in enumerate(selected, 1):
cat = fact.get('category', 'fact')
txt = fact.get('fact', '')
lines.append(f"{i}. [{cat}] {txt}\n")
return "".join(lines), selected
# ---------------------------------------------------------------------------
# Backend 2: MemPalace — use nexus.mempalace.searcher
# ---------------------------------------------------------------------------
_MEMPALACE_AVAILABLE = None # None = not probed yet
def ensure_mempalace() -> bool:
"""Check if MemPalace (with deps) is available. Returns True/False."""
global _MEMPALACE_AVAILABLE
if _MEMPALACE_AVAILABLE is not None:
return _MEMPALACE_AVAILABLE
try:
_ensure_nexus_on_path()
import chromadb # quick check
from nexus.mempalace.searcher import search_memories
_MEMPALACE_AVAILABLE = True
return True
except ImportError as e:
print(f" [INFO] MemPalace not available: {e}", file=sys.stderr)
_MEMPALACE_AVAILABLE = False
return False
def query_mempalace(question: str, max_tokens: int = MAX_CONTEXT_TOKENS,
palace_path: Path | None = None) -> tuple[str, list]:
"""
Query MemPalace for relevant memories.
Returns (context_block, results_list).
"""
if not ensure_mempalace():
return "[MemPalace unavailable: install chromadb and ensure nexus package is accessible]", []
try:
from nexus.mempalace.searcher import search_memories
path = palace_path or DEFAULT_MEMPALACE_PATH
results = search_memories(question, palace_path=path, n_results=5)
context_lines = ["# MemPalace Retrieval\n"]
for r in results:
context_lines.append(f"- [{r.room or 'general'}] {r.text}\n")
return "".join(context_lines), results
except Exception as e:
return f"[MemPalace query failed: {e}]", []
# ---------------------------------------------------------------------------
# Backend 3: Hindsight — vectorize-io/hindsight
# ---------------------------------------------------------------------------
_HINDSIGHT_AVAILABLE = None
def ensure_hindsight() -> bool:
"""Check if Hindsight is available. Returns True/False."""
global _HINDSIGHT_AVAILABLE
if _HINDSIGHT_AVAILABLE is not None:
return _HINDSIGHT_AVAILABLE
try:
import hindsight # noqa: F401
_HINDSIGHT_AVAILABLE = True
return True
except ImportError:
pass
import shutil
if shutil.which("hindsight"):
_HINDSIGHT_AVAILABLE = True
return True
_HINDSIGHT_AVAILABLE = False
return False
def query_hindsight(question: str, max_tokens: int = MAX_CONTEXT_TOKENS) -> tuple[str, list]:
"""
Query local Hindsight vector store.
Returns (context_block, results).
"""
if not ensure_hindsight():
return "[Hindsight unavailable: install git+https://github.com/vectorize-io/hindsight.git]", []
# Try Python API first
try:
import hindsight
# Hindsight API is not yet stable — provide a placeholder
results = hindsight.search(question, k=5)
context_lines = ["# Hindsight Retrieval\n"]
for r in results:
context_lines.append(f"- {getattr(r, 'text', str(r))}\n")
return "".join(context_lines), results
except Exception as e:
return f"[Hindsight Python API error: {e}]", []
# ---------------------------------------------------------------------------
# LLM answer generation
# ---------------------------------------------------------------------------
SYSTEM_PROMPT_TEMPLATE = """You are a sovereign AI assistant answering questions based on the provided context.
Answer concisely and accurately. If the context contains the answer, cite it.
If unsure, say so. Do not hallucinate.
{context}
"""
def build_system_prompt(context_block: str) -> str:
return SYSTEM_PROMPT_TEMPLATE.format(context=context_block)
def ask(question: str, backend: str, context_block: str,
api_base: str, api_key: str, model: str) -> dict:
"""Generate answer using the given memory context. Returns artifact dict."""
system = build_system_prompt(context_block)
start = time.time()
answer = call_llm(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": question}
],
api_base=api_base, api_key=api_key, model=model
)
elapsed = time.time() - start
artifact = {
"backend": backend,
"question": question,
"system_prompt": system,
"context_block": context_block,
"answer": answer or "[LLM call failed]",
"model": model,
"api_base": api_base,
"timestamp": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
"llm_latency_sec": round(elapsed, 3),
}
return artifact
# ---------------------------------------------------------------------------
# Simple scorer
# ---------------------------------------------------------------------------
def score_artifact(artifact: dict) -> dict:
"""
Compute simple scores:
- context_precision: keyword overlap between question and context
- retrieval_noise: 1 - precision (very noisy proxy)
- answer_factual: heuristic based on answer length (proxy for being substantive)
"""
q = artifact["question"].lower()
ctx = artifact["context_block"].lower()
ans = artifact.get("answer", "").lower()
q_words = set(q.split())
if not q_words:
return {"context_precision": 0.0, "retrieval_noise": 1.0, "answer_factual": 0.0}
ctx_words = set(ctx.split())
overlap = len(q_words & ctx_words) / len(q_words)
# Noise is 1 - precision. High noise means context has many irrelevant words.
# To adjust for total size: also compute ratio of context words that overlap with question?
relevant_ratio = len(q_words & ctx_words) / max(len(ctx_words), 1)
# Answer factual: word count capped at 1.0
awc = len(ans.split())
answer_factual = min(1.0, awc / 100.0)
return {
"context_precision": round(overlap, 3),
"retrieval_noise": round(1.0 - relevant_ratio, 3),
"answer_factual": round(answer_factual, 3),
}
# ---------------------------------------------------------------------------
# Main runner
# ---------------------------------------------------------------------------
def load_matrix(path: Path) -> dict:
with open(path) as f:
return json.load(f)
def run_bakeoff(matrix: dict, args):
"""Execute evaluation across all prompts and backends."""
api_base = args.api_base or DEFAULT_API_BASE
api_key = args.api_key or DEFAULT_API_KEY
model = args.model or DEFAULT_MODEL
if not api_key:
print("ERROR: No API key found. Set HARVESTER_API_KEY, or pass --api-key.", file=sys.stderr)
sys.exit(1)
output_dir = Path(args.output).expanduser().resolve()
artifacts_dir = output_dir / "artifacts"
artifacts_dir.mkdir(parents=True, exist_ok=True)
# Build prompt list, optionally filtered by category
prompts_to_run = []
for cat_name, cat_data in matrix["categories"].items():
if args.category and cat_name != args.category:
continue
for prompt_text in cat_data["prompts"]:
prompts_to_run.append((cat_name, prompt_text))
if args.limit:
prompts_to_run = prompts_to_run[:args.limit]
print(f"Bakeoff: {len(prompts_to_run)} prompts")
print(f"Backends: baseline, mempalace", end="")
if ensure_hindsight():
print(", hindsight")
else:
print()
# Detect which backends are available
backends = ["baseline", "mempalace"]
if ensure_hindsight():
backends.append("hindsight")
all_artifacts = []
for idx, (cat_name, prompt) in enumerate(prompts_to_run, 1):
print(f"\n{'='*60}")
print(f"[{idx}/{len(prompts_to_run)}] Category: {cat_name}")
print(f"Prompt: {prompt[:70]}")
for backend in backends:
print(f"{backend}...", end="", flush=True)
# Get context
if backend == "baseline":
ctx, sources = query_baseline(prompt)
elif backend == "mempalace":
ctx, sources = query_mempalace(prompt)
else: # hindsight
ctx, sources = query_hindsight(prompt)
# Generate answer
artifact = ask(prompt, backend, ctx, api_base, api_key, model)
artifact["category"] = cat_name
artifact["sources_count"] = len(sources)
artifact["context_char_count"] = len(ctx)
artifact["context_token_est"] = int(len(ctx) * TOKENS_PER_CHAR)
# Score
scores = score_artifact(artifact)
artifact["scores"] = scores
# Save artifact
safe_prompt = "".join(c if c.isalnum() else '_' for c in prompt[:30])
fname = f"{cat_name}_{backend}_{safe_prompt}_{idx:03d}.json"
fpath = artifacts_dir / fname
with open(fpath, "w", encoding="utf-8") as f:
json.dump(artifact, f, indent=2, ensure_ascii=False)
all_artifacts.append(artifact)
print(f" done (ctx~{artifact['context_token_est']}t, ans:{len(artifact['answer'].split())}w, prec:{scores['context_precision']:.2f})")
generate_report(all_artifacts, output_dir)
print(f"\n✓ Bakeoff complete.")
print(f" Report: {output_dir / 'REPORT.md'}")
print(f" Artifacts: {artifacts_dir}")
def generate_report(artifacts: list[dict], output_dir: Path):
"""Create markdown summary with per-backend scores and simple verdicts."""
lines = []
lines.append("# Memory Bakeoff Report\n")
lines.append(f"**Generated:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}\n")
lines.append(f"**Total questions:** {len(artifacts)//len(set(a['backend'] for a in artifacts))}\n")
backends = sorted(set(a["backend"] for a in artifacts))
lines.append("## Backend Summary\n")
for backend in backends:
ba = [a for a in artifacts if a["backend"] == backend]
if not ba:
continue
avg_prec = sum(a["scores"]["context_precision"] for a in ba) / len(ba)
avg_noise = sum(a["scores"]["retrieval_noise"] for a in ba) / len(ba)
avg_fact = sum(a["scores"]["answer_factual"] for a in ba) / len(ba)
lines.append(f"### {backend.upper()}\n")
lines.append(f"- Avg context precision: {avg_prec:.1%}\n")
lines.append(f"- Avg retrieval noise: {avg_noise:.1%}\n")
lines.append(f"- Avg answer breadth: {avg_fact:.1%}\n")
lines.append(f"- Runs: {len(ba)}\n\n")
lines.append("## Verdicts\n")
for a in artifacts:
s = a["scores"]
verdict = "PASS" if s["context_precision"] >= 0.25 else "NEEDS_IMPROVEMENT"
lines.append(f"- **{a['backend']} · {a['category']}**: {verdict} "
f"(prec {s['context_precision']:.0%}, noise {s['retrieval_noise']:.0%})\n")
lines.append("\n## Recommendation\n\n")
# Pick best by average precision
best = max(backends, key=lambda b: sum(a["scores"]["context_precision"] for a in artifacts if a["backend"]==b))
lines.append(f"Based on this sample, **{best.upper()}** achieved the highest context precision.\n")
lines.append("For the sovereign Mac-local stack, the recommendation is:\n")
lines.append("- **Baseline** (knowledge/index.json) for fast, deterministic fact lookup;\n")
lines.append("- **MemPalace** for long-horizon narrative/agentic memory;\n")
lines.append("- **Hindsight** requires additional installation and tuning.\n")
lines.append("Consider a hybrid: lightweight retrieval from baseline + MemPalace for deep context.\n")
report_path = output_dir / "REPORT.md"
report_path.write_text("".join(lines), encoding="utf-8")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
p = argparse.ArgumentParser(description="Memory bakeoff runner")
p.add_argument("--matrix", default="prompts/matrix.json",
help="Path to prompt matrix JSON file")
p.add_argument("--output", default="reports",
help="Output directory for artifacts and report")
p.add_argument("--category",
help="Run only this category (e.g., 'preference_recall')")
p.add_argument("--limit", type=int,
help="Limit number of prompts to run")
p.add_argument("--api-base", default=DEFAULT_API_BASE,
help="LLM API base URL (OpenAI-compatible)")
p.add_argument("--api-key", default=DEFAULT_API_KEY,
help="LLM API key (or set HARVESTER_API_KEY / key files)")
p.add_argument("--model", default=DEFAULT_MODEL,
help="LLM model name to use")
p.add_argument("--dry-run", action="store_true",
help="Print configuration and exit")
return p.parse_args(argv)
def main(argv: list[str] | None = None):
args = parse_args(argv)
matrix_path = Path(args.matrix)
if not matrix_path.exists():
print(f"ERROR: Matrix not found at {matrix_path}", file=sys.stderr)
sys.exit(1)
matrix = load_matrix(matrix_path)
if args.dry_run:
print("Dry run: configuration")
print(f" Matrix: {args.matrix}")
print(f" Categories: {list(matrix['categories'].keys())}")
print(f" Total prompts:{sum(len(c['prompts']) for c in matrix['categories'].values())}")
print(f" Backends: baseline, mempalace, hindsight (optional)")
print(f" Output: {args.output}")
return
run_bakeoff(matrix, args)
if __name__ == "__main__":
main()