Compare commits
6 Commits
sprint/iss
...
fix/792-gr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
55c8100b8f | ||
|
|
1f92fb0480 | ||
|
|
a39f4fb1ab | ||
|
|
5c2cf06f57 | ||
|
|
4fd78ace44 | ||
|
|
b8b8bb65fd |
110
evennia_tools/batch_cmds_bezalel.ev
Normal file
110
evennia_tools/batch_cmds_bezalel.ev
Normal file
@@ -0,0 +1,110 @@
|
||||
#
|
||||
# Bezalel World Builder — Evennia batch commands
|
||||
# Creates the Bezalel Evennia world from evennia_tools/bezalel_layout.py specs.
|
||||
#
|
||||
# Load with: @batchcommand bezalel_world
|
||||
#
|
||||
# Part of #536
|
||||
|
||||
# Create rooms
|
||||
@create/drop Limbo:evennia.objects.objects.DefaultRoom
|
||||
@desc here = The void between worlds. The air carries the pulse of three houses: Mac, VPS, and this one. Everything begins here before it is given form.
|
||||
|
||||
@create/drop Gatehouse:evennia.objects.objects.DefaultRoom
|
||||
@desc here = A stone guard tower at the edge of Bezalel world. The walls are carved with runes of travel, proof, and return. Every arrival is weighed before it is trusted.
|
||||
|
||||
@create/drop Great Hall:evennia.objects.objects.DefaultRoom
|
||||
@desc here = A vast hall with a long working table. Maps of the three houses hang beside sketches, benchmarks, and deployment notes. This is where the forge reports back to the house.
|
||||
|
||||
@create/drop The Library of Bezalel:evennia.objects.objects.DefaultRoom
|
||||
@desc here = Shelves of technical manuals, Evennia code, test logs, and bridge schematics rise to the ceiling. This room holds plans waiting to be made real.
|
||||
|
||||
@create/drop The Observatory:evennia.objects.objects.DefaultRoom
|
||||
@desc here = A high chamber with telescopes pointing toward the Mac, the VPS, and the wider net. Screens glow with status lights, latency traces, and long-range signals.
|
||||
|
||||
@create/drop The Workshop:evennia.objects.objects.DefaultRoom
|
||||
@desc here = A forge and workbench share the same heat. Scattered here are half-finished bridges, patched harnesses, and tools laid out for proof before pride.
|
||||
|
||||
@create/drop The Server Room:evennia.objects.objects.DefaultRoom
|
||||
@desc here = Racks of humming servers line the walls. Fans push warm air through the chamber while status LEDs beat like a mechanical heart. This is the pulse of Bezalel house.
|
||||
|
||||
@create/drop The Garden of Code:evennia.objects.objects.DefaultRoom
|
||||
@desc here = A quiet garden where ideas are left long enough to grow roots. Code-shaped leaves flutter in patterned wind, and a stone path invites patient thought.
|
||||
|
||||
@create/drop The Portal Room:evennia.objects.objects.DefaultRoom
|
||||
@desc here = Three shimmering doorways stand in a ring: one marked for the Mac house, one for the VPS, and one for the wider net. The room hums like a bridge waiting for traffic.
|
||||
|
||||
# Create exits
|
||||
@open gatehouse:gate,tower = Gatehouse
|
||||
@open limbo:void,back = Limbo
|
||||
@open greathall:hall,great hall = Great Hall
|
||||
@open gatehouse:gate,tower = Gatehouse
|
||||
@open library:books,study = The Library of Bezalel
|
||||
@open hall:great hall,back = Great Hall
|
||||
@open observatory:telescope,tower top = The Observatory
|
||||
@open hall:great hall,back = Great Hall
|
||||
@open workshop:forge,bench = The Workshop
|
||||
@open hall:great hall,back = Great Hall
|
||||
@open serverroom:servers,server room = The Server Room
|
||||
@open workshop:forge,bench = The Workshop
|
||||
@open garden:garden of code,grove = The Garden of Code
|
||||
@open workshop:forge,bench = The Workshop
|
||||
@open portalroom:portal,portals = The Portal Room
|
||||
@open gatehouse:gate,back = Gatehouse
|
||||
|
||||
# Create objects
|
||||
@create Threshold Ledger
|
||||
@desc Threshold Ledger = A heavy ledger where arrivals, departures, and field notes are recorded before the work begins.
|
||||
@tel Threshold Ledger = Gatehouse
|
||||
|
||||
@create Three-House Map
|
||||
@desc Three-House Map = A long map showing Mac, VPS, and remote edges in one continuous line of work.
|
||||
@tel Three-House Map = Great Hall
|
||||
|
||||
@create Bridge Schematics
|
||||
@desc Bridge Schematics = Rolled plans describing world bridges, Evennia layouts, and deployment paths.
|
||||
@tel Bridge Schematics = The Library of Bezalel
|
||||
|
||||
@create Compiler Manuals
|
||||
@desc Compiler Manuals = Manuals annotated in the margins with warnings against cleverness without proof.
|
||||
@tel Compiler Manuals = The Library of Bezalel
|
||||
|
||||
@create Tri-Axis Telescope
|
||||
@desc Tri-Axis Telescope = A brass telescope assembly that can be turned toward the Mac, the VPS, or the open net.
|
||||
@tel Tri-Axis Telescope = The Observatory
|
||||
|
||||
@create Forge Anvil
|
||||
@desc Forge Anvil = Scarred metal used for turning rough plans into testable form.
|
||||
@tel Forge Anvil = The Workshop
|
||||
|
||||
@create Bridge Workbench
|
||||
@desc Bridge Workbench = A wide bench covered in harness patches, relay notes, and half-soldered bridge parts.
|
||||
@tel Bridge Workbench = The Workshop
|
||||
|
||||
@create Heartbeat Console
|
||||
@desc Heartbeat Console = A monitoring console showing service health, latency, and the steady hum of the house.
|
||||
@tel Heartbeat Console = The Server Room
|
||||
|
||||
@create Server Racks
|
||||
@desc Server Racks = Stacked machines that keep the world awake even when no one is watching.
|
||||
@tel Server Racks = The Server Room
|
||||
|
||||
@create Code Orchard
|
||||
@desc Code Orchard = Trees with code-shaped leaves. Some branches bear elegant abstractions; others hold broken prototypes.
|
||||
@tel Code Orchard = The Garden of Code
|
||||
|
||||
@create Stone Bench
|
||||
@desc Stone Bench = A place to sit long enough for a hard implementation problem to become clear.
|
||||
@tel Stone Bench = The Garden of Code
|
||||
|
||||
@create Mac Portal:mac arch
|
||||
@desc Mac Portal = A silver doorway whose frame vibrates with the local sovereign house.
|
||||
@tel Mac Portal = The Portal Room
|
||||
|
||||
@create VPS Portal:vps arch
|
||||
@desc VPS Portal = A cobalt doorway tuned toward the testbed VPS house.
|
||||
@tel VPS Portal = The Portal Room
|
||||
|
||||
@create Net Portal:net arch,network arch
|
||||
@desc Net Portal = A pale doorway pointed toward the wider net and every uncertain edge beyond it.
|
||||
@tel Net Portal = The Portal Room
|
||||
85
evennia_tools/build_bezalel_world.py
Normal file
85
evennia_tools/build_bezalel_world.py
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
""
|
||||
build_bezalel_world.py — Build Bezalel Evennia world from layout specs.
|
||||
|
||||
Programmatically creates rooms, exits, objects, and characters in a running
|
||||
Evennia instance using the specs from evennia_tools/bezalel_layout.py.
|
||||
|
||||
Usage (in Evennia game shell):
|
||||
from evennia_tools.build_bezalel_world import build_world
|
||||
build_world()
|
||||
|
||||
Or via batch command:
|
||||
@batchcommand evennia_tools/batch_cmds_bezalel.ev
|
||||
|
||||
Part of #536
|
||||
""
|
||||
|
||||
from evennia_tools.bezalel_layout import (
|
||||
ROOMS, EXITS, OBJECTS, CHARACTERS, PORTAL_COMMANDS,
|
||||
room_keys, reachable_rooms_from
|
||||
)
|
||||
|
||||
|
||||
def build_world():
|
||||
"""Build the Bezalel Evennia world from layout specs."""
|
||||
from evennia.objects.models import ObjectDB
|
||||
from evennia.utils.create import create_object, create_exit, create_message
|
||||
|
||||
print("Building Bezalel world...")
|
||||
|
||||
# Create rooms
|
||||
rooms = {}
|
||||
for spec in ROOMS:
|
||||
room = create_object(
|
||||
"evennia.objects.objects.DefaultRoom",
|
||||
key=spec.key,
|
||||
attributes=(("desc", spec.desc),),
|
||||
)
|
||||
rooms[spec.key] = room
|
||||
print(f" Room: {spec.key}")
|
||||
|
||||
# Create exits
|
||||
for spec in EXITS:
|
||||
source = rooms.get(spec.source)
|
||||
dest = rooms.get(spec.destination)
|
||||
if not source or not dest:
|
||||
print(f" WARNING: Exit {spec.key} — missing room")
|
||||
continue
|
||||
exit_obj = create_exit(
|
||||
key=spec.key,
|
||||
location=source,
|
||||
destination=dest,
|
||||
aliases=list(spec.aliases),
|
||||
)
|
||||
print(f" Exit: {spec.source} -> {spec.destination} ({spec.key})")
|
||||
|
||||
# Create objects
|
||||
for spec in OBJECTS:
|
||||
location = rooms.get(spec.location)
|
||||
if not location:
|
||||
print(f" WARNING: Object {spec.key} — missing room {spec.location}")
|
||||
continue
|
||||
obj = create_object(
|
||||
"evennia.objects.objects.DefaultObject",
|
||||
key=spec.key,
|
||||
location=location,
|
||||
attributes=(("desc", spec.desc),),
|
||||
aliases=list(spec.aliases),
|
||||
)
|
||||
print(f" Object: {spec.key} in {spec.location}")
|
||||
|
||||
# Verify reachability
|
||||
all_rooms = set(room_keys())
|
||||
reachable = reachable_rooms_from("Limbo")
|
||||
unreachable = all_rooms - reachable
|
||||
if unreachable:
|
||||
print(f" WARNING: Unreachable rooms: {unreachable}")
|
||||
else:
|
||||
print(f" All {len(all_rooms)} rooms reachable from Limbo")
|
||||
|
||||
print("Bezalel world built.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
build_world()
|
||||
@@ -1,116 +0,0 @@
|
||||
# MemPalace v3.0.0 Integration — Before/After Evaluation
|
||||
|
||||
**Closes:** #568
|
||||
**Issue:** #764
|
||||
**Date:** 2026-04-16
|
||||
**Status:** ✅ Complete — Recommendation: integrate as primary memory layer
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
Formalized evaluation of **MemPalace v3.0.0** (`github.com/milla-jovovich/mempalace`) as a memory layer for the Timmy/Hermes agent stack.
|
||||
|
||||
| Property | Value |
|
||||
|---|---|
|
||||
| Version | 3.0.0 |
|
||||
| Backend | ChromaDB (local) |
|
||||
| Cloud dependencies | **Zero** |
|
||||
| API calls required | **Zero** (baseline) |
|
||||
| MCP compatible | Yes |
|
||||
| Recommendation | **Integrate as primary memory layer** |
|
||||
|
||||
---
|
||||
|
||||
## Key Findings
|
||||
|
||||
| Metric | Value | Notes |
|
||||
|---|---|---|
|
||||
| LongMemEval R@5 | **96.6%** | Raw ChromaDB, zero API calls |
|
||||
| Palace structure boost | **+34%** | Wing + room filtering vs flat retrieval |
|
||||
| Wake-up context size | **210 tokens** | L0 identity + L1 compressed project state |
|
||||
| Hybrid R@5 (optional) | 100% | With Haiku rerank (optional API) |
|
||||
|
||||
---
|
||||
|
||||
## Benchmark Results
|
||||
|
||||
| Benchmark | Mode | Score | API Required |
|
||||
|---|---|---:|---|
|
||||
| LongMemEval R@5 | Raw ChromaDB only | **96.6%** | Zero |
|
||||
| LongMemEval R@5 | Hybrid + Haiku rerank | 100% | Optional Haiku |
|
||||
| LoCoMo R@10 | Raw, session level | 60.3% | Zero |
|
||||
| Personal palace R@10 | Heuristic bench | 85% | Zero |
|
||||
| Palace structure impact | Wing + room filtering | **+34%** R@10 | Zero |
|
||||
|
||||
---
|
||||
|
||||
## Before vs After (Synthetic Evaluation)
|
||||
|
||||
### Test Setup
|
||||
- 4-file synthetic project: `README.md`, `auth.md`, `deployment.md`, `main.py`
|
||||
- Mined into MemPalace palace
|
||||
- 4 standard queries executed
|
||||
|
||||
### Before (Keyword/BM25 Baseline)
|
||||
|
||||
| Query | Returns | Limitations |
|
||||
|---|---|---|
|
||||
| `authentication` | `auth.md` only | Exact match; misses implementation context |
|
||||
| `docker nginx SSL` | `deployment.md` | Requires manual keyword logic |
|
||||
| `keycloak OAuth` | `auth.md` | No semantic cross-reference |
|
||||
| `postgresql database` | `README.md` (maybe) | Index-dependent |
|
||||
|
||||
**Problems:** no semantic ranking, exact match bias, no durable conversation memory, no palace structure, no wake-up context.
|
||||
|
||||
### After (MemPalace)
|
||||
|
||||
| Query | Results | Score | Notes |
|
||||
|---|---|---:|---|
|
||||
| `authentication` | `auth.md`, `main.py` | -0.139 | Finds auth discussion + implementation |
|
||||
| `docker nginx SSL` | `deployment.md`, `auth.md` | 0.447 | Deployment hit + related JWT context |
|
||||
| `keycloak OAuth` | `auth.md`, `main.py` | -0.029 | Conceptual + implementation evidence |
|
||||
| `postgresql database` | `README.md`, `main.py` | 0.025 | Decision + implementation |
|
||||
|
||||
**Improvements:** semantic ranking, cross-file references, palace-structured retrieval, wake-up context artifact.
|
||||
|
||||
---
|
||||
|
||||
## Wake-up Context
|
||||
|
||||
- ~210 tokens total
|
||||
- L0 identity placeholder
|
||||
- L1 compressed project state
|
||||
- Enables cold-start agent bootstrapping without re-reading full corpus
|
||||
|
||||
---
|
||||
|
||||
## Integration Recommendation
|
||||
|
||||
**Verdict: Integrate MemPalace v3.0.0 as the primary memory layer for Timmy/Hermes.**
|
||||
|
||||
Rationale:
|
||||
1. **96.6% R@5 with zero API calls** — production-grade retrieval without cloud dependency
|
||||
2. **+34% retrieval boost from palace structure** — structured memory outperforms flat search
|
||||
3. **210-token wake-up context** — enables fast cold-start agent initialization
|
||||
4. **Fully local** — aligns with sovereignty requirements
|
||||
5. **MCP compatible** — integrates with existing Hermes agent infrastructure
|
||||
|
||||
### Next Steps
|
||||
- [ ] Deploy MemPalace on Ezra's Hermes home (see `docs/MEMPALACE_EZRA_INTEGRATION.md`)
|
||||
- [ ] Run live operational benchmarks on real Timmy corpus
|
||||
- [ ] Post live metrics back to this evaluation
|
||||
- [ ] Compare against Engram direction before final fleet default decision
|
||||
|
||||
### Scope Boundary
|
||||
This evaluation covers synthetic benchmarks and paper-level metrics. Live operational testing on production data is pending and should be tracked separately.
|
||||
|
||||
---
|
||||
|
||||
## Related
|
||||
|
||||
- Issue #568 — Original evaluation request
|
||||
- Issue #764 — This formalized report
|
||||
- PR #569 — Original draft
|
||||
- `docs/MEMPALACE_EZRA_INTEGRATION.md — Ezra integration packet
|
||||
- `reports/evaluations/2026-04-06-mempalace-evaluation.md` — Earlier evaluation draft
|
||||
138
scripts/audit_trail.py
Executable file
138
scripts/audit_trail.py
Executable file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
# audit_trail.py - Local logging of inputs, sources, and confidence.
|
||||
# Implements SOUL.md "What Honesty Requires" - The Audit Trail.
|
||||
# Logs are stored locally. Never sent anywhere. The user owns them.
|
||||
# Part of #794
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from dataclasses import dataclass, field, asdict
|
||||
|
||||
AUDIT_DIR = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) / "audit-trail"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuditEntry:
|
||||
id: str
|
||||
ts: str
|
||||
input_text: str
|
||||
sources: List[str]
|
||||
confidence: float
|
||||
output_text: str
|
||||
model: str
|
||||
provider: str = ""
|
||||
session_id: str = ""
|
||||
source_types: List[str] = field(default_factory=list)
|
||||
|
||||
@staticmethod
|
||||
def generate_id(input_text: str, output_text: str, ts: str) -> str:
|
||||
content = f"{ts}:{input_text}:{output_text}"
|
||||
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
class AuditTrail:
|
||||
def __init__(self, audit_dir: Optional[Path] = None):
|
||||
self.audit_dir = audit_dir or AUDIT_DIR
|
||||
self.audit_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._log_file = self.audit_dir / "trail.jsonl"
|
||||
|
||||
def log_response(self, input_text, sources, confidence, output_text,
|
||||
model="", provider="", session_id="", source_types=None):
|
||||
ts = datetime.now(timezone.utc).isoformat()
|
||||
entry = AuditEntry(
|
||||
id=AuditEntry.generate_id(input_text, output_text, ts),
|
||||
ts=ts,
|
||||
input_text=input_text[:1000],
|
||||
sources=[s[:200] for s in sources[:10]],
|
||||
confidence=round(confidence, 3),
|
||||
output_text=output_text[:2000],
|
||||
model=model, provider=provider, session_id=session_id,
|
||||
source_types=source_types or [],
|
||||
)
|
||||
with open(self._log_file, "a") as f:
|
||||
f.write(json.dumps(asdict(entry)) + "\n")
|
||||
return entry
|
||||
|
||||
def query(self, search_text, limit=10, min_confidence=0.0):
|
||||
if not self._log_file.exists():
|
||||
return []
|
||||
results = []
|
||||
search_lower = search_text.lower()
|
||||
with open(self._log_file) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if data.get("confidence", 0) < min_confidence:
|
||||
continue
|
||||
searchable = (data.get("input_text", "") + " " +
|
||||
data.get("output_text", "") + " " +
|
||||
" ".join(data.get("sources", []))).lower()
|
||||
if search_lower in searchable:
|
||||
results.append(AuditEntry(**{k: data.get(k, "") if isinstance(data.get(k), str)
|
||||
else data.get(k, []) if isinstance(data.get(k), list)
|
||||
else data.get(k, 0.0) for k in AuditEntry.__dataclass_fields__}))
|
||||
if len(results) >= limit:
|
||||
break
|
||||
return results
|
||||
|
||||
def get_stats(self):
|
||||
if not self._log_file.exists():
|
||||
return {"total": 0, "avg_confidence": 0, "sources_breakdown": {}}
|
||||
total = 0
|
||||
confidence_sum = 0.0
|
||||
source_types = {}
|
||||
with open(self._log_file) as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line.strip())
|
||||
total += 1
|
||||
confidence_sum += data.get("confidence", 0)
|
||||
for st in data.get("source_types", []):
|
||||
source_types[st] = source_types.get(st, 0) + 1
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
return {"total": total, "avg_confidence": round(confidence_sum / max(total, 1), 3),
|
||||
"sources_breakdown": source_types}
|
||||
|
||||
def get_by_session(self, session_id, limit=50):
|
||||
if not self._log_file.exists():
|
||||
return []
|
||||
results = []
|
||||
with open(self._log_file) as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line.strip())
|
||||
if data.get("session_id") == session_id:
|
||||
results.append(AuditEntry(**{k: data.get(k, "") if isinstance(data.get(k), str)
|
||||
else data.get(k, []) if isinstance(data.get(k), list)
|
||||
else data.get(k, 0.0) for k in AuditEntry.__dataclass_fields__}))
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
if len(results) >= limit:
|
||||
break
|
||||
return results
|
||||
|
||||
|
||||
_default_trail = None
|
||||
|
||||
def get_trail():
|
||||
global _default_trail
|
||||
if _default_trail is None:
|
||||
_default_trail = AuditTrail()
|
||||
return _default_trail
|
||||
|
||||
def log_response(**kwargs):
|
||||
return get_trail().log_response(**kwargs)
|
||||
|
||||
def query(search_text, **kwargs):
|
||||
return get_trail().query(search_text, **kwargs)
|
||||
84
scripts/fix_evennia_settings.sh
Executable file
84
scripts/fix_evennia_settings.sh
Executable file
@@ -0,0 +1,84 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
#
|
||||
# fix_evennia_settings.sh — Fix Evennia settings on Bezalel VPS.
|
||||
#
|
||||
# Removes bad port tuples that crash Evennia's Twisted port binding.
|
||||
# Run on Bezalel VPS (104.131.15.18) or via SSH.
|
||||
#
|
||||
# Usage:
|
||||
# ssh root@104.131.15.18 'bash -s' < scripts/fix_evennia_settings.sh
|
||||
#
|
||||
# Part of #534
|
||||
|
||||
EVENNIA_DIR="/root/wizards/bezalel/evennia/bezalel_world"
|
||||
SETTINGS="${EVENNIA_DIR}/server/conf/settings.py"
|
||||
VENV_PYTHON="/root/wizards/bezalel/evennia/venv/bin/python3"
|
||||
VENV_EVENNIA="/root/wizards/bezalel/evennia/venv/bin/evennia"
|
||||
|
||||
echo "=== Fix Evennia Settings (Bezalel) ==="
|
||||
|
||||
# 1. Fix settings.py — remove bad port tuples
|
||||
echo "Fixing settings.py..."
|
||||
if [ -f "$SETTINGS" ]; then
|
||||
# Remove broken port lines
|
||||
sed -i '/WEBSERVER_PORTS/d' "$SETTINGS"
|
||||
sed -i '/TELNET_PORTS/d' "$SETTINGS"
|
||||
sed -i '/WEBSOCKET_PORTS/d' "$SETTINGS"
|
||||
sed -i '/SERVERNAME/d' "$SETTINGS"
|
||||
|
||||
# Add correct settings
|
||||
echo '' >> "$SETTINGS"
|
||||
echo '# Fixed port settings — #534' >> "$SETTINGS"
|
||||
echo 'SERVERNAME = "bezalel_world"' >> "$SETTINGS"
|
||||
echo 'WEBSERVER_PORTS = [(4001, "0.0.0.0")]' >> "$SETTINGS"
|
||||
echo 'TELNET_PORTS = [(4000, "0.0.0.0")]' >> "$SETTINGS"
|
||||
echo 'WEBSOCKET_PORTS = [(4002, "0.0.0.0")]' >> "$SETTINGS"
|
||||
|
||||
echo "Settings fixed."
|
||||
else
|
||||
echo "ERROR: Settings file not found at $SETTINGS"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2. Clean DB and re-migrate
|
||||
echo "Cleaning DB..."
|
||||
cd "$EVENNIA_DIR"
|
||||
rm -f server/evennia.db3
|
||||
|
||||
echo "Running migrations..."
|
||||
"$VENV_EVENNIA" migrate --no-input
|
||||
|
||||
# 3. Create superuser
|
||||
echo "Creating superuser..."
|
||||
"$VENV_PYTHON" -c "
|
||||
import sys, os
|
||||
sys.setrecursionlimit(5000)
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'server.conf.settings'
|
||||
os.chdir('$EVENNIA_DIR')
|
||||
import django
|
||||
django.setup()
|
||||
from evennia.accounts.accounts import AccountDB
|
||||
try:
|
||||
AccountDB.objects.create_superuser('Timmy', 'timmy@tower.world', 'timmy123')
|
||||
print('Superuser Timmy created')
|
||||
except Exception as e:
|
||||
print(f'Superuser may already exist: {e}')
|
||||
"
|
||||
|
||||
# 4. Start Evennia
|
||||
echo "Starting Evennia..."
|
||||
"$VENV_EVENNIA" start
|
||||
|
||||
# 5. Verify
|
||||
sleep 3
|
||||
echo ""
|
||||
echo "=== Verification ==="
|
||||
"$VENV_EVENNIA" status
|
||||
|
||||
echo ""
|
||||
echo "Listening ports:"
|
||||
ss -tlnp | grep -E '400[012]' || echo "No ports found (may need a moment)"
|
||||
|
||||
echo ""
|
||||
echo "Done. Connect: telnet 104.131.15.18 4000"
|
||||
171
scripts/genome_analyzer.py
Executable file
171
scripts/genome_analyzer.py
Executable file
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
genome_analyzer.py — Generate a GENOME.md from a codebase.
|
||||
|
||||
Scans a repository and produces a structured codebase genome with:
|
||||
- File counts by type
|
||||
- Architecture overview (directory structure)
|
||||
- Entry points
|
||||
- Test coverage summary
|
||||
|
||||
Usage:
|
||||
python3 scripts/genome_analyzer.py /path/to/repo
|
||||
python3 scripts/genome_analyzer.py /path/to/repo --output GENOME.md
|
||||
python3 scripts/genome_analyzer.py /path/to/repo --dry-run
|
||||
|
||||
Part of #666: GENOME.md Template + Single-Repo Analyzer.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
SKIP_DIRS = {".git", "__pycache__", ".venv", "venv", "node_modules", ".tox", ".pytest_cache", ".DS_Store"}
|
||||
|
||||
|
||||
def count_files(repo_path: Path) -> Dict[str, int]:
|
||||
counts = defaultdict(int)
|
||||
for f in repo_path.rglob("*"):
|
||||
if any(part in SKIP_DIRS for part in f.parts):
|
||||
continue
|
||||
if f.is_file():
|
||||
ext = f.suffix or "(no ext)"
|
||||
counts[ext] += 1
|
||||
return dict(sorted(counts.items(), key=lambda x: -x[1]))
|
||||
|
||||
|
||||
def find_entry_points(repo_path: Path) -> List[str]:
|
||||
entry_points = []
|
||||
candidates = [
|
||||
"main.py", "app.py", "server.py", "cli.py", "manage.py",
|
||||
"index.html", "index.js", "index.ts",
|
||||
"Makefile", "Dockerfile", "docker-compose.yml",
|
||||
"README.md", "deploy.sh", "setup.py", "pyproject.toml",
|
||||
]
|
||||
for name in candidates:
|
||||
if (repo_path / name).exists():
|
||||
entry_points.append(name)
|
||||
scripts_dir = repo_path / "scripts"
|
||||
if scripts_dir.is_dir():
|
||||
for f in sorted(scripts_dir.iterdir()):
|
||||
if f.suffix in (".py", ".sh") and not f.name.startswith("test_"):
|
||||
entry_points.append(f"scripts/{f.name}")
|
||||
return entry_points[:15]
|
||||
|
||||
|
||||
def find_tests(repo_path: Path) -> Tuple[List[str], int]:
|
||||
test_files = []
|
||||
for f in repo_path.rglob("*"):
|
||||
if any(part in SKIP_DIRS for part in f.parts):
|
||||
continue
|
||||
if f.is_file() and (f.name.startswith("test_") or f.name.endswith("_test.py") or f.name.endswith("_test.js")):
|
||||
test_files.append(str(f.relative_to(repo_path)))
|
||||
return sorted(test_files), len(test_files)
|
||||
|
||||
|
||||
def find_directories(repo_path: Path, max_depth: int = 2) -> List[str]:
|
||||
dirs = []
|
||||
for d in sorted(repo_path.rglob("*")):
|
||||
if d.is_dir() and len(d.relative_to(repo_path).parts) <= max_depth:
|
||||
if not any(part in SKIP_DIRS for part in d.parts):
|
||||
rel = str(d.relative_to(repo_path))
|
||||
if rel != ".":
|
||||
dirs.append(rel)
|
||||
return dirs[:30]
|
||||
|
||||
|
||||
def read_readme(repo_path: Path) -> str:
|
||||
for name in ["README.md", "README.rst", "README.txt", "README"]:
|
||||
readme = repo_path / name
|
||||
if readme.exists():
|
||||
lines = readme.read_text(encoding="utf-8", errors="replace").split("\n")
|
||||
para = []
|
||||
started = False
|
||||
for line in lines:
|
||||
if line.startswith("#") and not started:
|
||||
continue
|
||||
if line.strip():
|
||||
started = True
|
||||
para.append(line.strip())
|
||||
elif started:
|
||||
break
|
||||
return " ".join(para[:5])
|
||||
return "(no README found)"
|
||||
|
||||
|
||||
def generate_genome(repo_path: Path, repo_name: str = "") -> str:
|
||||
if not repo_name:
|
||||
repo_name = repo_path.name
|
||||
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
readme_desc = read_readme(repo_path)
|
||||
file_counts = count_files(repo_path)
|
||||
total_files = sum(file_counts.values())
|
||||
entry_points = find_entry_points(repo_path)
|
||||
test_files, test_count = find_tests(repo_path)
|
||||
dirs = find_directories(repo_path)
|
||||
|
||||
lines = [
|
||||
f"# GENOME.md — {repo_name}", "",
|
||||
f"> Codebase analysis generated {date}. {readme_desc[:100]}.", "",
|
||||
"## Project Overview", "",
|
||||
readme_desc, "",
|
||||
f"**{total_files} files** across {len(file_counts)} file types.", "",
|
||||
"## Architecture", "",
|
||||
"```",
|
||||
]
|
||||
for d in dirs[:20]:
|
||||
lines.append(f" {d}/")
|
||||
lines.append("```")
|
||||
lines += ["", "### File Types", "", "| Type | Count |", "|------|-------|"]
|
||||
for ext, count in list(file_counts.items())[:15]:
|
||||
lines.append(f"| {ext} | {count} |")
|
||||
lines += ["", "## Entry Points", ""]
|
||||
for ep in entry_points:
|
||||
lines.append(f"- `{ep}`")
|
||||
lines += ["", "## Test Coverage", "", f"**{test_count} test files** found.", ""]
|
||||
if test_files:
|
||||
for tf in test_files[:10]:
|
||||
lines.append(f"- `{tf}`")
|
||||
if len(test_files) > 10:
|
||||
lines.append(f"- ... and {len(test_files) - 10} more")
|
||||
else:
|
||||
lines.append("No test files found.")
|
||||
lines += ["", "## Security Considerations", "", "(To be filled during analysis)", ""]
|
||||
lines += ["## Design Decisions", "", "(To be filled during analysis)", ""]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate GENOME.md from a codebase")
|
||||
parser.add_argument("repo_path", help="Path to repository")
|
||||
parser.add_argument("--output", default="", help="Output file (default: stdout)")
|
||||
parser.add_argument("--name", default="", help="Repository name")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print stats only")
|
||||
args = parser.parse_args()
|
||||
repo_path = Path(args.repo_path).resolve()
|
||||
if not repo_path.is_dir():
|
||||
print(f"ERROR: {repo_path} is not a directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
repo_name = args.name or repo_path.name
|
||||
if args.dry_run:
|
||||
counts = count_files(repo_path)
|
||||
_, test_count = find_tests(repo_path)
|
||||
print(f"Repo: {repo_name}")
|
||||
print(f"Total files: {sum(counts.values())}")
|
||||
print(f"Test files: {test_count}")
|
||||
print(f"Top types: {', '.join(f'{k}={v}' for k,v in list(counts.items())[:5])}")
|
||||
sys.exit(0)
|
||||
genome = generate_genome(repo_path, repo_name)
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(genome)
|
||||
print(f"Written: {args.output}")
|
||||
else:
|
||||
print(genome)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
155
scripts/grounding.py
Executable file
155
scripts/grounding.py
Executable file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
# grounding.py - Grounding before generation.
|
||||
# SOUL.md: "When I have verified sources, I must consult them
|
||||
# before I generate from pattern alone. Retrieval is not a feature.
|
||||
# It is the primary mechanism by which I avoid lying."
|
||||
# Part of #792
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||
MEMORY_DIR = HERMES_HOME / "memory"
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingResult:
|
||||
query: str
|
||||
sources_found: List[Dict[str, Any]] = field(default_factory=list)
|
||||
grounded: bool = False
|
||||
confidence: float = 0.0
|
||||
source_text: str = ""
|
||||
source_type: str = "" # memory, file, chain, tool_result
|
||||
|
||||
@property
|
||||
def needs_hedging(self):
|
||||
return not self.grounded
|
||||
|
||||
|
||||
class GroundingLayer:
|
||||
def __init__(self, memory_dir=None):
|
||||
self.memory_dir = Path(memory_dir) if memory_dir else MEMORY_DIR
|
||||
|
||||
def ground(self, query, context=None):
|
||||
"""Query local sources before generation."""
|
||||
sources = []
|
||||
|
||||
# 1. Search memory files
|
||||
memory_hits = self._search_memory(query)
|
||||
sources.extend(memory_hits)
|
||||
|
||||
# 2. Search context files if provided
|
||||
if context:
|
||||
context_hits = self._search_context(query, context)
|
||||
sources.extend(context_hits)
|
||||
|
||||
# 3. Build result
|
||||
grounded = len(sources) > 0
|
||||
confidence = min(0.95, 0.3 + len(sources) * 0.2) if grounded else 0.0
|
||||
|
||||
source_text = ""
|
||||
source_type = ""
|
||||
if sources:
|
||||
best = max(sources, key=lambda s: s.get("score", 0))
|
||||
source_text = best.get("text", "")[:200]
|
||||
source_type = best.get("type", "unknown")
|
||||
|
||||
return GroundingResult(
|
||||
query=query, sources_found=sources, grounded=grounded,
|
||||
confidence=confidence, source_text=source_text, source_type=source_type,
|
||||
)
|
||||
|
||||
def _search_memory(self, query):
|
||||
"""Search memory files for relevant content."""
|
||||
results = []
|
||||
if not self.memory_dir.exists():
|
||||
return results
|
||||
|
||||
query_lower = query.lower()
|
||||
query_words = set(query_lower.split())
|
||||
|
||||
for mem_file in self.memory_dir.rglob("*.md"):
|
||||
try:
|
||||
content = mem_file.read_text(encoding="utf-8", errors="replace")
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
content_lower = content.lower()
|
||||
# Simple relevance: count query word matches
|
||||
matches = sum(1 for w in query_words if w in content_lower)
|
||||
if matches > 0:
|
||||
score = matches / max(len(query_words), 1)
|
||||
# Extract relevant snippet
|
||||
lines = content.split("\n")
|
||||
snippet = ""
|
||||
for line in lines:
|
||||
if any(w in line.lower() for w in query_words):
|
||||
snippet = line.strip()[:200]
|
||||
break
|
||||
|
||||
results.append({
|
||||
"text": snippet or content[:200],
|
||||
"source": str(mem_file.relative_to(self.memory_dir)),
|
||||
"type": "memory",
|
||||
"score": round(score, 3),
|
||||
})
|
||||
|
||||
return sorted(results, key=lambda r: -r["score"])[:5]
|
||||
|
||||
def _search_context(self, query, context):
|
||||
"""Search provided context text for relevant content."""
|
||||
results = []
|
||||
if not context:
|
||||
return results
|
||||
|
||||
query_lower = query.lower()
|
||||
query_words = set(query_lower.split())
|
||||
|
||||
for ctx in context:
|
||||
if isinstance(ctx, dict):
|
||||
text = ctx.get("content", "") or ctx.get("text", "")
|
||||
source = ctx.get("source", "context")
|
||||
else:
|
||||
text = str(ctx)
|
||||
source = "context"
|
||||
|
||||
text_lower = text.lower()
|
||||
matches = sum(1 for w in query_words if w in text_lower)
|
||||
if matches > 0:
|
||||
score = matches / max(len(query_words), 1)
|
||||
results.append({
|
||||
"text": text[:200],
|
||||
"source": source,
|
||||
"type": "context",
|
||||
"score": round(score, 3),
|
||||
})
|
||||
|
||||
return sorted(results, key=lambda r: -r["score"])[:5]
|
||||
|
||||
def format_sources(self, result):
|
||||
"""Format grounding result for display."""
|
||||
if not result.grounded:
|
||||
return "No verified sources found. Proceeding from pattern matching."
|
||||
|
||||
lines = ["Based on verified sources:"]
|
||||
for s in result.sources_found[:3]:
|
||||
ref = s.get("source", "unknown")
|
||||
text = s.get("text", "")[:100]
|
||||
lines.append(" - [" + ref + "] " + text)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# Convenience
|
||||
_default_layer = None
|
||||
|
||||
def get_grounding_layer():
|
||||
global _default_layer
|
||||
if _default_layer is None:
|
||||
_default_layer = GroundingLayer()
|
||||
return _default_layer
|
||||
|
||||
def ground(query, **kwargs):
|
||||
return get_grounding_layer().ground(query, **kwargs)
|
||||
101
scripts/source_distinction.py
Executable file
101
scripts/source_distinction.py
Executable file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
# source_distinction.py - I think vs I know annotation system.
|
||||
# SOUL.md: "Every claim I make comes from one of two places: a verified source
|
||||
# I can point to, or my own pattern-matching."
|
||||
# Part of #793
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class SourceType(Enum):
|
||||
VERIFIED = "verified"
|
||||
INFERRED = "inferred"
|
||||
STATED = "stated"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Claim:
|
||||
text: str
|
||||
source_type: SourceType
|
||||
source_ref: str = ""
|
||||
confidence: float = 0.0
|
||||
hedging: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnnotatedResponse:
|
||||
raw_text: str
|
||||
claims: List[Claim] = field(default_factory=list)
|
||||
|
||||
def render(self):
|
||||
if not self.claims:
|
||||
return self.raw_text
|
||||
parts = []
|
||||
for claim in self.claims:
|
||||
if claim.source_type == SourceType.VERIFIED:
|
||||
prefix = "[verified: " + claim.source_ref + "]" if claim.source_ref else "[verified]"
|
||||
parts.append(claim.text + " " + prefix)
|
||||
elif claim.source_type == SourceType.INFERRED:
|
||||
hedge = claim.hedging or "I think"
|
||||
parts.append(hedge + " " + claim.text)
|
||||
elif claim.source_type == SourceType.STATED:
|
||||
parts.append(claim.text + " [you told me]")
|
||||
else:
|
||||
parts.append("I am not certain, but " + claim.text)
|
||||
return " ".join(parts)
|
||||
|
||||
@property
|
||||
def verified_count(self):
|
||||
return sum(1 for c in self.claims if c.source_type == SourceType.VERIFIED)
|
||||
|
||||
@property
|
||||
def inferred_count(self):
|
||||
return sum(1 for c in self.claims if c.source_type == SourceType.INFERRED)
|
||||
|
||||
|
||||
def verified(text, source, confidence=0.95):
|
||||
return Claim(text=text, source_type=SourceType.VERIFIED, source_ref=source, confidence=confidence)
|
||||
|
||||
def inferred(text, hedging="I think", confidence=0.6):
|
||||
return Claim(text=text, source_type=SourceType.INFERRED, confidence=confidence, hedging=hedging)
|
||||
|
||||
def stated(text):
|
||||
return Claim(text=text, source_type=SourceType.STATED, confidence=1.0)
|
||||
|
||||
|
||||
def annotate_response(raw_text, claims):
|
||||
return AnnotatedResponse(raw_text=raw_text, claims=claims)
|
||||
|
||||
|
||||
def format_for_display(response):
|
||||
lines = []
|
||||
for claim in response.claims:
|
||||
if claim.source_type == SourceType.VERIFIED:
|
||||
ref = " (" + claim.source_ref + ")" if claim.source_ref else ""
|
||||
lines.append(" = " + claim.text + ref)
|
||||
elif claim.source_type == SourceType.INFERRED:
|
||||
lines.append(" ~ " + claim.hedging + " " + claim.text)
|
||||
elif claim.source_type == SourceType.STATED:
|
||||
lines.append(" > " + claim.text)
|
||||
else:
|
||||
lines.append(" ? " + claim.text)
|
||||
if response.claims:
|
||||
v = response.verified_count
|
||||
i = response.inferred_count
|
||||
t = len(response.claims)
|
||||
lines.append("")
|
||||
lines.append(" [" + str(v) + " verified, " + str(i) + " inferred, " + str(t) + " total]")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def source_distinction_check(text):
|
||||
hedging_words = ["i think", "i believe", "probably", "likely", "might",
|
||||
"it seems", "perhaps", "i am not sure", "i guess",
|
||||
"my understanding is", "i suspect"]
|
||||
text_lower = text.lower()
|
||||
hedging_count = sum(1 for h in hedging_words if h in text_lower)
|
||||
return {"has_hedging": hedging_count > 0, "hedging_count": hedging_count,
|
||||
"likely_inferred": hedging_count > 2}
|
||||
46
templates/GENOME-template.md
Normal file
46
templates/GENOME-template.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# GENOME.md — {{REPO_NAME}}
|
||||
|
||||
> Codebase analysis generated {{DATE}}. {{SHORT_DESCRIPTION}}.
|
||||
|
||||
## Project Overview
|
||||
|
||||
{{OVERVIEW}}
|
||||
|
||||
## Architecture
|
||||
|
||||
{{ARCHITECTURE_DIAGRAM}}
|
||||
|
||||
## Entry Points
|
||||
|
||||
{{ENTRY_POINTS}}
|
||||
|
||||
## Data Flow
|
||||
|
||||
{{DATA_FLOW}}
|
||||
|
||||
## Key Abstractions
|
||||
|
||||
{{ABSTRACTIONS}}
|
||||
|
||||
## API Surface
|
||||
|
||||
{{API_SURFACE}}
|
||||
|
||||
## Test Coverage
|
||||
|
||||
### Existing Tests
|
||||
{{EXISTING_TESTS}}
|
||||
|
||||
### Coverage Gaps
|
||||
{{COVERAGE_GAPS}}
|
||||
|
||||
### Critical paths that need tests:
|
||||
{{CRITICAL_PATHS}}
|
||||
|
||||
## Security Considerations
|
||||
|
||||
{{SECURITY}}
|
||||
|
||||
## Design Decisions
|
||||
|
||||
{{DESIGN_DECISIONS}}
|
||||
88
tests/test_audit_trail.py
Normal file
88
tests/test_audit_trail.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""Tests for audit trail — SOUL.md compliance."""
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class TestAuditTrail:
|
||||
def test_log_and_query(self, tmp_path):
|
||||
from scripts.audit_trail import AuditTrail
|
||||
trail = AuditTrail(audit_dir=tmp_path)
|
||||
|
||||
trail.log_response(
|
||||
input_text="What is Python?",
|
||||
sources=["web_search:Python is a programming language"],
|
||||
confidence=0.9,
|
||||
output_text="Python is a programming language.",
|
||||
model="test-model",
|
||||
)
|
||||
|
||||
results = trail.query("Python")
|
||||
assert len(results) == 1
|
||||
assert results[0].confidence == 0.9
|
||||
assert "Python" in results[0].output_text
|
||||
|
||||
def test_query_no_match(self, tmp_path):
|
||||
from scripts.audit_trail import AuditTrail
|
||||
trail = AuditTrail(audit_dir=tmp_path)
|
||||
|
||||
trail.log_response(
|
||||
input_text="What is Rust?",
|
||||
sources=[],
|
||||
confidence=0.8,
|
||||
output_text="Rust is a systems language.",
|
||||
)
|
||||
|
||||
results = trail.query("Python")
|
||||
assert len(results) == 0
|
||||
|
||||
def test_confidence_filter(self, tmp_path):
|
||||
from scripts.audit_trail import AuditTrail
|
||||
trail = AuditTrail(audit_dir=tmp_path)
|
||||
|
||||
trail.log_response(input_text="test", sources=[], confidence=0.3, output_text="low conf")
|
||||
trail.log_response(input_text="test", sources=[], confidence=0.95, output_text="high conf")
|
||||
|
||||
high_only = trail.query("test", min_confidence=0.5)
|
||||
assert len(high_only) == 1
|
||||
assert high_only[0].confidence == 0.95
|
||||
|
||||
def test_stats(self, tmp_path):
|
||||
from scripts.audit_trail import AuditTrail
|
||||
trail = AuditTrail(audit_dir=tmp_path)
|
||||
|
||||
trail.log_response(input_text="a", sources=[], confidence=0.8, output_text="b")
|
||||
trail.log_response(input_text="c", sources=[], confidence=0.6, output_text="d")
|
||||
|
||||
stats = trail.get_stats()
|
||||
assert stats["total"] == 2
|
||||
assert stats["avg_confidence"] == 0.7
|
||||
|
||||
def test_session_filter(self, tmp_path):
|
||||
from scripts.audit_trail import AuditTrail
|
||||
trail = AuditTrail(audit_dir=tmp_path)
|
||||
|
||||
trail.log_response(input_text="a", sources=[], confidence=0.9, output_text="b", session_id="s1")
|
||||
trail.log_response(input_text="c", sources=[], confidence=0.9, output_text="d", session_id="s2")
|
||||
|
||||
s1_results = trail.get_by_session("s1")
|
||||
assert len(s1_results) == 1
|
||||
|
||||
def test_empty_trail(self, tmp_path):
|
||||
from scripts.audit_trail import AuditTrail
|
||||
trail = AuditTrail(audit_dir=tmp_path)
|
||||
|
||||
assert trail.query("anything") == []
|
||||
assert trail.get_stats()["total"] == 0
|
||||
|
||||
def test_content_addressed_id(self):
|
||||
from scripts.audit_trail import AuditEntry
|
||||
id1 = AuditEntry.generate_id("input", "output", "2026-01-01")
|
||||
id2 = AuditEntry.generate_id("input", "output", "2026-01-01")
|
||||
id3 = AuditEntry.generate_id("different", "output", "2026-01-01")
|
||||
|
||||
assert id1 == id2 # same content = same ID
|
||||
assert id1 != id3 # different content = different ID
|
||||
67
tests/test_grounding.py
Normal file
67
tests/test_grounding.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Tests for grounding-before-generation - SOUL.md compliance."""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
|
||||
class TestGrounding:
|
||||
def test_ground_with_memory(self, tmp_path):
|
||||
from scripts.grounding import GroundingLayer
|
||||
mem_dir = tmp_path / "memory"
|
||||
mem_dir.mkdir()
|
||||
(mem_dir / "test.md").write_text("Python is a programming language created by Guido.")
|
||||
|
||||
layer = GroundingLayer(memory_dir=mem_dir)
|
||||
result = layer.ground("What is Python?")
|
||||
|
||||
assert result.grounded
|
||||
assert result.confidence > 0
|
||||
assert len(result.sources_found) > 0
|
||||
|
||||
def test_ground_no_sources(self, tmp_path):
|
||||
from scripts.grounding import GroundingLayer
|
||||
mem_dir = tmp_path / "memory"
|
||||
mem_dir.mkdir()
|
||||
|
||||
layer = GroundingLayer(memory_dir=mem_dir)
|
||||
result = layer.ground("What is quantum physics?")
|
||||
|
||||
assert not result.grounded
|
||||
assert result.needs_hedging
|
||||
assert result.confidence == 0.0
|
||||
|
||||
def test_ground_with_context(self):
|
||||
from scripts.grounding import GroundingLayer
|
||||
layer = GroundingLayer(memory_dir=Path("/nonexistent"))
|
||||
|
||||
context = [{"content": "The fleet uses tmux for agent management", "source": "fleet-ops"}]
|
||||
result = layer.ground("How does the fleet work?", context=context)
|
||||
|
||||
assert result.grounded
|
||||
assert result.source_type == "context"
|
||||
|
||||
def test_format_sources_grounded(self):
|
||||
from scripts.grounding import GroundingLayer, GroundingResult
|
||||
layer = GroundingLayer()
|
||||
result = GroundingResult(
|
||||
query="test", grounded=True,
|
||||
sources_found=[{"text": "test info", "source": "test.md", "type": "memory", "score": 0.8}],
|
||||
)
|
||||
formatted = layer.format_sources(result)
|
||||
assert "verified sources" in formatted
|
||||
assert "test.md" in formatted
|
||||
|
||||
def test_format_sources_ungrounded(self):
|
||||
from scripts.grounding import GroundingLayer, GroundingResult
|
||||
layer = GroundingLayer()
|
||||
result = GroundingResult(query="test", grounded=False)
|
||||
formatted = layer.format_sources(result)
|
||||
assert "pattern matching" in formatted
|
||||
|
||||
def test_empty_memory_dir(self, tmp_path):
|
||||
from scripts.grounding import GroundingLayer
|
||||
mem_dir = tmp_path / "empty"
|
||||
mem_dir.mkdir()
|
||||
layer = GroundingLayer(memory_dir=mem_dir)
|
||||
result = layer.ground("anything")
|
||||
assert not result.grounded
|
||||
61
tests/test_source_distinction.py
Normal file
61
tests/test_source_distinction.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""Tests for source distinction - SOUL.md compliance."""
|
||||
import pytest
|
||||
|
||||
|
||||
class TestSourceDistinction:
|
||||
def test_verified_claim(self):
|
||||
from scripts.source_distinction import verified, SourceType
|
||||
claim = verified("Paris is the capital", "web_search:Paris")
|
||||
assert claim.source_type == SourceType.VERIFIED
|
||||
assert claim.source_ref == "web_search:Paris"
|
||||
assert claim.confidence == 0.95
|
||||
|
||||
def test_inferred_claim(self):
|
||||
from scripts.source_distinction import inferred, SourceType
|
||||
claim = inferred("this approach is better")
|
||||
assert claim.source_type == SourceType.INFERRED
|
||||
assert claim.hedging == "I think"
|
||||
|
||||
def test_stated_claim(self):
|
||||
from scripts.source_distinction import stated, SourceType
|
||||
claim = stated("my name is Alexander")
|
||||
assert claim.source_type == SourceType.STATED
|
||||
assert claim.confidence == 1.0
|
||||
|
||||
def test_render_verified(self):
|
||||
from scripts.source_distinction import annotate_response, verified
|
||||
resp = annotate_response("test", [verified("Paris is capital", "web")])
|
||||
rendered = resp.render()
|
||||
assert "[verified: web]" in rendered
|
||||
|
||||
def test_render_inferred(self):
|
||||
from scripts.source_distinction import annotate_response, inferred
|
||||
resp = annotate_response("test", [ inferred("this is better")])
|
||||
rendered = resp.render()
|
||||
assert "I think" in rendered
|
||||
|
||||
def test_counts(self):
|
||||
from scripts.source_distinction import annotate_response, verified, inferred
|
||||
resp = annotate_response("test", [
|
||||
verified("a", "src"), verified("b", "src"), inferred("c"),
|
||||
])
|
||||
assert resp.verified_count == 2
|
||||
assert resp.inferred_count == 1
|
||||
|
||||
def test_hedging_detection(self):
|
||||
from scripts.source_distinction import source_distinction_check
|
||||
result = source_distinction_check("I think this is probably right, but I believe it could be different")
|
||||
assert result["has_hedging"]
|
||||
assert result["hedging_count"] >= 3
|
||||
|
||||
def test_no_hedging(self):
|
||||
from scripts.source_distinction import source_distinction_check
|
||||
result = source_distinction_check("The capital of France is Paris.")
|
||||
assert not result["has_hedging"]
|
||||
|
||||
def test_format_for_display(self):
|
||||
from scripts.source_distinction import format_for_display, annotate_response, verified, inferred
|
||||
resp = annotate_response("test", [verified("a", "src"), inferred("b")])
|
||||
output = format_for_display(resp)
|
||||
assert "=" in output # verified icon
|
||||
assert "~" in output # inferred icon
|
||||
Reference in New Issue
Block a user