Complete four-pass evolution to production-ready architecture: **Pass 1 → Foundation:** - Tool registry, basic harness, 19 tools - VPS provisioning, Syncthing mesh - Health daemon, systemd services **Pass 2 → Three-House Canon:** - Timmy (Sovereign), Ezra (Archivist), Bezalel (Artificer) - Provenance tracking, artifact-flow discipline - House-aware policy enforcement **Pass 3 → Self-Improvement:** - Pattern database with SQLite backend - Adaptive policies (auto-adjust thresholds) - Predictive execution (success prediction) - Hermes bridge for shortest-loop telemetry - Learning velocity tracking **Pass 4 → Production Integration:** - Unified API: `from uni_wizard import Harness, House, Mode` - Three modes: SIMPLE / INTELLIGENT / SOVEREIGN - Circuit breaker pattern for fault tolerance - Async/concurrent execution support - Production hardening (timeouts, retries) **Allegro Lane Definition:** - Narrowed to: Gitea integration, Hermes bridge, redundancy/failover - Provides: Cloud connectivity, telemetry streaming, issue routing - Does NOT: Make sovereign decisions, authenticate as Timmy **Files:** - v3/: Intelligence engine, adaptive harness, Hermes bridge - v4/: Unified API, production harness, final architecture Total: ~25KB architecture documentation + production code
494 lines
16 KiB
Python
494 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test Suite for Uni-Wizard v3 — Self-Improving Intelligence
|
|
|
|
Tests:
|
|
- Pattern database operations
|
|
- Intelligence engine learning
|
|
- Adaptive policy changes
|
|
- Prediction accuracy
|
|
- Hermes bridge integration
|
|
- End-to-end self-improvement
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import tempfile
|
|
import shutil
|
|
import time
|
|
import threading
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
|
|
# Add parent to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from intelligence_engine import (
|
|
PatternDatabase, IntelligenceEngine,
|
|
ExecutionPattern, AdaptationEvent
|
|
)
|
|
from harness import (
|
|
UniWizardHarness, AdaptivePolicy,
|
|
House, Provenance, ExecutionResult
|
|
)
|
|
from hermes_bridge import (
|
|
HermesStateReader, HermesSessionEvent,
|
|
TelemetryStreamProcessor, ShortestLoopIntegrator
|
|
)
|
|
|
|
|
|
class TestPatternDatabase:
|
|
"""Test pattern storage and retrieval"""
|
|
|
|
def setup_method(self):
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
|
|
|
|
def teardown_method(self):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_record_execution(self):
|
|
"""Test recording execution outcomes"""
|
|
self.db.record_execution({
|
|
"tool": "git_status",
|
|
"house": "ezra",
|
|
"model": "hermes3:8b",
|
|
"success": True,
|
|
"latency_ms": 150,
|
|
"confidence": 0.9
|
|
})
|
|
|
|
# Verify pattern created
|
|
pattern = self.db.get_pattern("git_status", "ezra")
|
|
assert pattern is not None
|
|
assert pattern.success_rate == 1.0
|
|
assert pattern.sample_count == 1
|
|
|
|
def test_pattern_aggregation(self):
|
|
"""Test pattern aggregation across multiple executions"""
|
|
# Record 10 executions, 8 successful
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "deploy",
|
|
"house": "bezalel",
|
|
"success": i < 8,
|
|
"latency_ms": 200 + i * 10,
|
|
"confidence": 0.8
|
|
})
|
|
|
|
pattern = self.db.get_pattern("deploy", "bezalel")
|
|
assert pattern.success_rate == 0.8
|
|
assert pattern.sample_count == 10
|
|
assert pattern.avg_latency_ms == 245 # Average of 200-290
|
|
|
|
def test_best_model_selection(self):
|
|
"""Test finding best model for task"""
|
|
# Model A: 10 calls, 8 success = 80%
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "read",
|
|
"house": "ezra",
|
|
"model": "model_a",
|
|
"task_type": "read",
|
|
"success": i < 8,
|
|
"latency_ms": 100
|
|
})
|
|
|
|
# Model B: 10 calls, 9 success = 90%
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "read",
|
|
"house": "ezra",
|
|
"model": "model_b",
|
|
"task_type": "read",
|
|
"success": i < 9,
|
|
"latency_ms": 120
|
|
})
|
|
|
|
best = self.db.get_best_model("read", min_samples=5)
|
|
assert best == "model_b"
|
|
|
|
def test_house_performance(self):
|
|
"""Test house performance metrics"""
|
|
# Record executions for ezra
|
|
for i in range(5):
|
|
self.db.record_execution({
|
|
"tool": "test",
|
|
"house": "ezra",
|
|
"success": i < 4, # 80% success
|
|
"latency_ms": 100
|
|
})
|
|
|
|
perf = self.db.get_house_performance("ezra", days=7)
|
|
assert perf["house"] == "ezra"
|
|
assert perf["success_rate"] == 0.8
|
|
assert perf["total_executions"] == 5
|
|
|
|
def test_adaptation_tracking(self):
|
|
"""Test recording adaptations"""
|
|
adapt = AdaptationEvent(
|
|
timestamp="2026-03-30T20:00:00Z",
|
|
trigger="low_success_rate",
|
|
change_type="policy.threshold",
|
|
old_value=0.8,
|
|
new_value=0.7,
|
|
reason="Performance below threshold",
|
|
expected_improvement=0.1
|
|
)
|
|
|
|
self.db.record_adaptation(adapt)
|
|
|
|
adaptations = self.db.get_adaptations(limit=10)
|
|
assert len(adaptations) == 1
|
|
assert adaptations[0].change_type == "policy.threshold"
|
|
|
|
|
|
class TestIntelligenceEngine:
|
|
"""Test intelligence and learning"""
|
|
|
|
def setup_method(self):
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
|
|
self.engine = IntelligenceEngine(db=self.db)
|
|
|
|
def teardown_method(self):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_predict_success_with_data(self):
|
|
"""Test prediction with historical data"""
|
|
# Record successful pattern
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "git_status",
|
|
"house": "ezra",
|
|
"success": True,
|
|
"latency_ms": 100,
|
|
"confidence": 0.9
|
|
})
|
|
|
|
prob, reason = self.engine.predict_success("git_status", "ezra")
|
|
assert prob == 1.0
|
|
assert "excellent track record" in reason
|
|
|
|
def test_predict_success_without_data(self):
|
|
"""Test prediction without historical data"""
|
|
prob, reason = self.engine.predict_success("unknown_tool", "timmy")
|
|
assert prob == 0.5
|
|
assert "Insufficient data" in reason
|
|
|
|
def test_optimal_house_selection(self):
|
|
"""Test finding optimal house for task"""
|
|
# Ezra: 90% success on git_status
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "git_status",
|
|
"house": "ezra",
|
|
"success": i < 9,
|
|
"latency_ms": 100
|
|
})
|
|
|
|
# Bezalel: 50% success on git_status
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "git_status",
|
|
"house": "bezalel",
|
|
"success": i < 5,
|
|
"latency_ms": 100
|
|
})
|
|
|
|
house, confidence = self.engine.get_optimal_house("git_status")
|
|
assert house == "ezra"
|
|
assert confidence == 0.9
|
|
|
|
def test_learning_velocity(self):
|
|
"""Test learning velocity calculation"""
|
|
now = time.time()
|
|
|
|
# Record old executions (5-7 days ago)
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "test",
|
|
"house": "timmy",
|
|
"success": i < 5, # 50% success
|
|
"latency_ms": 100
|
|
})
|
|
|
|
# Backdate the executions
|
|
conn = self.db.db_path
|
|
# (In real test, we'd manipulate timestamps)
|
|
|
|
velocity = self.engine._calculate_learning_velocity()
|
|
assert "velocity" in velocity
|
|
assert "improvement" in velocity
|
|
|
|
|
|
class TestAdaptivePolicy:
|
|
"""Test policy adaptation"""
|
|
|
|
def setup_method(self):
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
|
|
self.engine = IntelligenceEngine(db=self.db)
|
|
|
|
def teardown_method(self):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_policy_loads_defaults(self):
|
|
"""Test policy loads default values"""
|
|
policy = AdaptivePolicy(House.EZRA, self.engine)
|
|
|
|
assert policy.get("evidence_threshold") == 0.8
|
|
assert policy.get("must_read_before_write") is True
|
|
|
|
def test_policy_adapts_on_low_performance(self):
|
|
"""Test policy adapts when performance is poor"""
|
|
policy = AdaptivePolicy(House.EZRA, self.engine)
|
|
|
|
# Record poor performance for ezra
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "test",
|
|
"house": "ezra",
|
|
"success": i < 4, # 40% success
|
|
"latency_ms": 100
|
|
})
|
|
|
|
# Trigger adaptation
|
|
adapt = policy.adapt("low_performance", "Testing adaptation")
|
|
|
|
# Threshold should have decreased
|
|
assert policy.get("evidence_threshold") < 0.8
|
|
assert adapt is not None
|
|
|
|
def test_policy_adapts_on_high_performance(self):
|
|
"""Test policy adapts when performance is excellent"""
|
|
policy = AdaptivePolicy(House.EZRA, self.engine)
|
|
|
|
# Start with lower threshold
|
|
policy.policy["evidence_threshold"] = 0.7
|
|
|
|
# Record excellent performance
|
|
for i in range(10):
|
|
self.db.record_execution({
|
|
"tool": "test",
|
|
"house": "ezra",
|
|
"success": True, # 100% success
|
|
"latency_ms": 100
|
|
})
|
|
|
|
# Trigger adaptation
|
|
adapt = policy.adapt("high_performance", "Testing adaptation")
|
|
|
|
# Threshold should have increased
|
|
assert policy.get("evidence_threshold") > 0.7
|
|
|
|
|
|
class TestHarness:
|
|
"""Test v3 harness with intelligence"""
|
|
|
|
def setup_method(self):
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
|
|
self.engine = IntelligenceEngine(db=self.db)
|
|
|
|
def teardown_method(self):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_harness_creates_provenance(self):
|
|
"""Test harness creates proper provenance"""
|
|
harness = UniWizardHarness("ezra", intelligence=self.engine)
|
|
result = harness.execute("system_info")
|
|
|
|
assert result.provenance.house == "ezra"
|
|
assert result.provenance.tool == "system_info"
|
|
assert result.provenance.prediction >= 0
|
|
|
|
def test_harness_records_for_learning(self):
|
|
"""Test harness records executions"""
|
|
harness = UniWizardHarness("timmy", intelligence=self.engine, enable_learning=True)
|
|
|
|
initial_count = self.engine.db.get_house_performance("timmy")["total_executions"]
|
|
|
|
harness.execute("test_tool")
|
|
|
|
new_count = self.engine.db.get_house_performance("timmy")["total_executions"]
|
|
assert new_count == initial_count + 1
|
|
|
|
def test_harness_does_not_record_when_learning_disabled(self):
|
|
"""Test harness respects learning flag"""
|
|
harness = UniWizardHarness("timmy", intelligence=self.engine, enable_learning=False)
|
|
|
|
initial_count = self.engine.db.get_house_performance("timmy")["total_executions"]
|
|
|
|
harness.execute("test_tool")
|
|
|
|
new_count = self.engine.db.get_house_performance("timmy")["total_executions"]
|
|
assert new_count == initial_count
|
|
|
|
def test_learn_from_batch_triggers_adaptation(self):
|
|
"""Test batch learning triggers adaptations"""
|
|
harness = UniWizardHarness("ezra", intelligence=self.engine)
|
|
|
|
# Execute multiple times
|
|
for i in range(15):
|
|
harness.execute("test_tool")
|
|
|
|
# Trigger learning
|
|
result = harness.learn_from_batch(min_executions=10)
|
|
|
|
assert result["status"] == "adapted"
|
|
|
|
|
|
class TestHermesBridge:
|
|
"""Test Hermes integration"""
|
|
|
|
def setup_method(self):
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
|
|
self.engine = IntelligenceEngine(db=self.db)
|
|
|
|
def teardown_method(self):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_event_conversion(self):
|
|
"""Test Hermes event to intelligence record conversion"""
|
|
processor = TelemetryStreamProcessor(self.engine)
|
|
|
|
event = HermesSessionEvent(
|
|
session_id="test_session",
|
|
timestamp=time.time(),
|
|
event_type="tool_call",
|
|
tool_name="terminal",
|
|
success=True,
|
|
latency_ms=150,
|
|
model="hermes3:8b",
|
|
provider="local",
|
|
token_count=100,
|
|
error=None
|
|
)
|
|
|
|
record = processor._convert_event(event)
|
|
|
|
assert record["tool"] == "system_shell" # Mapped from terminal
|
|
assert record["house"] == "timmy"
|
|
assert record["success"] is True
|
|
|
|
def test_task_type_inference(self):
|
|
"""Test task type inference from tool"""
|
|
processor = TelemetryStreamProcessor(self.engine)
|
|
|
|
assert processor._infer_task_type("git_status") == "read"
|
|
assert processor._infer_task_type("file_write") == "build"
|
|
assert processor._infer_task_type("run_tests") == "test"
|
|
|
|
|
|
class TestEndToEnd:
|
|
"""End-to-end integration tests"""
|
|
|
|
def setup_method(self):
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
|
|
self.engine = IntelligenceEngine(db=self.db)
|
|
|
|
def teardown_method(self):
|
|
shutil.rmtree(self.temp_dir)
|
|
|
|
def test_full_learning_cycle(self):
|
|
"""Test complete learning cycle"""
|
|
# 1. Create harness
|
|
harness = UniWizardHarness("ezra", intelligence=self.engine)
|
|
|
|
# 2. Execute multiple times
|
|
for i in range(20):
|
|
harness.execute("git_status", repo_path="/tmp")
|
|
|
|
# 3. Get pattern
|
|
pattern = self.engine.db.get_pattern("git_status", "ezra")
|
|
assert pattern.sample_count == 20
|
|
|
|
# 4. Predict next execution
|
|
prob, reason = harness.predict_execution("git_status", {})
|
|
assert prob > 0
|
|
assert len(reason) > 0
|
|
|
|
# 5. Learn from batch
|
|
result = harness.learn_from_batch()
|
|
assert result["status"] == "adapted"
|
|
|
|
# 6. Get intelligence report
|
|
report = self.engine.get_intelligence_report()
|
|
assert "house_performance" in report
|
|
assert "learning_velocity" in report
|
|
|
|
|
|
def run_tests():
|
|
"""Run all tests"""
|
|
import inspect
|
|
|
|
test_classes = [
|
|
TestPatternDatabase,
|
|
TestIntelligenceEngine,
|
|
TestAdaptivePolicy,
|
|
TestHarness,
|
|
TestHermesBridge,
|
|
TestEndToEnd
|
|
]
|
|
|
|
passed = 0
|
|
failed = 0
|
|
|
|
print("=" * 60)
|
|
print("UNI-WIZARD v3 TEST SUITE")
|
|
print("=" * 60)
|
|
|
|
for cls in test_classes:
|
|
print(f"\n📦 {cls.__name__}")
|
|
print("-" * 40)
|
|
|
|
instance = cls()
|
|
|
|
# Run setup
|
|
if hasattr(instance, 'setup_method'):
|
|
try:
|
|
instance.setup_method()
|
|
except Exception as e:
|
|
print(f" ⚠️ Setup failed: {e}")
|
|
continue
|
|
|
|
for name, method in inspect.getmembers(cls, predicate=inspect.isfunction):
|
|
if name.startswith('test_'):
|
|
try:
|
|
# Get fresh instance for each test
|
|
test_instance = cls()
|
|
if hasattr(test_instance, 'setup_method'):
|
|
test_instance.setup_method()
|
|
|
|
method(test_instance)
|
|
print(f" ✅ {name}")
|
|
passed += 1
|
|
|
|
if hasattr(test_instance, 'teardown_method'):
|
|
test_instance.teardown_method()
|
|
|
|
except Exception as e:
|
|
print(f" ❌ {name}: {e}")
|
|
failed += 1
|
|
|
|
# Run teardown
|
|
if hasattr(instance, 'teardown_method'):
|
|
try:
|
|
instance.teardown_method()
|
|
except:
|
|
pass
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"Results: {passed} passed, {failed} failed")
|
|
print("=" * 60)
|
|
|
|
return failed == 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
success = run_tests()
|
|
sys.exit(0 if success else 1)
|