#!/usr/bin/env python3 """ Test Suite for Uni-Wizard v3 — Self-Improving Intelligence Tests: - Pattern database operations - Intelligence engine learning - Adaptive policy changes - Prediction accuracy - Hermes bridge integration - End-to-end self-improvement """ import sys import json import tempfile import shutil import time import threading from pathlib import Path from unittest.mock import Mock, patch, MagicMock # Add parent to path sys.path.insert(0, str(Path(__file__).parent.parent)) from intelligence_engine import ( PatternDatabase, IntelligenceEngine, ExecutionPattern, AdaptationEvent ) from harness import ( UniWizardHarness, AdaptivePolicy, House, Provenance, ExecutionResult ) from hermes_bridge import ( HermesStateReader, HermesSessionEvent, TelemetryStreamProcessor, ShortestLoopIntegrator ) class TestPatternDatabase: """Test pattern storage and retrieval""" def setup_method(self): self.temp_dir = tempfile.mkdtemp() self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db") def teardown_method(self): shutil.rmtree(self.temp_dir) def test_record_execution(self): """Test recording execution outcomes""" self.db.record_execution({ "tool": "git_status", "house": "ezra", "model": "hermes3:8b", "success": True, "latency_ms": 150, "confidence": 0.9 }) # Verify pattern created pattern = self.db.get_pattern("git_status", "ezra") assert pattern is not None assert pattern.success_rate == 1.0 assert pattern.sample_count == 1 def test_pattern_aggregation(self): """Test pattern aggregation across multiple executions""" # Record 10 executions, 8 successful for i in range(10): self.db.record_execution({ "tool": "deploy", "house": "bezalel", "success": i < 8, "latency_ms": 200 + i * 10, "confidence": 0.8 }) pattern = self.db.get_pattern("deploy", "bezalel") assert pattern.success_rate == 0.8 assert pattern.sample_count == 10 assert pattern.avg_latency_ms == 245 # Average of 200-290 def test_best_model_selection(self): """Test finding best model for task""" # Model A: 10 calls, 8 success = 80% for i in range(10): self.db.record_execution({ "tool": "read", "house": "ezra", "model": "model_a", "task_type": "read", "success": i < 8, "latency_ms": 100 }) # Model B: 10 calls, 9 success = 90% for i in range(10): self.db.record_execution({ "tool": "read", "house": "ezra", "model": "model_b", "task_type": "read", "success": i < 9, "latency_ms": 120 }) best = self.db.get_best_model("read", min_samples=5) assert best == "model_b" def test_house_performance(self): """Test house performance metrics""" # Record executions for ezra for i in range(5): self.db.record_execution({ "tool": "test", "house": "ezra", "success": i < 4, # 80% success "latency_ms": 100 }) perf = self.db.get_house_performance("ezra", days=7) assert perf["house"] == "ezra" assert perf["success_rate"] == 0.8 assert perf["total_executions"] == 5 def test_adaptation_tracking(self): """Test recording adaptations""" adapt = AdaptationEvent( timestamp="2026-03-30T20:00:00Z", trigger="low_success_rate", change_type="policy.threshold", old_value=0.8, new_value=0.7, reason="Performance below threshold", expected_improvement=0.1 ) self.db.record_adaptation(adapt) adaptations = self.db.get_adaptations(limit=10) assert len(adaptations) == 1 assert adaptations[0].change_type == "policy.threshold" class TestIntelligenceEngine: """Test intelligence and learning""" def setup_method(self): self.temp_dir = tempfile.mkdtemp() self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db") self.engine = IntelligenceEngine(db=self.db) def teardown_method(self): shutil.rmtree(self.temp_dir) def test_predict_success_with_data(self): """Test prediction with historical data""" # Record successful pattern for i in range(10): self.db.record_execution({ "tool": "git_status", "house": "ezra", "success": True, "latency_ms": 100, "confidence": 0.9 }) prob, reason = self.engine.predict_success("git_status", "ezra") assert prob == 1.0 assert "excellent track record" in reason def test_predict_success_without_data(self): """Test prediction without historical data""" prob, reason = self.engine.predict_success("unknown_tool", "timmy") assert prob == 0.5 assert "Insufficient data" in reason def test_optimal_house_selection(self): """Test finding optimal house for task""" # Ezra: 90% success on git_status for i in range(10): self.db.record_execution({ "tool": "git_status", "house": "ezra", "success": i < 9, "latency_ms": 100 }) # Bezalel: 50% success on git_status for i in range(10): self.db.record_execution({ "tool": "git_status", "house": "bezalel", "success": i < 5, "latency_ms": 100 }) house, confidence = self.engine.get_optimal_house("git_status") assert house == "ezra" assert confidence == 0.9 def test_learning_velocity(self): """Test learning velocity calculation""" now = time.time() # Record old executions (5-7 days ago) for i in range(10): self.db.record_execution({ "tool": "test", "house": "timmy", "success": i < 5, # 50% success "latency_ms": 100 }) # Backdate the executions conn = self.db.db_path # (In real test, we'd manipulate timestamps) velocity = self.engine._calculate_learning_velocity() assert "velocity" in velocity assert "improvement" in velocity class TestAdaptivePolicy: """Test policy adaptation""" def setup_method(self): self.temp_dir = tempfile.mkdtemp() self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db") self.engine = IntelligenceEngine(db=self.db) def teardown_method(self): shutil.rmtree(self.temp_dir) def test_policy_loads_defaults(self): """Test policy loads default values""" policy = AdaptivePolicy(House.EZRA, self.engine) assert policy.get("evidence_threshold") == 0.8 assert policy.get("must_read_before_write") is True def test_policy_adapts_on_low_performance(self): """Test policy adapts when performance is poor""" policy = AdaptivePolicy(House.EZRA, self.engine) # Record poor performance for ezra for i in range(10): self.db.record_execution({ "tool": "test", "house": "ezra", "success": i < 4, # 40% success "latency_ms": 100 }) # Trigger adaptation adapt = policy.adapt("low_performance", "Testing adaptation") # Threshold should have decreased assert policy.get("evidence_threshold") < 0.8 assert adapt is not None def test_policy_adapts_on_high_performance(self): """Test policy adapts when performance is excellent""" policy = AdaptivePolicy(House.EZRA, self.engine) # Start with lower threshold policy.policy["evidence_threshold"] = 0.7 # Record excellent performance for i in range(10): self.db.record_execution({ "tool": "test", "house": "ezra", "success": True, # 100% success "latency_ms": 100 }) # Trigger adaptation adapt = policy.adapt("high_performance", "Testing adaptation") # Threshold should have increased assert policy.get("evidence_threshold") > 0.7 class TestHarness: """Test v3 harness with intelligence""" def setup_method(self): self.temp_dir = tempfile.mkdtemp() self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db") self.engine = IntelligenceEngine(db=self.db) def teardown_method(self): shutil.rmtree(self.temp_dir) def test_harness_creates_provenance(self): """Test harness creates proper provenance""" harness = UniWizardHarness("ezra", intelligence=self.engine) result = harness.execute("system_info") assert result.provenance.house == "ezra" assert result.provenance.tool == "system_info" assert result.provenance.prediction >= 0 def test_harness_records_for_learning(self): """Test harness records executions""" harness = UniWizardHarness("timmy", intelligence=self.engine, enable_learning=True) initial_count = self.engine.db.get_house_performance("timmy")["total_executions"] harness.execute("test_tool") new_count = self.engine.db.get_house_performance("timmy")["total_executions"] assert new_count == initial_count + 1 def test_harness_does_not_record_when_learning_disabled(self): """Test harness respects learning flag""" harness = UniWizardHarness("timmy", intelligence=self.engine, enable_learning=False) initial_count = self.engine.db.get_house_performance("timmy")["total_executions"] harness.execute("test_tool") new_count = self.engine.db.get_house_performance("timmy")["total_executions"] assert new_count == initial_count def test_learn_from_batch_triggers_adaptation(self): """Test batch learning triggers adaptations""" harness = UniWizardHarness("ezra", intelligence=self.engine) # Execute multiple times for i in range(15): harness.execute("test_tool") # Trigger learning result = harness.learn_from_batch(min_executions=10) assert result["status"] == "adapted" class TestHermesBridge: """Test Hermes integration""" def setup_method(self): self.temp_dir = tempfile.mkdtemp() self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db") self.engine = IntelligenceEngine(db=self.db) def teardown_method(self): shutil.rmtree(self.temp_dir) def test_event_conversion(self): """Test Hermes event to intelligence record conversion""" processor = TelemetryStreamProcessor(self.engine) event = HermesSessionEvent( session_id="test_session", timestamp=time.time(), event_type="tool_call", tool_name="terminal", success=True, latency_ms=150, model="hermes3:8b", provider="local", token_count=100, error=None ) record = processor._convert_event(event) assert record["tool"] == "system_shell" # Mapped from terminal assert record["house"] == "timmy" assert record["success"] is True def test_task_type_inference(self): """Test task type inference from tool""" processor = TelemetryStreamProcessor(self.engine) assert processor._infer_task_type("git_status") == "read" assert processor._infer_task_type("file_write") == "build" assert processor._infer_task_type("run_tests") == "test" class TestEndToEnd: """End-to-end integration tests""" def setup_method(self): self.temp_dir = tempfile.mkdtemp() self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db") self.engine = IntelligenceEngine(db=self.db) def teardown_method(self): shutil.rmtree(self.temp_dir) def test_full_learning_cycle(self): """Test complete learning cycle""" # 1. Create harness harness = UniWizardHarness("ezra", intelligence=self.engine) # 2. Execute multiple times for i in range(20): harness.execute("git_status", repo_path="/tmp") # 3. Get pattern pattern = self.engine.db.get_pattern("git_status", "ezra") assert pattern.sample_count == 20 # 4. Predict next execution prob, reason = harness.predict_execution("git_status", {}) assert prob > 0 assert len(reason) > 0 # 5. Learn from batch result = harness.learn_from_batch() assert result["status"] == "adapted" # 6. Get intelligence report report = self.engine.get_intelligence_report() assert "house_performance" in report assert "learning_velocity" in report def run_tests(): """Run all tests""" import inspect test_classes = [ TestPatternDatabase, TestIntelligenceEngine, TestAdaptivePolicy, TestHarness, TestHermesBridge, TestEndToEnd ] passed = 0 failed = 0 print("=" * 60) print("UNI-WIZARD v3 TEST SUITE") print("=" * 60) for cls in test_classes: print(f"\nšŸ“¦ {cls.__name__}") print("-" * 40) instance = cls() # Run setup if hasattr(instance, 'setup_method'): try: instance.setup_method() except Exception as e: print(f" āš ļø Setup failed: {e}") continue for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): if name.startswith('test_'): try: # Get fresh instance for each test test_instance = cls() if hasattr(test_instance, 'setup_method'): test_instance.setup_method() method(test_instance) print(f" āœ… {name}") passed += 1 if hasattr(test_instance, 'teardown_method'): test_instance.teardown_method() except Exception as e: print(f" āŒ {name}: {e}") failed += 1 # Run teardown if hasattr(instance, 'teardown_method'): try: instance.teardown_method() except: pass print("\n" + "=" * 60) print(f"Results: {passed} passed, {failed} failed") print("=" * 60) return failed == 0 if __name__ == "__main__": success = run_tests() sys.exit(0 if success else 1)