timmy-home/uni-wizard/v3/tests/test_v3.py

#!/usr/bin/env python3
"""
Test Suite for Uni-Wizard v3 — Self-Improving Intelligence

Tests:
- Pattern database operations
- Intelligence engine learning
- Adaptive policy changes
- Prediction accuracy
- Hermes bridge integration
- End-to-end self-improvement
"""

import sys
import json
import tempfile
import shutil
import time
import threading
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock

# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from intelligence_engine import (
    PatternDatabase, IntelligenceEngine,
    ExecutionPattern, AdaptationEvent
)
from harness import (
    UniWizardHarness, AdaptivePolicy,
    House, Provenance, ExecutionResult
)
from hermes_bridge import (
    HermesStateReader, HermesSessionEvent,
    TelemetryStreamProcessor, ShortestLoopIntegrator
)


class TestPatternDatabase:
    """Test pattern storage and retrieval"""

    def setup_method(self):
        self.temp_dir = tempfile.mkdtemp()
        self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")

    def teardown_method(self):
        shutil.rmtree(self.temp_dir)

    def test_record_execution(self):
        """Test recording execution outcomes"""
        self.db.record_execution({
            "tool": "git_status",
            "house": "ezra",
            "model": "hermes3:8b",
            "success": True,
            "latency_ms": 150,
            "confidence": 0.9
        })

        # Verify pattern created
        pattern = self.db.get_pattern("git_status", "ezra")
        assert pattern is not None
        assert pattern.success_rate == 1.0
        assert pattern.sample_count == 1

    def test_pattern_aggregation(self):
        """Test pattern aggregation across multiple executions"""
        # Record 10 executions, 8 successful
        for i in range(10):
            self.db.record_execution({
                "tool": "deploy",
                "house": "bezalel",
                "success": i < 8,
                "latency_ms": 200 + i * 10,
                "confidence": 0.8
            })

        pattern = self.db.get_pattern("deploy", "bezalel")
        assert pattern.success_rate == 0.8
        assert pattern.sample_count == 10
        assert pattern.avg_latency_ms == 245  # Average of 200-290

    def test_best_model_selection(self):
        """Test finding best model for task"""
        # Model A: 10 calls, 8 success = 80%
        for i in range(10):
            self.db.record_execution({
                "tool": "read",
                "house": "ezra",
                "model": "model_a",
                "task_type": "read",
                "success": i < 8,
                "latency_ms": 100
            })

        # Model B: 10 calls, 9 success = 90%
        for i in range(10):
            self.db.record_execution({
                "tool": "read",
                "house": "ezra",
                "model": "model_b",
                "task_type": "read",
                "success": i < 9,
                "latency_ms": 120
            })

        best = self.db.get_best_model("read", min_samples=5)
        assert best == "model_b"

    def test_house_performance(self):
        """Test house performance metrics"""
        # Record executions for ezra
        for i in range(5):
            self.db.record_execution({
                "tool": "test",
                "house": "ezra",
                "success": i < 4,  # 80% success
                "latency_ms": 100
            })

        perf = self.db.get_house_performance("ezra", days=7)
        assert perf["house"] == "ezra"
        assert perf["success_rate"] == 0.8
        assert perf["total_executions"] == 5

    def test_adaptation_tracking(self):
        """Test recording adaptations"""
        adapt = AdaptationEvent(
            timestamp="2026-03-30T20:00:00Z",
            trigger="low_success_rate",
            change_type="policy.threshold",
            old_value=0.8,
            new_value=0.7,
            reason="Performance below threshold",
            expected_improvement=0.1
        )

        self.db.record_adaptation(adapt)

        adaptations = self.db.get_adaptations(limit=10)
        assert len(adaptations) == 1
        assert adaptations[0].change_type == "policy.threshold"


class TestIntelligenceEngine:
    """Test intelligence and learning"""

    def setup_method(self):
        self.temp_dir = tempfile.mkdtemp()
        self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
        self.engine = IntelligenceEngine(db=self.db)

    def teardown_method(self):
        shutil.rmtree(self.temp_dir)

    def test_predict_success_with_data(self):
        """Test prediction with historical data"""
        # Record successful pattern
        for i in range(10):
            self.db.record_execution({
                "tool": "git_status",
                "house": "ezra",
                "success": True,
                "latency_ms": 100,
                "confidence": 0.9
            })

        prob, reason = self.engine.predict_success("git_status", "ezra")
        assert prob == 1.0
        assert "excellent track record" in reason

    def test_predict_success_without_data(self):
        """Test prediction without historical data"""
        prob, reason = self.engine.predict_success("unknown_tool", "timmy")
        assert prob == 0.5
        assert "Insufficient data" in reason

    def test_optimal_house_selection(self):
        """Test finding optimal house for task"""
        # Ezra: 90% success on git_status
        for i in range(10):
            self.db.record_execution({
                "tool": "git_status",
                "house": "ezra",
                "success": i < 9,
                "latency_ms": 100
            })

        # Bezalel: 50% success on git_status
        for i in range(10):
            self.db.record_execution({
                "tool": "git_status",
                "house": "bezalel",
                "success": i < 5,
                "latency_ms": 100
            })

        house, confidence = self.engine.get_optimal_house("git_status")
        assert house == "ezra"
        assert confidence == 0.9

    def test_learning_velocity(self):
        """Test learning velocity calculation"""
        now = time.time()

        # Record old executions (5-7 days ago)
        for i in range(10):
            self.db.record_execution({
                "tool": "test",
                "house": "timmy",
                "success": i < 5,  # 50% success
                "latency_ms": 100
            })

        # Backdate the executions
        conn = self.db.db_path
        # (In real test, we'd manipulate timestamps)

        velocity = self.engine._calculate_learning_velocity()
        assert "velocity" in velocity
        assert "improvement" in velocity


class TestAdaptivePolicy:
    """Test policy adaptation"""

    def setup_method(self):
        self.temp_dir = tempfile.mkdtemp()
        self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
        self.engine = IntelligenceEngine(db=self.db)

    def teardown_method(self):
        shutil.rmtree(self.temp_dir)

    def test_policy_loads_defaults(self):
        """Test policy loads default values"""
        policy = AdaptivePolicy(House.EZRA, self.engine)

        assert policy.get("evidence_threshold") == 0.8
        assert policy.get("must_read_before_write") is True

    def test_policy_adapts_on_low_performance(self):
        """Test policy adapts when performance is poor"""
        policy = AdaptivePolicy(House.EZRA, self.engine)

        # Record poor performance for ezra
        for i in range(10):
            self.db.record_execution({
                "tool": "test",
                "house": "ezra",
                "success": i < 4,  # 40% success
                "latency_ms": 100
            })

        # Trigger adaptation
        adapt = policy.adapt("low_performance", "Testing adaptation")

        # Threshold should have decreased
        assert policy.get("evidence_threshold") < 0.8
        assert adapt is not None

    def test_policy_adapts_on_high_performance(self):
        """Test policy adapts when performance is excellent"""
        policy = AdaptivePolicy(House.EZRA, self.engine)

        # Start with lower threshold
        policy.policy["evidence_threshold"] = 0.7

        # Record excellent performance
        for i in range(10):
            self.db.record_execution({
                "tool": "test",
                "house": "ezra",
                "success": True,  # 100% success
                "latency_ms": 100
            })

        # Trigger adaptation
        adapt = policy.adapt("high_performance", "Testing adaptation")

        # Threshold should have increased
        assert policy.get("evidence_threshold") > 0.7


class TestHarness:
    """Test v3 harness with intelligence"""

    def setup_method(self):
        self.temp_dir = tempfile.mkdtemp()
        self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
        self.engine = IntelligenceEngine(db=self.db)

    def teardown_method(self):
        shutil.rmtree(self.temp_dir)

    def test_harness_creates_provenance(self):
        """Test harness creates proper provenance"""
        harness = UniWizardHarness("ezra", intelligence=self.engine)
        result = harness.execute("system_info")

        assert result.provenance.house == "ezra"
        assert result.provenance.tool == "system_info"
        assert result.provenance.prediction >= 0

    def test_harness_records_for_learning(self):
        """Test harness records executions"""
        harness = UniWizardHarness("timmy", intelligence=self.engine, enable_learning=True)

        initial_count = self.engine.db.get_house_performance("timmy")["total_executions"]

        harness.execute("test_tool")

        new_count = self.engine.db.get_house_performance("timmy")["total_executions"]
        assert new_count == initial_count + 1

    def test_harness_does_not_record_when_learning_disabled(self):
        """Test harness respects learning flag"""
        harness = UniWizardHarness("timmy", intelligence=self.engine, enable_learning=False)

        initial_count = self.engine.db.get_house_performance("timmy")["total_executions"]

        harness.execute("test_tool")

        new_count = self.engine.db.get_house_performance("timmy")["total_executions"]
        assert new_count == initial_count

    def test_learn_from_batch_triggers_adaptation(self):
        """Test batch learning triggers adaptations"""
        harness = UniWizardHarness("ezra", intelligence=self.engine)

        # Execute multiple times
        for i in range(15):
            harness.execute("test_tool")

        # Trigger learning
        result = harness.learn_from_batch(min_executions=10)

        assert result["status"] == "adapted"


class TestHermesBridge:
    """Test Hermes integration"""

    def setup_method(self):
        self.temp_dir = tempfile.mkdtemp()
        self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
        self.engine = IntelligenceEngine(db=self.db)

    def teardown_method(self):
        shutil.rmtree(self.temp_dir)

    def test_event_conversion(self):
        """Test Hermes event to intelligence record conversion"""
        processor = TelemetryStreamProcessor(self.engine)

        event = HermesSessionEvent(
            session_id="test_session",
            timestamp=time.time(),
            event_type="tool_call",
            tool_name="terminal",
            success=True,
            latency_ms=150,
            model="hermes3:8b",
            provider="local",
            token_count=100,
            error=None
        )

        record = processor._convert_event(event)

        assert record["tool"] == "system_shell"  # Mapped from terminal
        assert record["house"] == "timmy"
        assert record["success"] is True

    def test_task_type_inference(self):
        """Test task type inference from tool"""
        processor = TelemetryStreamProcessor(self.engine)

        assert processor._infer_task_type("git_status") == "read"
        assert processor._infer_task_type("file_write") == "build"
        assert processor._infer_task_type("run_tests") == "test"


class TestEndToEnd:
    """End-to-end integration tests"""

    def setup_method(self):
        self.temp_dir = tempfile.mkdtemp()
        self.db = PatternDatabase(db_path=Path(self.temp_dir) / "test.db")
        self.engine = IntelligenceEngine(db=self.db)

    def teardown_method(self):
        shutil.rmtree(self.temp_dir)

    def test_full_learning_cycle(self):
        """Test complete learning cycle"""
        # 1. Create harness
        harness = UniWizardHarness("ezra", intelligence=self.engine)

        # 2. Execute multiple times
        for i in range(20):
            harness.execute("git_status", repo_path="/tmp")

        # 3. Get pattern
        pattern = self.engine.db.get_pattern("git_status", "ezra")
        assert pattern.sample_count == 20

        # 4. Predict next execution
        prob, reason = harness.predict_execution("git_status", {})
        assert prob > 0
        assert len(reason) > 0

        # 5. Learn from batch
        result = harness.learn_from_batch()
        assert result["status"] == "adapted"

        # 6. Get intelligence report
        report = self.engine.get_intelligence_report()
        assert "house_performance" in report
        assert "learning_velocity" in report


def run_tests():
    """Run all tests"""
    import inspect

    test_classes = [
        TestPatternDatabase,
        TestIntelligenceEngine,
        TestAdaptivePolicy,
        TestHarness,
        TestHermesBridge,
        TestEndToEnd
    ]

    passed = 0
    failed = 0

    print("=" * 60)
    print("UNI-WIZARD v3 TEST SUITE")
    print("=" * 60)

    for cls in test_classes:
        print(f"\n📦 {cls.__name__}")
        print("-" * 40)

        instance = cls()

        # Run setup
        if hasattr(instance, 'setup_method'):
            try:
                instance.setup_method()
            except Exception as e:
                print(f"  ⚠️  Setup failed: {e}")
                continue

        for name, method in inspect.getmembers(cls, predicate=inspect.isfunction):
            if name.startswith('test_'):
                try:
                    # Get fresh instance for each test
                    test_instance = cls()
                    if hasattr(test_instance, 'setup_method'):
                        test_instance.setup_method()

                    method(test_instance)
                    print(f"  ✅ {name}")
                    passed += 1

                    if hasattr(test_instance, 'teardown_method'):
                        test_instance.teardown_method()

                except Exception as e:
                    print(f"  ❌ {name}: {e}")
                    failed += 1

        # Run teardown
        if hasattr(instance, 'teardown_method'):
            try:
                instance.teardown_method()
            except:
                pass

    print("\n" + "=" * 60)
    print(f"Results: {passed} passed, {failed} failed")
    print("=" * 60)

    return failed == 0


if __name__ == "__main__":
    success = run_tests()
    sys.exit(0 if success else 1)