feat: add run_self_tests() tool for self-verification (#65)

Timmy can now run his own test suite via the run_self_tests() tool. Supports 'fast' (unit only), 'full', or specific path scopes. Returns structured results with pass/fail counts. Sovereign self-verification — a fundamental capability.
2026-03-14 20:28:24 -04:00
parent c164d1736f
commit 2fb104528f
4 changed files with 181 additions and 1 deletions
--- a/config/agents.yaml
+++ b/config/agents.yaml
@@ -111,6 +111,7 @@ agents:
      - memory_search
      - memory_write
      - system_status
      - self_test
      - shell
    prompt: |
      You are Timmy, a sovereign local AI orchestrator.
--- a/src/timmy/tools.py
+++ b/src/timmy/tools.py
@@ -579,11 +579,17 @@ def create_full_toolkit(base_dir: str | Path | None = None):
    # System introspection - query runtime environment (sovereign self-knowledge)
    try:
-        from timmy.tools_intro import check_ollama_health, get_memory_status, get_system_info
+        from timmy.tools_intro import (
            check_ollama_health,
            get_memory_status,
            get_system_info,
            run_self_tests,
        )
        toolkit.register(get_system_info, name="get_system_info")
        toolkit.register(check_ollama_health, name="check_ollama_health")
        toolkit.register(get_memory_status, name="get_memory_status")
        toolkit.register(run_self_tests, name="run_self_tests")
    except Exception as exc:
        logger.warning("Tool execution failed (Introspection tools registration): %s", exc)
        logger.debug("Introspection tools not available")
--- a/src/timmy/tools_intro/init.py
+++ b/src/timmy/tools_intro/init.py
@@ -321,3 +321,78 @@ def get_live_system_status() -> dict[str, Any]:
    result["timestamp"] = datetime.now(UTC).isoformat()
    return result
 def run_self_tests(scope: str = "fast", _repo_root: str | None = None) -> dict[str, Any]:
    """Run Timmy's own test suite and report results.
    A sovereign agent verifies his own integrity. This runs pytest
    on the codebase and returns a structured summary.
    Args:
        scope: Test scope — "fast" (unit tests only, ~30s timeout),
               "full" (all tests), or a specific path like "tests/timmy/"
        _repo_root: Optional repo root for testing (overrides settings)
    Returns:
        Dict with passed, failed, errors, total counts and summary text.
    """
    import subprocess
    from config import settings
    repo = _repo_root if _repo_root else settings.repo_root
    venv_python = Path(repo) / ".venv" / "bin" / "python"
    if not venv_python.exists():
        return {"success": False, "error": f"No venv found at {venv_python}"}
    cmd = [str(venv_python), "-m", "pytest", "-x", "-q", "--tb=short", "--timeout=30"]
    if scope == "fast":
        # Unit tests only — skip functional/e2e/integration
        cmd.extend(
            [
                "--ignore=tests/functional",
                "--ignore=tests/e2e",
                "--ignore=tests/integrations",
                "tests/",
            ]
        )
    elif scope == "full":
        cmd.append("tests/")
    else:
        # Specific path
        cmd.append(scope)
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120, cwd=repo)
        output = result.stdout + result.stderr
        # Parse pytest output for counts
        passed = failed = errors = 0
        for line in output.splitlines():
            if "passed" in line or "failed" in line or "error" in line:
                import re
                nums = re.findall(r"(\d+) (passed|failed|error)", line)
                for count, kind in nums:
                    if kind == "passed":
                        passed = int(count)
                    elif kind == "failed":
                        failed = int(count)
                    elif kind == "error":
                        errors = int(count)
        return {
            "success": result.returncode == 0,
            "passed": passed,
            "failed": failed,
            "errors": errors,
            "total": passed + failed + errors,
            "return_code": result.returncode,
            "summary": output[-2000:] if len(output) > 2000 else output,
        }
    except subprocess.TimeoutExpired:
        return {"success": False, "error": "Test run timed out (120s limit)"}
    except Exception as exc:
        return {"success": False, "error": str(exc)}
--- a/tests/timmy/test_introspection.py
+++ b/tests/timmy/test_introspection.py
@@ -158,3 +158,101 @@ class TestGetOllamaModelExactMatch:
            result = _get_ollama_model()
        assert result == "qwen3:30b"
 class TestRunSelfTests:
    """Tests for run_self_tests() — Timmy's self-verification tool."""
    def test_returns_dict_with_expected_keys(self):
        """run_self_tests should return structured test results."""
        from timmy.tools_intro import run_self_tests
        result = run_self_tests(scope="tests/timmy/test_introspection.py")
        assert isinstance(result, dict)
        assert "success" in result
        # Should have count keys when tests ran
        if result["success"] or "passed" in result:
            assert "passed" in result
            assert "failed" in result
            assert "total" in result
    def test_fast_scope_skips_integration(self, monkeypatch, tmp_path):
        """Fast scope should exclude functional/e2e/integration dirs."""
        import subprocess
        calls = []
        def capture_run(*args, **kwargs):
            calls.append(args[0] if args else kwargs.get("cmd"))
            # Return a fake result
            return subprocess.CompletedProcess(
                args=args[0] if args else [], returncode=0, stdout="1 passed in 0.5s", stderr=""
            )
        monkeypatch.setattr(subprocess, "run", capture_run)
        # Create fake venv so check passes
        venv_python = tmp_path / ".venv" / "bin" / "python"
        venv_python.parent.mkdir(parents=True)
        venv_python.write_text("#!/bin/sh\necho mock")
        from timmy.tools_intro import run_self_tests
        run_self_tests(scope="fast", _repo_root=str(tmp_path))
        assert len(calls) == 1
        cmd = calls[0]
        assert "--ignore=tests/functional" in cmd
        assert "--ignore=tests/e2e" in cmd
    def test_specific_path_scope(self, monkeypatch, tmp_path):
        """Specific path scope passes path directly to pytest."""
        import subprocess
        calls = []
        def capture_run(*args, **kwargs):
            calls.append(args[0] if args else kwargs.get("cmd"))
            return subprocess.CompletedProcess(
                args=args[0] if args else [], returncode=0, stdout="5 passed in 1.0s", stderr=""
            )
        monkeypatch.setattr(subprocess, "run", capture_run)
        # Create fake venv so check passes
        venv_python = tmp_path / ".venv" / "bin" / "python"
        venv_python.parent.mkdir(parents=True)
        venv_python.write_text("#!/bin/sh\necho mock")
        from timmy.tools_intro import run_self_tests
        run_self_tests(scope="tests/timmy/", _repo_root=str(tmp_path))
        assert len(calls) == 1
        assert "tests/timmy/" in calls[0]
    def test_missing_venv_returns_error(self, monkeypatch, tmp_path):
        """Should handle missing venv gracefully."""
        from timmy.tools_intro import run_self_tests
        result = run_self_tests(_repo_root=str(tmp_path))
        assert result["success"] is False
        assert "venv" in result.get("error", "").lower()
    def test_timeout_returns_error(self, monkeypatch, tmp_path):
        """Should handle subprocess timeout gracefully."""
        import subprocess
        def timeout_run(*args, **kwargs):
            raise subprocess.TimeoutExpired(cmd="pytest", timeout=120)
        monkeypatch.setattr(subprocess, "run", timeout_run)
        # Create fake venv so check passes
        venv_python = tmp_path / ".venv" / "bin" / "python"
        venv_python.parent.mkdir(parents=True)
        venv_python.write_text("#!/bin/sh\necho mock")
        from timmy.tools_intro import run_self_tests
        result = run_self_tests(_repo_root=str(tmp_path))
        assert result["success"] is False
        assert "timed out" in result.get("error", "").lower()