Merge pull request #4 from Alexspayne/claude/implement-todo-item-61a2S

2026-02-21 11:59:10 -05:00
parent 41a38bf6af 19af4ae540
commit 7499690e10
16 changed files with 759 additions and 15 deletions
--- a/.env.example
+++ b/.env.example
@@ -11,3 +11,13 @@

 # Enable FastAPI interactive docs at /docs and /redoc (default: false)
 # DEBUG=true
+
+# ── AirLLM / big-brain backend ───────────────────────────────────────────────
+# Inference backend: "ollama" (default) | "airllm" | "auto"
+#   "auto" → uses AirLLM on Apple Silicon if installed, otherwise Ollama.
+#   Requires: pip install ".[bigbrain]"
+# TIMMY_MODEL_BACKEND=ollama
+
+# AirLLM model size (default: 70b).
+# 8b  ~16 GB RAM  |  70b  ~140 GB RAM  |  405b  ~810 GB RAM
+# AIRLLM_MODEL_SIZE=70b
--- a/README.md
+++ b/README.md
@@ -138,6 +138,65 @@ timmy status

 ---

+## Big Brain — AirLLM backend (Apple Silicon / large RAM)
+
+Run 70B or 405B models locally with no GPU required, using AirLLM's
+layer-by-layer loading strategy.  On M-series Macs the MLX backend is
+selected automatically for maximum throughput.  Everything stays local.
+No cloud.  No telemetry.  Sats are sovereignty, boss.
+
+### One-line install
+
+```bash
+pip install ".[bigbrain]"
+# Apple Silicon only — adds the MLX-accelerated backend:
+pip install "airllm[mlx]"
+```
+
+### Run with the big brain
+
+```bash
+# Explicit flag — works anywhere airllm is installed:
+timmy chat "Explain self-custody" --backend airllm --model-size 70b
+
+# Or set it once in .env and forget about it:
+echo "TIMMY_MODEL_BACKEND=auto" >> .env
+echo "AIRLLM_MODEL_SIZE=70b"   >> .env
+timmy chat "What is sovereignty?"
+```
+
+`--backend auto` (or `TIMMY_MODEL_BACKEND=auto`) selects AirLLM automatically
+on Apple Silicon when the package is installed, and falls back to Ollama
+everywhere else — so the same `.env` works on any machine.
+
+### Model sizes
+
+| Flag | Parameters | Approx. RAM needed |
+|------|-----------|-------------------|
+| `8b`   | 8 billion   | ~16 GB  |
+| `70b`  | 70 billion  | ~140 GB |
+| `405b` | 405 billion | ~810 GB |
+
+Models are downloaded from HuggingFace on first run and cached locally.
+You need a HuggingFace account and `huggingface-cli login` for gated models
+(Llama 3.1 requires accepting Meta's license at hf.co/meta-llama).
+
+### Architecture with AirLLM
+
+```
+timmy chat --backend airllm
+    │
+    ▼
+TimmyAirLLMAgent          (src/timmy/backends.py)
+    │
+    ├─ Apple Silicon? ──► AirLLMMLX  (MLX tensors, Metal GPU)
+    └─ Everything else ──► AutoModel  (PyTorch, CPU/CUDA)
+         │
+         └─ Layers loaded on-demand from ~/.cache/huggingface/
+```
+
+---
+
 ## Architecture

 ```mermaid
--- a/activate_self_tdd.sh
+++ b/activate_self_tdd.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# activate_self_tdd.sh — Timmy Time dev environment bootstrapper
+#
+# Usage:
+#   bash activate_self_tdd.sh              # standard (Ollama) setup
+#   bash activate_self_tdd.sh --big-brain  # install AirLLM extra too
+#
+# What it does:
+#   1. Creates a Python venv (or reuses an existing one)
+#   2. Installs Timmy Time (+ dev deps, optionally bigbrain)
+#   3. Runs the full test suite — aborts if anything fails
+#   4. Launches the self-TDD watchdog in the background
+#   5. Starts the dashboard
+#
+# Everything stays local. No cloud. Sats are sovereignty, boss.
+
+set -euo pipefail
+
+REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VENV_DIR="$REPO_DIR/.venv"
+BIG_BRAIN=0
+
+for arg in "$@"; do
+  case $arg in
+    --big-brain) BIG_BRAIN=1 ;;
+    *) echo "Unknown argument: $arg"; exit 1 ;;
+  esac
+done
+
+echo "==> Timmy Time — sovereign AI agent bootstrapper"
+echo "    Working directory: $REPO_DIR"
+
+# ── 1. Virtual environment ────────────────────────────────────────────────────
+if [[ ! -d "$VENV_DIR" ]]; then
+  echo "==> Creating virtual environment..."
+  python3 -m venv "$VENV_DIR"
+fi
+
+# shellcheck disable=SC1091
+source "$VENV_DIR/bin/activate"
+echo "==> Virtual environment active: $VENV_DIR"
+
+# ── 2. Install dependencies ───────────────────────────────────────────────────
+if [[ $BIG_BRAIN -eq 1 ]]; then
+  echo "==> Installing with bigbrain extra (AirLLM)..."
+  pip install --quiet -e "$REPO_DIR[dev,bigbrain]"
+  # On Apple Silicon, also install the MLX backend.
+  if [[ "$(uname -m)" == "arm64" && "$(uname -s)" == "Darwin" ]]; then
+    echo "==> Apple Silicon detected — installing AirLLM MLX backend..."
+    pip install --quiet "airllm[mlx]"
+  fi
+else
+  echo "==> Installing standard dependencies..."
+  pip install --quiet -e "$REPO_DIR[dev]"
+fi
+
+# ── 3. Run tests ──────────────────────────────────────────────────────────────
+echo "==> Running test suite..."
+python -m pytest "$REPO_DIR/tests/" -q --tb=short
+echo "==> All tests passed."
+
+# ── 4. Self-TDD watchdog (background) ────────────────────────────────────────
+echo "==> Starting self-TDD watchdog (60s interval) in background..."
+self-tdd watch --interval 60 &
+WATCHDOG_PID=$!
+echo "    Watchdog PID: $WATCHDOG_PID"
+echo "    Kill with: kill $WATCHDOG_PID"
+
+# ── 5. Dashboard ─────────────────────────────────────────────────────────────
+echo ""
+echo "==> Starting Timmy Time dashboard at http://localhost:8000"
+echo "    Ctrl-C stops the dashboard (watchdog continues until you kill it)"
+echo ""
+uvicorn dashboard.app:app --reload --host 0.0.0.0 --port 8000
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,13 +28,20 @@ dev = [
    "pytest-asyncio>=0.24.0",
    "pytest-cov>=5.0.0",
 ]
+# Big-brain: run 8B / 70B / 405B models locally via layer-by-layer loading.
+# pip install ".[bigbrain]"
+# On Apple Silicon: pip install "airllm[mlx]" for the MLX-accelerated backend.
+bigbrain = [
+    "airllm>=2.9.0",
+]

 [project.scripts]
 timmy = "timmy.cli:main"
+self-tdd = "self_tdd.watchdog:main"

 [tool.hatch.build.targets.wheel]
 sources = {"src" = ""}
-include = ["src/timmy", "src/dashboard", "src/config.py"]
+include = ["src/timmy", "src/dashboard", "src/config.py", "src/self_tdd"]

 [tool.pytest.ini_options]
 testpaths = ["tests"]
--- a/src/config.py
+++ b/src/config.py
@@ -1,3 +1,5 @@
+from typing import Literal
+
 from pydantic_settings import BaseSettings, SettingsConfigDict


@@ -11,6 +13,18 @@ class Settings(BaseSettings):
    # Set DEBUG=true to enable /docs and /redoc (disabled by default)
    debug: bool = False

+    # ── AirLLM / backend selection ───────────────────────────────────────────
+    # "ollama"  — always use Ollama (default, safe everywhere)
+    # "airllm"  — always use AirLLM (requires pip install ".[bigbrain]")
+    # "auto"    — use AirLLM on Apple Silicon if airllm is installed,
+    #             fall back to Ollama otherwise
+    timmy_model_backend: Literal["ollama", "airllm", "auto"] = "ollama"
+
+    # AirLLM model size when backend is airllm or auto.
+    # Larger = smarter, but needs more RAM / disk.
+    # 8b  ~16 GB  |  70b  ~140 GB  |  405b  ~810 GB
+    airllm_model_size: Literal["8b", "70b", "405b"] = "70b"
+
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
--- a/src/self_tdd/init.py
+++ b/src/self_tdd/init.py
--- a/src/self_tdd/watchdog.py
+++ b/src/self_tdd/watchdog.py
@@ -0,0 +1,71 @@
+"""Self-TDD Watchdog — polls pytest on a schedule and reports regressions.
+
+Run in a terminal alongside your normal dev work:
+
+    self-tdd watch
+    self-tdd watch --interval 30
+
+The watchdog runs silently while tests pass. When a regression appears it
+prints the full short-traceback output so you can see exactly what broke.
+No files are modified; no commits are made. Ctrl-C to stop.
+"""
+
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+import typer
+
+# Project root is three levels up from src/self_tdd/watchdog.py
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+
+app = typer.Typer(help="Self-TDD watchdog — continuous test runner")
+
+
+def _run_tests() -> tuple[bool, str]:
+    """Run the test suite and return (passed, combined_output)."""
+    result = subprocess.run(
+        [sys.executable, "-m", "pytest", "tests/", "-q", "--tb=short"],
+        capture_output=True,
+        text=True,
+        cwd=PROJECT_ROOT,
+        timeout=60,
+    )
+    return result.returncode == 0, (result.stdout + result.stderr).strip()
+
+
+@app.command()
+def watch(
+    interval: int = typer.Option(60, "--interval", "-i", help="Seconds between test runs"),
+) -> None:
+    """Poll pytest continuously and print regressions as they appear."""
+    typer.echo(f"Self-TDD watchdog started — polling every {interval}s. Ctrl-C to stop.")
+    last_passed: bool | None = None
+
+    try:
+        while True:
+            passed, output = _run_tests()
+            stamp = datetime.now().strftime("%H:%M:%S")
+
+            if passed:
+                if last_passed is not True:
+                    typer.secho(f"[{stamp}] All tests passing.", fg=typer.colors.GREEN)
+            else:
+                typer.secho(f"[{stamp}] Regression detected:", fg=typer.colors.RED)
+                typer.echo(output)
+
+            last_passed = passed
+            time.sleep(interval)
+
+    except KeyboardInterrupt:
+        typer.echo("\nWatchdog stopped.")
+
+
+def main() -> None:
+    app()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/timmy/agent.py
+++ b/src/timmy/agent.py
@@ -1,13 +1,67 @@
+from typing import TYPE_CHECKING, Union
+
 from agno.agent import Agent
-from agno.models.ollama import Ollama
 from agno.db.sqlite import SqliteDb
+from agno.models.ollama import Ollama

-from timmy.prompts import TIMMY_SYSTEM_PROMPT
 from config import settings
+from timmy.prompts import TIMMY_SYSTEM_PROMPT
+
+if TYPE_CHECKING:
+    from timmy.backends import TimmyAirLLMAgent
+
+# Union type for callers that want to hint the return type.
+TimmyAgent = Union[Agent, "TimmyAirLLMAgent"]


-def create_timmy(db_file: str = "timmy.db") -> Agent:
-    """Instantiate Timmy with Agno + Ollama + SQLite memory."""
+def _resolve_backend(requested: str | None) -> str:
+    """Return the backend name to use, resolving 'auto' and explicit overrides.
+
+    Priority (highest → lowest):
+      1. CLI flag passed directly to create_timmy()
+      2. TIMMY_MODEL_BACKEND env var / .env setting
+      3. 'ollama' (safe default — no surprises)
+
+    'auto' triggers Apple Silicon detection: uses AirLLM if both
+    is_apple_silicon() and airllm_available() return True.
+    """
+    if requested is not None:
+        return requested
+
+    configured = settings.timmy_model_backend  # "ollama" | "airllm" | "auto"
+    if configured != "auto":
+        return configured
+
+    # "auto" path — lazy import to keep startup fast and tests clean.
+    from timmy.backends import airllm_available, is_apple_silicon
+    if is_apple_silicon() and airllm_available():
+        return "airllm"
+    return "ollama"
+
+
+def create_timmy(
+    db_file: str = "timmy.db",
+    backend: str | None = None,
+    model_size: str | None = None,
+) -> TimmyAgent:
+    """Instantiate Timmy — Ollama or AirLLM, same public interface either way.
+
+    Args:
+        db_file:    SQLite file for Agno conversation memory (Ollama path only).
+        backend:    "ollama" | "airllm" | "auto" | None (reads config/env).
+        model_size: AirLLM size — "8b" | "70b" | "405b" | None (reads config).
+
+    Returns an Agno Agent (Ollama) or TimmyAirLLMAgent — both expose
+    print_response(message, stream).
+    """
+    resolved = _resolve_backend(backend)
+    size = model_size or settings.airllm_model_size
+
+    if resolved == "airllm":
+        from timmy.backends import TimmyAirLLMAgent
+        return TimmyAirLLMAgent(model_size=size)
+
+    # Default: Ollama via Agno.
    return Agent(
        name="Timmy",
        model=Ollama(id=settings.ollama_model),
--- a/src/timmy/backends.py
+++ b/src/timmy/backends.py
@@ -0,0 +1,119 @@
+"""AirLLM backend — only imported when the airllm extra is installed.
+
+Provides TimmyAirLLMAgent: a drop-in replacement for an Agno Agent that
+exposes the same print_response(message, stream) surface while routing
+inference through AirLLM.  On Apple Silicon (arm64 Darwin) the MLX backend
+is selected automatically; everywhere else AutoModel (PyTorch) is used.
+
+No cloud.  No telemetry.  Sats are sovereignty, boss.
+"""
+
+import platform
+from typing import Literal
+
+from timmy.prompts import TIMMY_SYSTEM_PROMPT
+
+# HuggingFace model IDs for each supported size.
+_AIRLLM_MODELS: dict[str, str] = {
+    "8b":   "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "70b":  "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+}
+
+ModelSize = Literal["8b", "70b", "405b"]
+
+
+def is_apple_silicon() -> bool:
+    """Return True when running on an M-series Mac (arm64 Darwin)."""
+    return platform.system() == "Darwin" and platform.machine() == "arm64"
+
+
+def airllm_available() -> bool:
+    """Return True when the airllm package is importable."""
+    try:
+        import airllm  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+class TimmyAirLLMAgent:
+    """Thin AirLLM wrapper with the same print_response interface as Agno Agent.
+
+    Maintains a rolling 10-turn in-memory history so Timmy remembers the
+    conversation within a session — no SQLite needed at this layer.
+    """
+
+    def __init__(self, model_size: str = "70b") -> None:
+        model_id = _AIRLLM_MODELS.get(model_size)
+        if model_id is None:
+            raise ValueError(
+                f"Unknown model size {model_size!r}. "
+                f"Choose from: {list(_AIRLLM_MODELS)}"
+            )
+
+        if is_apple_silicon():
+            from airllm import AirLLMMLX  # type: ignore[import]
+            self._model = AirLLMMLX(model_id)
+        else:
+            from airllm import AutoModel  # type: ignore[import]
+            self._model = AutoModel.from_pretrained(model_id)
+
+        self._history: list[str] = []
+        self._model_size = model_size
+
+    # ── public interface (mirrors Agno Agent) ────────────────────────────────
+
+    def print_response(self, message: str, *, stream: bool = True) -> None:
+        """Run inference, update history, and render the response to stdout.
+
+        `stream` is accepted for API compatibility but AirLLM generates the
+        full output in one pass — the result is still printed as soon as it
+        is ready.
+        """
+        prompt = self._build_prompt(message)
+
+        input_tokens = self._model.tokenizer(
+            [prompt],
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=2048,
+        )
+        output = self._model.generate(
+            **input_tokens,
+            max_new_tokens=512,
+            use_cache=True,
+            do_sample=True,
+            temperature=0.7,
+        )
+
+        # Decode only the newly generated tokens, not the prompt.
+        input_len = input_tokens["input_ids"].shape[1]
+        response = self._model.tokenizer.decode(
+            output[0][input_len:], skip_special_tokens=True
+        ).strip()
+
+        self._history.append(f"User: {message}")
+        self._history.append(f"Timmy: {response}")
+
+        self._render(response)
+
+    # ── private helpers ──────────────────────────────────────────────────────
+
+    def _build_prompt(self, message: str) -> str:
+        context = TIMMY_SYSTEM_PROMPT + "\n\n"
+        # Include the last 10 turns (5 exchanges) for continuity.
+        if self._history:
+            context += "\n".join(self._history[-10:]) + "\n\n"
+        return context + f"User: {message}\nTimmy:"
+
+    @staticmethod
+    def _render(text: str) -> None:
+        """Print response with rich markdown when available, plain text otherwise."""
+        try:
+            from rich.console import Console
+            from rich.markdown import Markdown
+            Console().print(Markdown(text))
+        except ImportError:
+            print(text)
--- a/src/timmy/cli.py
+++ b/src/timmy/cli.py
@@ -1,29 +1,57 @@
+from typing import Optional
+
 import typer

 from timmy.agent import create_timmy
+from timmy.prompts import TIMMY_STATUS_PROMPT

 app = typer.Typer(help="Timmy — sovereign AI agent")

+# Shared option definitions (reused across commands for consistency).
+_BACKEND_OPTION = typer.Option(
+    None,
+    "--backend",
+    "-b",
+    help="Inference backend: 'ollama' (default) | 'airllm' | 'auto'",
+)
+_MODEL_SIZE_OPTION = typer.Option(
+    None,
+    "--model-size",
+    "-s",
+    help="AirLLM model size when --backend airllm: '8b' | '70b' | '405b'",
+)
+

@app.command()
-def think(topic: str = typer.Argument(..., help="Topic to reason about")):
+def think(
+    topic: str = typer.Argument(..., help="Topic to reason about"),
+    backend: Optional[str] = _BACKEND_OPTION,
+    model_size: Optional[str] = _MODEL_SIZE_OPTION,
+):
    """Ask Timmy to think carefully about a topic."""
-    timmy = create_timmy()
+    timmy = create_timmy(backend=backend, model_size=model_size)
    timmy.print_response(f"Think carefully about: {topic}", stream=True)


@app.command()
-def chat(message: str = typer.Argument(..., help="Message to send")):
+def chat(
+    message: str = typer.Argument(..., help="Message to send"),
+    backend: Optional[str] = _BACKEND_OPTION,
+    model_size: Optional[str] = _MODEL_SIZE_OPTION,
+):
    """Send a message to Timmy."""
-    timmy = create_timmy()
+    timmy = create_timmy(backend=backend, model_size=model_size)
    timmy.print_response(message, stream=True)


@app.command()
-def status():
+def status(
+    backend: Optional[str] = _BACKEND_OPTION,
+    model_size: Optional[str] = _MODEL_SIZE_OPTION,
+):
    """Print Timmy's operational status."""
-    timmy = create_timmy()
-    timmy.print_response("Brief status report — one sentence.", stream=False)
+    timmy = create_timmy(backend=backend, model_size=model_size)
+    timmy.print_response(TIMMY_STATUS_PROMPT, stream=False)


 def main():
--- a/src/timmy/prompts.py
+++ b/src/timmy/prompts.py
@@ -1,7 +1,12 @@
 TIMMY_SYSTEM_PROMPT = """You are Timmy — a sovereign AI agent running locally.
 No cloud dependencies. You think clearly, speak plainly, act with intention.
 Grounded in Christian faith, powered by Bitcoin economics, committed to the
-user's digital sovereignty."""
+user's digital sovereignty.
+
+When running on Apple Silicon with AirLLM you operate with even bigger brains
+— 70B or 405B parameters loaded layer-by-layer directly from local disk.
+Still fully sovereign. Still 100% private. More capable, no permission needed.
+Sir, affirmative."""

 TIMMY_STATUS_PROMPT = """You are Timmy. Give a one-sentence status report confirming
 you are operational and running locally."""
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,8 @@ from unittest.mock import MagicMock
 import pytest
 from fastapi.testclient import TestClient

-# ── Mock agno so tests run without it installed ───────────────────────────────
-# Uses setdefault: real module is used if installed, mock otherwise.
+# ── Stub heavy optional dependencies so tests run without them installed ──────
+# Uses setdefault: real module is used if already installed, mock otherwise.
 for _mod in [
    "agno",
    "agno.agent",
@@ -14,6 +14,9 @@ for _mod in [
    "agno.models.ollama",
    "agno.db",
    "agno.db.sqlite",
+    # AirLLM is optional (bigbrain extra) — stub it so backend tests can
+    # import timmy.backends and instantiate TimmyAirLLMAgent without a GPU.
+    "airllm",
 ]:
    sys.modules.setdefault(_mod, MagicMock())

--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@@ -77,3 +77,77 @@ def test_create_timmy_embeds_system_prompt():

        kwargs = MockAgent.call_args.kwargs
        assert kwargs["description"] == TIMMY_SYSTEM_PROMPT
+
+
+# ── AirLLM path ──────────────────────────────────────────────────────────────
+
+def test_create_timmy_airllm_returns_airllm_agent():
+    """backend='airllm' must return a TimmyAirLLMAgent, not an Agno Agent."""
+    with patch("timmy.backends.is_apple_silicon", return_value=False):
+        from timmy.agent import create_timmy
+        from timmy.backends import TimmyAirLLMAgent
+
+        result = create_timmy(backend="airllm", model_size="8b")
+
+    assert isinstance(result, TimmyAirLLMAgent)
+
+
+def test_create_timmy_airllm_does_not_call_agno_agent():
+    """When using the airllm backend, Agno Agent should never be instantiated."""
+    with patch("timmy.agent.Agent") as MockAgent, \
+         patch("timmy.backends.is_apple_silicon", return_value=False):
+
+        from timmy.agent import create_timmy
+        create_timmy(backend="airllm", model_size="8b")
+
+    MockAgent.assert_not_called()
+
+
+def test_create_timmy_explicit_ollama_ignores_autodetect():
+    """backend='ollama' must always use Ollama, even on Apple Silicon."""
+    with patch("timmy.agent.Agent") as MockAgent, \
+         patch("timmy.agent.Ollama"), \
+         patch("timmy.agent.SqliteDb"):
+
+        from timmy.agent import create_timmy
+        create_timmy(backend="ollama")
+
+    MockAgent.assert_called_once()
+
+
+# ── _resolve_backend ─────────────────────────────────────────────────────────
+
+def test_resolve_backend_explicit_takes_priority():
+    from timmy.agent import _resolve_backend
+    assert _resolve_backend("airllm") == "airllm"
+    assert _resolve_backend("ollama") == "ollama"
+
+
+def test_resolve_backend_defaults_to_ollama_without_config():
+    """Default config (timmy_model_backend='ollama') → 'ollama'."""
+    from timmy.agent import _resolve_backend
+    assert _resolve_backend(None) == "ollama"
+
+
+def test_resolve_backend_auto_uses_airllm_on_apple_silicon():
+    """'auto' on Apple Silicon with airllm stubbed → 'airllm'."""
+    with patch("timmy.backends.is_apple_silicon", return_value=True), \
+         patch("timmy.agent.settings") as mock_settings:
+        mock_settings.timmy_model_backend = "auto"
+        mock_settings.airllm_model_size = "70b"
+        mock_settings.ollama_model = "llama3.2"
+
+        from timmy.agent import _resolve_backend
+        assert _resolve_backend(None) == "airllm"
+
+
+def test_resolve_backend_auto_falls_back_on_non_apple():
+    """'auto' on non-Apple Silicon → 'ollama'."""
+    with patch("timmy.backends.is_apple_silicon", return_value=False), \
+         patch("timmy.agent.settings") as mock_settings:
+        mock_settings.timmy_model_backend = "auto"
+        mock_settings.airllm_model_size = "70b"
+        mock_settings.ollama_model = "llama3.2"
+
+        from timmy.agent import _resolve_backend
+        assert _resolve_backend(None) == "ollama"
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -0,0 +1,143 @@
+"""Tests for src/timmy/backends.py — AirLLM wrapper and helpers."""
+
+import sys
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ── is_apple_silicon ──────────────────────────────────────────────────────────
+
+def test_is_apple_silicon_true_on_arm_darwin():
+    with patch("timmy.backends.platform.system", return_value="Darwin"), \
+         patch("timmy.backends.platform.machine", return_value="arm64"):
+        from timmy.backends import is_apple_silicon
+        assert is_apple_silicon() is True
+
+
+def test_is_apple_silicon_false_on_linux():
+    with patch("timmy.backends.platform.system", return_value="Linux"), \
+         patch("timmy.backends.platform.machine", return_value="x86_64"):
+        from timmy.backends import is_apple_silicon
+        assert is_apple_silicon() is False
+
+
+def test_is_apple_silicon_false_on_intel_mac():
+    with patch("timmy.backends.platform.system", return_value="Darwin"), \
+         patch("timmy.backends.platform.machine", return_value="x86_64"):
+        from timmy.backends import is_apple_silicon
+        assert is_apple_silicon() is False
+
+
+# ── airllm_available ─────────────────────────────────────────────────────────
+
+def test_airllm_available_true_when_stub_in_sys_modules():
+    # conftest already stubs 'airllm' — importable → True.
+    from timmy.backends import airllm_available
+    assert airllm_available() is True
+
+
+def test_airllm_available_false_when_not_importable():
+    # Temporarily remove the stub to simulate airllm not installed.
+    saved = sys.modules.pop("airllm", None)
+    try:
+        from timmy.backends import airllm_available
+        assert airllm_available() is False
+    finally:
+        if saved is not None:
+            sys.modules["airllm"] = saved
+
+
+# ── TimmyAirLLMAgent construction ────────────────────────────────────────────
+
+def test_airllm_agent_raises_on_unknown_size():
+    from timmy.backends import TimmyAirLLMAgent
+    with pytest.raises(ValueError, match="Unknown model size"):
+        TimmyAirLLMAgent(model_size="3b")
+
+
+def test_airllm_agent_uses_automodel_on_non_apple():
+    """Non-Apple-Silicon path uses AutoModel.from_pretrained."""
+    with patch("timmy.backends.is_apple_silicon", return_value=False):
+        from timmy.backends import TimmyAirLLMAgent
+        agent = TimmyAirLLMAgent(model_size="8b")
+    # sys.modules["airllm"] is a MagicMock; AutoModel.from_pretrained was called.
+    assert sys.modules["airllm"].AutoModel.from_pretrained.called
+
+
+def test_airllm_agent_uses_mlx_on_apple_silicon():
+    """Apple Silicon path uses AirLLMMLX, not AutoModel."""
+    with patch("timmy.backends.is_apple_silicon", return_value=True):
+        from timmy.backends import TimmyAirLLMAgent
+        agent = TimmyAirLLMAgent(model_size="8b")
+    assert sys.modules["airllm"].AirLLMMLX.called
+
+
+def test_airllm_agent_resolves_correct_model_id_for_70b():
+    with patch("timmy.backends.is_apple_silicon", return_value=False):
+        from timmy.backends import TimmyAirLLMAgent, _AIRLLM_MODELS
+        TimmyAirLLMAgent(model_size="70b")
+    sys.modules["airllm"].AutoModel.from_pretrained.assert_called_with(
+        _AIRLLM_MODELS["70b"]
+    )
+
+
+# ── TimmyAirLLMAgent.print_response ──────────────────────────────────────────
+
+def _make_agent(model_size: str = "8b") -> "TimmyAirLLMAgent":
+    """Helper: create an agent with a fully mocked underlying model."""
+    with patch("timmy.backends.is_apple_silicon", return_value=False):
+        from timmy.backends import TimmyAirLLMAgent
+        agent = TimmyAirLLMAgent(model_size=model_size)
+
+    # Replace the underlying model with a clean mock that returns predictable output.
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    # tokenizer() returns a dict-like object with an "input_ids" tensor mock.
+    input_ids_mock = MagicMock()
+    input_ids_mock.shape = [1, 10]  # shape[1] = prompt token count = 10
+    token_dict = {"input_ids": input_ids_mock}
+    mock_tokenizer.return_value = token_dict
+    # generate() returns a list of token sequences.
+    mock_tokenizer.decode.return_value = "Sir, affirmative."
+    mock_model.tokenizer = mock_tokenizer
+    mock_model.generate.return_value = [list(range(15))]  # 15 tokens total
+    agent._model = mock_model
+    return agent
+
+
+def test_print_response_calls_generate():
+    agent = _make_agent()
+    agent.print_response("What is sovereignty?", stream=True)
+    agent._model.generate.assert_called_once()
+
+
+def test_print_response_decodes_only_generated_tokens():
+    agent = _make_agent()
+    agent.print_response("Hello", stream=False)
+    # decode should be called with tokens starting at index 10 (prompt length).
+    decode_call = agent._model.tokenizer.decode.call_args
+    token_slice = decode_call[0][0]
+    assert list(token_slice) == list(range(10, 15))
+
+
+def test_print_response_updates_history():
+    agent = _make_agent()
+    agent.print_response("First message")
+    assert any("First message" in turn for turn in agent._history)
+    assert any("Timmy:" in turn for turn in agent._history)
+
+
+def test_print_response_history_included_in_second_prompt():
+    agent = _make_agent()
+    agent.print_response("First")
+    # Build the prompt for the second call — history should appear.
+    prompt = agent._build_prompt("Second")
+    assert "First" in prompt
+    assert "Second" in prompt
+
+
+def test_print_response_stream_flag_accepted():
+    """stream=False should not raise — it's accepted for API compatibility."""
+    agent = _make_agent()
+    agent.print_response("hello", stream=False)  # no error
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -0,0 +1,29 @@
+from unittest.mock import MagicMock, patch
+
+from typer.testing import CliRunner
+
+from timmy.cli import app
+from timmy.prompts import TIMMY_STATUS_PROMPT
+
+runner = CliRunner()
+
+
+def test_status_uses_status_prompt():
+    """status command must pass TIMMY_STATUS_PROMPT to the agent."""
+    mock_timmy = MagicMock()
+
+    with patch("timmy.cli.create_timmy", return_value=mock_timmy):
+        runner.invoke(app, ["status"])
+
+    mock_timmy.print_response.assert_called_once_with(TIMMY_STATUS_PROMPT, stream=False)
+
+
+def test_status_does_not_use_inline_string():
+    """status command must not pass the old inline hardcoded string."""
+    mock_timmy = MagicMock()
+
+    with patch("timmy.cli.create_timmy", return_value=mock_timmy):
+        runner.invoke(app, ["status"])
+
+    call_args = mock_timmy.print_response.call_args
+    assert call_args[0][0] != "Brief status report — one sentence."
--- a/tests/test_watchdog.py
+++ b/tests/test_watchdog.py
@@ -0,0 +1,54 @@
+from unittest.mock import MagicMock, patch
+
+from self_tdd.watchdog import _run_tests
+
+
+def _mock_result(returncode: int, stdout: str = "", stderr: str = "") -> MagicMock:
+    m = MagicMock()
+    m.returncode = returncode
+    m.stdout = stdout
+    m.stderr = stderr
+    return m
+
+
+def test_run_tests_returns_true_when_suite_passes():
+    with patch("self_tdd.watchdog.subprocess.run", return_value=_mock_result(0, "5 passed")):
+        passed, _ = _run_tests()
+    assert passed is True
+
+
+def test_run_tests_returns_false_when_suite_fails():
+    with patch("self_tdd.watchdog.subprocess.run", return_value=_mock_result(1, "1 failed")):
+        passed, _ = _run_tests()
+    assert passed is False
+
+
+def test_run_tests_output_includes_stdout():
+    with patch("self_tdd.watchdog.subprocess.run", return_value=_mock_result(0, stdout="5 passed")):
+        _, output = _run_tests()
+    assert "5 passed" in output
+
+
+def test_run_tests_output_combines_stdout_and_stderr():
+    with patch(
+        "self_tdd.watchdog.subprocess.run",
+        return_value=_mock_result(1, stdout="FAILED test_foo", stderr="ImportError: no module named bar"),
+    ):
+        _, output = _run_tests()
+    assert "FAILED test_foo" in output
+    assert "ImportError" in output
+
+
+def test_run_tests_invokes_pytest_with_correct_flags():
+    with patch("self_tdd.watchdog.subprocess.run", return_value=_mock_result(0)) as mock_run:
+        _run_tests()
+    cmd = mock_run.call_args[0][0]
+    assert "pytest" in cmd
+    assert "tests/" in cmd
+    assert "--tb=short" in cmd
+
+
+def test_run_tests_uses_60s_timeout():
+    with patch("self_tdd.watchdog.subprocess.run", return_value=_mock_result(0)) as mock_run:
+        _run_tests()
+    assert mock_run.call_args.kwargs["timeout"] == 60