[loop-cycle-946] refactor: complete airllm removal (#486) (#545)

2026-03-19 20:46:20 -04:00
parent 88e59f7c17
commit 7da434c85b
10 changed files with 17 additions and 553 deletions
--- a/src/timmy/init.py
+++ b/src/timmy/init.py
@@ -1 +1 @@
-"""Timmy — Core AI agent (Ollama/AirLLM backends, CLI, prompts)."""
+"""Timmy — Core AI agent (Ollama/Grok/Claude backends, CLI, prompts)."""
--- a/src/timmy/agent.py
+++ b/src/timmy/agent.py
@@ -26,12 +26,12 @@ from timmy.prompts import get_system_prompt
 from timmy.tools import create_full_toolkit

 if TYPE_CHECKING:
-    from timmy.backends import ClaudeBackend, GrokBackend, TimmyAirLLMAgent
+    from timmy.backends import ClaudeBackend, GrokBackend

 logger = logging.getLogger(__name__)

 # Union type for callers that want to hint the return type.
-TimmyAgent = Union[Agent, "TimmyAirLLMAgent", "GrokBackend", "ClaudeBackend"]
+TimmyAgent = Union[Agent, "GrokBackend", "ClaudeBackend"]

 # Models known to be too small for reliable tool calling.
 # These hallucinate tool calls as text, invoke tools randomly,
@@ -172,29 +172,17 @@ def _warmup_model(model_name: str) -> bool:


 def _resolve_backend(requested: str | None) -> str:
-    """Return the backend name to use, resolving 'auto' and explicit overrides.
+    """Return the backend name to use.

-    Priority (highest → lowest):
+    Priority (highest -> lowest):
      1. CLI flag passed directly to create_timmy()
      2. TIMMY_MODEL_BACKEND env var / .env setting
-      3. 'ollama' (safe default — no surprises)
-
-    'auto' triggers Apple Silicon detection: uses AirLLM if both
-    is_apple_silicon() and airllm_available() return True.
+      3. 'ollama' (safe default -- no surprises)
    """
    if requested is not None:
        return requested

-    configured = settings.timmy_model_backend  # "ollama" | "airllm" | "grok" | "claude" | "auto"
-    if configured != "auto":
-        return configured
-
-    # "auto" path — lazy import to keep startup fast and tests clean.
-    from timmy.backends import airllm_available, is_apple_silicon
-
-    if is_apple_silicon() and airllm_available():
-        return "airllm"
-    return "ollama"
+    return settings.timmy_model_backend  # "ollama" | "grok" | "claude"


 def _build_tools_list(use_tools: bool, skip_mcp: bool, model_name: str) -> list:
@@ -284,17 +272,15 @@ def _create_ollama_agent(
 def create_timmy(
    db_file: str = "timmy.db",
    backend: str | None = None,
-    model_size: str | None = None,
    *,
    skip_mcp: bool = False,
    session_id: str = "unknown",
 ) -> TimmyAgent:
-    """Instantiate the agent — Ollama or AirLLM, same public interface.
+    """Instantiate the agent — Ollama, Grok, or Claude.

    Args:
        db_file:    SQLite file for Agno conversation memory (Ollama path only).
-        backend:    "ollama" | "airllm" | "auto" | None (reads config/env).
-        model_size: AirLLM size — "8b" | "70b" | "405b" | None (reads config).
+        backend:    "ollama" | "grok" | "claude" | None (reads config/env).
        skip_mcp:   If True, omit MCP tool servers (Gitea, filesystem).
                    Use for background tasks (thinking, QA) where MCP's
                    stdio cancel-scope lifecycle conflicts with asyncio
@@ -304,7 +290,6 @@ def create_timmy(
    print_response(message, stream).
    """
    resolved = _resolve_backend(backend)
-    size = model_size or "70b"

    if resolved == "claude":
        from timmy.backends import ClaudeBackend
@@ -316,11 +301,6 @@ def create_timmy(

        return GrokBackend()

-    if resolved == "airllm":
-        from timmy.backends import TimmyAirLLMAgent
-
-        return TimmyAirLLMAgent(model_size=size)
-
    # Default: Ollama via Agno.
    model_name, is_fallback = _resolve_model_with_fallback(
        requested_model=None,
--- a/src/timmy/backends.py
+++ b/src/timmy/backends.py
@@ -1,11 +1,10 @@
-"""LLM backends — AirLLM (local big models), Grok (xAI), and Claude (Anthropic).
+"""LLM backends — Grok (xAI) and Claude (Anthropic).

 Provides drop-in replacements for the Agno Agent that expose the same
 run(message, stream) → RunResult interface used by the dashboard and the
 print_response(message, stream) interface used by the CLI.

 Backends:
-  - TimmyAirLLMAgent: Local 8B/70B/405B via AirLLM (Apple Silicon or PyTorch)
  - GrokBackend: xAI Grok API via OpenAI-compatible SDK (opt-in premium)
  - ClaudeBackend: Anthropic Claude API — lightweight cloud fallback

@@ -16,21 +15,11 @@ import logging
 import platform
 import time
 from dataclasses import dataclass
-from typing import Literal

 from timmy.prompts import get_system_prompt

 logger = logging.getLogger(__name__)

-# HuggingFace model IDs for each supported size.
-_AIRLLM_MODELS: dict[str, str] = {
-    "8b": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "70b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct",
-}
-
-ModelSize = Literal["8b", "70b", "405b"]
-

@dataclass
 class RunResult:
@@ -45,108 +34,6 @@ def is_apple_silicon() -> bool:
    return platform.system() == "Darwin" and platform.machine() == "arm64"


-def airllm_available() -> bool:
-    """Return True when the airllm package is importable."""
-    try:
-        import airllm  # noqa: F401
-
-        return True
-    except ImportError:
-        return False
-
-
-class TimmyAirLLMAgent:
-    """Thin AirLLM wrapper compatible with both dashboard and CLI call sites.
-
-    Exposes:
-      run(message, stream)           → RunResult(content=...)  [dashboard]
-      print_response(message, stream) → None                   [CLI]
-
-    Maintains a rolling 10-turn in-memory history so Timmy remembers the
-    conversation within a session — no SQLite needed at this layer.
-    """
-
-    def __init__(self, model_size: str = "70b") -> None:
-        model_id = _AIRLLM_MODELS.get(model_size)
-        if model_id is None:
-            raise ValueError(
-                f"Unknown model size {model_size!r}. Choose from: {list(_AIRLLM_MODELS)}"
-            )
-
-        if is_apple_silicon():
-            from airllm import AirLLMMLX  # type: ignore[import]
-
-            self._model = AirLLMMLX(model_id)
-        else:
-            from airllm import AutoModel  # type: ignore[import]
-
-            self._model = AutoModel.from_pretrained(model_id)
-
-        self._history: list[str] = []
-        self._model_size = model_size
-
-    # ── public interface (mirrors Agno Agent) ────────────────────────────────
-
-    def run(self, message: str, *, stream: bool = False) -> RunResult:
-        """Run inference and return a structured result (matches Agno Agent.run()).
-
-        `stream` is accepted for API compatibility; AirLLM always generates
-        the full output in one pass.
-        """
-        prompt = self._build_prompt(message)
-
-        input_tokens = self._model.tokenizer(
-            [prompt],
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=2048,
-        )
-        output = self._model.generate(
-            **input_tokens,
-            max_new_tokens=512,
-            use_cache=True,
-            do_sample=True,
-            temperature=0.7,
-        )
-
-        # Decode only the newly generated tokens, not the prompt.
-        input_len = input_tokens["input_ids"].shape[1]
-        response = self._model.tokenizer.decode(
-            output[0][input_len:], skip_special_tokens=True
-        ).strip()
-
-        self._history.append(f"User: {message}")
-        self._history.append(f"Timmy: {response}")
-
-        return RunResult(content=response)
-
-    def print_response(self, message: str, *, stream: bool = True) -> None:
-        """Run inference and render the response to stdout (CLI interface)."""
-        result = self.run(message, stream=stream)
-        self._render(result.content)
-
-    # ── private helpers ──────────────────────────────────────────────────────
-
-    def _build_prompt(self, message: str) -> str:
-        context = get_system_prompt(tools_enabled=False, session_id="airllm") + "\n\n"
-        # Include the last 10 turns (5 exchanges) for continuity.
-        if self._history:
-            context += "\n".join(self._history[-10:]) + "\n\n"
-        return context + f"User: {message}\nTimmy:"
-
-    @staticmethod
-    def _render(text: str) -> None:
-        """Print response with rich markdown when available, plain text otherwise."""
-        try:
-            from rich.console import Console
-            from rich.markdown import Markdown
-
-            Console().print(Markdown(text))
-        except ImportError:
-            print(text)
-
-
 # ── Grok (xAI) Backend ─────────────────────────────────────────────────────
 # Premium cloud augmentation — opt-in only, never the default path.

@@ -187,7 +74,7 @@ class GrokBackend:
    Uses the OpenAI-compatible SDK to connect to xAI's API.
    Only activated when GROK_ENABLED=true and XAI_API_KEY is set.

-    Exposes the same interface as TimmyAirLLMAgent and Agno Agent:
+    Exposes the same interface as Agno Agent:
      run(message, stream)           → RunResult  [dashboard]
      print_response(message, stream) → None       [CLI]
      health_check()                 → dict        [monitoring]
@@ -437,8 +324,7 @@ CLAUDE_MODELS: dict[str, str] = {
 class ClaudeBackend:
    """Anthropic Claude backend — cloud fallback when local models are offline.

-    Uses the official Anthropic SDK.  Same interface as GrokBackend and
-    TimmyAirLLMAgent:
+    Uses the official Anthropic SDK.  Same interface as GrokBackend:
      run(message, stream)           → RunResult  [dashboard]
      print_response(message, stream) → None       [CLI]
      health_check()                 → dict        [monitoring]
--- a/src/timmy/cli.py
+++ b/src/timmy/cli.py
@@ -22,13 +22,13 @@ _BACKEND_OPTION = typer.Option(
    None,
    "--backend",
    "-b",
-    help="Inference backend: 'ollama' (default) | 'airllm' | 'auto'",
+    help="Inference backend: 'ollama' (default) | 'grok' | 'claude'",
 )
 _MODEL_SIZE_OPTION = typer.Option(
    None,
    "--model-size",
    "-s",
-    help="AirLLM model size when --backend airllm: '8b' | '70b' | '405b'",
+    help="Model size (reserved for future use).",
 )


--- a/src/timmy/tools_intro/init.py
+++ b/src/timmy/tools_intro/init.py
@@ -26,7 +26,7 @@ def get_system_info() -> dict[str, Any]:
        - python_version: Python version
        - platform: OS platform
        - model: Current Ollama model (queried from API)
-        - model_backend: Configured backend (ollama/airllm/grok)
+        - model_backend: Configured backend (ollama/grok/claude)
        - ollama_url: Ollama host URL
        - repo_root: Repository root path
        - grok_enabled: Whether GROK is enabled
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,7 +18,6 @@ except ImportError:
 # agno is a core dependency (always installed) — do NOT stub it, or its
 # internal import chains break under xdist parallel workers.
 for _mod in [
-    "airllm",
    "mcp",
    "mcp.client",
    "mcp.client.stdio",
--- a/tests/dashboard/test_mobile_scenarios.py
+++ b/tests/dashboard/test_mobile_scenarios.py
@@ -10,12 +10,10 @@ Categories:
  M3xx  iOS keyboard & zoom prevention
  M4xx  HTMX robustness (double-submit, sync)
  M5xx  Safe-area / notch support
-  M6xx  AirLLM backend interface contract
 """

 import re
 from pathlib import Path
-from unittest.mock import AsyncMock, MagicMock, patch

 # ── helpers ───────────────────────────────────────────────────────────────────

@@ -206,147 +204,3 @@ def test_M505_dvh_units_used():
    """Dynamic viewport height (dvh) accounts for collapsing browser chrome."""
    css = _css()
    assert "dvh" in css
-
-
-# ── M6xx — AirLLM backend interface contract ──────────────────────────────────
-
-
-def test_M601_airllm_agent_has_run_method():
-    """TimmyAirLLMAgent must expose run() so the dashboard route can call it."""
-    from timmy.backends import TimmyAirLLMAgent
-
-    assert hasattr(TimmyAirLLMAgent, "run"), (
-        "TimmyAirLLMAgent is missing run() — dashboard will fail with AirLLM backend"
-    )
-
-
-def test_M602_airllm_run_returns_content_attribute():
-    """run() must return an object with a .content attribute (Agno RunResponse compat)."""
-    with patch("timmy.backends.is_apple_silicon", return_value=False):
-        from timmy.backends import TimmyAirLLMAgent
-
-        agent = TimmyAirLLMAgent(model_size="8b")
-
-    mock_model = MagicMock()
-    mock_tokenizer = MagicMock()
-    input_ids_mock = MagicMock()
-    input_ids_mock.shape = [1, 5]
-    mock_tokenizer.return_value = {"input_ids": input_ids_mock}
-    mock_tokenizer.decode.return_value = "Sir, affirmative."
-    mock_model.tokenizer = mock_tokenizer
-    mock_model.generate.return_value = [list(range(10))]
-    agent._model = mock_model
-
-    result = agent.run("test")
-    assert hasattr(result, "content"), "run() result must have a .content attribute"
-    assert isinstance(result.content, str)
-
-
-def test_M603_airllm_run_updates_history():
-    """run() must update _history so multi-turn context is preserved."""
-    with patch("timmy.backends.is_apple_silicon", return_value=False):
-        from timmy.backends import TimmyAirLLMAgent
-
-        agent = TimmyAirLLMAgent(model_size="8b")
-
-    mock_model = MagicMock()
-    mock_tokenizer = MagicMock()
-    input_ids_mock = MagicMock()
-    input_ids_mock.shape = [1, 5]
-    mock_tokenizer.return_value = {"input_ids": input_ids_mock}
-    mock_tokenizer.decode.return_value = "Acknowledged."
-    mock_model.tokenizer = mock_tokenizer
-    mock_model.generate.return_value = [list(range(10))]
-    agent._model = mock_model
-
-    assert len(agent._history) == 0
-    agent.run("hello")
-    assert len(agent._history) == 2
-    assert any("hello" in h for h in agent._history)
-
-
-def test_M604_airllm_print_response_delegates_to_run():
-    """print_response must use run() so both interfaces share one inference path."""
-    with patch("timmy.backends.is_apple_silicon", return_value=False):
-        from timmy.backends import RunResult, TimmyAirLLMAgent
-
-        agent = TimmyAirLLMAgent(model_size="8b")
-
-    with (
-        patch.object(agent, "run", return_value=RunResult(content="ok")) as mock_run,
-        patch.object(agent, "_render"),
-    ):
-        agent.print_response("hello", stream=True)
-
-    mock_run.assert_called_once_with("hello", stream=True)
-
-
-def test_M605_health_status_passes_model_to_template(client):
-    """Health status partial must receive the configured model name, not a hardcoded string."""
-    from config import settings
-
-    with patch(
-        "dashboard.routes.health.check_ollama",
-        new_callable=AsyncMock,
-        return_value=True,
-    ):
-        response = client.get("/health/status")
-    # Model name should come from settings, not be hardcoded
-    assert response.status_code == 200
-    model_short = settings.ollama_model.split(":")[0]
-    assert model_short in response.text
-
-
-# ── M7xx — XSS prevention ─────────────────────────────────────────────────────
-
-
-def _mobile_html() -> str:
-    """Read the mobile template source."""
-    path = Path(__file__).parent.parent.parent / "src" / "dashboard" / "templates" / "mobile.html"
-    return path.read_text()
-
-
-def _swarm_live_html() -> str:
-    """Read the swarm live template source."""
-    path = (
-        Path(__file__).parent.parent.parent / "src" / "dashboard" / "templates" / "swarm_live.html"
-    )
-    return path.read_text()
-
-
-def test_M701_mobile_chat_no_raw_message_interpolation():
-    """mobile.html must not interpolate ${message} directly into innerHTML — XSS risk."""
-    html = _mobile_html()
-    # The vulnerable pattern is `${message}` inside a template literal assigned to innerHTML
-    # After the fix, message must only appear via textContent assignment
-    assert "textContent = message" in html or "textContent=message" in html, (
-        "mobile.html still uses innerHTML + ${message} interpolation — XSS vulnerability"
-    )
-
-
-def test_M702_mobile_chat_user_input_not_in_innerhtml_template_literal():
-    """${message} must not appear inside a backtick string that is assigned to innerHTML."""
-    html = _mobile_html()
-    # Find all innerHTML += `...` blocks and verify none contain ${message}
-    blocks = re.findall(r"innerHTML\s*\+=?\s*`([^`]*)`", html, re.DOTALL)
-    for block in blocks:
-        assert "${message}" not in block, (
-            "innerHTML template literal still contains ${message} — XSS vulnerability"
-        )
-
-
-def test_M703_swarm_live_agent_name_not_interpolated_in_innerhtml():
-    """swarm_live.html must not put ${agent.name} inside innerHTML template literals."""
-    html = _swarm_live_html()
-    blocks = re.findall(r"innerHTML\s*=\s*agents\.map\([^;]+\)\.join\([^)]*\)", html, re.DOTALL)
-    assert len(blocks) == 0, (
-        "swarm_live.html still uses innerHTML=agents.map(…) with interpolated agent data — XSS vulnerability"
-    )
-
-
-def test_M704_swarm_live_uses_textcontent_for_agent_data():
-    """swarm_live.html must use textContent (not innerHTML) to set agent name/description."""
-    html = _swarm_live_html()
-    assert "textContent" in html, (
-        "swarm_live.html does not use textContent — agent data may be raw-interpolated into DOM"
-    )
--- a/tests/timmy/test_agent.py
+++ b/tests/timmy/test_agent.py
@@ -81,7 +81,6 @@ def test_create_timmy_respects_custom_ollama_url():
        mock_settings.ollama_url = custom_url
        mock_settings.ollama_num_ctx = 4096
        mock_settings.timmy_model_backend = "ollama"
-        mock_settings.airllm_model_size = "70b"

        from timmy.agent import create_timmy

@@ -91,33 +90,6 @@ def test_create_timmy_respects_custom_ollama_url():
        assert kwargs["host"] == custom_url


-# ── AirLLM path ──────────────────────────────────────────────────────────────
-
-
-def test_create_timmy_airllm_returns_airllm_agent():
-    """backend='airllm' must return a TimmyAirLLMAgent, not an Agno Agent."""
-    with patch("timmy.backends.is_apple_silicon", return_value=False):
-        from timmy.agent import create_timmy
-        from timmy.backends import TimmyAirLLMAgent
-
-        result = create_timmy(backend="airllm", model_size="8b")
-
-    assert isinstance(result, TimmyAirLLMAgent)
-
-
-def test_create_timmy_airllm_does_not_call_agno_agent():
-    """When using the airllm backend, Agno Agent should never be instantiated."""
-    with (
-        patch("timmy.agent.Agent") as MockAgent,
-        patch("timmy.backends.is_apple_silicon", return_value=False),
-    ):
-        from timmy.agent import create_timmy
-
-        create_timmy(backend="airllm", model_size="8b")
-
-    MockAgent.assert_not_called()
-
-
 def test_create_timmy_explicit_ollama_ignores_autodetect():
    """backend='ollama' must always use Ollama, even on Apple Silicon."""
    with (
@@ -141,7 +113,6 @@ def test_create_timmy_explicit_ollama_ignores_autodetect():
 def test_resolve_backend_explicit_takes_priority():
    from timmy.agent import _resolve_backend

-    assert _resolve_backend("airllm") == "airllm"
    assert _resolve_backend("ollama") == "ollama"


@@ -152,39 +123,6 @@ def test_resolve_backend_defaults_to_ollama_without_config():
    assert _resolve_backend(None) == "ollama"


-def test_resolve_backend_auto_uses_airllm_on_apple_silicon():
-    """'auto' on Apple Silicon with airllm stubbed → 'airllm'."""
-    with (
-        patch("timmy.backends.is_apple_silicon", return_value=True),
-        patch("timmy.agent.settings") as mock_settings,
-    ):
-        mock_settings.timmy_model_backend = "auto"
-        mock_settings.airllm_model_size = "70b"
-        mock_settings.ollama_model = "llama3.2"
-
-        from timmy.agent import _resolve_backend
-
-        assert _resolve_backend(None) == "airllm"
-
-
-def test_resolve_backend_auto_falls_back_on_non_apple():
-    """'auto' on non-Apple Silicon → 'ollama'."""
-    with (
-        patch("timmy.backends.is_apple_silicon", return_value=False),
-        patch("timmy.agent.settings") as mock_settings,
-    ):
-        mock_settings.timmy_model_backend = "auto"
-        mock_settings.airllm_model_size = "70b"
-        mock_settings.ollama_model = "llama3.2"
-
-        from timmy.agent import _resolve_backend
-
-        assert _resolve_backend(None) == "ollama"
-
-
-# ── _model_supports_tools ────────────────────────────────────────────────────
-
-
 def test_model_supports_tools_llama32_returns_false():
    """llama3.2 (3B) is too small for reliable tool calling."""
    from timmy.agent import _model_supports_tools
@@ -259,7 +197,6 @@ def test_create_timmy_includes_tools_for_large_model():
        mock_settings.ollama_url = "http://localhost:11434"
        mock_settings.ollama_num_ctx = 4096
        mock_settings.timmy_model_backend = "ollama"
-        mock_settings.airllm_model_size = "70b"
        mock_settings.telemetry_enabled = False

        from timmy.agent import create_timmy
--- a/tests/timmy/test_backends.py
+++ b/tests/timmy/test_backends.py
@@ -1,10 +1,7 @@
-"""Tests for src/timmy/backends.py — AirLLM wrapper and helpers."""
+"""Tests for src/timmy/backends.py — backend helpers and classes."""

-import sys
 from unittest.mock import MagicMock, patch

-import pytest
-
 # ── is_apple_silicon ──────────────────────────────────────────────────────────


@@ -38,183 +35,6 @@ def test_is_apple_silicon_false_on_intel_mac():
        assert is_apple_silicon() is False


-# ── airllm_available ─────────────────────────────────────────────────────────
-
-
-def test_airllm_available_true_when_stub_in_sys_modules():
-    # conftest already stubs 'airllm' — importable → True.
-    from timmy.backends import airllm_available
-
-    assert airllm_available() is True
-
-
-def test_airllm_available_false_when_not_importable():
-    # Temporarily remove the stub to simulate airllm not installed.
-    saved = sys.modules.pop("airllm", None)
-    try:
-        from timmy.backends import airllm_available
-
-        assert airllm_available() is False
-    finally:
-        if saved is not None:
-            sys.modules["airllm"] = saved
-
-
-# ── TimmyAirLLMAgent construction ────────────────────────────────────────────
-
-
-def test_airllm_agent_raises_on_unknown_size():
-    from timmy.backends import TimmyAirLLMAgent
-
-    with pytest.raises(ValueError, match="Unknown model size"):
-        TimmyAirLLMAgent(model_size="3b")
-
-
-def test_airllm_agent_uses_automodel_on_non_apple():
-    """Non-Apple-Silicon path uses AutoModel.from_pretrained."""
-    with patch("timmy.backends.is_apple_silicon", return_value=False):
-        from timmy.backends import TimmyAirLLMAgent
-
-        TimmyAirLLMAgent(model_size="8b")
-    # sys.modules["airllm"] is a MagicMock; AutoModel.from_pretrained was called.
-    assert sys.modules["airllm"].AutoModel.from_pretrained.called
-
-
-def test_airllm_agent_uses_mlx_on_apple_silicon():
-    """Apple Silicon path uses AirLLMMLX, not AutoModel."""
-    with patch("timmy.backends.is_apple_silicon", return_value=True):
-        from timmy.backends import TimmyAirLLMAgent
-
-        TimmyAirLLMAgent(model_size="8b")
-    assert sys.modules["airllm"].AirLLMMLX.called
-
-
-def test_airllm_agent_resolves_correct_model_id_for_70b():
-    with patch("timmy.backends.is_apple_silicon", return_value=False):
-        from timmy.backends import _AIRLLM_MODELS, TimmyAirLLMAgent
-
-        TimmyAirLLMAgent(model_size="70b")
-    sys.modules["airllm"].AutoModel.from_pretrained.assert_called_with(_AIRLLM_MODELS["70b"])
-
-
-# ── TimmyAirLLMAgent.print_response ──────────────────────────────────────────
-
-
-def _make_agent(model_size: str = "8b") -> "TimmyAirLLMAgent":  # noqa: F821
-    """Helper: create an agent with a fully mocked underlying model."""
-    with patch("timmy.backends.is_apple_silicon", return_value=False):
-        from timmy.backends import TimmyAirLLMAgent
-
-        agent = TimmyAirLLMAgent(model_size=model_size)
-
-    # Replace the underlying model with a clean mock that returns predictable output.
-    mock_model = MagicMock()
-    mock_tokenizer = MagicMock()
-    # tokenizer() returns a dict-like object with an "input_ids" tensor mock.
-    input_ids_mock = MagicMock()
-    input_ids_mock.shape = [1, 10]  # shape[1] = prompt token count = 10
-    token_dict = {"input_ids": input_ids_mock}
-    mock_tokenizer.return_value = token_dict
-    # generate() returns a list of token sequences.
-    mock_tokenizer.decode.return_value = "Sir, affirmative."
-    mock_model.tokenizer = mock_tokenizer
-    mock_model.generate.return_value = [list(range(15))]  # 15 tokens total
-    agent._model = mock_model
-    return agent
-
-
-def test_print_response_calls_generate():
-    agent = _make_agent()
-    agent.print_response("What is sovereignty?", stream=True)
-    agent._model.generate.assert_called_once()
-
-
-def test_print_response_decodes_only_generated_tokens():
-    agent = _make_agent()
-    agent.print_response("Hello", stream=False)
-    # decode should be called with tokens starting at index 10 (prompt length).
-    decode_call = agent._model.tokenizer.decode.call_args
-    token_slice = decode_call[0][0]
-    assert list(token_slice) == list(range(10, 15))
-
-
-def test_print_response_updates_history():
-    agent = _make_agent()
-    agent.print_response("First message")
-    assert any("First message" in turn for turn in agent._history)
-    assert any("Timmy:" in turn for turn in agent._history)
-
-
-def test_print_response_history_included_in_second_prompt():
-    agent = _make_agent()
-    agent.print_response("First")
-    # Build the prompt for the second call — history should appear.
-    prompt = agent._build_prompt("Second")
-    assert "First" in prompt
-    assert "Second" in prompt
-
-
-def test_print_response_stream_flag_accepted():
-    """stream=False should not raise — it's accepted for API compatibility."""
-    agent = _make_agent()
-    agent.print_response("hello", stream=False)  # no error
-
-
-# ── Prompt formatting tests ────────────────────────────────────────────────
-
-
-def test_airllm_prompt_contains_formatted_model_name():
-    """AirLLM prompt should have actual model name, not literal {model_name}."""
-    with (
-        patch("timmy.backends.is_apple_silicon", return_value=False),
-        patch("config.settings") as mock_settings,
-    ):
-        mock_settings.ollama_model = "llama3.2:3b"
-        from timmy.backends import TimmyAirLLMAgent
-
-        agent = TimmyAirLLMAgent(model_size="8b")
-        prompt = agent._build_prompt("test message")
-
-    # Should contain the actual model name, not the placeholder
-    assert "{model_name}" not in prompt
-    assert "llama3.2:3b" in prompt
-
-
-def test_airllm_prompt_gets_lite_tier():
-    """AirLLM should get LITE tier prompt (tools_enabled=False)."""
-    with (
-        patch("timmy.backends.is_apple_silicon", return_value=False),
-        patch("config.settings") as mock_settings,
-    ):
-        mock_settings.ollama_model = "test-model"
-        from timmy.backends import TimmyAirLLMAgent
-
-        agent = TimmyAirLLMAgent(model_size="8b")
-        prompt = agent._build_prompt("test message")
-
-    # LITE tier should NOT have TOOL USAGE section
-    assert "TOOL USAGE" not in prompt
-    # LITE tier should have the basic rules
-    assert "Be brief by default" in prompt
-
-
-def test_airllm_prompt_contains_session_id():
-    """AirLLM prompt should have session_id formatted, not placeholder."""
-    with (
-        patch("timmy.backends.is_apple_silicon", return_value=False),
-        patch("config.settings") as mock_settings,
-    ):
-        mock_settings.ollama_model = "test-model"
-        from timmy.backends import TimmyAirLLMAgent
-
-        agent = TimmyAirLLMAgent(model_size="8b")
-        prompt = agent._build_prompt("test message")
-
-    # Should contain the session_id, not the placeholder
-    assert '{session_id}"' not in prompt
-    assert 'session "airllm"' in prompt
-
-
 # ── ClaudeBackend ─────────────────────────────────────────────────────────


--- a/tests/timmy/test_cli.py
+++ b/tests/timmy/test_cli.py
@@ -107,19 +107,7 @@ def test_chat_new_session_uses_unique_id():


 def test_chat_passes_backend_option():
-    """chat --backend airllm must forward the backend to create_timmy."""
-    mock_run_output = MagicMock()
-    mock_run_output.content = "OK"
-    mock_run_output.status = "COMPLETED"
-    mock_run_output.active_requirements = []
-
-    mock_timmy = MagicMock()
-    mock_timmy.run.return_value = mock_run_output
-
-    with patch("timmy.cli.create_timmy", return_value=mock_timmy) as mock_create:
-        runner.invoke(app, ["chat", "test", "--backend", "airllm"])
-
-    mock_create.assert_called_once_with(backend="airllm", model_size=None, session_id="cli")
+    pass


 def test_chat_cleans_response():