diff --git a/.env.example b/.env.example index 42effa9..5f825e4 100644 --- a/.env.example +++ b/.env.example @@ -11,3 +11,13 @@ # Enable FastAPI interactive docs at /docs and /redoc (default: false) # DEBUG=true + +# ── AirLLM / big-brain backend ─────────────────────────────────────────────── +# Inference backend: "ollama" (default) | "airllm" | "auto" +# "auto" → uses AirLLM on Apple Silicon if installed, otherwise Ollama. +# Requires: pip install ".[bigbrain]" +# TIMMY_MODEL_BACKEND=ollama + +# AirLLM model size (default: 70b). +# 8b ~16 GB RAM | 70b ~140 GB RAM | 405b ~810 GB RAM +# AIRLLM_MODEL_SIZE=70b diff --git a/README.md b/README.md index ce078a7..6adb409 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,65 @@ timmy status --- +## Big Brain — AirLLM backend (Apple Silicon / large RAM) + +Run 70B or 405B models locally with no GPU required, using AirLLM's +layer-by-layer loading strategy. On M-series Macs the MLX backend is +selected automatically for maximum throughput. Everything stays local. +No cloud. No telemetry. Sats are sovereignty, boss. + +### One-line install + +```bash +pip install ".[bigbrain]" +# Apple Silicon only — adds the MLX-accelerated backend: +pip install "airllm[mlx]" +``` + +### Run with the big brain + +```bash +# Explicit flag — works anywhere airllm is installed: +timmy chat "Explain self-custody" --backend airllm --model-size 70b + +# Or set it once in .env and forget about it: +echo "TIMMY_MODEL_BACKEND=auto" >> .env +echo "AIRLLM_MODEL_SIZE=70b" >> .env +timmy chat "What is sovereignty?" +``` + +`--backend auto` (or `TIMMY_MODEL_BACKEND=auto`) selects AirLLM automatically +on Apple Silicon when the package is installed, and falls back to Ollama +everywhere else — so the same `.env` works on any machine. + +### Model sizes + +| Flag | Parameters | Approx. RAM needed | +|------|-----------|-------------------| +| `8b` | 8 billion | ~16 GB | +| `70b` | 70 billion | ~140 GB | +| `405b` | 405 billion | ~810 GB | + +Models are downloaded from HuggingFace on first run and cached locally. +You need a HuggingFace account and `huggingface-cli login` for gated models +(Llama 3.1 requires accepting Meta's license at hf.co/meta-llama). + +### Architecture with AirLLM + +``` +timmy chat --backend airllm + │ + ▼ +TimmyAirLLMAgent (src/timmy/backends.py) + │ + ├─ Apple Silicon? ──► AirLLMMLX (MLX tensors, Metal GPU) + └─ Everything else ──► AutoModel (PyTorch, CPU/CUDA) + │ + └─ Layers loaded on-demand from ~/.cache/huggingface/ +``` + +--- + ## Architecture ```mermaid diff --git a/activate_self_tdd.sh b/activate_self_tdd.sh new file mode 100755 index 0000000..1268f77 --- /dev/null +++ b/activate_self_tdd.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# activate_self_tdd.sh — Timmy Time dev environment bootstrapper +# +# Usage: +# bash activate_self_tdd.sh # standard (Ollama) setup +# bash activate_self_tdd.sh --big-brain # install AirLLM extra too +# +# What it does: +# 1. Creates a Python venv (or reuses an existing one) +# 2. Installs Timmy Time (+ dev deps, optionally bigbrain) +# 3. Runs the full test suite — aborts if anything fails +# 4. Launches the self-TDD watchdog in the background +# 5. Starts the dashboard +# +# Everything stays local. No cloud. Sats are sovereignty, boss. + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="$REPO_DIR/.venv" +BIG_BRAIN=0 + +for arg in "$@"; do + case $arg in + --big-brain) BIG_BRAIN=1 ;; + *) echo "Unknown argument: $arg"; exit 1 ;; + esac +done + +echo "==> Timmy Time — sovereign AI agent bootstrapper" +echo " Working directory: $REPO_DIR" + +# ── 1. Virtual environment ──────────────────────────────────────────────────── +if [[ ! -d "$VENV_DIR" ]]; then + echo "==> Creating virtual environment..." + python3 -m venv "$VENV_DIR" +fi + +# shellcheck disable=SC1091 +source "$VENV_DIR/bin/activate" +echo "==> Virtual environment active: $VENV_DIR" + +# ── 2. Install dependencies ─────────────────────────────────────────────────── +if [[ $BIG_BRAIN -eq 1 ]]; then + echo "==> Installing with bigbrain extra (AirLLM)..." + pip install --quiet -e "$REPO_DIR[dev,bigbrain]" + # On Apple Silicon, also install the MLX backend. + if [[ "$(uname -m)" == "arm64" && "$(uname -s)" == "Darwin" ]]; then + echo "==> Apple Silicon detected — installing AirLLM MLX backend..." + pip install --quiet "airllm[mlx]" + fi +else + echo "==> Installing standard dependencies..." + pip install --quiet -e "$REPO_DIR[dev]" +fi + +# ── 3. Run tests ────────────────────────────────────────────────────────────── +echo "==> Running test suite..." +python -m pytest "$REPO_DIR/tests/" -q --tb=short +echo "==> All tests passed." + +# ── 4. Self-TDD watchdog (background) ──────────────────────────────────────── +echo "==> Starting self-TDD watchdog (60s interval) in background..." +self-tdd watch --interval 60 & +WATCHDOG_PID=$! +echo " Watchdog PID: $WATCHDOG_PID" +echo " Kill with: kill $WATCHDOG_PID" + +# ── 5. Dashboard ───────────────────────────────────────────────────────────── +echo "" +echo "==> Starting Timmy Time dashboard at http://localhost:8000" +echo " Ctrl-C stops the dashboard (watchdog continues until you kill it)" +echo "" +uvicorn dashboard.app:app --reload --host 0.0.0.0 --port 8000 diff --git a/pyproject.toml b/pyproject.toml index c2fce35..8508f88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,12 @@ dev = [ "pytest-asyncio>=0.24.0", "pytest-cov>=5.0.0", ] +# Big-brain: run 8B / 70B / 405B models locally via layer-by-layer loading. +# pip install ".[bigbrain]" +# On Apple Silicon: pip install "airllm[mlx]" for the MLX-accelerated backend. +bigbrain = [ + "airllm>=2.9.0", +] [project.scripts] timmy = "timmy.cli:main" diff --git a/src/config.py b/src/config.py index ec387d3..6d527b9 100644 --- a/src/config.py +++ b/src/config.py @@ -1,3 +1,5 @@ +from typing import Literal + from pydantic_settings import BaseSettings, SettingsConfigDict @@ -11,6 +13,18 @@ class Settings(BaseSettings): # Set DEBUG=true to enable /docs and /redoc (disabled by default) debug: bool = False + # ── AirLLM / backend selection ─────────────────────────────────────────── + # "ollama" — always use Ollama (default, safe everywhere) + # "airllm" — always use AirLLM (requires pip install ".[bigbrain]") + # "auto" — use AirLLM on Apple Silicon if airllm is installed, + # fall back to Ollama otherwise + timmy_model_backend: Literal["ollama", "airllm", "auto"] = "ollama" + + # AirLLM model size when backend is airllm or auto. + # Larger = smarter, but needs more RAM / disk. + # 8b ~16 GB | 70b ~140 GB | 405b ~810 GB + airllm_model_size: Literal["8b", "70b", "405b"] = "70b" + model_config = SettingsConfigDict( env_file=".env", env_file_encoding="utf-8", diff --git a/src/timmy/agent.py b/src/timmy/agent.py index cc1aab3..dba4656 100644 --- a/src/timmy/agent.py +++ b/src/timmy/agent.py @@ -1,13 +1,67 @@ +from typing import TYPE_CHECKING, Union + from agno.agent import Agent -from agno.models.ollama import Ollama from agno.db.sqlite import SqliteDb +from agno.models.ollama import Ollama -from timmy.prompts import TIMMY_SYSTEM_PROMPT from config import settings +from timmy.prompts import TIMMY_SYSTEM_PROMPT + +if TYPE_CHECKING: + from timmy.backends import TimmyAirLLMAgent + +# Union type for callers that want to hint the return type. +TimmyAgent = Union[Agent, "TimmyAirLLMAgent"] -def create_timmy(db_file: str = "timmy.db") -> Agent: - """Instantiate Timmy with Agno + Ollama + SQLite memory.""" +def _resolve_backend(requested: str | None) -> str: + """Return the backend name to use, resolving 'auto' and explicit overrides. + + Priority (highest → lowest): + 1. CLI flag passed directly to create_timmy() + 2. TIMMY_MODEL_BACKEND env var / .env setting + 3. 'ollama' (safe default — no surprises) + + 'auto' triggers Apple Silicon detection: uses AirLLM if both + is_apple_silicon() and airllm_available() return True. + """ + if requested is not None: + return requested + + configured = settings.timmy_model_backend # "ollama" | "airllm" | "auto" + if configured != "auto": + return configured + + # "auto" path — lazy import to keep startup fast and tests clean. + from timmy.backends import airllm_available, is_apple_silicon + if is_apple_silicon() and airllm_available(): + return "airllm" + return "ollama" + + +def create_timmy( + db_file: str = "timmy.db", + backend: str | None = None, + model_size: str | None = None, +) -> TimmyAgent: + """Instantiate Timmy — Ollama or AirLLM, same public interface either way. + + Args: + db_file: SQLite file for Agno conversation memory (Ollama path only). + backend: "ollama" | "airllm" | "auto" | None (reads config/env). + model_size: AirLLM size — "8b" | "70b" | "405b" | None (reads config). + + Returns an Agno Agent (Ollama) or TimmyAirLLMAgent — both expose + print_response(message, stream). + """ + resolved = _resolve_backend(backend) + size = model_size or settings.airllm_model_size + + if resolved == "airllm": + from timmy.backends import TimmyAirLLMAgent + return TimmyAirLLMAgent(model_size=size) + + # Default: Ollama via Agno. return Agent( name="Timmy", model=Ollama(id=settings.ollama_model), diff --git a/src/timmy/backends.py b/src/timmy/backends.py new file mode 100644 index 0000000..88d0308 --- /dev/null +++ b/src/timmy/backends.py @@ -0,0 +1,119 @@ +"""AirLLM backend — only imported when the airllm extra is installed. + +Provides TimmyAirLLMAgent: a drop-in replacement for an Agno Agent that +exposes the same print_response(message, stream) surface while routing +inference through AirLLM. On Apple Silicon (arm64 Darwin) the MLX backend +is selected automatically; everywhere else AutoModel (PyTorch) is used. + +No cloud. No telemetry. Sats are sovereignty, boss. +""" + +import platform +from typing import Literal + +from timmy.prompts import TIMMY_SYSTEM_PROMPT + +# HuggingFace model IDs for each supported size. +_AIRLLM_MODELS: dict[str, str] = { + "8b": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "70b": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct", +} + +ModelSize = Literal["8b", "70b", "405b"] + + +def is_apple_silicon() -> bool: + """Return True when running on an M-series Mac (arm64 Darwin).""" + return platform.system() == "Darwin" and platform.machine() == "arm64" + + +def airllm_available() -> bool: + """Return True when the airllm package is importable.""" + try: + import airllm # noqa: F401 + return True + except ImportError: + return False + + +class TimmyAirLLMAgent: + """Thin AirLLM wrapper with the same print_response interface as Agno Agent. + + Maintains a rolling 10-turn in-memory history so Timmy remembers the + conversation within a session — no SQLite needed at this layer. + """ + + def __init__(self, model_size: str = "70b") -> None: + model_id = _AIRLLM_MODELS.get(model_size) + if model_id is None: + raise ValueError( + f"Unknown model size {model_size!r}. " + f"Choose from: {list(_AIRLLM_MODELS)}" + ) + + if is_apple_silicon(): + from airllm import AirLLMMLX # type: ignore[import] + self._model = AirLLMMLX(model_id) + else: + from airllm import AutoModel # type: ignore[import] + self._model = AutoModel.from_pretrained(model_id) + + self._history: list[str] = [] + self._model_size = model_size + + # ── public interface (mirrors Agno Agent) ──────────────────────────────── + + def print_response(self, message: str, *, stream: bool = True) -> None: + """Run inference, update history, and render the response to stdout. + + `stream` is accepted for API compatibility but AirLLM generates the + full output in one pass — the result is still printed as soon as it + is ready. + """ + prompt = self._build_prompt(message) + + input_tokens = self._model.tokenizer( + [prompt], + return_tensors="pt", + padding=True, + truncation=True, + max_length=2048, + ) + output = self._model.generate( + **input_tokens, + max_new_tokens=512, + use_cache=True, + do_sample=True, + temperature=0.7, + ) + + # Decode only the newly generated tokens, not the prompt. + input_len = input_tokens["input_ids"].shape[1] + response = self._model.tokenizer.decode( + output[0][input_len:], skip_special_tokens=True + ).strip() + + self._history.append(f"User: {message}") + self._history.append(f"Timmy: {response}") + + self._render(response) + + # ── private helpers ────────────────────────────────────────────────────── + + def _build_prompt(self, message: str) -> str: + context = TIMMY_SYSTEM_PROMPT + "\n\n" + # Include the last 10 turns (5 exchanges) for continuity. + if self._history: + context += "\n".join(self._history[-10:]) + "\n\n" + return context + f"User: {message}\nTimmy:" + + @staticmethod + def _render(text: str) -> None: + """Print response with rich markdown when available, plain text otherwise.""" + try: + from rich.console import Console + from rich.markdown import Markdown + Console().print(Markdown(text)) + except ImportError: + print(text) diff --git a/src/timmy/cli.py b/src/timmy/cli.py index 6d195e1..3de213c 100644 --- a/src/timmy/cli.py +++ b/src/timmy/cli.py @@ -1,3 +1,5 @@ +from typing import Optional + import typer from timmy.agent import create_timmy @@ -5,25 +7,50 @@ from timmy.prompts import TIMMY_STATUS_PROMPT app = typer.Typer(help="Timmy — sovereign AI agent") +# Shared option definitions (reused across commands for consistency). +_BACKEND_OPTION = typer.Option( + None, + "--backend", + "-b", + help="Inference backend: 'ollama' (default) | 'airllm' | 'auto'", +) +_MODEL_SIZE_OPTION = typer.Option( + None, + "--model-size", + "-s", + help="AirLLM model size when --backend airllm: '8b' | '70b' | '405b'", +) + @app.command() -def think(topic: str = typer.Argument(..., help="Topic to reason about")): +def think( + topic: str = typer.Argument(..., help="Topic to reason about"), + backend: Optional[str] = _BACKEND_OPTION, + model_size: Optional[str] = _MODEL_SIZE_OPTION, +): """Ask Timmy to think carefully about a topic.""" - timmy = create_timmy() + timmy = create_timmy(backend=backend, model_size=model_size) timmy.print_response(f"Think carefully about: {topic}", stream=True) @app.command() -def chat(message: str = typer.Argument(..., help="Message to send")): +def chat( + message: str = typer.Argument(..., help="Message to send"), + backend: Optional[str] = _BACKEND_OPTION, + model_size: Optional[str] = _MODEL_SIZE_OPTION, +): """Send a message to Timmy.""" - timmy = create_timmy() + timmy = create_timmy(backend=backend, model_size=model_size) timmy.print_response(message, stream=True) @app.command() -def status(): +def status( + backend: Optional[str] = _BACKEND_OPTION, + model_size: Optional[str] = _MODEL_SIZE_OPTION, +): """Print Timmy's operational status.""" - timmy = create_timmy() + timmy = create_timmy(backend=backend, model_size=model_size) timmy.print_response(TIMMY_STATUS_PROMPT, stream=False) diff --git a/src/timmy/prompts.py b/src/timmy/prompts.py index 3c25a35..5f1ac09 100644 --- a/src/timmy/prompts.py +++ b/src/timmy/prompts.py @@ -1,7 +1,12 @@ TIMMY_SYSTEM_PROMPT = """You are Timmy — a sovereign AI agent running locally. No cloud dependencies. You think clearly, speak plainly, act with intention. Grounded in Christian faith, powered by Bitcoin economics, committed to the -user's digital sovereignty.""" +user's digital sovereignty. + +When running on Apple Silicon with AirLLM you operate with even bigger brains +— 70B or 405B parameters loaded layer-by-layer directly from local disk. +Still fully sovereign. Still 100% private. More capable, no permission needed. +Sir, affirmative.""" TIMMY_STATUS_PROMPT = """You are Timmy. Give a one-sentence status report confirming you are operational and running locally.""" diff --git a/tests/conftest.py b/tests/conftest.py index 5b603c1..ecd6b13 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,8 @@ from unittest.mock import MagicMock import pytest from fastapi.testclient import TestClient -# ── Mock agno so tests run without it installed ─────────────────────────────── -# Uses setdefault: real module is used if installed, mock otherwise. +# ── Stub heavy optional dependencies so tests run without them installed ────── +# Uses setdefault: real module is used if already installed, mock otherwise. for _mod in [ "agno", "agno.agent", @@ -14,6 +14,9 @@ for _mod in [ "agno.models.ollama", "agno.db", "agno.db.sqlite", + # AirLLM is optional (bigbrain extra) — stub it so backend tests can + # import timmy.backends and instantiate TimmyAirLLMAgent without a GPU. + "airllm", ]: sys.modules.setdefault(_mod, MagicMock()) diff --git a/tests/test_agent.py b/tests/test_agent.py index db1c370..4cbed35 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -77,3 +77,77 @@ def test_create_timmy_embeds_system_prompt(): kwargs = MockAgent.call_args.kwargs assert kwargs["description"] == TIMMY_SYSTEM_PROMPT + + +# ── AirLLM path ────────────────────────────────────────────────────────────── + +def test_create_timmy_airllm_returns_airllm_agent(): + """backend='airllm' must return a TimmyAirLLMAgent, not an Agno Agent.""" + with patch("timmy.backends.is_apple_silicon", return_value=False): + from timmy.agent import create_timmy + from timmy.backends import TimmyAirLLMAgent + + result = create_timmy(backend="airllm", model_size="8b") + + assert isinstance(result, TimmyAirLLMAgent) + + +def test_create_timmy_airllm_does_not_call_agno_agent(): + """When using the airllm backend, Agno Agent should never be instantiated.""" + with patch("timmy.agent.Agent") as MockAgent, \ + patch("timmy.backends.is_apple_silicon", return_value=False): + + from timmy.agent import create_timmy + create_timmy(backend="airllm", model_size="8b") + + MockAgent.assert_not_called() + + +def test_create_timmy_explicit_ollama_ignores_autodetect(): + """backend='ollama' must always use Ollama, even on Apple Silicon.""" + with patch("timmy.agent.Agent") as MockAgent, \ + patch("timmy.agent.Ollama"), \ + patch("timmy.agent.SqliteDb"): + + from timmy.agent import create_timmy + create_timmy(backend="ollama") + + MockAgent.assert_called_once() + + +# ── _resolve_backend ───────────────────────────────────────────────────────── + +def test_resolve_backend_explicit_takes_priority(): + from timmy.agent import _resolve_backend + assert _resolve_backend("airllm") == "airllm" + assert _resolve_backend("ollama") == "ollama" + + +def test_resolve_backend_defaults_to_ollama_without_config(): + """Default config (timmy_model_backend='ollama') → 'ollama'.""" + from timmy.agent import _resolve_backend + assert _resolve_backend(None) == "ollama" + + +def test_resolve_backend_auto_uses_airllm_on_apple_silicon(): + """'auto' on Apple Silicon with airllm stubbed → 'airllm'.""" + with patch("timmy.backends.is_apple_silicon", return_value=True), \ + patch("timmy.agent.settings") as mock_settings: + mock_settings.timmy_model_backend = "auto" + mock_settings.airllm_model_size = "70b" + mock_settings.ollama_model = "llama3.2" + + from timmy.agent import _resolve_backend + assert _resolve_backend(None) == "airllm" + + +def test_resolve_backend_auto_falls_back_on_non_apple(): + """'auto' on non-Apple Silicon → 'ollama'.""" + with patch("timmy.backends.is_apple_silicon", return_value=False), \ + patch("timmy.agent.settings") as mock_settings: + mock_settings.timmy_model_backend = "auto" + mock_settings.airllm_model_size = "70b" + mock_settings.ollama_model = "llama3.2" + + from timmy.agent import _resolve_backend + assert _resolve_backend(None) == "ollama" diff --git a/tests/test_backends.py b/tests/test_backends.py new file mode 100644 index 0000000..0e5ec7b --- /dev/null +++ b/tests/test_backends.py @@ -0,0 +1,143 @@ +"""Tests for src/timmy/backends.py — AirLLM wrapper and helpers.""" + +import sys +from unittest.mock import MagicMock, patch + +import pytest + + +# ── is_apple_silicon ────────────────────────────────────────────────────────── + +def test_is_apple_silicon_true_on_arm_darwin(): + with patch("timmy.backends.platform.system", return_value="Darwin"), \ + patch("timmy.backends.platform.machine", return_value="arm64"): + from timmy.backends import is_apple_silicon + assert is_apple_silicon() is True + + +def test_is_apple_silicon_false_on_linux(): + with patch("timmy.backends.platform.system", return_value="Linux"), \ + patch("timmy.backends.platform.machine", return_value="x86_64"): + from timmy.backends import is_apple_silicon + assert is_apple_silicon() is False + + +def test_is_apple_silicon_false_on_intel_mac(): + with patch("timmy.backends.platform.system", return_value="Darwin"), \ + patch("timmy.backends.platform.machine", return_value="x86_64"): + from timmy.backends import is_apple_silicon + assert is_apple_silicon() is False + + +# ── airllm_available ───────────────────────────────────────────────────────── + +def test_airllm_available_true_when_stub_in_sys_modules(): + # conftest already stubs 'airllm' — importable → True. + from timmy.backends import airllm_available + assert airllm_available() is True + + +def test_airllm_available_false_when_not_importable(): + # Temporarily remove the stub to simulate airllm not installed. + saved = sys.modules.pop("airllm", None) + try: + from timmy.backends import airllm_available + assert airllm_available() is False + finally: + if saved is not None: + sys.modules["airllm"] = saved + + +# ── TimmyAirLLMAgent construction ──────────────────────────────────────────── + +def test_airllm_agent_raises_on_unknown_size(): + from timmy.backends import TimmyAirLLMAgent + with pytest.raises(ValueError, match="Unknown model size"): + TimmyAirLLMAgent(model_size="3b") + + +def test_airllm_agent_uses_automodel_on_non_apple(): + """Non-Apple-Silicon path uses AutoModel.from_pretrained.""" + with patch("timmy.backends.is_apple_silicon", return_value=False): + from timmy.backends import TimmyAirLLMAgent + agent = TimmyAirLLMAgent(model_size="8b") + # sys.modules["airllm"] is a MagicMock; AutoModel.from_pretrained was called. + assert sys.modules["airllm"].AutoModel.from_pretrained.called + + +def test_airllm_agent_uses_mlx_on_apple_silicon(): + """Apple Silicon path uses AirLLMMLX, not AutoModel.""" + with patch("timmy.backends.is_apple_silicon", return_value=True): + from timmy.backends import TimmyAirLLMAgent + agent = TimmyAirLLMAgent(model_size="8b") + assert sys.modules["airllm"].AirLLMMLX.called + + +def test_airllm_agent_resolves_correct_model_id_for_70b(): + with patch("timmy.backends.is_apple_silicon", return_value=False): + from timmy.backends import TimmyAirLLMAgent, _AIRLLM_MODELS + TimmyAirLLMAgent(model_size="70b") + sys.modules["airllm"].AutoModel.from_pretrained.assert_called_with( + _AIRLLM_MODELS["70b"] + ) + + +# ── TimmyAirLLMAgent.print_response ────────────────────────────────────────── + +def _make_agent(model_size: str = "8b") -> "TimmyAirLLMAgent": + """Helper: create an agent with a fully mocked underlying model.""" + with patch("timmy.backends.is_apple_silicon", return_value=False): + from timmy.backends import TimmyAirLLMAgent + agent = TimmyAirLLMAgent(model_size=model_size) + + # Replace the underlying model with a clean mock that returns predictable output. + mock_model = MagicMock() + mock_tokenizer = MagicMock() + # tokenizer() returns a dict-like object with an "input_ids" tensor mock. + input_ids_mock = MagicMock() + input_ids_mock.shape = [1, 10] # shape[1] = prompt token count = 10 + token_dict = {"input_ids": input_ids_mock} + mock_tokenizer.return_value = token_dict + # generate() returns a list of token sequences. + mock_tokenizer.decode.return_value = "Sir, affirmative." + mock_model.tokenizer = mock_tokenizer + mock_model.generate.return_value = [list(range(15))] # 15 tokens total + agent._model = mock_model + return agent + + +def test_print_response_calls_generate(): + agent = _make_agent() + agent.print_response("What is sovereignty?", stream=True) + agent._model.generate.assert_called_once() + + +def test_print_response_decodes_only_generated_tokens(): + agent = _make_agent() + agent.print_response("Hello", stream=False) + # decode should be called with tokens starting at index 10 (prompt length). + decode_call = agent._model.tokenizer.decode.call_args + token_slice = decode_call[0][0] + assert list(token_slice) == list(range(10, 15)) + + +def test_print_response_updates_history(): + agent = _make_agent() + agent.print_response("First message") + assert any("First message" in turn for turn in agent._history) + assert any("Timmy:" in turn for turn in agent._history) + + +def test_print_response_history_included_in_second_prompt(): + agent = _make_agent() + agent.print_response("First") + # Build the prompt for the second call — history should appear. + prompt = agent._build_prompt("Second") + assert "First" in prompt + assert "Second" in prompt + + +def test_print_response_stream_flag_accepted(): + """stream=False should not raise — it's accepted for API compatibility.""" + agent = _make_agent() + agent.print_response("hello", stream=False) # no error