diff --git a/pyproject.toml b/pyproject.toml index a3987220..09fbe9b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,9 @@ python-telegram-bot = { version = ">=21.0", optional = true } "discord.py" = { version = ">=2.3.0", optional = true } airllm = { version = ">=2.9.0", optional = true } pyttsx3 = { version = ">=2.90", optional = true } +openai-whisper = { version = ">=20231117", optional = true } +piper-tts = { version = ">=1.2.0", optional = true } +sounddevice = { version = ">=0.4.6", optional = true } sentence-transformers = { version = ">=2.0.0", optional = true } numpy = { version = ">=1.24.0", optional = true } requests = { version = ">=2.31.0", optional = true } @@ -59,7 +62,7 @@ pytest-xdist = { version = ">=3.5.0", optional = true } telegram = ["python-telegram-bot"] discord = ["discord.py"] bigbrain = ["airllm"] -voice = ["pyttsx3"] +voice = ["pyttsx3", "openai-whisper", "piper-tts", "sounddevice"] celery = ["celery"] embeddings = ["sentence-transformers", "numpy"] git = ["GitPython"] diff --git a/src/timmy/cli.py b/src/timmy/cli.py index 04c8a1ce..676a49a2 100644 --- a/src/timmy/cli.py +++ b/src/timmy/cli.py @@ -248,5 +248,37 @@ def down(): subprocess.run(["docker", "compose", "down"], check=True) +@app.command() +def voice( + whisper_model: str = typer.Option( + "base.en", "--whisper", "-w", help="Whisper model: tiny.en, base.en, small.en, medium.en" + ), + use_say: bool = typer.Option(False, "--say", help="Use macOS `say` instead of Piper TTS"), + threshold: float = typer.Option( + 0.015, "--threshold", "-t", help="Mic silence threshold (RMS). Lower = more sensitive." + ), + silence: float = typer.Option(1.5, "--silence", help="Seconds of silence to end recording"), + backend: str | None = _BACKEND_OPTION, + model_size: str | None = _MODEL_SIZE_OPTION, +): + """Start the sovereign voice loop — listen, think, speak. + + Everything runs locally: Whisper for STT, Ollama for LLM, Piper for TTS. + No cloud, no network calls, no microphone data leaves your machine. + """ + from timmy.voice_loop import VoiceConfig, VoiceLoop + + config = VoiceConfig( + whisper_model=whisper_model, + use_say_fallback=use_say, + silence_threshold=threshold, + silence_duration=silence, + backend=backend, + model_size=model_size, + ) + loop = VoiceLoop(config=config) + loop.run() + + def main(): app() diff --git a/src/timmy/voice_loop.py b/src/timmy/voice_loop.py new file mode 100644 index 00000000..82b82ff3 --- /dev/null +++ b/src/timmy/voice_loop.py @@ -0,0 +1,387 @@ +"""Sovereign voice loop — listen, think, speak. + +A fully local voice interface for Timmy. No cloud, no network calls. +All processing happens on the user's machine: + + Mic → VAD/silence detection → Whisper (local STT) → Timmy chat → Piper TTS → Speaker + +Usage: + from timmy.voice_loop import VoiceLoop + loop = VoiceLoop() + loop.run() # blocks, Ctrl-C to stop + +Requires: sounddevice, numpy, whisper, piper-tts +""" + +import asyncio +import logging +import subprocess +import sys +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +logger = logging.getLogger(__name__) + +# ── Defaults ──────────────────────────────────────────────────────────────── + +DEFAULT_WHISPER_MODEL = "base.en" +DEFAULT_PIPER_VOICE = Path.home() / ".local/share/piper-voices/en_US-lessac-medium.onnx" +DEFAULT_SAMPLE_RATE = 16000 # Whisper expects 16 kHz +DEFAULT_CHANNELS = 1 +DEFAULT_SILENCE_THRESHOLD = 0.015 # RMS threshold — tune for your mic/room +DEFAULT_SILENCE_DURATION = 1.5 # seconds of silence to end utterance +DEFAULT_MIN_UTTERANCE = 0.5 # ignore clicks/bumps shorter than this +DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever +DEFAULT_SESSION_ID = "voice" + + +@dataclass +class VoiceConfig: + """Configuration for the voice loop.""" + + whisper_model: str = DEFAULT_WHISPER_MODEL + piper_voice: Path = DEFAULT_PIPER_VOICE + sample_rate: int = DEFAULT_SAMPLE_RATE + silence_threshold: float = DEFAULT_SILENCE_THRESHOLD + silence_duration: float = DEFAULT_SILENCE_DURATION + min_utterance: float = DEFAULT_MIN_UTTERANCE + max_utterance: float = DEFAULT_MAX_UTTERANCE + session_id: str = DEFAULT_SESSION_ID + # Set True to use macOS `say` instead of Piper + use_say_fallback: bool = False + # Piper speaking rate (default 1.0, lower = slower) + speaking_rate: float = 1.0 + # Backend/model for Timmy inference + backend: str | None = None + model_size: str | None = None + + +class VoiceLoop: + """Sovereign listen-think-speak loop. + + Everything runs locally: + - STT: OpenAI Whisper (local model, no API) + - LLM: Timmy via Ollama (local inference) + - TTS: Piper (local ONNX model) or macOS `say` + """ + + def __init__(self, config: VoiceConfig | None = None) -> None: + self.config = config or VoiceConfig() + self._whisper_model = None + self._running = False + self._speaking = False # True while TTS is playing + self._interrupted = False # set when user talks over TTS + + # ── Lazy initialization ───────────────────────────────────────────── + + def _load_whisper(self): + """Load Whisper model (lazy, first use only).""" + if self._whisper_model is not None: + return + import whisper + + logger.info("Loading Whisper model: %s", self.config.whisper_model) + self._whisper_model = whisper.load_model(self.config.whisper_model) + logger.info("Whisper model loaded.") + + def _ensure_piper(self) -> bool: + """Check that Piper voice model exists.""" + if self.config.use_say_fallback: + return True + voice_path = self.config.piper_voice + if not voice_path.exists(): + logger.warning("Piper voice not found at %s — falling back to `say`", voice_path) + self.config.use_say_fallback = True + return True + return True + + # ── STT: Microphone → Text ────────────────────────────────────────── + + def _record_utterance(self) -> np.ndarray | None: + """Record from microphone until silence is detected. + + Uses energy-based Voice Activity Detection: + 1. Wait for speech (RMS above threshold) + 2. Record until silence (RMS below threshold for silence_duration) + 3. Return the audio as a numpy array + + Returns None if interrupted or no speech detected. + """ + import sounddevice as sd + + sr = self.config.sample_rate + block_size = int(sr * 0.1) # 100ms blocks + silence_blocks = int(self.config.silence_duration / 0.1) + min_blocks = int(self.config.min_utterance / 0.1) + max_blocks = int(self.config.max_utterance / 0.1) + + audio_chunks: list[np.ndarray] = [] + silent_count = 0 + recording = False + + def _rms(block: np.ndarray) -> float: + return float(np.sqrt(np.mean(block.astype(np.float32) ** 2))) + + sys.stdout.write("\n 🎤 Listening... (speak now)\n") + sys.stdout.flush() + + with sd.InputStream( + samplerate=sr, + channels=DEFAULT_CHANNELS, + dtype="float32", + blocksize=block_size, + ) as stream: + while self._running: + block, overflowed = stream.read(block_size) + if overflowed: + logger.debug("Audio buffer overflowed") + + rms = _rms(block) + + if not recording: + if rms > self.config.silence_threshold: + recording = True + silent_count = 0 + audio_chunks.append(block.copy()) + sys.stdout.write(" 📢 Recording...\r") + sys.stdout.flush() + else: + audio_chunks.append(block.copy()) + + if rms < self.config.silence_threshold: + silent_count += 1 + else: + silent_count = 0 + + # End of utterance + if silent_count >= silence_blocks: + break + + # Safety cap + if len(audio_chunks) >= max_blocks: + logger.info("Max utterance length reached, stopping.") + break + + if not audio_chunks or len(audio_chunks) < min_blocks: + return None + + audio = np.concatenate(audio_chunks, axis=0).flatten() + duration = len(audio) / sr + sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n") + sys.stdout.flush() + return audio + + def _transcribe(self, audio: np.ndarray) -> str: + """Transcribe audio using local Whisper model.""" + self._load_whisper() + + sys.stdout.write(" 🧠 Transcribing...\r") + sys.stdout.flush() + + t0 = time.monotonic() + result = self._whisper_model.transcribe( + audio, + language="en", + fp16=False, # MPS/CPU — fp16 can cause issues on some setups + ) + elapsed = time.monotonic() - t0 + + text = result["text"].strip() + logger.info("Whisper transcribed in %.1fs: '%s'", elapsed, text[:80]) + return text + + # ── TTS: Text → Speaker ───────────────────────────────────────────── + + def _speak(self, text: str) -> None: + """Speak text aloud using Piper TTS or macOS `say`.""" + if not text: + return + + self._speaking = True + try: + if self.config.use_say_fallback: + self._speak_say(text) + else: + self._speak_piper(text) + finally: + self._speaking = False + + def _speak_piper(self, text: str) -> None: + """Speak using Piper TTS (local ONNX inference).""" + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_path = tmp.name + + try: + # Generate WAV with Piper + cmd = [ + "piper", + "--model", + str(self.config.piper_voice), + "--output_file", + tmp_path, + ] + + proc = subprocess.run( + cmd, + input=text, + capture_output=True, + text=True, + timeout=30, + ) + + if proc.returncode != 0: + logger.error("Piper failed: %s", proc.stderr) + self._speak_say(text) # fallback + return + + # Play with afplay (macOS) — interruptible + self._play_audio(tmp_path) + + finally: + Path(tmp_path).unlink(missing_ok=True) + + def _speak_say(self, text: str) -> None: + """Speak using macOS `say` command.""" + try: + proc = subprocess.Popen( + ["say", "-r", "180", text], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + proc.wait(timeout=60) + except subprocess.TimeoutExpired: + proc.kill() + except FileNotFoundError: + logger.error("macOS `say` command not found") + + def _play_audio(self, path: str) -> None: + """Play a WAV file. Can be interrupted by setting self._interrupted.""" + try: + proc = subprocess.Popen( + ["afplay", path], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + # Poll so we can interrupt + while proc.poll() is None: + if self._interrupted: + proc.terminate() + self._interrupted = False + logger.info("TTS interrupted by user") + return + time.sleep(0.05) + except FileNotFoundError: + # Not macOS — try aplay (Linux) + try: + subprocess.run(["aplay", path], capture_output=True, timeout=60) + except (FileNotFoundError, subprocess.TimeoutExpired): + logger.error("No audio player found (tried afplay, aplay)") + + # ── LLM: Text → Response ─────────────────────────────────────────── + + def _think(self, user_text: str) -> str: + """Send text to Timmy and get a response.""" + sys.stdout.write(" 💭 Thinking...\r") + sys.stdout.flush() + + t0 = time.monotonic() + + try: + response = asyncio.run(self._chat(user_text)) + except Exception as exc: + logger.error("Timmy chat failed: %s", exc) + response = "I'm having trouble thinking right now. Could you try again?" + + elapsed = time.monotonic() - t0 + logger.info("Timmy responded in %.1fs", elapsed) + return response + + async def _chat(self, message: str) -> str: + """Async wrapper around Timmy's session.chat().""" + from timmy.session import chat + + return await chat(message, session_id=self.config.session_id) + + # ── Main Loop ─────────────────────────────────────────────────────── + + def run(self) -> None: + """Run the voice loop. Blocks until Ctrl-C.""" + self._ensure_piper() + + tts_label = ( + "macOS say" + if self.config.use_say_fallback + else f"Piper ({self.config.piper_voice.name})" + ) + print( + f"\n{'=' * 60}\n" + f" 🎙️ Timmy Voice — Sovereign Voice Interface\n" + f"{'=' * 60}\n" + f" STT: Whisper ({self.config.whisper_model})\n" + f" TTS: {tts_label}\n" + f" LLM: Timmy (local Ollama)\n" + f"{'=' * 60}\n" + f" Speak naturally. Timmy will listen, think, and respond.\n" + f" Press Ctrl-C to exit.\n" + f"{'=' * 60}" + ) + + self._running = True + + try: + while self._running: + # 1. LISTEN — record until silence + audio = self._record_utterance() + if audio is None: + continue + + # 2. TRANSCRIBE — Whisper STT + text = self._transcribe(audio) + if not text or text.lower() in ( + "you", + "thanks.", + "thank you.", + "bye.", + "", + "thanks for watching!", + "thank you for watching!", + ): + # Whisper hallucinations on silence/noise + logger.debug("Ignoring likely Whisper hallucination: '%s'", text) + continue + + sys.stdout.write(f"\n 👤 You: {text}\n") + sys.stdout.flush() + + # Exit commands + if text.lower().strip().rstrip(".!") in ( + "goodbye", + "exit", + "quit", + "stop", + "goodbye timmy", + "stop listening", + ): + print("\n 👋 Goodbye!\n") + break + + # 3. THINK — send to Timmy + response = self._think(text) + sys.stdout.write(f" 🤖 Timmy: {response}\n") + sys.stdout.flush() + + # 4. SPEAK — TTS output + self._speak(response) + + except KeyboardInterrupt: + print("\n\n 👋 Voice loop stopped.\n") + finally: + self._running = False + + def stop(self) -> None: + """Stop the voice loop (from another thread).""" + self._running = False diff --git a/tests/timmy/test_voice_loop.py b/tests/timmy/test_voice_loop.py new file mode 100644 index 00000000..8c2dd52c --- /dev/null +++ b/tests/timmy/test_voice_loop.py @@ -0,0 +1,273 @@ +"""Tests for the sovereign voice loop. + +These tests verify the VoiceLoop components without requiring a microphone, +Whisper model, or Piper installation — all I/O is mocked. +""" + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import numpy as np + +from timmy.voice_loop import VoiceConfig, VoiceLoop + +# ── VoiceConfig tests ────────────────────────────────────────────────────── + + +class TestVoiceConfig: + def test_defaults(self): + cfg = VoiceConfig() + assert cfg.whisper_model == "base.en" + assert cfg.sample_rate == 16000 + assert cfg.silence_threshold == 0.015 + assert cfg.silence_duration == 1.5 + assert cfg.min_utterance == 0.5 + assert cfg.max_utterance == 30.0 + assert cfg.session_id == "voice" + assert cfg.use_say_fallback is False + + def test_custom_values(self): + cfg = VoiceConfig( + whisper_model="tiny.en", + silence_threshold=0.02, + session_id="custom", + use_say_fallback=True, + ) + assert cfg.whisper_model == "tiny.en" + assert cfg.silence_threshold == 0.02 + assert cfg.session_id == "custom" + assert cfg.use_say_fallback is True + + +# ── VoiceLoop unit tests ────────────────────────────────────────────────── + + +class TestVoiceLoopInit: + def test_default_config(self): + loop = VoiceLoop() + assert loop.config.whisper_model == "base.en" + assert loop._running is False + assert loop._speaking is False + + def test_custom_config(self): + cfg = VoiceConfig(whisper_model="tiny.en") + loop = VoiceLoop(config=cfg) + assert loop.config.whisper_model == "tiny.en" + + +class TestPiperFallback: + def test_falls_back_to_say_when_no_voice_file(self): + cfg = VoiceConfig(piper_voice=Path("/nonexistent/voice.onnx")) + loop = VoiceLoop(config=cfg) + loop._ensure_piper() + assert loop.config.use_say_fallback is True + + def test_keeps_piper_when_voice_exists(self, tmp_path): + voice_file = tmp_path / "test.onnx" + voice_file.write_bytes(b"fake model") + cfg = VoiceConfig(piper_voice=voice_file) + loop = VoiceLoop(config=cfg) + loop._ensure_piper() + assert loop.config.use_say_fallback is False + + +class TestTranscribe: + def test_transcribes_audio(self): + """Whisper transcription returns cleaned text.""" + loop = VoiceLoop() + + mock_model = MagicMock() + mock_model.transcribe.return_value = {"text": " Hello Timmy "} + loop._whisper_model = mock_model + + audio = np.random.randn(16000).astype(np.float32) + result = loop._transcribe(audio) + + assert result == "Hello Timmy" + mock_model.transcribe.assert_called_once() + + def test_transcribes_empty_returns_empty(self): + loop = VoiceLoop() + mock_model = MagicMock() + mock_model.transcribe.return_value = {"text": " "} + loop._whisper_model = mock_model + + audio = np.random.randn(16000).astype(np.float32) + result = loop._transcribe(audio) + assert result == "" + + +class TestThink: + @patch("timmy.voice_loop.asyncio") + def test_think_returns_response(self, mock_asyncio): + mock_asyncio.run.return_value = "I am Timmy." + loop = VoiceLoop() + result = loop._think("Who are you?") + assert result == "I am Timmy." + + @patch("timmy.voice_loop.asyncio") + def test_think_handles_error(self, mock_asyncio): + mock_asyncio.run.side_effect = RuntimeError("Ollama down") + loop = VoiceLoop() + result = loop._think("test") + assert "trouble" in result.lower() + + +class TestSpeakSay: + @patch("subprocess.Popen") + def test_speak_say_calls_subprocess(self, mock_popen): + mock_proc = MagicMock() + mock_proc.wait.return_value = 0 + mock_popen.return_value = mock_proc + + cfg = VoiceConfig(use_say_fallback=True) + loop = VoiceLoop(config=cfg) + loop._speak_say("Hello") + + mock_popen.assert_called_once() + args = mock_popen.call_args[0][0] + assert args[0] == "say" + assert "Hello" in args + + @patch("subprocess.Popen", side_effect=FileNotFoundError) + def test_speak_say_handles_missing(self, mock_popen): + cfg = VoiceConfig(use_say_fallback=True) + loop = VoiceLoop(config=cfg) + # Should not raise + loop._speak_say("Hello") + + +class TestSpeakPiper: + @patch("timmy.voice_loop.VoiceLoop._play_audio") + @patch("subprocess.run") + def test_speak_piper_generates_and_plays(self, mock_run, mock_play): + mock_run.return_value = MagicMock(returncode=0, stderr="") + + voice_path = Path("/tmp/test_voice.onnx") + cfg = VoiceConfig(piper_voice=voice_path) + loop = VoiceLoop(config=cfg) + loop._speak_piper("Hello from Piper") + + # Piper was called + mock_run.assert_called_once() + cmd = mock_run.call_args[0][0] + assert cmd[0] == "piper" + assert "--model" in cmd + + # Audio was played + mock_play.assert_called_once() + + @patch("timmy.voice_loop.VoiceLoop._speak_say") + @patch("subprocess.run") + def test_speak_piper_falls_back_on_error(self, mock_run, mock_say): + mock_run.return_value = MagicMock(returncode=1, stderr="model error") + + cfg = VoiceConfig(piper_voice=Path("/tmp/test.onnx")) + loop = VoiceLoop(config=cfg) + loop._speak_piper("test") + + # Should fall back to say + mock_say.assert_called_once_with("test") + + +class TestHallucinationFilter: + """Whisper tends to hallucinate on silence/noise. The loop should filter these.""" + + def test_known_hallucinations_filtered(self): + hallucinations = [ + "you", + "thanks.", + "Thank you.", + "Bye.", + "Thanks for watching!", + "Thank you for watching!", + ] + for text in hallucinations: + assert text.lower() in ( + "you", + "thanks.", + "thank you.", + "bye.", + "", + "thanks for watching!", + "thank you for watching!", + ), f"'{text}' should be filtered" + + +class TestExitCommands: + """Voice loop should recognize exit commands.""" + + def test_exit_commands(self): + exits = ["goodbye", "exit", "quit", "stop", "goodbye timmy", "stop listening"] + for cmd in exits: + assert cmd.lower().strip().rstrip(".!") in ( + "goodbye", + "exit", + "quit", + "stop", + "goodbye timmy", + "stop listening", + ), f"'{cmd}' should be an exit command" + + +class TestPlayAudio: + @patch("subprocess.Popen") + def test_play_audio_calls_afplay(self, mock_popen): + mock_proc = MagicMock() + mock_proc.poll.side_effect = [None, 0] # Running, then done + mock_popen.return_value = mock_proc + + loop = VoiceLoop() + loop._play_audio("/tmp/test.wav") + + mock_popen.assert_called_once() + args = mock_popen.call_args[0][0] + assert args[0] == "afplay" + + @patch("subprocess.Popen") + def test_play_audio_interruptible(self, mock_popen): + mock_proc = MagicMock() + # Simulate running, then we interrupt + call_count = 0 + + def poll_side_effect(): + nonlocal call_count + call_count += 1 + return None # Always running + + mock_proc.poll.side_effect = poll_side_effect + mock_popen.return_value = mock_proc + + loop = VoiceLoop() + loop._interrupted = True # Pre-set interrupt + loop._play_audio("/tmp/test.wav") + + mock_proc.terminate.assert_called_once() + + +class TestStopMethod: + def test_stop_sets_running_false(self): + loop = VoiceLoop() + loop._running = True + loop.stop() + assert loop._running is False + + +class TestSpeakSetsFlag: + @patch("timmy.voice_loop.VoiceLoop._speak_say") + def test_speaking_flag_set_during_speech(self, mock_say): + cfg = VoiceConfig(use_say_fallback=True) + loop = VoiceLoop(config=cfg) + + # Before speak + assert loop._speaking is False + + # Mock say to check flag during execution + def check_flag(text): + assert loop._speaking is True + + mock_say.side_effect = check_flag + loop._speak("Hello") + + # After speak + assert loop._speaking is False