feat: sovereign voice loop — timmy voice command

Adds fully local listen-think-speak voice interface. STT: Whisper, LLM: Ollama, TTS: Piper. No cloud, no network. - src/timmy/voice_loop.py: VoiceLoop with VAD, Whisper, Piper - src/timmy/cli.py: new voice command - pyproject.toml: voice extras updated - 20 new tests
2026-03-14 13:58:56 -04:00
parent d770d66150
commit dbadfc425d
4 changed files with 696 additions and 1 deletions
--- a/src/timmy/voice_loop.py
+++ b/src/timmy/voice_loop.py
@@ -0,0 +1,387 @@
+"""Sovereign voice loop — listen, think, speak.
+
+A fully local voice interface for Timmy. No cloud, no network calls.
+All processing happens on the user's machine:
+
+    Mic → VAD/silence detection → Whisper (local STT) → Timmy chat → Piper TTS → Speaker
+
+Usage:
+    from timmy.voice_loop import VoiceLoop
+    loop = VoiceLoop()
+    loop.run()  # blocks, Ctrl-C to stop
+
+Requires: sounddevice, numpy, whisper, piper-tts
+"""
+
+import asyncio
+import logging
+import subprocess
+import sys
+import tempfile
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# ── Defaults ────────────────────────────────────────────────────────────────
+
+DEFAULT_WHISPER_MODEL = "base.en"
+DEFAULT_PIPER_VOICE = Path.home() / ".local/share/piper-voices/en_US-lessac-medium.onnx"
+DEFAULT_SAMPLE_RATE = 16000  # Whisper expects 16 kHz
+DEFAULT_CHANNELS = 1
+DEFAULT_SILENCE_THRESHOLD = 0.015  # RMS threshold — tune for your mic/room
+DEFAULT_SILENCE_DURATION = 1.5  # seconds of silence to end utterance
+DEFAULT_MIN_UTTERANCE = 0.5  # ignore clicks/bumps shorter than this
+DEFAULT_MAX_UTTERANCE = 30.0  # safety cap — don't record forever
+DEFAULT_SESSION_ID = "voice"
+
+
+@dataclass
+class VoiceConfig:
+    """Configuration for the voice loop."""
+
+    whisper_model: str = DEFAULT_WHISPER_MODEL
+    piper_voice: Path = DEFAULT_PIPER_VOICE
+    sample_rate: int = DEFAULT_SAMPLE_RATE
+    silence_threshold: float = DEFAULT_SILENCE_THRESHOLD
+    silence_duration: float = DEFAULT_SILENCE_DURATION
+    min_utterance: float = DEFAULT_MIN_UTTERANCE
+    max_utterance: float = DEFAULT_MAX_UTTERANCE
+    session_id: str = DEFAULT_SESSION_ID
+    # Set True to use macOS `say` instead of Piper
+    use_say_fallback: bool = False
+    # Piper speaking rate (default 1.0, lower = slower)
+    speaking_rate: float = 1.0
+    # Backend/model for Timmy inference
+    backend: str | None = None
+    model_size: str | None = None
+
+
+class VoiceLoop:
+    """Sovereign listen-think-speak loop.
+
+    Everything runs locally:
+    - STT: OpenAI Whisper (local model, no API)
+    - LLM: Timmy via Ollama (local inference)
+    - TTS: Piper (local ONNX model) or macOS `say`
+    """
+
+    def __init__(self, config: VoiceConfig | None = None) -> None:
+        self.config = config or VoiceConfig()
+        self._whisper_model = None
+        self._running = False
+        self._speaking = False  # True while TTS is playing
+        self._interrupted = False  # set when user talks over TTS
+
+    # ── Lazy initialization ─────────────────────────────────────────────
+
+    def _load_whisper(self):
+        """Load Whisper model (lazy, first use only)."""
+        if self._whisper_model is not None:
+            return
+        import whisper
+
+        logger.info("Loading Whisper model: %s", self.config.whisper_model)
+        self._whisper_model = whisper.load_model(self.config.whisper_model)
+        logger.info("Whisper model loaded.")
+
+    def _ensure_piper(self) -> bool:
+        """Check that Piper voice model exists."""
+        if self.config.use_say_fallback:
+            return True
+        voice_path = self.config.piper_voice
+        if not voice_path.exists():
+            logger.warning("Piper voice not found at %s — falling back to `say`", voice_path)
+            self.config.use_say_fallback = True
+            return True
+        return True
+
+    # ── STT: Microphone → Text ──────────────────────────────────────────
+
+    def _record_utterance(self) -> np.ndarray | None:
+        """Record from microphone until silence is detected.
+
+        Uses energy-based Voice Activity Detection:
+        1. Wait for speech (RMS above threshold)
+        2. Record until silence (RMS below threshold for silence_duration)
+        3. Return the audio as a numpy array
+
+        Returns None if interrupted or no speech detected.
+        """
+        import sounddevice as sd
+
+        sr = self.config.sample_rate
+        block_size = int(sr * 0.1)  # 100ms blocks
+        silence_blocks = int(self.config.silence_duration / 0.1)
+        min_blocks = int(self.config.min_utterance / 0.1)
+        max_blocks = int(self.config.max_utterance / 0.1)
+
+        audio_chunks: list[np.ndarray] = []
+        silent_count = 0
+        recording = False
+
+        def _rms(block: np.ndarray) -> float:
+            return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
+
+        sys.stdout.write("\n  🎤 Listening... (speak now)\n")
+        sys.stdout.flush()
+
+        with sd.InputStream(
+            samplerate=sr,
+            channels=DEFAULT_CHANNELS,
+            dtype="float32",
+            blocksize=block_size,
+        ) as stream:
+            while self._running:
+                block, overflowed = stream.read(block_size)
+                if overflowed:
+                    logger.debug("Audio buffer overflowed")
+
+                rms = _rms(block)
+
+                if not recording:
+                    if rms > self.config.silence_threshold:
+                        recording = True
+                        silent_count = 0
+                        audio_chunks.append(block.copy())
+                        sys.stdout.write("  📢 Recording...\r")
+                        sys.stdout.flush()
+                else:
+                    audio_chunks.append(block.copy())
+
+                    if rms < self.config.silence_threshold:
+                        silent_count += 1
+                    else:
+                        silent_count = 0
+
+                    # End of utterance
+                    if silent_count >= silence_blocks:
+                        break
+
+                    # Safety cap
+                    if len(audio_chunks) >= max_blocks:
+                        logger.info("Max utterance length reached, stopping.")
+                        break
+
+        if not audio_chunks or len(audio_chunks) < min_blocks:
+            return None
+
+        audio = np.concatenate(audio_chunks, axis=0).flatten()
+        duration = len(audio) / sr
+        sys.stdout.write(f"  ✂️  Captured {duration:.1f}s of audio\n")
+        sys.stdout.flush()
+        return audio
+
+    def _transcribe(self, audio: np.ndarray) -> str:
+        """Transcribe audio using local Whisper model."""
+        self._load_whisper()
+
+        sys.stdout.write("  🧠 Transcribing...\r")
+        sys.stdout.flush()
+
+        t0 = time.monotonic()
+        result = self._whisper_model.transcribe(
+            audio,
+            language="en",
+            fp16=False,  # MPS/CPU — fp16 can cause issues on some setups
+        )
+        elapsed = time.monotonic() - t0
+
+        text = result["text"].strip()
+        logger.info("Whisper transcribed in %.1fs: '%s'", elapsed, text[:80])
+        return text
+
+    # ── TTS: Text → Speaker ─────────────────────────────────────────────
+
+    def _speak(self, text: str) -> None:
+        """Speak text aloud using Piper TTS or macOS `say`."""
+        if not text:
+            return
+
+        self._speaking = True
+        try:
+            if self.config.use_say_fallback:
+                self._speak_say(text)
+            else:
+                self._speak_piper(text)
+        finally:
+            self._speaking = False
+
+    def _speak_piper(self, text: str) -> None:
+        """Speak using Piper TTS (local ONNX inference)."""
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp_path = tmp.name
+
+        try:
+            # Generate WAV with Piper
+            cmd = [
+                "piper",
+                "--model",
+                str(self.config.piper_voice),
+                "--output_file",
+                tmp_path,
+            ]
+
+            proc = subprocess.run(
+                cmd,
+                input=text,
+                capture_output=True,
+                text=True,
+                timeout=30,
+            )
+
+            if proc.returncode != 0:
+                logger.error("Piper failed: %s", proc.stderr)
+                self._speak_say(text)  # fallback
+                return
+
+            # Play with afplay (macOS) — interruptible
+            self._play_audio(tmp_path)
+
+        finally:
+            Path(tmp_path).unlink(missing_ok=True)
+
+    def _speak_say(self, text: str) -> None:
+        """Speak using macOS `say` command."""
+        try:
+            proc = subprocess.Popen(
+                ["say", "-r", "180", text],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            proc.wait(timeout=60)
+        except subprocess.TimeoutExpired:
+            proc.kill()
+        except FileNotFoundError:
+            logger.error("macOS `say` command not found")
+
+    def _play_audio(self, path: str) -> None:
+        """Play a WAV file. Can be interrupted by setting self._interrupted."""
+        try:
+            proc = subprocess.Popen(
+                ["afplay", path],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            # Poll so we can interrupt
+            while proc.poll() is None:
+                if self._interrupted:
+                    proc.terminate()
+                    self._interrupted = False
+                    logger.info("TTS interrupted by user")
+                    return
+                time.sleep(0.05)
+        except FileNotFoundError:
+            # Not macOS — try aplay (Linux)
+            try:
+                subprocess.run(["aplay", path], capture_output=True, timeout=60)
+            except (FileNotFoundError, subprocess.TimeoutExpired):
+                logger.error("No audio player found (tried afplay, aplay)")
+
+    # ── LLM: Text → Response ───────────────────────────────────────────
+
+    def _think(self, user_text: str) -> str:
+        """Send text to Timmy and get a response."""
+        sys.stdout.write("  💭 Thinking...\r")
+        sys.stdout.flush()
+
+        t0 = time.monotonic()
+
+        try:
+            response = asyncio.run(self._chat(user_text))
+        except Exception as exc:
+            logger.error("Timmy chat failed: %s", exc)
+            response = "I'm having trouble thinking right now. Could you try again?"
+
+        elapsed = time.monotonic() - t0
+        logger.info("Timmy responded in %.1fs", elapsed)
+        return response
+
+    async def _chat(self, message: str) -> str:
+        """Async wrapper around Timmy's session.chat()."""
+        from timmy.session import chat
+
+        return await chat(message, session_id=self.config.session_id)
+
+    # ── Main Loop ───────────────────────────────────────────────────────
+
+    def run(self) -> None:
+        """Run the voice loop. Blocks until Ctrl-C."""
+        self._ensure_piper()
+
+        tts_label = (
+            "macOS say"
+            if self.config.use_say_fallback
+            else f"Piper ({self.config.piper_voice.name})"
+        )
+        print(
+            f"\n{'=' * 60}\n"
+            f"  🎙️  Timmy Voice — Sovereign Voice Interface\n"
+            f"{'=' * 60}\n"
+            f"  STT:  Whisper ({self.config.whisper_model})\n"
+            f"  TTS:  {tts_label}\n"
+            f"  LLM:  Timmy (local Ollama)\n"
+            f"{'=' * 60}\n"
+            f"  Speak naturally. Timmy will listen, think, and respond.\n"
+            f"  Press Ctrl-C to exit.\n"
+            f"{'=' * 60}"
+        )
+
+        self._running = True
+
+        try:
+            while self._running:
+                # 1. LISTEN — record until silence
+                audio = self._record_utterance()
+                if audio is None:
+                    continue
+
+                # 2. TRANSCRIBE — Whisper STT
+                text = self._transcribe(audio)
+                if not text or text.lower() in (
+                    "you",
+                    "thanks.",
+                    "thank you.",
+                    "bye.",
+                    "",
+                    "thanks for watching!",
+                    "thank you for watching!",
+                ):
+                    # Whisper hallucinations on silence/noise
+                    logger.debug("Ignoring likely Whisper hallucination: '%s'", text)
+                    continue
+
+                sys.stdout.write(f"\n  👤 You: {text}\n")
+                sys.stdout.flush()
+
+                # Exit commands
+                if text.lower().strip().rstrip(".!") in (
+                    "goodbye",
+                    "exit",
+                    "quit",
+                    "stop",
+                    "goodbye timmy",
+                    "stop listening",
+                ):
+                    print("\n  👋 Goodbye!\n")
+                    break
+
+                # 3. THINK — send to Timmy
+                response = self._think(text)
+                sys.stdout.write(f"  🤖 Timmy: {response}\n")
+                sys.stdout.flush()
+
+                # 4. SPEAK — TTS output
+                self._speak(response)
+
+        except KeyboardInterrupt:
+            print("\n\n  👋 Voice loop stopped.\n")
+        finally:
+            self._running = False
+
+    def stop(self) -> None:
+        """Stop the voice loop (from another thread)."""
+        self._running = False