From f32077ae8d7bd91b29f9d9923c3b03bd0d22a5c6 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Tue, 24 Mar 2026 15:26:01 -0400 Subject: [PATCH] refactor: split voice_loop.py into voice/ subpackage (#1379) --- src/dashboard/routes/world/__init__.py | 11 +- src/infrastructure/router/cascade.py | 5 +- src/infrastructure/router/health.py | 4 +- src/timmy/voice/__init__.py | 50 +++ src/timmy/voice/activation.py | 38 ++ src/timmy/voice/audio_io.py | 19 + src/timmy/voice/helpers.py | 53 +++ src/timmy/voice/llm.py | 68 ++++ src/timmy/voice/speech_engines.py | 48 +++ src/timmy/voice/stt.py | 119 +++++++ src/timmy/voice/tts.py | 78 ++++ src/timmy/voice_loop.py | 470 ++----------------------- 12 files changed, 517 insertions(+), 446 deletions(-) create mode 100644 src/timmy/voice/__init__.py create mode 100644 src/timmy/voice/activation.py create mode 100644 src/timmy/voice/audio_io.py create mode 100644 src/timmy/voice/helpers.py create mode 100644 src/timmy/voice/llm.py create mode 100644 src/timmy/voice/speech_engines.py create mode 100644 src/timmy/voice/stt.py create mode 100644 src/timmy/voice/tts.py diff --git a/src/dashboard/routes/world/__init__.py b/src/dashboard/routes/world/__init__.py index 914c8203..807e789b 100644 --- a/src/dashboard/routes/world/__init__.py +++ b/src/dashboard/routes/world/__init__.py @@ -50,17 +50,12 @@ for route in _matrix_matrix_router.routes: # --------------------------------------------------------------------------- # Used by src/dashboard/app.py -from .websocket import broadcast_world_state # noqa: E402, F401 - -# Used by src/infrastructure/presence.py -from .websocket import _ws_clients # noqa: E402, F401 - # Used by tests from .bark import ( # noqa: E402, F401 - BarkRequest, _BARK_RATE_LIMIT_SECONDS, _GROUND_TTL, _MAX_EXCHANGES, + BarkRequest, _bark_and_broadcast, _bark_last_request, _conversation, @@ -116,9 +111,13 @@ from .utils import ( # noqa: E402, F401 _get_agent_shape, _get_client_ip, ) + +# Used by src/infrastructure/presence.py from .websocket import ( # noqa: E402, F401 _authenticate_ws, _broadcast, _heartbeat, + _ws_clients, # noqa: E402, F401 + broadcast_world_state, # noqa: E402, F401 world_ws, ) diff --git a/src/infrastructure/router/cascade.py b/src/infrastructure/router/cascade.py index c34ce17c..9a9e8431 100644 --- a/src/infrastructure/router/cascade.py +++ b/src/infrastructure/router/cascade.py @@ -29,6 +29,8 @@ except ImportError: requests = None # type: ignore # Re-export data models so existing ``from …cascade import X`` keeps working. +# Mixins +from .health import HealthMixin from .models import ( # noqa: F401 – re-exports CircuitState, ContentType, @@ -38,9 +40,6 @@ from .models import ( # noqa: F401 – re-exports ProviderStatus, RouterConfig, ) - -# Mixins -from .health import HealthMixin from .providers import ProviderCallsMixin logger = logging.getLogger(__name__) diff --git a/src/infrastructure/router/health.py b/src/infrastructure/router/health.py index 7b77318f..5be6ec0d 100644 --- a/src/infrastructure/router/health.py +++ b/src/infrastructure/router/health.py @@ -10,7 +10,7 @@ import logging import time from datetime import UTC, datetime -from .models import CircuitState, Provider, ProviderMetrics, ProviderStatus +from .models import CircuitState, Provider, ProviderStatus logger = logging.getLogger(__name__) @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) try: from infrastructure.claude_quota import QuotaMonitor, get_quota_monitor - _quota_monitor: "QuotaMonitor | None" = get_quota_monitor() + _quota_monitor: QuotaMonitor | None = get_quota_monitor() except Exception as _exc: # pragma: no cover logger.debug("Quota monitor not available: %s", _exc) _quota_monitor = None diff --git a/src/timmy/voice/__init__.py b/src/timmy/voice/__init__.py new file mode 100644 index 00000000..8cc847ac --- /dev/null +++ b/src/timmy/voice/__init__.py @@ -0,0 +1,50 @@ +"""Voice subpackage — re-exports for convenience.""" + +from timmy.voice.activation import ( + EXIT_COMMANDS, + WHISPER_HALLUCINATIONS, + is_exit_command, + is_hallucination, +) +from timmy.voice.audio_io import ( + DEFAULT_CHANNELS, + DEFAULT_MAX_UTTERANCE, + DEFAULT_MIN_UTTERANCE, + DEFAULT_SAMPLE_RATE, + DEFAULT_SILENCE_DURATION, + DEFAULT_SILENCE_THRESHOLD, + _rms, +) +from timmy.voice.helpers import _install_quiet_asyncgen_hooks, _suppress_mcp_noise +from timmy.voice.llm import LLMMixin +from timmy.voice.speech_engines import ( + _VOICE_PREAMBLE, + DEFAULT_PIPER_VOICE, + DEFAULT_WHISPER_MODEL, + _strip_markdown, +) +from timmy.voice.stt import STTMixin +from timmy.voice.tts import TTSMixin + +__all__ = [ + "DEFAULT_CHANNELS", + "DEFAULT_MAX_UTTERANCE", + "DEFAULT_MIN_UTTERANCE", + "DEFAULT_PIPER_VOICE", + "DEFAULT_SAMPLE_RATE", + "DEFAULT_SILENCE_DURATION", + "DEFAULT_SILENCE_THRESHOLD", + "DEFAULT_WHISPER_MODEL", + "EXIT_COMMANDS", + "LLMMixin", + "STTMixin", + "TTSMixin", + "WHISPER_HALLUCINATIONS", + "_VOICE_PREAMBLE", + "_install_quiet_asyncgen_hooks", + "_rms", + "_strip_markdown", + "_suppress_mcp_noise", + "is_exit_command", + "is_hallucination", +] diff --git a/src/timmy/voice/activation.py b/src/timmy/voice/activation.py new file mode 100644 index 00000000..0054fc61 --- /dev/null +++ b/src/timmy/voice/activation.py @@ -0,0 +1,38 @@ +"""Voice activation detection — hallucination filtering and exit commands.""" + +from __future__ import annotations + +# Whisper hallucinates these on silence/noise — skip them. +WHISPER_HALLUCINATIONS = frozenset( + { + "you", + "thanks.", + "thank you.", + "bye.", + "", + "thanks for watching!", + "thank you for watching!", + } +) + +# Spoken phrases that end the voice session. +EXIT_COMMANDS = frozenset( + { + "goodbye", + "exit", + "quit", + "stop", + "goodbye timmy", + "stop listening", + } +) + + +def is_hallucination(text: str) -> bool: + """Return True if *text* is a known Whisper hallucination.""" + return not text or text.lower() in WHISPER_HALLUCINATIONS + + +def is_exit_command(text: str) -> bool: + """Return True if the user asked to stop the voice session.""" + return text.lower().strip().rstrip(".!") in EXIT_COMMANDS diff --git a/src/timmy/voice/audio_io.py b/src/timmy/voice/audio_io.py new file mode 100644 index 00000000..74035baa --- /dev/null +++ b/src/timmy/voice/audio_io.py @@ -0,0 +1,19 @@ +"""Audio capture and playback utilities for the voice loop.""" + +from __future__ import annotations + +import numpy as np + +# ── Defaults ──────────────────────────────────────────────────────────────── + +DEFAULT_SAMPLE_RATE = 16000 # Whisper expects 16 kHz +DEFAULT_CHANNELS = 1 +DEFAULT_SILENCE_THRESHOLD = 0.015 # RMS threshold — tune for your mic/room +DEFAULT_SILENCE_DURATION = 1.5 # seconds of silence to end utterance +DEFAULT_MIN_UTTERANCE = 0.5 # ignore clicks/bumps shorter than this +DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever + + +def _rms(block: np.ndarray) -> float: + """Compute root-mean-square energy of an audio block.""" + return float(np.sqrt(np.mean(block.astype(np.float32) ** 2))) diff --git a/src/timmy/voice/helpers.py b/src/timmy/voice/helpers.py new file mode 100644 index 00000000..ef2fa706 --- /dev/null +++ b/src/timmy/voice/helpers.py @@ -0,0 +1,53 @@ +"""Miscellaneous helpers for the voice loop runtime.""" + +from __future__ import annotations + +import logging +import sys + + +def _suppress_mcp_noise() -> None: + """Quiet down noisy MCP/Agno loggers during voice mode. + + Sets specific loggers to WARNING so the terminal stays clean + for the voice transcript. + """ + for name in ( + "mcp", + "mcp.server", + "mcp.client", + "agno", + "agno.mcp", + "httpx", + "httpcore", + ): + logging.getLogger(name).setLevel(logging.WARNING) + + +def _install_quiet_asyncgen_hooks() -> None: + """Silence MCP stdio_client async-generator teardown noise. + + When the voice loop exits, Python GC finalizes Agno's MCP + stdio_client async generators. anyio's cancel-scope teardown + prints ugly tracebacks to stderr. These are harmless — the + MCP subprocesses die with the loop. We intercept them here. + """ + _orig_hook = getattr(sys, "unraisablehook", None) + + def _quiet_hook(args): + # Swallow RuntimeError from anyio cancel-scope teardown + # and BaseExceptionGroup from MCP stdio_client generators + if args.exc_type in (RuntimeError, BaseExceptionGroup): + msg = str(args.exc_value) if args.exc_value else "" + if "cancel scope" in msg or "unhandled errors" in msg: + return + # Also swallow GeneratorExit from stdio_client + if args.exc_type is GeneratorExit: + return + # Everything else: forward to original hook + if _orig_hook: + _orig_hook(args) + else: + sys.__unraisablehook__(args) + + sys.unraisablehook = _quiet_hook diff --git a/src/timmy/voice/llm.py b/src/timmy/voice/llm.py new file mode 100644 index 00000000..aca3d134 --- /dev/null +++ b/src/timmy/voice/llm.py @@ -0,0 +1,68 @@ +"""LLM integration mixin — async chat and event-loop management.""" + +from __future__ import annotations + +import asyncio +import logging +import sys +import time +import warnings + +from timmy.voice.speech_engines import _VOICE_PREAMBLE, _strip_markdown + +logger = logging.getLogger(__name__) + + +class LLMMixin: + """Mixin providing LLM chat methods for :class:`VoiceLoop`.""" + + def _get_loop(self) -> asyncio.AbstractEventLoop: + """Return a persistent event loop, creating one if needed.""" + if self._loop is None or self._loop.is_closed(): + self._loop = asyncio.new_event_loop() + return self._loop + + def _think(self, user_text: str) -> str: + """Send text to Timmy and get a response.""" + sys.stdout.write(" 💭 Thinking...\r") + sys.stdout.flush() + t0 = time.monotonic() + try: + loop = self._get_loop() + response = loop.run_until_complete(self._chat(user_text)) + except (ConnectionError, RuntimeError, ValueError) as exc: + logger.error("Timmy chat failed: %s", exc) + response = "I'm having trouble thinking right now. Could you try again?" + elapsed = time.monotonic() - t0 + logger.info("Timmy responded in %.1fs", elapsed) + response = _strip_markdown(response) + return response + + async def _chat(self, message: str) -> str: + """Async wrapper around Timmy's session.chat().""" + from timmy.session import chat + + voiced = f"{_VOICE_PREAMBLE}\n\nUser said: {message}" + return await chat(voiced, session_id=self.config.session_id) + + def _cleanup_loop(self) -> None: + """Shut down the persistent event loop cleanly.""" + if self._loop is None or self._loop.is_closed(): + return + + self._loop.set_exception_handler(lambda loop, ctx: None) + try: + self._loop.run_until_complete(self._loop.shutdown_asyncgens()) + except RuntimeError as exc: + logger.debug("Shutdown asyncgens failed: %s", exc) + pass + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + try: + self._loop.close() + except RuntimeError as exc: + logger.debug("Loop close failed: %s", exc) + pass + + self._loop = None diff --git a/src/timmy/voice/speech_engines.py b/src/timmy/voice/speech_engines.py new file mode 100644 index 00000000..b525da3a --- /dev/null +++ b/src/timmy/voice/speech_engines.py @@ -0,0 +1,48 @@ +"""Speech engine constants and text-processing utilities.""" + +from __future__ import annotations + +import re +from pathlib import Path + +# ── Defaults ──────────────────────────────────────────────────────────────── + +DEFAULT_WHISPER_MODEL = "base.en" +DEFAULT_PIPER_VOICE = Path.home() / ".local/share/piper-voices/en_US-lessac-medium.onnx" + +# ── Voice-mode system instruction ─────────────────────────────────────────── +# Prepended to user messages so Timmy responds naturally for TTS. +_VOICE_PREAMBLE = ( + "[VOICE MODE] You are speaking aloud through a text-to-speech system. " + "Respond in short, natural spoken sentences. No markdown, no bullet points, " + "no asterisks, no numbered lists, no headers, no bold/italic formatting. " + "Talk like a person in a conversation — concise, warm, direct. " + "Keep responses under 3-4 sentences unless the user asks for detail." +) + + +def _strip_markdown(text: str) -> str: + """Remove markdown formatting so TTS reads naturally. + + Strips: **bold**, *italic*, `code`, # headers, - bullets, + numbered lists, [links](url), etc. + """ + if not text: + return text + # Remove bold/italic markers + text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text) + # Remove inline code + text = re.sub(r"`([^`]+)`", r"\1", text) + # Remove headers (# Header) + text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) + # Remove bullet points (-, *, +) at start of line + text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE) + # Remove numbered lists (1. 2. etc) + text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE) + # Remove link syntax [text](url) → text + text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) + # Remove horizontal rules + text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE) + # Collapse multiple newlines + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() diff --git a/src/timmy/voice/stt.py b/src/timmy/voice/stt.py new file mode 100644 index 00000000..4b3ea4e8 --- /dev/null +++ b/src/timmy/voice/stt.py @@ -0,0 +1,119 @@ +"""Speech-to-text mixin — microphone capture and Whisper transcription.""" + +from __future__ import annotations + +import logging +import sys +import time + +import numpy as np + +from timmy.voice.audio_io import DEFAULT_CHANNELS, _rms + +logger = logging.getLogger(__name__) + + +class STTMixin: + """Mixin providing STT methods for :class:`VoiceLoop`.""" + + def _load_whisper(self): + """Load Whisper model (lazy, first use only).""" + if self._whisper_model is not None: + return + import whisper + + logger.info("Loading Whisper model: %s", self.config.whisper_model) + self._whisper_model = whisper.load_model(self.config.whisper_model) + logger.info("Whisper model loaded.") + + def _record_utterance(self) -> np.ndarray | None: + """Record from microphone until silence is detected.""" + import sounddevice as sd + + sr = self.config.sample_rate + block_size = int(sr * 0.1) + silence_blocks = int(self.config.silence_duration / 0.1) + min_blocks = int(self.config.min_utterance / 0.1) + max_blocks = int(self.config.max_utterance / 0.1) + + sys.stdout.write("\n 🎤 Listening... (speak now)\n") + sys.stdout.flush() + + with sd.InputStream( + samplerate=sr, + channels=DEFAULT_CHANNELS, + dtype="float32", + blocksize=block_size, + ) as stream: + chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks) + + return self._finalize_utterance(chunks, min_blocks, sr) + + def _capture_audio_blocks( + self, + stream, + block_size: int, + silence_blocks: int, + max_blocks: int, + ) -> list[np.ndarray]: + """Read audio blocks from *stream* until silence or max length.""" + chunks: list[np.ndarray] = [] + silent_count = 0 + recording = False + + while self._running: + block, overflowed = stream.read(block_size) + if overflowed: + logger.debug("Audio buffer overflowed") + + rms = _rms(block) + + if not recording: + if rms > self.config.silence_threshold: + recording = True + silent_count = 0 + chunks.append(block.copy()) + sys.stdout.write(" 📢 Recording...\r") + sys.stdout.flush() + else: + chunks.append(block.copy()) + if rms < self.config.silence_threshold: + silent_count += 1 + else: + silent_count = 0 + if silent_count >= silence_blocks: + break + if len(chunks) >= max_blocks: + logger.info("Max utterance length reached, stopping.") + break + + return chunks + + @staticmethod + def _finalize_utterance( + chunks: list[np.ndarray], min_blocks: int, sample_rate: int + ) -> np.ndarray | None: + """Concatenate recorded chunks and report duration.""" + if not chunks or len(chunks) < min_blocks: + return None + + audio = np.concatenate(chunks, axis=0).flatten() + duration = len(audio) / sample_rate + sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n") + sys.stdout.flush() + return audio + + def _transcribe(self, audio: np.ndarray) -> str: + """Transcribe audio using local Whisper model.""" + self._load_whisper() + + sys.stdout.write(" 🧠 Transcribing...\r") + sys.stdout.flush() + + t0 = time.monotonic() + result = self._whisper_model.transcribe(audio, language="en", fp16=False) + elapsed = time.monotonic() - t0 + + text = result["text"].strip() + logger.info("Whisper transcribed in %.1fs: '%s'", elapsed, text[:80]) + return text diff --git a/src/timmy/voice/tts.py b/src/timmy/voice/tts.py new file mode 100644 index 00000000..1bc3f95a --- /dev/null +++ b/src/timmy/voice/tts.py @@ -0,0 +1,78 @@ +"""Text-to-speech mixin — Piper TTS and macOS ``say`` fallback.""" + +from __future__ import annotations + +import logging +import subprocess +import tempfile +import time +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class TTSMixin: + """Mixin providing TTS methods for :class:`VoiceLoop`.""" + + def _speak(self, text: str) -> None: + """Speak text aloud using Piper TTS or macOS `say`.""" + if not text: + return + self._speaking = True + try: + if self.config.use_say_fallback: + self._speak_say(text) + else: + self._speak_piper(text) + finally: + self._speaking = False + + def _speak_piper(self, text: str) -> None: + """Speak using Piper TTS (local ONNX inference).""" + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_path = tmp.name + try: + cmd = ["piper", "--model", str(self.config.piper_voice), "--output_file", tmp_path] + proc = subprocess.run(cmd, input=text, capture_output=True, text=True, timeout=30) + if proc.returncode != 0: + logger.error("Piper failed: %s", proc.stderr) + self._speak_say(text) + return + self._play_audio(tmp_path) + finally: + Path(tmp_path).unlink(missing_ok=True) + + def _speak_say(self, text: str) -> None: + """Speak using macOS `say` command.""" + try: + proc = subprocess.Popen( + ["say", "-r", "180", text], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + proc.wait(timeout=60) + except subprocess.TimeoutExpired: + proc.kill() + except FileNotFoundError: + logger.error("macOS `say` command not found") + + def _play_audio(self, path: str) -> None: + """Play a WAV file. Can be interrupted by setting self._interrupted.""" + try: + proc = subprocess.Popen( + ["afplay", path], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + while proc.poll() is None: + if self._interrupted: + proc.terminate() + self._interrupted = False + logger.info("TTS interrupted by user") + return + time.sleep(0.05) + except FileNotFoundError: + try: + subprocess.run(["aplay", path], capture_output=True, timeout=60) + except (FileNotFoundError, subprocess.TimeoutExpired): + logger.error("No audio player found (tried afplay, aplay)") diff --git a/src/timmy/voice_loop.py b/src/timmy/voice_loop.py index e2fcfb25..d57f6a8c 100644 --- a/src/timmy/voice_loop.py +++ b/src/timmy/voice_loop.py @@ -13,76 +13,41 @@ Usage: Requires: sounddevice, numpy, whisper, piper-tts """ +from __future__ import annotations + import asyncio import logging -import re -import subprocess import sys -import tempfile -import time from dataclasses import dataclass from pathlib import Path -import numpy as np +from timmy.voice.activation import ( + EXIT_COMMANDS, + WHISPER_HALLUCINATIONS, + is_exit_command, + is_hallucination, +) +from timmy.voice.audio_io import ( + DEFAULT_MAX_UTTERANCE, + DEFAULT_MIN_UTTERANCE, + DEFAULT_SAMPLE_RATE, + DEFAULT_SILENCE_DURATION, + DEFAULT_SILENCE_THRESHOLD, +) +from timmy.voice.helpers import _install_quiet_asyncgen_hooks, _suppress_mcp_noise +from timmy.voice.llm import LLMMixin +from timmy.voice.speech_engines import ( + DEFAULT_PIPER_VOICE, + DEFAULT_WHISPER_MODEL, +) +from timmy.voice.stt import STTMixin +from timmy.voice.tts import TTSMixin logger = logging.getLogger(__name__) -# ── Voice-mode system instruction ─────────────────────────────────────────── -# Prepended to user messages so Timmy responds naturally for TTS. -_VOICE_PREAMBLE = ( - "[VOICE MODE] You are speaking aloud through a text-to-speech system. " - "Respond in short, natural spoken sentences. No markdown, no bullet points, " - "no asterisks, no numbered lists, no headers, no bold/italic formatting. " - "Talk like a person in a conversation — concise, warm, direct. " - "Keep responses under 3-4 sentences unless the user asks for detail." -) - - -def _strip_markdown(text: str) -> str: - """Remove markdown formatting so TTS reads naturally. - - Strips: **bold**, *italic*, `code`, # headers, - bullets, - numbered lists, [links](url), etc. - """ - if not text: - return text - # Remove bold/italic markers - text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text) - # Remove inline code - text = re.sub(r"`([^`]+)`", r"\1", text) - # Remove headers (# Header) - text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) - # Remove bullet points (-, *, +) at start of line - text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE) - # Remove numbered lists (1. 2. etc) - text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE) - # Remove link syntax [text](url) → text - text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) - # Remove horizontal rules - text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE) - # Collapse multiple newlines - text = re.sub(r"\n{3,}", "\n\n", text) - return text.strip() - - -# ── Defaults ──────────────────────────────────────────────────────────────── - -DEFAULT_WHISPER_MODEL = "base.en" -DEFAULT_PIPER_VOICE = Path.home() / ".local/share/piper-voices/en_US-lessac-medium.onnx" -DEFAULT_SAMPLE_RATE = 16000 # Whisper expects 16 kHz -DEFAULT_CHANNELS = 1 -DEFAULT_SILENCE_THRESHOLD = 0.015 # RMS threshold — tune for your mic/room -DEFAULT_SILENCE_DURATION = 1.5 # seconds of silence to end utterance -DEFAULT_MIN_UTTERANCE = 0.5 # ignore clicks/bumps shorter than this -DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever DEFAULT_SESSION_ID = "voice" -def _rms(block: np.ndarray) -> float: - """Compute root-mean-square energy of an audio block.""" - return float(np.sqrt(np.mean(block.astype(np.float32) ** 2))) - - @dataclass class VoiceConfig: """Configuration for the voice loop.""" @@ -104,7 +69,7 @@ class VoiceConfig: model_size: str | None = None -class VoiceLoop: +class VoiceLoop(STTMixin, TTSMixin, LLMMixin): """Sovereign listen-think-speak loop. Everything runs locally: @@ -113,312 +78,35 @@ class VoiceLoop: - TTS: Piper (local ONNX model) or macOS `say` """ + # Class-level constants delegate to the activation module. + _WHISPER_HALLUCINATIONS = WHISPER_HALLUCINATIONS + _EXIT_COMMANDS = EXIT_COMMANDS + def __init__(self, config: VoiceConfig | None = None) -> None: self.config = config or VoiceConfig() self._whisper_model = None self._running = False - self._speaking = False # True while TTS is playing - self._interrupted = False # set when user talks over TTS - # Persistent event loop — reused across all chat calls so Agno's - # MCP sessions don't die when the loop closes. + self._speaking = False + self._interrupted = False self._loop: asyncio.AbstractEventLoop | None = None # ── Lazy initialization ───────────────────────────────────────────── - def _load_whisper(self): - """Load Whisper model (lazy, first use only).""" - if self._whisper_model is not None: - return - import whisper - - logger.info("Loading Whisper model: %s", self.config.whisper_model) - self._whisper_model = whisper.load_model(self.config.whisper_model) - logger.info("Whisper model loaded.") - def _ensure_piper(self) -> bool: """Check that Piper voice model exists.""" if self.config.use_say_fallback: return True voice_path = self.config.piper_voice if not voice_path.exists(): - logger.warning("Piper voice not found at %s — falling back to `say`", voice_path) + logger.warning( + "Piper voice not found at %s — falling back to `say`", voice_path + ) self.config.use_say_fallback = True return True return True - # ── STT: Microphone → Text ────────────────────────────────────────── - - def _record_utterance(self) -> np.ndarray | None: - """Record from microphone until silence is detected. - - Uses energy-based Voice Activity Detection: - 1. Wait for speech (RMS above threshold) - 2. Record until silence (RMS below threshold for silence_duration) - 3. Return the audio as a numpy array - - Returns None if interrupted or no speech detected. - """ - import sounddevice as sd - - sr = self.config.sample_rate - block_size = int(sr * 0.1) # 100ms blocks - silence_blocks = int(self.config.silence_duration / 0.1) - min_blocks = int(self.config.min_utterance / 0.1) - max_blocks = int(self.config.max_utterance / 0.1) - - sys.stdout.write("\n 🎤 Listening... (speak now)\n") - sys.stdout.flush() - - with sd.InputStream( - samplerate=sr, - channels=DEFAULT_CHANNELS, - dtype="float32", - blocksize=block_size, - ) as stream: - chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks) - - return self._finalize_utterance(chunks, min_blocks, sr) - - def _capture_audio_blocks( - self, - stream, - block_size: int, - silence_blocks: int, - max_blocks: int, - ) -> list[np.ndarray]: - """Read audio blocks from *stream* until silence or max length. - - Returns the list of captured audio chunks (may be empty). - """ - chunks: list[np.ndarray] = [] - silent_count = 0 - recording = False - - while self._running: - block, overflowed = stream.read(block_size) - if overflowed: - logger.debug("Audio buffer overflowed") - - rms = _rms(block) - - if not recording: - if rms > self.config.silence_threshold: - recording = True - silent_count = 0 - chunks.append(block.copy()) - sys.stdout.write(" 📢 Recording...\r") - sys.stdout.flush() - else: - chunks.append(block.copy()) - - if rms < self.config.silence_threshold: - silent_count += 1 - else: - silent_count = 0 - - if silent_count >= silence_blocks: - break - - if len(chunks) >= max_blocks: - logger.info("Max utterance length reached, stopping.") - break - - return chunks - - @staticmethod - def _finalize_utterance( - chunks: list[np.ndarray], min_blocks: int, sample_rate: int - ) -> np.ndarray | None: - """Concatenate recorded chunks and report duration. - - Returns ``None`` if the utterance is too short to be meaningful. - """ - if not chunks or len(chunks) < min_blocks: - return None - - audio = np.concatenate(chunks, axis=0).flatten() - duration = len(audio) / sample_rate - sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n") - sys.stdout.flush() - return audio - - def _transcribe(self, audio: np.ndarray) -> str: - """Transcribe audio using local Whisper model.""" - self._load_whisper() - - sys.stdout.write(" 🧠 Transcribing...\r") - sys.stdout.flush() - - t0 = time.monotonic() - result = self._whisper_model.transcribe( - audio, - language="en", - fp16=False, # MPS/CPU — fp16 can cause issues on some setups - ) - elapsed = time.monotonic() - t0 - - text = result["text"].strip() - logger.info("Whisper transcribed in %.1fs: '%s'", elapsed, text[:80]) - return text - - # ── TTS: Text → Speaker ───────────────────────────────────────────── - - def _speak(self, text: str) -> None: - """Speak text aloud using Piper TTS or macOS `say`.""" - if not text: - return - - self._speaking = True - try: - if self.config.use_say_fallback: - self._speak_say(text) - else: - self._speak_piper(text) - finally: - self._speaking = False - - def _speak_piper(self, text: str) -> None: - """Speak using Piper TTS (local ONNX inference).""" - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: - tmp_path = tmp.name - - try: - # Generate WAV with Piper - cmd = [ - "piper", - "--model", - str(self.config.piper_voice), - "--output_file", - tmp_path, - ] - - proc = subprocess.run( - cmd, - input=text, - capture_output=True, - text=True, - timeout=30, - ) - - if proc.returncode != 0: - logger.error("Piper failed: %s", proc.stderr) - self._speak_say(text) # fallback - return - - # Play with afplay (macOS) — interruptible - self._play_audio(tmp_path) - - finally: - Path(tmp_path).unlink(missing_ok=True) - - def _speak_say(self, text: str) -> None: - """Speak using macOS `say` command.""" - try: - proc = subprocess.Popen( - ["say", "-r", "180", text], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - proc.wait(timeout=60) - except subprocess.TimeoutExpired: - proc.kill() - except FileNotFoundError: - logger.error("macOS `say` command not found") - - def _play_audio(self, path: str) -> None: - """Play a WAV file. Can be interrupted by setting self._interrupted.""" - try: - proc = subprocess.Popen( - ["afplay", path], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - # Poll so we can interrupt - while proc.poll() is None: - if self._interrupted: - proc.terminate() - self._interrupted = False - logger.info("TTS interrupted by user") - return - time.sleep(0.05) - except FileNotFoundError: - # Not macOS — try aplay (Linux) - try: - subprocess.run(["aplay", path], capture_output=True, timeout=60) - except (FileNotFoundError, subprocess.TimeoutExpired): - logger.error("No audio player found (tried afplay, aplay)") - - # ── LLM: Text → Response ─────────────────────────────────────────── - - def _get_loop(self) -> asyncio.AbstractEventLoop: - """Return a persistent event loop, creating one if needed. - - A single loop is reused for the entire voice session so Agno's - MCP tool-server connections survive across turns. - """ - if self._loop is None or self._loop.is_closed(): - self._loop = asyncio.new_event_loop() - return self._loop - - def _think(self, user_text: str) -> str: - """Send text to Timmy and get a response.""" - sys.stdout.write(" 💭 Thinking...\r") - sys.stdout.flush() - - t0 = time.monotonic() - - try: - loop = self._get_loop() - response = loop.run_until_complete(self._chat(user_text)) - except (ConnectionError, RuntimeError, ValueError) as exc: - logger.error("Timmy chat failed: %s", exc) - response = "I'm having trouble thinking right now. Could you try again?" - - elapsed = time.monotonic() - t0 - logger.info("Timmy responded in %.1fs", elapsed) - - # Strip markdown so TTS doesn't read asterisks, bullets, etc. - response = _strip_markdown(response) - return response - - async def _chat(self, message: str) -> str: - """Async wrapper around Timmy's session.chat(). - - Prepends the voice-mode instruction so Timmy responds in - natural spoken language rather than markdown. - """ - from timmy.session import chat - - voiced = f"{_VOICE_PREAMBLE}\n\nUser said: {message}" - return await chat(voiced, session_id=self.config.session_id) - # ── Main Loop ─────────────────────────────────────────────────────── - # Whisper hallucinates these on silence/noise — skip them. - _WHISPER_HALLUCINATIONS = frozenset( - { - "you", - "thanks.", - "thank you.", - "bye.", - "", - "thanks for watching!", - "thank you for watching!", - } - ) - - # Spoken phrases that end the voice session. - _EXIT_COMMANDS = frozenset( - { - "goodbye", - "exit", - "quit", - "stop", - "goodbye timmy", - "stop listening", - } - ) - def _log_banner(self) -> None: """Log the startup banner with STT/TTS/LLM configuration.""" tts_label = ( @@ -438,21 +126,19 @@ class VoiceLoop: def _is_hallucination(self, text: str) -> bool: """Return True if *text* is a known Whisper hallucination.""" - return not text or text.lower() in self._WHISPER_HALLUCINATIONS + return is_hallucination(text) def _is_exit_command(self, text: str) -> bool: """Return True if the user asked to stop the voice session.""" - return text.lower().strip().rstrip(".!") in self._EXIT_COMMANDS + return is_exit_command(text) def _process_turn(self, text: str) -> None: """Handle a single listen-think-speak turn after transcription.""" sys.stdout.write(f"\n 👤 You: {text}\n") sys.stdout.flush() - response = self._think(text) sys.stdout.write(f" 🤖 Timmy: {response}\n") sys.stdout.flush() - self._speak(response) def run(self) -> None: @@ -461,112 +147,26 @@ class VoiceLoop: _suppress_mcp_noise() _install_quiet_asyncgen_hooks() self._log_banner() - self._running = True - try: while self._running: audio = self._record_utterance() if audio is None: continue - text = self._transcribe(audio) if self._is_hallucination(text): logger.debug("Ignoring likely Whisper hallucination: '%s'", text) continue - if self._is_exit_command(text): logger.info("👋 Goodbye!") break - self._process_turn(text) - except KeyboardInterrupt: logger.info("👋 Voice loop stopped.") finally: self._running = False self._cleanup_loop() - def _cleanup_loop(self) -> None: - """Shut down the persistent event loop cleanly. - - Agno's MCP stdio sessions leave async generators (stdio_client) - that complain loudly when torn down from a different task. - We swallow those errors — they're harmless, the subprocesses - die with the loop anyway. - """ - if self._loop is None or self._loop.is_closed(): - return - - # Silence "error during closing of asynchronous generator" warnings - # from MCP's anyio/asyncio cancel-scope teardown. - import warnings - - self._loop.set_exception_handler(lambda loop, ctx: None) - - try: - self._loop.run_until_complete(self._loop.shutdown_asyncgens()) - except RuntimeError as exc: - logger.debug("Shutdown asyncgens failed: %s", exc) - pass - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - try: - self._loop.close() - except RuntimeError as exc: - logger.debug("Loop close failed: %s", exc) - pass - - self._loop = None - def stop(self) -> None: """Stop the voice loop (from another thread).""" self._running = False - - -def _suppress_mcp_noise() -> None: - """Quiet down noisy MCP/Agno loggers during voice mode. - - Sets specific loggers to WARNING so the terminal stays clean - for the voice transcript. - """ - for name in ( - "mcp", - "mcp.server", - "mcp.client", - "agno", - "agno.mcp", - "httpx", - "httpcore", - ): - logging.getLogger(name).setLevel(logging.WARNING) - - -def _install_quiet_asyncgen_hooks() -> None: - """Silence MCP stdio_client async-generator teardown noise. - - When the voice loop exits, Python GC finalizes Agno's MCP - stdio_client async generators. anyio's cancel-scope teardown - prints ugly tracebacks to stderr. These are harmless — the - MCP subprocesses die with the loop. We intercept them here. - """ - _orig_hook = getattr(sys, "unraisablehook", None) - - def _quiet_hook(args): - # Swallow RuntimeError from anyio cancel-scope teardown - # and BaseExceptionGroup from MCP stdio_client generators - if args.exc_type in (RuntimeError, BaseExceptionGroup): - msg = str(args.exc_value) if args.exc_value else "" - if "cancel scope" in msg or "unhandled errors" in msg: - return - # Also swallow GeneratorExit from stdio_client - if args.exc_type is GeneratorExit: - return - # Everything else: forward to original hook - if _orig_hook: - _orig_hook(args) - else: - sys.__unraisablehook__(args) - - sys.unraisablehook = _quiet_hook