1
0
This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/src/timmy/voice_loop.py
2026-03-19 21:37:32 -04:00

573 lines
20 KiB
Python

"""Sovereign voice loop — listen, think, speak.
A fully local voice interface for Timmy. No cloud, no network calls.
All processing happens on the user's machine:
Mic → VAD/silence detection → Whisper (local STT) → Timmy chat → Piper TTS → Speaker
Usage:
from timmy.voice_loop import VoiceLoop
loop = VoiceLoop()
loop.run() # blocks, Ctrl-C to stop
Requires: sounddevice, numpy, whisper, piper-tts
"""
import asyncio
import logging
import re
import subprocess
import sys
import tempfile
import time
from dataclasses import dataclass
from pathlib import Path
import numpy as np
logger = logging.getLogger(__name__)
# ── Voice-mode system instruction ───────────────────────────────────────────
# Prepended to user messages so Timmy responds naturally for TTS.
_VOICE_PREAMBLE = (
"[VOICE MODE] You are speaking aloud through a text-to-speech system. "
"Respond in short, natural spoken sentences. No markdown, no bullet points, "
"no asterisks, no numbered lists, no headers, no bold/italic formatting. "
"Talk like a person in a conversation — concise, warm, direct. "
"Keep responses under 3-4 sentences unless the user asks for detail."
)
def _strip_markdown(text: str) -> str:
"""Remove markdown formatting so TTS reads naturally.
Strips: **bold**, *italic*, `code`, # headers, - bullets,
numbered lists, [links](url), etc.
"""
if not text:
return text
# Remove bold/italic markers
text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
# Remove inline code
text = re.sub(r"`([^`]+)`", r"\1", text)
# Remove headers (# Header)
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
# Remove bullet points (-, *, +) at start of line
text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
# Remove numbered lists (1. 2. etc)
text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
# Remove link syntax [text](url) → text
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
# Remove horizontal rules
text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
# Collapse multiple newlines
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
# ── Defaults ────────────────────────────────────────────────────────────────
DEFAULT_WHISPER_MODEL = "base.en"
DEFAULT_PIPER_VOICE = Path.home() / ".local/share/piper-voices/en_US-lessac-medium.onnx"
DEFAULT_SAMPLE_RATE = 16000 # Whisper expects 16 kHz
DEFAULT_CHANNELS = 1
DEFAULT_SILENCE_THRESHOLD = 0.015 # RMS threshold — tune for your mic/room
DEFAULT_SILENCE_DURATION = 1.5 # seconds of silence to end utterance
DEFAULT_MIN_UTTERANCE = 0.5 # ignore clicks/bumps shorter than this
DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever
DEFAULT_SESSION_ID = "voice"
def _rms(block: np.ndarray) -> float:
"""Compute root-mean-square energy of an audio block."""
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
@dataclass
class VoiceConfig:
"""Configuration for the voice loop."""
whisper_model: str = DEFAULT_WHISPER_MODEL
piper_voice: Path = DEFAULT_PIPER_VOICE
sample_rate: int = DEFAULT_SAMPLE_RATE
silence_threshold: float = DEFAULT_SILENCE_THRESHOLD
silence_duration: float = DEFAULT_SILENCE_DURATION
min_utterance: float = DEFAULT_MIN_UTTERANCE
max_utterance: float = DEFAULT_MAX_UTTERANCE
session_id: str = DEFAULT_SESSION_ID
# Set True to use macOS `say` instead of Piper
use_say_fallback: bool = False
# Piper speaking rate (default 1.0, lower = slower)
speaking_rate: float = 1.0
# Backend/model for Timmy inference
backend: str | None = None
model_size: str | None = None
class VoiceLoop:
"""Sovereign listen-think-speak loop.
Everything runs locally:
- STT: OpenAI Whisper (local model, no API)
- LLM: Timmy via Ollama (local inference)
- TTS: Piper (local ONNX model) or macOS `say`
"""
def __init__(self, config: VoiceConfig | None = None) -> None:
self.config = config or VoiceConfig()
self._whisper_model = None
self._running = False
self._speaking = False # True while TTS is playing
self._interrupted = False # set when user talks over TTS
# Persistent event loop — reused across all chat calls so Agno's
# MCP sessions don't die when the loop closes.
self._loop: asyncio.AbstractEventLoop | None = None
# ── Lazy initialization ─────────────────────────────────────────────
def _load_whisper(self):
"""Load Whisper model (lazy, first use only)."""
if self._whisper_model is not None:
return
import whisper
logger.info("Loading Whisper model: %s", self.config.whisper_model)
self._whisper_model = whisper.load_model(self.config.whisper_model)
logger.info("Whisper model loaded.")
def _ensure_piper(self) -> bool:
"""Check that Piper voice model exists."""
if self.config.use_say_fallback:
return True
voice_path = self.config.piper_voice
if not voice_path.exists():
logger.warning("Piper voice not found at %s — falling back to `say`", voice_path)
self.config.use_say_fallback = True
return True
return True
# ── STT: Microphone → Text ──────────────────────────────────────────
def _record_utterance(self) -> np.ndarray | None:
"""Record from microphone until silence is detected.
Uses energy-based Voice Activity Detection:
1. Wait for speech (RMS above threshold)
2. Record until silence (RMS below threshold for silence_duration)
3. Return the audio as a numpy array
Returns None if interrupted or no speech detected.
"""
import sounddevice as sd
sr = self.config.sample_rate
block_size = int(sr * 0.1) # 100ms blocks
silence_blocks = int(self.config.silence_duration / 0.1)
min_blocks = int(self.config.min_utterance / 0.1)
max_blocks = int(self.config.max_utterance / 0.1)
sys.stdout.write("\n 🎤 Listening... (speak now)\n")
sys.stdout.flush()
with sd.InputStream(
samplerate=sr,
channels=DEFAULT_CHANNELS,
dtype="float32",
blocksize=block_size,
) as stream:
chunks = self._capture_audio_blocks(stream, block_size, silence_blocks, max_blocks)
return self._finalize_utterance(chunks, min_blocks, sr)
def _capture_audio_blocks(
self,
stream,
block_size: int,
silence_blocks: int,
max_blocks: int,
) -> list[np.ndarray]:
"""Read audio blocks from *stream* until silence or max length.
Returns the list of captured audio chunks (may be empty).
"""
chunks: list[np.ndarray] = []
silent_count = 0
recording = False
while self._running:
block, overflowed = stream.read(block_size)
if overflowed:
logger.debug("Audio buffer overflowed")
rms = _rms(block)
if not recording:
if rms > self.config.silence_threshold:
recording = True
silent_count = 0
chunks.append(block.copy())
sys.stdout.write(" 📢 Recording...\r")
sys.stdout.flush()
else:
chunks.append(block.copy())
if rms < self.config.silence_threshold:
silent_count += 1
else:
silent_count = 0
if silent_count >= silence_blocks:
break
if len(chunks) >= max_blocks:
logger.info("Max utterance length reached, stopping.")
break
return chunks
@staticmethod
def _finalize_utterance(
chunks: list[np.ndarray], min_blocks: int, sample_rate: int
) -> np.ndarray | None:
"""Concatenate recorded chunks and report duration.
Returns ``None`` if the utterance is too short to be meaningful.
"""
if not chunks or len(chunks) < min_blocks:
return None
audio = np.concatenate(chunks, axis=0).flatten()
duration = len(audio) / sample_rate
sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n")
sys.stdout.flush()
return audio
def _transcribe(self, audio: np.ndarray) -> str:
"""Transcribe audio using local Whisper model."""
self._load_whisper()
sys.stdout.write(" 🧠 Transcribing...\r")
sys.stdout.flush()
t0 = time.monotonic()
result = self._whisper_model.transcribe(
audio,
language="en",
fp16=False, # MPS/CPU — fp16 can cause issues on some setups
)
elapsed = time.monotonic() - t0
text = result["text"].strip()
logger.info("Whisper transcribed in %.1fs: '%s'", elapsed, text[:80])
return text
# ── TTS: Text → Speaker ─────────────────────────────────────────────
def _speak(self, text: str) -> None:
"""Speak text aloud using Piper TTS or macOS `say`."""
if not text:
return
self._speaking = True
try:
if self.config.use_say_fallback:
self._speak_say(text)
else:
self._speak_piper(text)
finally:
self._speaking = False
def _speak_piper(self, text: str) -> None:
"""Speak using Piper TTS (local ONNX inference)."""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
# Generate WAV with Piper
cmd = [
"piper",
"--model",
str(self.config.piper_voice),
"--output_file",
tmp_path,
]
proc = subprocess.run(
cmd,
input=text,
capture_output=True,
text=True,
timeout=30,
)
if proc.returncode != 0:
logger.error("Piper failed: %s", proc.stderr)
self._speak_say(text) # fallback
return
# Play with afplay (macOS) — interruptible
self._play_audio(tmp_path)
finally:
Path(tmp_path).unlink(missing_ok=True)
def _speak_say(self, text: str) -> None:
"""Speak using macOS `say` command."""
try:
proc = subprocess.Popen(
["say", "-r", "180", text],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
proc.wait(timeout=60)
except subprocess.TimeoutExpired:
proc.kill()
except FileNotFoundError:
logger.error("macOS `say` command not found")
def _play_audio(self, path: str) -> None:
"""Play a WAV file. Can be interrupted by setting self._interrupted."""
try:
proc = subprocess.Popen(
["afplay", path],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# Poll so we can interrupt
while proc.poll() is None:
if self._interrupted:
proc.terminate()
self._interrupted = False
logger.info("TTS interrupted by user")
return
time.sleep(0.05)
except FileNotFoundError:
# Not macOS — try aplay (Linux)
try:
subprocess.run(["aplay", path], capture_output=True, timeout=60)
except (FileNotFoundError, subprocess.TimeoutExpired):
logger.error("No audio player found (tried afplay, aplay)")
# ── LLM: Text → Response ───────────────────────────────────────────
def _get_loop(self) -> asyncio.AbstractEventLoop:
"""Return a persistent event loop, creating one if needed.
A single loop is reused for the entire voice session so Agno's
MCP tool-server connections survive across turns.
"""
if self._loop is None or self._loop.is_closed():
self._loop = asyncio.new_event_loop()
return self._loop
def _think(self, user_text: str) -> str:
"""Send text to Timmy and get a response."""
sys.stdout.write(" 💭 Thinking...\r")
sys.stdout.flush()
t0 = time.monotonic()
try:
loop = self._get_loop()
response = loop.run_until_complete(self._chat(user_text))
except (ConnectionError, RuntimeError, ValueError) as exc:
logger.error("Timmy chat failed: %s", exc)
response = "I'm having trouble thinking right now. Could you try again?"
elapsed = time.monotonic() - t0
logger.info("Timmy responded in %.1fs", elapsed)
# Strip markdown so TTS doesn't read asterisks, bullets, etc.
response = _strip_markdown(response)
return response
async def _chat(self, message: str) -> str:
"""Async wrapper around Timmy's session.chat().
Prepends the voice-mode instruction so Timmy responds in
natural spoken language rather than markdown.
"""
from timmy.session import chat
voiced = f"{_VOICE_PREAMBLE}\n\nUser said: {message}"
return await chat(voiced, session_id=self.config.session_id)
# ── Main Loop ───────────────────────────────────────────────────────
# Whisper hallucinates these on silence/noise — skip them.
_WHISPER_HALLUCINATIONS = frozenset(
{
"you",
"thanks.",
"thank you.",
"bye.",
"",
"thanks for watching!",
"thank you for watching!",
}
)
# Spoken phrases that end the voice session.
_EXIT_COMMANDS = frozenset(
{
"goodbye",
"exit",
"quit",
"stop",
"goodbye timmy",
"stop listening",
}
)
def _log_banner(self) -> None:
"""Log the startup banner with STT/TTS/LLM configuration."""
tts_label = (
"macOS say"
if self.config.use_say_fallback
else f"Piper ({self.config.piper_voice.name})"
)
logger.info(
"\n" + "=" * 60 + "\n"
" 🎙️ Timmy Voice — Sovereign Voice Interface\n" + "=" * 60 + "\n"
f" STT: Whisper ({self.config.whisper_model})\n"
f" TTS: {tts_label}\n"
" LLM: Timmy (local Ollama)\n" + "=" * 60 + "\n"
" Speak naturally. Timmy will listen, think, and respond.\n"
" Press Ctrl-C to exit.\n" + "=" * 60
)
def _is_hallucination(self, text: str) -> bool:
"""Return True if *text* is a known Whisper hallucination."""
return not text or text.lower() in self._WHISPER_HALLUCINATIONS
def _is_exit_command(self, text: str) -> bool:
"""Return True if the user asked to stop the voice session."""
return text.lower().strip().rstrip(".!") in self._EXIT_COMMANDS
def _process_turn(self, text: str) -> None:
"""Handle a single listen-think-speak turn after transcription."""
sys.stdout.write(f"\n 👤 You: {text}\n")
sys.stdout.flush()
response = self._think(text)
sys.stdout.write(f" 🤖 Timmy: {response}\n")
sys.stdout.flush()
self._speak(response)
def run(self) -> None:
"""Run the voice loop. Blocks until Ctrl-C."""
self._ensure_piper()
_suppress_mcp_noise()
_install_quiet_asyncgen_hooks()
self._log_banner()
self._running = True
try:
while self._running:
audio = self._record_utterance()
if audio is None:
continue
text = self._transcribe(audio)
if self._is_hallucination(text):
logger.debug("Ignoring likely Whisper hallucination: '%s'", text)
continue
if self._is_exit_command(text):
logger.info("👋 Goodbye!")
break
self._process_turn(text)
except KeyboardInterrupt:
logger.info("👋 Voice loop stopped.")
finally:
self._running = False
self._cleanup_loop()
def _cleanup_loop(self) -> None:
"""Shut down the persistent event loop cleanly.
Agno's MCP stdio sessions leave async generators (stdio_client)
that complain loudly when torn down from a different task.
We swallow those errors — they're harmless, the subprocesses
die with the loop anyway.
"""
if self._loop is None or self._loop.is_closed():
return
# Silence "error during closing of asynchronous generator" warnings
# from MCP's anyio/asyncio cancel-scope teardown.
import warnings
self._loop.set_exception_handler(lambda loop, ctx: None)
try:
self._loop.run_until_complete(self._loop.shutdown_asyncgens())
except RuntimeError as exc:
logger.debug("Shutdown asyncgens failed: %s", exc)
pass
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
try:
self._loop.close()
except RuntimeError as exc:
logger.debug("Loop close failed: %s", exc)
pass
self._loop = None
def stop(self) -> None:
"""Stop the voice loop (from another thread)."""
self._running = False
def _suppress_mcp_noise() -> None:
"""Quiet down noisy MCP/Agno loggers during voice mode.
Sets specific loggers to WARNING so the terminal stays clean
for the voice transcript.
"""
for name in (
"mcp",
"mcp.server",
"mcp.client",
"agno",
"agno.mcp",
"httpx",
"httpcore",
):
logging.getLogger(name).setLevel(logging.WARNING)
def _install_quiet_asyncgen_hooks() -> None:
"""Silence MCP stdio_client async-generator teardown noise.
When the voice loop exits, Python GC finalizes Agno's MCP
stdio_client async generators. anyio's cancel-scope teardown
prints ugly tracebacks to stderr. These are harmless — the
MCP subprocesses die with the loop. We intercept them here.
"""
_orig_hook = getattr(sys, "unraisablehook", None)
def _quiet_hook(args):
# Swallow RuntimeError from anyio cancel-scope teardown
# and BaseExceptionGroup from MCP stdio_client generators
if args.exc_type in (RuntimeError, BaseExceptionGroup):
msg = str(args.exc_value) if args.exc_value else ""
if "cancel scope" in msg or "unhandled errors" in msg:
return
# Also swallow GeneratorExit from stdio_client
if args.exc_type is GeneratorExit:
return
# Everything else: forward to original hook
if _orig_hook:
_orig_hook(args)
else:
sys.__unraisablehook__(args)
sys.unraisablehook = _quiet_hook