1
0

fix: voice loop — persistent event loop, markdown stripping, MCP noise

Three fixes from real-world testing:

1. Event loop: replaced asyncio.run() with a persistent loop so
   Agno's MCP sessions survive across conversation turns. No more
   'Event loop is closed' errors on turn 2+.

2. Markdown stripping: voice preamble tells Timmy to respond in
   natural spoken language, plus _strip_markdown() as a safety net
   removes **bold**, *italic*, bullets, headers, code fences, etc.
   TTS no longer reads 'asterisk asterisk'.

3. MCP noise: _suppress_mcp_noise() quiets mcp/agno/httpx loggers
   during voice mode so the terminal shows clean transcript only.

32 tests (12 new for markdown stripping + persistent loop).
This commit is contained in:
2026-03-14 14:05:24 -04:00
parent dbadfc425d
commit 782218aa2c
2 changed files with 156 additions and 10 deletions

View File

@@ -15,6 +15,7 @@ Requires: sounddevice, numpy, whisper, piper-tts
import asyncio
import logging
import re
import subprocess
import sys
import tempfile
@@ -26,6 +27,44 @@ import numpy as np
logger = logging.getLogger(__name__)
# ── Voice-mode system instruction ───────────────────────────────────────────
# Prepended to user messages so Timmy responds naturally for TTS.
_VOICE_PREAMBLE = (
"[VOICE MODE] You are speaking aloud through a text-to-speech system. "
"Respond in short, natural spoken sentences. No markdown, no bullet points, "
"no asterisks, no numbered lists, no headers, no bold/italic formatting. "
"Talk like a person in a conversation — concise, warm, direct. "
"Keep responses under 3-4 sentences unless the user asks for detail."
)
def _strip_markdown(text: str) -> str:
"""Remove markdown formatting so TTS reads naturally.
Strips: **bold**, *italic*, `code`, # headers, - bullets,
numbered lists, [links](url), etc.
"""
if not text:
return text
# Remove bold/italic markers
text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
# Remove inline code
text = re.sub(r"`([^`]+)`", r"\1", text)
# Remove headers (# Header)
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
# Remove bullet points (-, *, +) at start of line
text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
# Remove numbered lists (1. 2. etc)
text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
# Remove link syntax [text](url) → text
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
# Remove horizontal rules
text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
# Collapse multiple newlines
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
# ── Defaults ────────────────────────────────────────────────────────────────
DEFAULT_WHISPER_MODEL = "base.en"
@@ -75,6 +114,9 @@ class VoiceLoop:
self._running = False
self._speaking = False # True while TTS is playing
self._interrupted = False # set when user talks over TTS
# Persistent event loop — reused across all chat calls so Agno's
# MCP sessions don't die when the loop closes.
self._loop: asyncio.AbstractEventLoop | None = None
# ── Lazy initialization ─────────────────────────────────────────────
@@ -283,6 +325,16 @@ class VoiceLoop:
# ── LLM: Text → Response ───────────────────────────────────────────
def _get_loop(self) -> asyncio.AbstractEventLoop:
"""Return a persistent event loop, creating one if needed.
A single loop is reused for the entire voice session so Agno's
MCP tool-server connections survive across turns.
"""
if self._loop is None or self._loop.is_closed():
self._loop = asyncio.new_event_loop()
return self._loop
def _think(self, user_text: str) -> str:
"""Send text to Timmy and get a response."""
sys.stdout.write(" 💭 Thinking...\r")
@@ -291,20 +343,29 @@ class VoiceLoop:
t0 = time.monotonic()
try:
response = asyncio.run(self._chat(user_text))
loop = self._get_loop()
response = loop.run_until_complete(self._chat(user_text))
except Exception as exc:
logger.error("Timmy chat failed: %s", exc)
response = "I'm having trouble thinking right now. Could you try again?"
elapsed = time.monotonic() - t0
logger.info("Timmy responded in %.1fs", elapsed)
# Strip markdown so TTS doesn't read asterisks, bullets, etc.
response = _strip_markdown(response)
return response
async def _chat(self, message: str) -> str:
"""Async wrapper around Timmy's session.chat()."""
"""Async wrapper around Timmy's session.chat().
Prepends the voice-mode instruction so Timmy responds in
natural spoken language rather than markdown.
"""
from timmy.session import chat
return await chat(message, session_id=self.config.session_id)
voiced = f"{_VOICE_PREAMBLE}\n\nUser said: {message}"
return await chat(voiced, session_id=self.config.session_id)
# ── Main Loop ───────────────────────────────────────────────────────
@@ -312,6 +373,11 @@ class VoiceLoop:
"""Run the voice loop. Blocks until Ctrl-C."""
self._ensure_piper()
# Suppress MCP / Agno stderr noise during voice mode.
# The "Secure MCP Filesystem Server running on stdio" messages
# are distracting in a voice session.
_suppress_mcp_noise()
tts_label = (
"macOS say"
if self.config.use_say_fallback
@@ -381,7 +447,36 @@ class VoiceLoop:
print("\n\n 👋 Voice loop stopped.\n")
finally:
self._running = False
self._cleanup_loop()
def _cleanup_loop(self) -> None:
"""Shut down the persistent event loop cleanly."""
if self._loop is not None and not self._loop.is_closed():
try:
self._loop.run_until_complete(self._loop.shutdown_asyncgens())
except Exception:
pass
self._loop.close()
self._loop = None
def stop(self) -> None:
"""Stop the voice loop (from another thread)."""
self._running = False
def _suppress_mcp_noise() -> None:
"""Quiet down noisy MCP/Agno loggers during voice mode.
Sets specific loggers to WARNING so the terminal stays clean
for the voice transcript.
"""
for name in (
"mcp",
"mcp.server",
"mcp.client",
"agno",
"agno.mcp",
"httpx",
"httpcore",
):
logging.getLogger(name).setLevel(logging.WARNING)