Files
the-nexus/bin/deepdive_tts.py
Alexander Whitestone ef74536e33
Some checks failed
CI / test (pull_request) Failing after 33s
CI / validate (pull_request) Failing after 26s
Review Approval Gate / verify-review (pull_request) Failing after 5s
feat: add edge-tts as zero-cost voice output provider
- Add EdgeTTSAdapter to bin/deepdive_tts.py (provider key: "edge-tts")
  default voice: en-US-GuyNeural, no API key required
- Add EdgeTTS class to intelligence/deepdive/tts_engine.py
- Update HybridTTS to try edge-tts as fallback between piper and elevenlabs
- Add --voice-memo flag to bin/night_watch.py for spoken nightly reports
- Add edge-tts>=6.1.9 to requirements.txt
- Create docs/voice-output.md documenting all providers and fallback chain
- Add tests/test_edge_tts.py with 17 unit tests (all mocked, no network)

Fixes #1126

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 06:29:26 -04:00

274 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.
Issue: #830 (the-nexus)
Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.
"""
import argparse
import json
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import os
import urllib.request
@dataclass
class TTSConfig:
provider: str # "piper", "elevenlabs", "openai"
voice_id: str
output_dir: Path
# Provider-specific
api_key: Optional[str] = None
model: Optional[str] = None # e.g., "eleven_turbo_v2" or "tts-1"
class PiperAdapter:
"""Local TTS using Piper (offline, free, medium quality).
Requires: pip install piper-tts
Model download: https://huggingface.co/rhasspy/piper-voices
"""
def __init__(self, config: TTSConfig):
self.config = config
self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"
def synthesize(self, text: str, output_path: Path) -> Path:
if not Path(self.model_path).exists():
raise RuntimeError(f"Piper model not found: {self.model_path}. "
f"Download from https://huggingface.co/rhasspy/piper-voices")
cmd = [
"piper-tts",
"--model", str(self.model_path),
"--output_file", str(output_path.with_suffix(".wav"))
]
subprocess.run(cmd, input=text.encode(), check=True)
# Convert to MP3 for smaller size
mp3_path = output_path.with_suffix(".mp3")
subprocess.run([
"lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)
], check=True, capture_output=True)
output_path.with_suffix(".wav").unlink()
return mp3_path
class ElevenLabsAdapter:
"""Cloud TTS using ElevenLabs API (high quality, paid).
Requires: ELEVENLABS_API_KEY environment variable
Voices: https://elevenlabs.io/voice-library
"""
VOICE_MAP = {
"matthew": "Mathew", # Professional narrator
"josh": "Josh", # Young male
"rachel": "Rachel", # Professional female
"bella": "Bella", # Warm female
"adam": "Adam", # Deep male
}
def __init__(self, config: TTSConfig):
self.config = config
self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")
if not self.api_key:
raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")
def synthesize(self, text: str, output_path: Path) -> Path:
voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
data = json.dumps({
"text": text[:5000], # ElevenLabs limit
"model_id": self.config.model or "eleven_turbo_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75
}
}).encode()
req = urllib.request.Request(url, data=data, method="POST")
req.add_header("xi-api-key", self.api_key)
req.add_header("Content-Type", "application/json")
mp3_path = output_path.with_suffix(".mp3")
with urllib.request.urlopen(req, timeout=120) as resp:
mp3_path.write_bytes(resp.read())
return mp3_path
class OpenAITTSAdapter:
"""Cloud TTS using OpenAI API (good quality, usage-based pricing).
Requires: OPENAI_API_KEY environment variable
"""
VOICE_MAP = {
"alloy": "alloy",
"echo": "echo",
"fable": "fable",
"onyx": "onyx",
"nova": "nova",
"shimmer": "shimmer",
}
def __init__(self, config: TTSConfig):
self.config = config
self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")
if not self.api_key:
raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")
def synthesize(self, text: str, output_path: Path) -> Path:
voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")
url = "https://api.openai.com/v1/audio/speech"
data = json.dumps({
"model": self.config.model or "tts-1",
"input": text[:4096], # OpenAI limit
"voice": voice,
"response_format": "mp3"
}).encode()
req = urllib.request.Request(url, data=data, method="POST")
req.add_header("Authorization", f"Bearer {self.api_key}")
req.add_header("Content-Type", "application/json")
mp3_path = output_path.with_suffix(".mp3")
with urllib.request.urlopen(req, timeout=60) as resp:
mp3_path.write_bytes(resp.read())
return mp3_path
class EdgeTTSAdapter:
"""Zero-cost TTS using Microsoft Edge neural voices (no API key required).
Requires: pip install edge-tts>=6.1.9
Voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support
"""
DEFAULT_VOICE = "en-US-GuyNeural"
def __init__(self, config: TTSConfig):
self.config = config
self.voice = config.voice_id or self.DEFAULT_VOICE
def synthesize(self, text: str, output_path: Path) -> Path:
try:
import edge_tts
except ImportError:
raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
import asyncio
mp3_path = output_path.with_suffix(".mp3")
async def _run():
communicate = edge_tts.Communicate(text, self.voice)
await communicate.save(str(mp3_path))
asyncio.run(_run())
return mp3_path
ADAPTERS = {
"piper": PiperAdapter,
"elevenlabs": ElevenLabsAdapter,
"openai": OpenAITTSAdapter,
"edge-tts": EdgeTTSAdapter,
}
def get_provider_config() -> TTSConfig:
"""Load TTS configuration from environment."""
provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
if provider == "openai":
default_voice = "alloy"
elif provider == "edge-tts":
default_voice = EdgeTTSAdapter.DEFAULT_VOICE
else:
default_voice = "matthew"
voice = os.environ.get("DEEPDIVE_TTS_VOICE", default_voice)
return TTSConfig(
provider=provider,
voice_id=voice,
output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),
api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs"
else os.environ.get("OPENAI_API_KEY") if provider == "openai"
else None
)
def main():
parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")
parser.add_argument("--text", help="Text to synthesize (or read from stdin)")
parser.add_argument("--input-file", "-i", help="Text file to synthesize")
parser.add_argument("--output", "-o", help="Output file path (without extension)")
parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")
parser.add_argument("--voice", help="Voice ID override")
args = parser.parse_args()
# Load config
config = get_provider_config()
if args.provider:
config.provider = args.provider
if args.voice:
config.voice_id = args.voice
if args.output:
config.output_dir = Path(args.output).parent
output_name = Path(args.output).stem
else:
from datetime import datetime
output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"
config.output_dir.mkdir(parents=True, exist_ok=True)
output_path = config.output_dir / output_name
# Get text
if args.input_file:
text = Path(args.input_file).read_text()
elif args.text:
text = args.text
else:
text = sys.stdin.read()
if not text.strip():
print("Error: No text provided", file=sys.stderr)
sys.exit(1)
# Synthesize
print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")
adapter_class = ADAPTERS.get(config.provider)
if not adapter_class:
print(f"Error: Unknown provider {config.provider}", file=sys.stderr)
sys.exit(1)
adapter = adapter_class(config)
result_path = adapter.synthesize(text, output_path)
print(f"[TTS] Audio saved: {result_path}")
print(json.dumps({
"provider": config.provider,
"voice": config.voice_id,
"output_path": str(result_path),
"duration_estimate_min": len(text) // 150 # ~150 chars/min
}))
if __name__ == "__main__":
main()