From dde9c74fa7e34f8c2b1c947266116db3621ade89 Mon Sep 17 00:00:00 2001 From: Ezra Date: Sun, 5 Apr 2026 03:45:06 +0000 Subject: [PATCH] [ezra] Add Phase 4 TTS pipeline with multi-adapter support #830 --- bin/deepdive_tts.py | 235 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 bin/deepdive_tts.py diff --git a/bin/deepdive_tts.py b/bin/deepdive_tts.py new file mode 100644 index 0000000..8eab246 --- /dev/null +++ b/bin/deepdive_tts.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive. + +Issue: #830 (the-nexus) +Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers. +""" + +import argparse +import json +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Optional +import os +import urllib.request + + +@dataclass +class TTSConfig: + provider: str # "piper", "elevenlabs", "openai" + voice_id: str + output_dir: Path + # Provider-specific + api_key: Optional[str] = None + model: Optional[str] = None # e.g., "eleven_turbo_v2" or "tts-1" + + +class PiperAdapter: + """Local TTS using Piper (offline, free, medium quality). + + Requires: pip install piper-tts + Model download: https://huggingface.co/rhasspy/piper-voices + """ + + def __init__(self, config: TTSConfig): + self.config = config + self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx" + + def synthesize(self, text: str, output_path: Path) -> Path: + if not Path(self.model_path).exists(): + raise RuntimeError(f"Piper model not found: {self.model_path}. " + f"Download from https://huggingface.co/rhasspy/piper-voices") + + cmd = [ + "piper-tts", + "--model", str(self.model_path), + "--output_file", str(output_path.with_suffix(".wav")) + ] + + subprocess.run(cmd, input=text.encode(), check=True) + + # Convert to MP3 for smaller size + mp3_path = output_path.with_suffix(".mp3") + subprocess.run([ + "lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path) + ], check=True, capture_output=True) + + output_path.with_suffix(".wav").unlink() + return mp3_path + + +class ElevenLabsAdapter: + """Cloud TTS using ElevenLabs API (high quality, paid). + + Requires: ELEVENLABS_API_KEY environment variable + Voices: https://elevenlabs.io/voice-library + """ + + VOICE_MAP = { + "matthew": "Mathew", # Professional narrator + "josh": "Josh", # Young male + "rachel": "Rachel", # Professional female + "bella": "Bella", # Warm female + "adam": "Adam", # Deep male + } + + def __init__(self, config: TTSConfig): + self.config = config + self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY") + if not self.api_key: + raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.") + + def synthesize(self, text: str, output_path: Path) -> Path: + voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id) + + url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}" + + data = json.dumps({ + "text": text[:5000], # ElevenLabs limit + "model_id": self.config.model or "eleven_turbo_v2", + "voice_settings": { + "stability": 0.5, + "similarity_boost": 0.75 + } + }).encode() + + req = urllib.request.Request(url, data=data, method="POST") + req.add_header("xi-api-key", self.api_key) + req.add_header("Content-Type", "application/json") + + mp3_path = output_path.with_suffix(".mp3") + + with urllib.request.urlopen(req, timeout=120) as resp: + mp3_path.write_bytes(resp.read()) + + return mp3_path + + +class OpenAITTSAdapter: + """Cloud TTS using OpenAI API (good quality, usage-based pricing). + + Requires: OPENAI_API_KEY environment variable + """ + + VOICE_MAP = { + "alloy": "alloy", + "echo": "echo", + "fable": "fable", + "onyx": "onyx", + "nova": "nova", + "shimmer": "shimmer", + } + + def __init__(self, config: TTSConfig): + self.config = config + self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY") + if not self.api_key: + raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.") + + def synthesize(self, text: str, output_path: Path) -> Path: + voice = self.VOICE_MAP.get(self.config.voice_id, "alloy") + + url = "https://api.openai.com/v1/audio/speech" + + data = json.dumps({ + "model": self.config.model or "tts-1", + "input": text[:4096], # OpenAI limit + "voice": voice, + "response_format": "mp3" + }).encode() + + req = urllib.request.Request(url, data=data, method="POST") + req.add_header("Authorization", f"Bearer {self.api_key}") + req.add_header("Content-Type", "application/json") + + mp3_path = output_path.with_suffix(".mp3") + + with urllib.request.urlopen(req, timeout=60) as resp: + mp3_path.write_bytes(resp.read()) + + return mp3_path + + +ADAPTERS = { + "piper": PiperAdapter, + "elevenlabs": ElevenLabsAdapter, + "openai": OpenAITTSAdapter, +} + + +def get_provider_config() -> TTSConfig: + """Load TTS configuration from environment.""" + provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai") + voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew") + + return TTSConfig( + provider=provider, + voice_id=voice, + output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")), + api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs" + else os.environ.get("OPENAI_API_KEY") if provider == "openai" + else None + ) + + +def main(): + parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline") + parser.add_argument("--text", help="Text to synthesize (or read from stdin)") + parser.add_argument("--input-file", "-i", help="Text file to synthesize") + parser.add_argument("--output", "-o", help="Output file path (without extension)") + parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override") + parser.add_argument("--voice", help="Voice ID override") + args = parser.parse_args() + + # Load config + config = get_provider_config() + if args.provider: + config.provider = args.provider + if args.voice: + config.voice_id = args.voice + if args.output: + config.output_dir = Path(args.output).parent + output_name = Path(args.output).stem + else: + from datetime import datetime + output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}" + + config.output_dir.mkdir(parents=True, exist_ok=True) + output_path = config.output_dir / output_name + + # Get text + if args.input_file: + text = Path(args.input_file).read_text() + elif args.text: + text = args.text + else: + text = sys.stdin.read() + + if not text.strip(): + print("Error: No text provided", file=sys.stderr) + sys.exit(1) + + # Synthesize + print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}") + + adapter_class = ADAPTERS.get(config.provider) + if not adapter_class: + print(f"Error: Unknown provider {config.provider}", file=sys.stderr) + sys.exit(1) + + adapter = adapter_class(config) + result_path = adapter.synthesize(text, output_path) + + print(f"[TTS] Audio saved: {result_path}") + print(json.dumps({ + "provider": config.provider, + "voice": config.voice_id, + "output_path": str(result_path), + "duration_estimate_min": len(text) // 150 # ~150 chars/min + })) + + +if __name__ == "__main__": + main()