#!/usr/bin/env python3 """deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive. Issue: #830 (the-nexus) Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers. """ import argparse import json import subprocess import sys from dataclasses import dataclass from pathlib import Path from typing import Optional import os import urllib.request @dataclass class TTSConfig: provider: str # "piper", "elevenlabs", "openai" voice_id: str output_dir: Path # Provider-specific api_key: Optional[str] = None model: Optional[str] = None # e.g., "eleven_turbo_v2" or "tts-1" class PiperAdapter: """Local TTS using Piper (offline, free, medium quality). Requires: pip install piper-tts Model download: https://huggingface.co/rhasspy/piper-voices """ def __init__(self, config: TTSConfig): self.config = config self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx" def synthesize(self, text: str, output_path: Path) -> Path: if not Path(self.model_path).exists(): raise RuntimeError(f"Piper model not found: {self.model_path}. " f"Download from https://huggingface.co/rhasspy/piper-voices") cmd = [ "piper-tts", "--model", str(self.model_path), "--output_file", str(output_path.with_suffix(".wav")) ] subprocess.run(cmd, input=text.encode(), check=True) # Convert to MP3 for smaller size mp3_path = output_path.with_suffix(".mp3") subprocess.run([ "lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path) ], check=True, capture_output=True) output_path.with_suffix(".wav").unlink() return mp3_path class ElevenLabsAdapter: """Cloud TTS using ElevenLabs API (high quality, paid). Requires: ELEVENLABS_API_KEY environment variable Voices: https://elevenlabs.io/voice-library """ VOICE_MAP = { "matthew": "Mathew", # Professional narrator "josh": "Josh", # Young male "rachel": "Rachel", # Professional female "bella": "Bella", # Warm female "adam": "Adam", # Deep male } def __init__(self, config: TTSConfig): self.config = config self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY") if not self.api_key: raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.") def synthesize(self, text: str, output_path: Path) -> Path: voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id) url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}" data = json.dumps({ "text": text[:5000], # ElevenLabs limit "model_id": self.config.model or "eleven_turbo_v2", "voice_settings": { "stability": 0.5, "similarity_boost": 0.75 } }).encode() req = urllib.request.Request(url, data=data, method="POST") req.add_header("xi-api-key", self.api_key) req.add_header("Content-Type", "application/json") mp3_path = output_path.with_suffix(".mp3") with urllib.request.urlopen(req, timeout=120) as resp: mp3_path.write_bytes(resp.read()) return mp3_path class OpenAITTSAdapter: """Cloud TTS using OpenAI API (good quality, usage-based pricing). Requires: OPENAI_API_KEY environment variable """ VOICE_MAP = { "alloy": "alloy", "echo": "echo", "fable": "fable", "onyx": "onyx", "nova": "nova", "shimmer": "shimmer", } def __init__(self, config: TTSConfig): self.config = config self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY") if not self.api_key: raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.") def synthesize(self, text: str, output_path: Path) -> Path: voice = self.VOICE_MAP.get(self.config.voice_id, "alloy") url = "https://api.openai.com/v1/audio/speech" data = json.dumps({ "model": self.config.model or "tts-1", "input": text[:4096], # OpenAI limit "voice": voice, "response_format": "mp3" }).encode() req = urllib.request.Request(url, data=data, method="POST") req.add_header("Authorization", f"Bearer {self.api_key}") req.add_header("Content-Type", "application/json") mp3_path = output_path.with_suffix(".mp3") with urllib.request.urlopen(req, timeout=60) as resp: mp3_path.write_bytes(resp.read()) return mp3_path ADAPTERS = { "piper": PiperAdapter, "elevenlabs": ElevenLabsAdapter, "openai": OpenAITTSAdapter, } def get_provider_config() -> TTSConfig: """Load TTS configuration from environment.""" provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai") voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew") return TTSConfig( provider=provider, voice_id=voice, output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")), api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs" else os.environ.get("OPENAI_API_KEY") if provider == "openai" else None ) def main(): parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline") parser.add_argument("--text", help="Text to synthesize (or read from stdin)") parser.add_argument("--input-file", "-i", help="Text file to synthesize") parser.add_argument("--output", "-o", help="Output file path (without extension)") parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override") parser.add_argument("--voice", help="Voice ID override") args = parser.parse_args() # Load config config = get_provider_config() if args.provider: config.provider = args.provider if args.voice: config.voice_id = args.voice if args.output: config.output_dir = Path(args.output).parent output_name = Path(args.output).stem else: from datetime import datetime output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}" config.output_dir.mkdir(parents=True, exist_ok=True) output_path = config.output_dir / output_name # Get text if args.input_file: text = Path(args.input_file).read_text() elif args.text: text = args.text else: text = sys.stdin.read() if not text.strip(): print("Error: No text provided", file=sys.stderr) sys.exit(1) # Synthesize print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}") adapter_class = ADAPTERS.get(config.provider) if not adapter_class: print(f"Error: Unknown provider {config.provider}", file=sys.stderr) sys.exit(1) adapter = adapter_class(config) result_path = adapter.synthesize(text, output_path) print(f"[TTS] Audio saved: {result_path}") print(json.dumps({ "provider": config.provider, "voice": config.voice_id, "output_path": str(result_path), "duration_estimate_min": len(text) // 150 # ~150 chars/min })) if __name__ == "__main__": main()