[ezra] Add Phase 4 TTS pipeline with multi-adapter support #830

2026-04-05 03:45:06 +00:00
parent 75fa66344d
commit dde9c74fa7
1 changed files with 235 additions and 0 deletions
--- a/bin/deepdive_tts.py
+++ b/bin/deepdive_tts.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.
+
+Issue: #830 (the-nexus)
+Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import os
+import urllib.request
+
+
+@dataclass
+class TTSConfig:
+    provider: str  # "piper", "elevenlabs", "openai"
+    voice_id: str
+    output_dir: Path
+    # Provider-specific
+    api_key: Optional[str] = None
+    model: Optional[str] = None  # e.g., "eleven_turbo_v2" or "tts-1"
+
+
+class PiperAdapter:
+    """Local TTS using Piper (offline, free, medium quality).
+    
+    Requires: pip install piper-tts
+    Model download: https://huggingface.co/rhasspy/piper-voices
+    """
+    
+    def __init__(self, config: TTSConfig):
+        self.config = config
+        self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"
+    
+    def synthesize(self, text: str, output_path: Path) -> Path:
+        if not Path(self.model_path).exists():
+            raise RuntimeError(f"Piper model not found: {self.model_path}. "
+                             f"Download from https://huggingface.co/rhasspy/piper-voices")
+        
+        cmd = [
+            "piper-tts",
+            "--model", str(self.model_path),
+            "--output_file", str(output_path.with_suffix(".wav"))
+        ]
+        
+        subprocess.run(cmd, input=text.encode(), check=True)
+        
+        # Convert to MP3 for smaller size
+        mp3_path = output_path.with_suffix(".mp3")
+        subprocess.run([
+            "lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)
+        ], check=True, capture_output=True)
+        
+        output_path.with_suffix(".wav").unlink()
+        return mp3_path
+
+
+class ElevenLabsAdapter:
+    """Cloud TTS using ElevenLabs API (high quality, paid).
+    
+    Requires: ELEVENLABS_API_KEY environment variable
+    Voices: https://elevenlabs.io/voice-library
+    """
+    
+    VOICE_MAP = {
+        "matthew": "Mathew",  # Professional narrator
+        "josh": "Josh",       # Young male
+        "rachel": "Rachel",   # Professional female
+        "bella": "Bella",     # Warm female
+        "adam": "Adam",       # Deep male
+    }
+    
+    def __init__(self, config: TTSConfig):
+        self.config = config
+        self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")
+        if not self.api_key:
+            raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")
+    
+    def synthesize(self, text: str, output_path: Path) -> Path:
+        voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)
+        
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+        
+        data = json.dumps({
+            "text": text[:5000],  # ElevenLabs limit
+            "model_id": self.config.model or "eleven_turbo_v2",
+            "voice_settings": {
+                "stability": 0.5,
+                "similarity_boost": 0.75
+            }
+        }).encode()
+        
+        req = urllib.request.Request(url, data=data, method="POST")
+        req.add_header("xi-api-key", self.api_key)
+        req.add_header("Content-Type", "application/json")
+        
+        mp3_path = output_path.with_suffix(".mp3")
+        
+        with urllib.request.urlopen(req, timeout=120) as resp:
+            mp3_path.write_bytes(resp.read())
+        
+        return mp3_path
+
+
+class OpenAITTSAdapter:
+    """Cloud TTS using OpenAI API (good quality, usage-based pricing).
+    
+    Requires: OPENAI_API_KEY environment variable
+    """
+    
+    VOICE_MAP = {
+        "alloy": "alloy",
+        "echo": "echo",
+        "fable": "fable",
+        "onyx": "onyx",
+        "nova": "nova",
+        "shimmer": "shimmer",
+    }
+    
+    def __init__(self, config: TTSConfig):
+        self.config = config
+        self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")
+        if not self.api_key:
+            raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")
+    
+    def synthesize(self, text: str, output_path: Path) -> Path:
+        voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")
+        
+        url = "https://api.openai.com/v1/audio/speech"
+        
+        data = json.dumps({
+            "model": self.config.model or "tts-1",
+            "input": text[:4096],  # OpenAI limit
+            "voice": voice,
+            "response_format": "mp3"
+        }).encode()
+        
+        req = urllib.request.Request(url, data=data, method="POST")
+        req.add_header("Authorization", f"Bearer {self.api_key}")
+        req.add_header("Content-Type", "application/json")
+        
+        mp3_path = output_path.with_suffix(".mp3")
+        
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            mp3_path.write_bytes(resp.read())
+        
+        return mp3_path
+
+
+ADAPTERS = {
+    "piper": PiperAdapter,
+    "elevenlabs": ElevenLabsAdapter,
+    "openai": OpenAITTSAdapter,
+}
+
+
+def get_provider_config() -> TTSConfig:
+    """Load TTS configuration from environment."""
+    provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
+    voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew")
+    
+    return TTSConfig(
+        provider=provider,
+        voice_id=voice,
+        output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),
+        api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs" 
+                else os.environ.get("OPENAI_API_KEY") if provider == "openai"
+                else None
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")
+    parser.add_argument("--text", help="Text to synthesize (or read from stdin)")
+    parser.add_argument("--input-file", "-i", help="Text file to synthesize")
+    parser.add_argument("--output", "-o", help="Output file path (without extension)")
+    parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")
+    parser.add_argument("--voice", help="Voice ID override")
+    args = parser.parse_args()
+    
+    # Load config
+    config = get_provider_config()
+    if args.provider:
+        config.provider = args.provider
+    if args.voice:
+        config.voice_id = args.voice
+    if args.output:
+        config.output_dir = Path(args.output).parent
+        output_name = Path(args.output).stem
+    else:
+        from datetime import datetime
+        output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"
+    
+    config.output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = config.output_dir / output_name
+    
+    # Get text
+    if args.input_file:
+        text = Path(args.input_file).read_text()
+    elif args.text:
+        text = args.text
+    else:
+        text = sys.stdin.read()
+    
+    if not text.strip():
+        print("Error: No text provided", file=sys.stderr)
+        sys.exit(1)
+    
+    # Synthesize
+    print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")
+    
+    adapter_class = ADAPTERS.get(config.provider)
+    if not adapter_class:
+        print(f"Error: Unknown provider {config.provider}", file=sys.stderr)
+        sys.exit(1)
+    
+    adapter = adapter_class(config)
+    result_path = adapter.synthesize(text, output_path)
+    
+    print(f"[TTS] Audio saved: {result_path}")
+    print(json.dumps({
+        "provider": config.provider,
+        "voice": config.voice_id,
+        "output_path": str(result_path),
+        "duration_estimate_min": len(text) // 150  # ~150 chars/min
+    }))
+
+
+if __name__ == "__main__":
+    main()