bin/deepdive_tts.py

#!/usr/bin/env python3
"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.

Issue: #830 (the-nexus)
Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.
"""

import argparse
import json
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import os
import urllib.request


@dataclass
class TTSConfig:
    provider: str  # "piper", "elevenlabs", "openai"
    voice_id: str
    output_dir: Path
    # Provider-specific
    api_key: Optional[str] = None
    model: Optional[str] = None  # e.g., "eleven_turbo_v2" or "tts-1"


class PiperAdapter:
    """Local TTS using Piper (offline, free, medium quality).
    
    Requires: pip install piper-tts
    Model download: https://huggingface.co/rhasspy/piper-voices
    """
    
    def __init__(self, config: TTSConfig):
        self.config = config
        self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"
    
    def synthesize(self, text: str, output_path: Path) -> Path:
        if not Path(self.model_path).exists():
            raise RuntimeError(f"Piper model not found: {self.model_path}. "
                             f"Download from https://huggingface.co/rhasspy/piper-voices")
        
        cmd = [
            "piper-tts",
            "--model", str(self.model_path),
            "--output_file", str(output_path.with_suffix(".wav"))
        ]
        
        subprocess.run(cmd, input=text.encode(), check=True)
        
        # Convert to MP3 for smaller size
        mp3_path = output_path.with_suffix(".mp3")
        subprocess.run([
            "lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)
        ], check=True, capture_output=True)
        
        output_path.with_suffix(".wav").unlink()
        return mp3_path


class ElevenLabsAdapter:
    """Cloud TTS using ElevenLabs API (high quality, paid).
    
    Requires: ELEVENLABS_API_KEY environment variable
    Voices: https://elevenlabs.io/voice-library
    """
    
    VOICE_MAP = {
        "matthew": "Mathew",  # Professional narrator
        "josh": "Josh",       # Young male
        "rachel": "Rachel",   # Professional female
        "bella": "Bella",     # Warm female
        "adam": "Adam",       # Deep male
    }
    
    def __init__(self, config: TTSConfig):
        self.config = config
        self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")
        if not self.api_key:
            raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")
    
    def synthesize(self, text: str, output_path: Path) -> Path:
        voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)
        
        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
        
        data = json.dumps({
            "text": text[:5000],  # ElevenLabs limit
            "model_id": self.config.model or "eleven_turbo_v2",
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75
            }
        }).encode()
        
        req = urllib.request.Request(url, data=data, method="POST")
        req.add_header("xi-api-key", self.api_key)
        req.add_header("Content-Type", "application/json")
        
        mp3_path = output_path.with_suffix(".mp3")
        
        with urllib.request.urlopen(req, timeout=120) as resp:
            mp3_path.write_bytes(resp.read())
        
        return mp3_path


class OpenAITTSAdapter:
    """Cloud TTS using OpenAI API (good quality, usage-based pricing).
    
    Requires: OPENAI_API_KEY environment variable
    """
    
    VOICE_MAP = {
        "alloy": "alloy",
        "echo": "echo",
        "fable": "fable",
        "onyx": "onyx",
        "nova": "nova",
        "shimmer": "shimmer",
    }
    
    def __init__(self, config: TTSConfig):
        self.config = config
        self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")
        if not self.api_key:
            raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")
    
    def synthesize(self, text: str, output_path: Path) -> Path:
        voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")
        
        url = "https://api.openai.com/v1/audio/speech"
        
        data = json.dumps({
            "model": self.config.model or "tts-1",
            "input": text[:4096],  # OpenAI limit
            "voice": voice,
            "response_format": "mp3"
        }).encode()
        
        req = urllib.request.Request(url, data=data, method="POST")
        req.add_header("Authorization", f"Bearer {self.api_key}")
        req.add_header("Content-Type", "application/json")
        
        mp3_path = output_path.with_suffix(".mp3")
        
        with urllib.request.urlopen(req, timeout=60) as resp:
            mp3_path.write_bytes(resp.read())
        
        return mp3_path


ADAPTERS = {
    "piper": PiperAdapter,
    "elevenlabs": ElevenLabsAdapter,
    "openai": OpenAITTSAdapter,
}


def get_provider_config() -> TTSConfig:
    """Load TTS configuration from environment."""
    provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
    voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew")
    
    return TTSConfig(
        provider=provider,
        voice_id=voice,
        output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),
        api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs" 
                else os.environ.get("OPENAI_API_KEY") if provider == "openai"
                else None
    )


def main():
    parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")
    parser.add_argument("--text", help="Text to synthesize (or read from stdin)")
    parser.add_argument("--input-file", "-i", help="Text file to synthesize")
    parser.add_argument("--output", "-o", help="Output file path (without extension)")
    parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")
    parser.add_argument("--voice", help="Voice ID override")
    args = parser.parse_args()
    
    # Load config
    config = get_provider_config()
    if args.provider:
        config.provider = args.provider
    if args.voice:
        config.voice_id = args.voice
    if args.output:
        config.output_dir = Path(args.output).parent
        output_name = Path(args.output).stem
    else:
        from datetime import datetime
        output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"
    
    config.output_dir.mkdir(parents=True, exist_ok=True)
    output_path = config.output_dir / output_name
    
    # Get text
    if args.input_file:
        text = Path(args.input_file).read_text()
    elif args.text:
        text = args.text
    else:
        text = sys.stdin.read()
    
    if not text.strip():
        print("Error: No text provided", file=sys.stderr)
        sys.exit(1)
    
    # Synthesize
    print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")
    
    adapter_class = ADAPTERS.get(config.provider)
    if not adapter_class:
        print(f"Error: Unknown provider {config.provider}", file=sys.stderr)
        sys.exit(1)
    
    adapter = adapter_class(config)
    result_path = adapter.synthesize(text, output_path)
    
    print(f"[TTS] Audio saved: {result_path}")
    print(json.dumps({
        "provider": config.provider,
        "voice": config.voice_id,
        "output_path": str(result_path),
        "duration_estimate_min": len(text) // 150  # ~150 chars/min
    }))


if __name__ == "__main__":
    main()
[ezra] Add Phase 4 TTS pipeline with multi-adapter support #830 2026-04-05 03:45:06 +00:00			`#!/usr/bin/env python3`
			`"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.`

			`Issue: #830 (the-nexus)`
			`Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.`
			`"""`

			`import argparse`
			`import json`
			`import subprocess`
			`import sys`
			`from dataclasses import dataclass`
			`from pathlib import Path`
			`from typing import Optional`
			`import os`
			`import urllib.request`


			`@dataclass`
			`class TTSConfig:`
			`provider: str # "piper", "elevenlabs", "openai"`
			`voice_id: str`
			`output_dir: Path`
			`# Provider-specific`
			`api_key: Optional[str] = None`
			`model: Optional[str] = None # e.g., "eleven_turbo_v2" or "tts-1"`


			`class PiperAdapter:`
			`"""Local TTS using Piper (offline, free, medium quality).`

			`Requires: pip install piper-tts`
			`Model download: https://huggingface.co/rhasspy/piper-voices`
			`"""`

			`def __init__(self, config: TTSConfig):`
			`self.config = config`
			`self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"`

			`def synthesize(self, text: str, output_path: Path) -> Path:`
			`if not Path(self.model_path).exists():`
			`raise RuntimeError(f"Piper model not found: {self.model_path}. "`
			`f"Download from https://huggingface.co/rhasspy/piper-voices")`

			`cmd = [`
			`"piper-tts",`
			`"--model", str(self.model_path),`
			`"--output_file", str(output_path.with_suffix(".wav"))`
			`]`

			`subprocess.run(cmd, input=text.encode(), check=True)`

			`# Convert to MP3 for smaller size`
			`mp3_path = output_path.with_suffix(".mp3")`
			`subprocess.run([`
			`"lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)`
			`], check=True, capture_output=True)`

			`output_path.with_suffix(".wav").unlink()`
			`return mp3_path`


			`class ElevenLabsAdapter:`
			`"""Cloud TTS using ElevenLabs API (high quality, paid).`

			`Requires: ELEVENLABS_API_KEY environment variable`
			`Voices: https://elevenlabs.io/voice-library`
			`"""`

			`VOICE_MAP = {`
			`"matthew": "Mathew", # Professional narrator`
			`"josh": "Josh", # Young male`
			`"rachel": "Rachel", # Professional female`
			`"bella": "Bella", # Warm female`
			`"adam": "Adam", # Deep male`
			`}`

			`def __init__(self, config: TTSConfig):`
			`self.config = config`
			`self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")`
			`if not self.api_key:`
			`raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")`

			`def synthesize(self, text: str, output_path: Path) -> Path:`
			`voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)`

			`url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"`

			`data = json.dumps({`
			`"text": text[:5000], # ElevenLabs limit`
			`"model_id": self.config.model or "eleven_turbo_v2",`
			`"voice_settings": {`
			`"stability": 0.5,`
			`"similarity_boost": 0.75`
			`}`
			`}).encode()`

			`req = urllib.request.Request(url, data=data, method="POST")`
			`req.add_header("xi-api-key", self.api_key)`
			`req.add_header("Content-Type", "application/json")`

			`mp3_path = output_path.with_suffix(".mp3")`

			`with urllib.request.urlopen(req, timeout=120) as resp:`
			`mp3_path.write_bytes(resp.read())`

			`return mp3_path`


			`class OpenAITTSAdapter:`
			`"""Cloud TTS using OpenAI API (good quality, usage-based pricing).`

			`Requires: OPENAI_API_KEY environment variable`
			`"""`

			`VOICE_MAP = {`
			`"alloy": "alloy",`
			`"echo": "echo",`
			`"fable": "fable",`
			`"onyx": "onyx",`
			`"nova": "nova",`
			`"shimmer": "shimmer",`
			`}`

			`def __init__(self, config: TTSConfig):`
			`self.config = config`
			`self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")`
			`if not self.api_key:`
			`raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")`

			`def synthesize(self, text: str, output_path: Path) -> Path:`
			`voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")`

			`url = "https://api.openai.com/v1/audio/speech"`

			`data = json.dumps({`
			`"model": self.config.model or "tts-1",`
			`"input": text[:4096], # OpenAI limit`
			`"voice": voice,`
			`"response_format": "mp3"`
			`}).encode()`

			`req = urllib.request.Request(url, data=data, method="POST")`
			`req.add_header("Authorization", f"Bearer {self.api_key}")`
			`req.add_header("Content-Type", "application/json")`

			`mp3_path = output_path.with_suffix(".mp3")`

			`with urllib.request.urlopen(req, timeout=60) as resp:`
			`mp3_path.write_bytes(resp.read())`

			`return mp3_path`


			`ADAPTERS = {`
			`"piper": PiperAdapter,`
			`"elevenlabs": ElevenLabsAdapter,`
			`"openai": OpenAITTSAdapter,`
			`}`


			`def get_provider_config() -> TTSConfig:`
			`"""Load TTS configuration from environment."""`
			`provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")`
			`voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew")`

			`return TTSConfig(`
			`provider=provider,`
			`voice_id=voice,`
			`output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),`
			`api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs"`
			`else os.environ.get("OPENAI_API_KEY") if provider == "openai"`
			`else None`
			`)`


			`def main():`
			`parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")`
			`parser.add_argument("--text", help="Text to synthesize (or read from stdin)")`
			`parser.add_argument("--input-file", "-i", help="Text file to synthesize")`
			`parser.add_argument("--output", "-o", help="Output file path (without extension)")`
			`parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")`
			`parser.add_argument("--voice", help="Voice ID override")`
			`args = parser.parse_args()`

			`# Load config`
			`config = get_provider_config()`
			`if args.provider:`
			`config.provider = args.provider`
			`if args.voice:`
			`config.voice_id = args.voice`
			`if args.output:`
			`config.output_dir = Path(args.output).parent`
			`output_name = Path(args.output).stem`
			`else:`
			`from datetime import datetime`
			`output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"`

			`config.output_dir.mkdir(parents=True, exist_ok=True)`
			`output_path = config.output_dir / output_name`

			`# Get text`
			`if args.input_file:`
			`text = Path(args.input_file).read_text()`
			`elif args.text:`
			`text = args.text`
			`else:`
			`text = sys.stdin.read()`

			`if not text.strip():`
			`print("Error: No text provided", file=sys.stderr)`
			`sys.exit(1)`

			`# Synthesize`
			`print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")`

			`adapter_class = ADAPTERS.get(config.provider)`
			`if not adapter_class:`
			`print(f"Error: Unknown provider {config.provider}", file=sys.stderr)`
			`sys.exit(1)`

			`adapter = adapter_class(config)`
			`result_path = adapter.synthesize(text, output_path)`

			`print(f"[TTS] Audio saved: {result_path}")`
			`print(json.dumps({`
			`"provider": config.provider,`
			`"voice": config.voice_id,`
			`"output_path": str(result_path),`
			`"duration_estimate_min": len(text) // 150 # ~150 chars/min`
			`}))`


			`if __name__ == "__main__":`
			`main()`