the-nexus/bin/deepdive_tts.py

#!/usr/bin/env python3
"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.

Issue: #830 (the-nexus)
Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.
"""

import argparse
import json
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import os
import urllib.request


@dataclass
class TTSConfig:
    provider: str  # "piper", "elevenlabs", "openai"
    voice_id: str
    output_dir: Path
    # Provider-specific
    api_key: Optional[str] = None
    model: Optional[str] = None  # e.g., "eleven_turbo_v2" or "tts-1"


class PiperAdapter:
    """Local TTS using Piper (offline, free, medium quality).

    Requires: pip install piper-tts
    Model download: https://huggingface.co/rhasspy/piper-voices
    """

    def __init__(self, config: TTSConfig):
        self.config = config
        self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"

    def synthesize(self, text: str, output_path: Path) -> Path:
        if not Path(self.model_path).exists():
            raise RuntimeError(f"Piper model not found: {self.model_path}. "
                             f"Download from https://huggingface.co/rhasspy/piper-voices")

        cmd = [
            "piper-tts",
            "--model", str(self.model_path),
            "--output_file", str(output_path.with_suffix(".wav"))
        ]

        subprocess.run(cmd, input=text.encode(), check=True)

        # Convert to MP3 for smaller size
        mp3_path = output_path.with_suffix(".mp3")
        subprocess.run([
            "lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)
        ], check=True, capture_output=True)

        output_path.with_suffix(".wav").unlink()
        return mp3_path


class ElevenLabsAdapter:
    """Cloud TTS using ElevenLabs API (high quality, paid).

    Requires: ELEVENLABS_API_KEY environment variable
    Voices: https://elevenlabs.io/voice-library
    """

    VOICE_MAP = {
        "matthew": "Mathew",  # Professional narrator
        "josh": "Josh",       # Young male
        "rachel": "Rachel",   # Professional female
        "bella": "Bella",     # Warm female
        "adam": "Adam",       # Deep male
    }

    def __init__(self, config: TTSConfig):
        self.config = config
        self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")
        if not self.api_key:
            raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")

    def synthesize(self, text: str, output_path: Path) -> Path:
        voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)

        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"

        data = json.dumps({
            "text": text[:5000],  # ElevenLabs limit
            "model_id": self.config.model or "eleven_turbo_v2",
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75
            }
        }).encode()

        req = urllib.request.Request(url, data=data, method="POST")
        req.add_header("xi-api-key", self.api_key)
        req.add_header("Content-Type", "application/json")

        mp3_path = output_path.with_suffix(".mp3")

        with urllib.request.urlopen(req, timeout=120) as resp:
            mp3_path.write_bytes(resp.read())

        return mp3_path


class OpenAITTSAdapter:
    """Cloud TTS using OpenAI API (good quality, usage-based pricing).

    Requires: OPENAI_API_KEY environment variable
    """

    VOICE_MAP = {
        "alloy": "alloy",
        "echo": "echo",
        "fable": "fable",
        "onyx": "onyx",
        "nova": "nova",
        "shimmer": "shimmer",
    }

    def __init__(self, config: TTSConfig):
        self.config = config
        self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")
        if not self.api_key:
            raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")

    def synthesize(self, text: str, output_path: Path) -> Path:
        voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")

        url = "https://api.openai.com/v1/audio/speech"

        data = json.dumps({
            "model": self.config.model or "tts-1",
            "input": text[:4096],  # OpenAI limit
            "voice": voice,
            "response_format": "mp3"
        }).encode()

        req = urllib.request.Request(url, data=data, method="POST")
        req.add_header("Authorization", f"Bearer {self.api_key}")
        req.add_header("Content-Type", "application/json")

        mp3_path = output_path.with_suffix(".mp3")

        with urllib.request.urlopen(req, timeout=60) as resp:
            mp3_path.write_bytes(resp.read())

        return mp3_path


class EdgeTTSAdapter:
    """Zero-cost TTS using Microsoft Edge neural voices (no API key required).

    Requires: pip install edge-tts>=6.1.9
    Voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support
    """

    DEFAULT_VOICE = "en-US-GuyNeural"

    def __init__(self, config: TTSConfig):
        self.config = config
        self.voice = config.voice_id or self.DEFAULT_VOICE

    def synthesize(self, text: str, output_path: Path) -> Path:
        try:
            import edge_tts
        except ImportError:
            raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")

        import asyncio

        mp3_path = output_path.with_suffix(".mp3")

        async def _run():
            communicate = edge_tts.Communicate(text, self.voice)
            await communicate.save(str(mp3_path))

        asyncio.run(_run())
        return mp3_path


ADAPTERS = {
    "piper": PiperAdapter,
    "elevenlabs": ElevenLabsAdapter,
    "openai": OpenAITTSAdapter,
    "edge-tts": EdgeTTSAdapter,
}


def get_provider_config() -> TTSConfig:
    """Load TTS configuration from environment."""
    provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
    if provider == "openai":
        default_voice = "alloy"
    elif provider == "edge-tts":
        default_voice = EdgeTTSAdapter.DEFAULT_VOICE
    else:
        default_voice = "matthew"
    voice = os.environ.get("DEEPDIVE_TTS_VOICE", default_voice)

    return TTSConfig(
        provider=provider,
        voice_id=voice,
        output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),
        api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs"
                else os.environ.get("OPENAI_API_KEY") if provider == "openai"
                else None
    )


def main():
    parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")
    parser.add_argument("--text", help="Text to synthesize (or read from stdin)")
    parser.add_argument("--input-file", "-i", help="Text file to synthesize")
    parser.add_argument("--output", "-o", help="Output file path (without extension)")
    parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")
    parser.add_argument("--voice", help="Voice ID override")
    args = parser.parse_args()

    # Load config
    config = get_provider_config()
    if args.provider:
        config.provider = args.provider
    if args.voice:
        config.voice_id = args.voice
    if args.output:
        config.output_dir = Path(args.output).parent
        output_name = Path(args.output).stem
    else:
        from datetime import datetime
        output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"

    config.output_dir.mkdir(parents=True, exist_ok=True)
    output_path = config.output_dir / output_name

    # Get text
    if args.input_file:
        text = Path(args.input_file).read_text()
    elif args.text:
        text = args.text
    else:
        text = sys.stdin.read()

    if not text.strip():
        print("Error: No text provided", file=sys.stderr)
        sys.exit(1)

    # Synthesize
    print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")

    adapter_class = ADAPTERS.get(config.provider)
    if not adapter_class:
        print(f"Error: Unknown provider {config.provider}", file=sys.stderr)
        sys.exit(1)

    adapter = adapter_class(config)
    result_path = adapter.synthesize(text, output_path)

    print(f"[TTS] Audio saved: {result_path}")
    print(json.dumps({
        "provider": config.provider,
        "voice": config.voice_id,
        "output_path": str(result_path),
        "duration_estimate_min": len(text) // 150  # ~150 chars/min
    }))


if __name__ == "__main__":
    main()