the-nexus/intelligence/deepdive/tts_engine.py

#!/usr/bin/env python3
"""
TTS Engine for Deep Dive — Phase 4 Implementation
Issue #830 — Sovereign NotebookLM Daily Briefing
"""

import os
import subprocess
import tempfile
import requests
from pathlib import Path
from datetime import datetime
from typing import Optional, List


class PiperTTS:
    """Local TTS using Piper (sovereign, no API calls)."""

    DEFAULT_MODEL = "en_US-lessac-medium"
    MODEL_BASE_URL = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US"

    def __init__(self, model_name: str = None):
        self.model_name = model_name or self.DEFAULT_MODEL
        self.model_path = None
        self.config_path = None
        self._ensure_model()

    def _ensure_model(self):
        """Download model if not present."""
        model_dir = Path.home() / ".local/share/piper"
        model_dir.mkdir(parents=True, exist_ok=True)

        self.model_path = model_dir / f"{self.model_name}.onnx"
        self.config_path = model_dir / f"{self.model_name}.onnx.json"

        if not self.model_path.exists():
            self._download_model(model_dir)

    def _download_model(self, model_dir: Path):
        """Download voice model (~2GB)."""
        print(f"Downloading Piper model: {self.model_name}")

        voice_type = self.model_name.split("-")[-1]  # medium/high
        base = f"{self.MODEL_BASE_URL}/{self.model_name.replace(f'en_US-', '').replace(f'-{voice_type}', '')}/{voice_type}"

        subprocess.run([
            "wget", "-q", "--show-progress",
            "-O", str(self.model_path),
            f"{base}/{self.model_name}.onnx"
        ], check=True)

        subprocess.run([
            "wget", "-q", "--show-progress",
            "-O", str(self.config_path),
            f"{base}/{self.model_name}.onnx.json"
        ], check=True)

        print(f"Model downloaded to {model_dir}")

    def synthesize(self, text: str, output_path: str) -> str:
        """Convert text to MP3."""
        chunks = self._chunk_text(text)

        with tempfile.TemporaryDirectory() as tmpdir:
            chunk_files = []

            for i, chunk in enumerate(chunks):
                chunk_wav = f"{tmpdir}/chunk_{i:03d}.wav"
                self._synthesize_chunk(chunk, chunk_wav)
                chunk_files.append(chunk_wav)

            # Concatenate
            concat_list = f"{tmpdir}/concat.txt"
            with open(concat_list, 'w') as f:
                for cf in chunk_files:
                    f.write(f"file '{cf}'\n")

            subprocess.run([
                "ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
                "-f", "concat", "-safe", "0", "-i", concat_list,
                "-c:a", "libmp3lame", "-q:a", "4", output_path
            ], check=True)

        return output_path

    def _chunk_text(self, text: str, max_chars: int = 400) -> List[str]:
        """Split at sentence boundaries."""
        text = text.replace('. ', '.|').replace('! ', '!|').replace('? ', '?|')
        sentences = text.split('|')

        chunks = []
        current = ""

        for sent in sentences:
            sent = sent.strip()
            if not sent:
                continue
            if len(current) + len(sent) < max_chars:
                current += sent + " "
            else:
                if current:
                    chunks.append(current.strip())
                current = sent + " "

        if current:
            chunks.append(current.strip())

        return chunks or [text[:max_chars]]

    def _synthesize_chunk(self, text: str, output_wav: str):
        """Synthesize single chunk."""
        subprocess.run([
            "piper", "--quiet",
            "--model", str(self.model_path),
            "--config", str(self.config_path),
            "--output_file", output_wav
        ], input=text.encode(), check=True)


class ElevenLabsTTS:
    """Cloud TTS using ElevenLabs API."""

    API_BASE = "https://api.elevenlabs.io/v1"
    DEFAULT_VOICE = "21m00Tcm4TlvDq8ikWAM"  # Rachel

    def __init__(self, api_key: str = None, voice_id: str = None):
        self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
        if not self.api_key:
            raise ValueError("ELEVENLABS_API_KEY required")
        self.voice_id = voice_id or self.DEFAULT_VOICE

    def synthesize(self, text: str, output_path: str) -> str:
        """Convert text to speech via API."""
        url = f"{self.API_BASE}/text-to-speech/{self.voice_id}"

        headers = {
            "Accept": "audio/mpeg",
            "Content-Type": "application/json",
            "xi-api-key": self.api_key
        }

        data = {
            "text": text[:5000],  # ElevenLabs limit
            "model_id": "eleven_monolingual_v1",
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75
            }
        }

        response = requests.post(url, json=data, headers=headers, timeout=120)
        response.raise_for_status()

        with open(output_path, 'wb') as f:
            f.write(response.content)

        return output_path


class HybridTTS:
    """TTS with sovereign primary, cloud fallback."""

    def __init__(self, prefer_cloud: bool = False):
        self.primary = None
        self.fallback = None
        self.prefer_cloud = prefer_cloud

        # Try preferred engine
        if prefer_cloud:
            self._init_elevenlabs()
            if not self.primary:
                self._init_piper()
        else:
            self._init_piper()
            if not self.primary:
                self._init_elevenlabs()

    def _init_piper(self):
        try:
            self.primary = PiperTTS()
        except Exception as e:
            print(f"Piper init failed: {e}")

    def _init_elevenlabs(self):
        try:
            self.primary = ElevenLabsTTS()
        except Exception as e:
            print(f"ElevenLabs init failed: {e}")

    def synthesize(self, text: str, output_path: str) -> str:
        """Synthesize with fallback."""
        if self.primary:
            try:
                return self.primary.synthesize(text, output_path)
            except Exception as e:
                print(f"Primary failed: {e}")

        raise RuntimeError("No TTS engine available")


def phase4_generate_audio(briefing_text: str, output_dir: str = "/tmp/deepdive",
                         prefer_cloud: bool = False) -> str:
    """Phase 4: Generate audio from briefing text."""
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"{output_dir}/deepdive_{timestamp}.mp3"

    tts = HybridTTS(prefer_cloud=prefer_cloud)
    return tts.synthesize(briefing_text, output_path)


if __name__ == "__main__":
    # Test
    test_text = """
    Good morning. This is your Deep Dive daily briefing for April 5th, 2026.
    Three papers from arXiv caught our attention today.
    First, researchers at Stanford propose a new method for efficient fine-tuning
    of large language models using gradient checkpointing.
    Second, a team from DeepMind releases a comprehensive survey on multi-agent
    reinforcement learning in open-ended environments.
    Third, an interesting approach to speculative decoding that promises 3x speedup
    for transformer inference without quality degradation.
    That concludes today's briefing. Stay sovereign.
    """

    output = phase4_generate_audio(test_text)
    print(f"Generated: {output}")