Files

Deploy Nexus / deploy (push) Has been cancelled

Details

Add TTS integration proof for Deep Dive (#830 )

Phase 4 implementation: Piper (sovereign) + ElevenLabs (cloud)
with hybrid fallback architecture. Includes working Python code,
voice selection guide, testing commands.

Burn mode artifact by Ezra.

2026-04-05 08:31:33 +00:00

9.2 KiB

Raw Blame History

TTS Integration Proof — Deep Dive Phase 4

Issue #830 — Sovereign NotebookLM Daily Briefing

Created: Ezra, Burn Mode | 2026-04-05

Architecture

┌─────────────────┐     ┌─────────────────┐     ┌─────────────────┐
│  Synthesis      │────▶│  TTS Engine     │────▶│  Audio Output   │
│  (text brief)   │     │  Piper/Coqui/   │     │  MP3/OGG file   │
│                 │     │  ElevenLabs     │     │                 │
└─────────────────┘     └─────────────────┘     └─────────────────┘

Implementation

Option A: Local Piper (Sovereign)

#!/usr/bin/env python3
"""Piper TTS integration for Deep Dive Phase 4."""
import subprocess
import tempfile
import os
from pathlib import Path

class PiperTTS:
    """Local TTS using Piper (sovereign, no API calls)."""
    
    def __init__(self, model_path: str = None):
        self.model_path = model_path or self._download_default_model()
        self.config_path = self.model_path.replace(".onnx", ".onnx.json")
        
    def _download_default_model(self) -> str:
        """Download default en_US voice model (~2GB)."""
        model_dir = Path.home() / ".local/share/piper"
        model_dir.mkdir(parents=True, exist_ok=True)
        
        model_file = model_dir / "en_US-lessac-medium.onnx"
        config_file = model_dir / "en_US-lessac-medium.onnx.json"
        
        if not model_file.exists():
            print("Downloading Piper voice model (~2GB)...")
            base_url = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/lessac/medium"
            subprocess.run([
                "wget", "-O", str(model_file),
                f"{base_url}/en_US-lessac-medium.onnx"
            ], check=True)
            subprocess.run([
                "wget", "-O", str(config_file),
                f"{base_url}/en_US-lessac-medium.onnx.json"
            ], check=True)
        
        return str(model_file)
    
    def synthesize(self, text: str, output_path: str) -> str:
        """Convert text to speech."""
        # Split long text into chunks (Piper handles ~400 chars well)
        chunks = self._chunk_text(text, max_chars=400)
        
        with tempfile.TemporaryDirectory() as tmpdir:
            chunk_files = []
            
            for i, chunk in enumerate(chunks):
                chunk_wav = f"{tmpdir}/chunk_{i:03d}.wav"
                self._synthesize_chunk(chunk, chunk_wav)
                chunk_files.append(chunk_wav)
            
            # Concatenate chunks
            concat_list = f"{tmpdir}/concat.txt"
            with open(concat_list, 'w') as f:
                for cf in chunk_files:
                    f.write(f"file '{cf}'\n")
            
            # Final output
            subprocess.run([
                "ffmpeg", "-y", "-f", "concat", "-safe", "0",
                "-i", concat_list,
                "-c:a", "libmp3lame", "-q:a", "4",
                output_path
            ], check=True, capture_output=True)
        
        return output_path
    
    def _chunk_text(self, text: str, max_chars: int = 400) -> list:
        """Split text at sentence boundaries."""
        sentences = text.replace('. ', '.|').replace('! ', '!|').replace('? ', '?|').split('|')
        chunks = []
        current = ""
        
        for sent in sentences:
            if len(current) + len(sent) < max_chars:
                current += sent + " "
            else:
                if current:
                    chunks.append(current.strip())
                current = sent + " "
        
        if current:
            chunks.append(current.strip())
        
        return chunks
    
    def _synthesize_chunk(self, text: str, output_wav: str):
        """Synthesize single chunk."""
        subprocess.run([
            "piper", "--model", self.model_path,
            "--config", self.config_path,
            "--output_file", output_wav
        ], input=text.encode(), check=True)


# Usage example
if __name__ == "__main__":
    tts = PiperTTS()
    briefing_text = """
    Good morning. Today\'s Deep Dive covers three papers from arXiv.
    First, a new approach to reinforcement learning from human feedback.
    Second, advances in quantized model inference for edge deployment.
    Third, a survey of multi-agent coordination protocols.
    """
    output = tts.synthesize(briefing_text, "daily_briefing.mp3")
    print(f"Generated: {output}")

Option B: ElevenLabs API (Quality)

#!/usr/bin/env python3
"""ElevenLabs TTS integration for Deep Dive Phase 4."""
import os
import requests
from pathlib import Path

class ElevenLabsTTS:
    """Cloud TTS using ElevenLabs API."""
    
    API_BASE = "https://api.elevenlabs.io/v1"
    
    def __init__(self, api_key: str = None):
        self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
        if not self.api_key:
            raise ValueError("ElevenLabs API key required")
        
        # Rachel voice (professional, clear)
        self.voice_id = "21m00Tcm4TlvDq8ikWAM"
        
    def synthesize(self, text: str, output_path: str) -> str:
        """Convert text to speech via ElevenLabs."""
        url = f"{self.API_BASE}/text-to-speech/{self.voice_id}"
        
        headers = {
            "Accept": "audio/mpeg",
            "Content-Type": "application/json",
            "xi-api-key": self.api_key
        }
        
        # ElevenLabs handles long text natively (up to ~5000 chars)
        data = {
            "text": text,
            "model_id": "eleven_monolingual_v1",
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75
            }
        }
        
        response = requests.post(url, json=data, headers=headers)
        response.raise_for_status()
        
        with open(output_path, 'wb') as f:
            f.write(response.content)
        
        return output_path


# Usage example
if __name__ == "__main__":
    tts = ElevenLabsTTS()
    briefing_text = "Your daily intelligence briefing..."
    output = tts.synthesize(briefing_text, "daily_briefing.mp3")
    print(f"Generated: {output}")

Hybrid Implementation (Recommended)

#!/usr/bin/env python3
"""Hybrid TTS with Piper primary, ElevenLabs fallback."""
import os
from typing import Optional

class HybridTTS:
    """TTS with sovereign default, cloud fallback."""
    
    def __init__(self):
        self.primary = None
        self.fallback = None
        
        # Try Piper first (sovereign)
        try:
            self.primary = PiperTTS()
            print("✅ Piper TTS ready (sovereign)")
        except Exception as e:
            print(f"⚠️ Piper unavailable: {e}")
        
        # Set up ElevenLabs fallback
        if os.getenv("ELEVENLABS_API_KEY"):
            try:
                self.fallback = ElevenLabsTTS()
                print("✅ ElevenLabs fallback ready")
            except Exception as e:
                print(f"⚠️ ElevenLabs unavailable: {e}")
    
    def synthesize(self, text: str, output_path: str) -> str:
        """Synthesize with fallback chain."""
        # Try primary
        if self.primary:
            try:
                return self.primary.synthesize(text, output_path)
            except Exception as e:
                print(f"Primary TTS failed: {e}, trying fallback...")
        
        # Try fallback
        if self.fallback:
            return self.fallback.synthesize(text, output_path)
        
        raise RuntimeError("No TTS engine available")


# Integration with Deep Dive pipeline
def phase4_generate_audio(briefing_text: str, output_dir: str = "/tmp/deepdive") -> str:
    """Phase 4: Generate audio from synthesized briefing."""
    os.makedirs(output_dir, exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"{output_dir}/deepdive_{timestamp}.mp3"
    
    tts = HybridTTS()
    return tts.synthesize(briefing_text, output_path)

Testing

# Test Piper locally
piper --model ~/.local/share/piper/en_US-lessac-medium.onnx --output_file test.wav <<EOF
This is a test of the Deep Dive text to speech system.
EOF

# Test ElevenLabs
curl -X POST https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM \
  -H "xi-api-key: $ELEVENLABS_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{"text": "Test message", "model_id": "eleven_monolingual_v1"}' \
  --output test.mp3

Dependencies

# Piper (local)
pip install piper-tts
# Or build from source: https://github.com/rhasspy/piper

# ElevenLabs (API)
pip install elevenlabs

# Audio processing
apt install ffmpeg

Voice Selection Guide

Use Case	Piper Voice	ElevenLabs Voice	Notes
Daily briefing	`en_US-lessac-medium`	Rachel (21m00...)	Professional, neutral
Alert/urgent	`en_US-ryan-high`	Adam (pNInz6...)	Authoritative
Casual update	`en_US-libritts-high`	Bella (EXAVIT...)	Conversational

Artifact: docs/deep-dive/TTS_INTEGRATION_PROOF.md
Issue: #830
Author: Ezra | Burn Mode | 2026-04-05

9.2 KiB Raw Blame History