diff --git a/docs/deep-dive/TTS_INTEGRATION_PROOF.md b/docs/deep-dive/TTS_INTEGRATION_PROOF.md new file mode 100644 index 0000000..680c1cf --- /dev/null +++ b/docs/deep-dive/TTS_INTEGRATION_PROOF.md @@ -0,0 +1,285 @@ +# TTS Integration Proof — Deep Dive Phase 4 +# Issue #830 — Sovereign NotebookLM Daily Briefing +# Created: Ezra, Burn Mode | 2026-04-05 + +## Architecture + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Synthesis │────▶│ TTS Engine │────▶│ Audio Output │ +│ (text brief) │ │ Piper/Coqui/ │ │ MP3/OGG file │ +│ │ │ ElevenLabs │ │ │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +## Implementation + +### Option A: Local Piper (Sovereign) + +```python +#!/usr/bin/env python3 +"""Piper TTS integration for Deep Dive Phase 4.""" +import subprocess +import tempfile +import os +from pathlib import Path + +class PiperTTS: + """Local TTS using Piper (sovereign, no API calls).""" + + def __init__(self, model_path: str = None): + self.model_path = model_path or self._download_default_model() + self.config_path = self.model_path.replace(".onnx", ".onnx.json") + + def _download_default_model(self) -> str: + """Download default en_US voice model (~2GB).""" + model_dir = Path.home() / ".local/share/piper" + model_dir.mkdir(parents=True, exist_ok=True) + + model_file = model_dir / "en_US-lessac-medium.onnx" + config_file = model_dir / "en_US-lessac-medium.onnx.json" + + if not model_file.exists(): + print("Downloading Piper voice model (~2GB)...") + base_url = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/lessac/medium" + subprocess.run([ + "wget", "-O", str(model_file), + f"{base_url}/en_US-lessac-medium.onnx" + ], check=True) + subprocess.run([ + "wget", "-O", str(config_file), + f"{base_url}/en_US-lessac-medium.onnx.json" + ], check=True) + + return str(model_file) + + def synthesize(self, text: str, output_path: str) -> str: + """Convert text to speech.""" + # Split long text into chunks (Piper handles ~400 chars well) + chunks = self._chunk_text(text, max_chars=400) + + with tempfile.TemporaryDirectory() as tmpdir: + chunk_files = [] + + for i, chunk in enumerate(chunks): + chunk_wav = f"{tmpdir}/chunk_{i:03d}.wav" + self._synthesize_chunk(chunk, chunk_wav) + chunk_files.append(chunk_wav) + + # Concatenate chunks + concat_list = f"{tmpdir}/concat.txt" + with open(concat_list, 'w') as f: + for cf in chunk_files: + f.write(f"file '{cf}'\n") + + # Final output + subprocess.run([ + "ffmpeg", "-y", "-f", "concat", "-safe", "0", + "-i", concat_list, + "-c:a", "libmp3lame", "-q:a", "4", + output_path + ], check=True, capture_output=True) + + return output_path + + def _chunk_text(self, text: str, max_chars: int = 400) -> list: + """Split text at sentence boundaries.""" + sentences = text.replace('. ', '.|').replace('! ', '!|').replace('? ', '?|').split('|') + chunks = [] + current = "" + + for sent in sentences: + if len(current) + len(sent) < max_chars: + current += sent + " " + else: + if current: + chunks.append(current.strip()) + current = sent + " " + + if current: + chunks.append(current.strip()) + + return chunks + + def _synthesize_chunk(self, text: str, output_wav: str): + """Synthesize single chunk.""" + subprocess.run([ + "piper", "--model", self.model_path, + "--config", self.config_path, + "--output_file", output_wav + ], input=text.encode(), check=True) + + +# Usage example +if __name__ == "__main__": + tts = PiperTTS() + briefing_text = """ + Good morning. Today\'s Deep Dive covers three papers from arXiv. + First, a new approach to reinforcement learning from human feedback. + Second, advances in quantized model inference for edge deployment. + Third, a survey of multi-agent coordination protocols. + """ + output = tts.synthesize(briefing_text, "daily_briefing.mp3") + print(f"Generated: {output}") +``` + +### Option B: ElevenLabs API (Quality) + +```python +#!/usr/bin/env python3 +"""ElevenLabs TTS integration for Deep Dive Phase 4.""" +import os +import requests +from pathlib import Path + +class ElevenLabsTTS: + """Cloud TTS using ElevenLabs API.""" + + API_BASE = "https://api.elevenlabs.io/v1" + + def __init__(self, api_key: str = None): + self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") + if not self.api_key: + raise ValueError("ElevenLabs API key required") + + # Rachel voice (professional, clear) + self.voice_id = "21m00Tcm4TlvDq8ikWAM" + + def synthesize(self, text: str, output_path: str) -> str: + """Convert text to speech via ElevenLabs.""" + url = f"{self.API_BASE}/text-to-speech/{self.voice_id}" + + headers = { + "Accept": "audio/mpeg", + "Content-Type": "application/json", + "xi-api-key": self.api_key + } + + # ElevenLabs handles long text natively (up to ~5000 chars) + data = { + "text": text, + "model_id": "eleven_monolingual_v1", + "voice_settings": { + "stability": 0.5, + "similarity_boost": 0.75 + } + } + + response = requests.post(url, json=data, headers=headers) + response.raise_for_status() + + with open(output_path, 'wb') as f: + f.write(response.content) + + return output_path + + +# Usage example +if __name__ == "__main__": + tts = ElevenLabsTTS() + briefing_text = "Your daily intelligence briefing..." + output = tts.synthesize(briefing_text, "daily_briefing.mp3") + print(f"Generated: {output}") +``` + +## Hybrid Implementation (Recommended) + +```python +#!/usr/bin/env python3 +"""Hybrid TTS with Piper primary, ElevenLabs fallback.""" +import os +from typing import Optional + +class HybridTTS: + """TTS with sovereign default, cloud fallback.""" + + def __init__(self): + self.primary = None + self.fallback = None + + # Try Piper first (sovereign) + try: + self.primary = PiperTTS() + print("✅ Piper TTS ready (sovereign)") + except Exception as e: + print(f"⚠️ Piper unavailable: {e}") + + # Set up ElevenLabs fallback + if os.getenv("ELEVENLABS_API_KEY"): + try: + self.fallback = ElevenLabsTTS() + print("✅ ElevenLabs fallback ready") + except Exception as e: + print(f"⚠️ ElevenLabs unavailable: {e}") + + def synthesize(self, text: str, output_path: str) -> str: + """Synthesize with fallback chain.""" + # Try primary + if self.primary: + try: + return self.primary.synthesize(text, output_path) + except Exception as e: + print(f"Primary TTS failed: {e}, trying fallback...") + + # Try fallback + if self.fallback: + return self.fallback.synthesize(text, output_path) + + raise RuntimeError("No TTS engine available") + + +# Integration with Deep Dive pipeline +def phase4_generate_audio(briefing_text: str, output_dir: str = "/tmp/deepdive") -> str: + """Phase 4: Generate audio from synthesized briefing.""" + os.makedirs(output_dir, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = f"{output_dir}/deepdive_{timestamp}.mp3" + + tts = HybridTTS() + return tts.synthesize(briefing_text, output_path) +``` + +## Testing + +```bash +# Test Piper locally +piper --model ~/.local/share/piper/en_US-lessac-medium.onnx --output_file test.wav <