diff --git a/intelligence/deepdive/tts_engine.py b/intelligence/deepdive/tts_engine.py new file mode 100644 index 0000000..5e1f0e3 --- /dev/null +++ b/intelligence/deepdive/tts_engine.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +TTS Engine for Deep Dive — Phase 4 Implementation +Issue #830 — Sovereign NotebookLM Daily Briefing +""" + +import os +import subprocess +import tempfile +import requests +from pathlib import Path +from datetime import datetime +from typing import Optional, List + + +class PiperTTS: + """Local TTS using Piper (sovereign, no API calls).""" + + DEFAULT_MODEL = "en_US-lessac-medium" + MODEL_BASE_URL = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US" + + def __init__(self, model_name: str = None): + self.model_name = model_name or self.DEFAULT_MODEL + self.model_path = None + self.config_path = None + self._ensure_model() + + def _ensure_model(self): + """Download model if not present.""" + model_dir = Path.home() / ".local/share/piper" + model_dir.mkdir(parents=True, exist_ok=True) + + self.model_path = model_dir / f"{self.model_name}.onnx" + self.config_path = model_dir / f"{self.model_name}.onnx.json" + + if not self.model_path.exists(): + self._download_model(model_dir) + + def _download_model(self, model_dir: Path): + """Download voice model (~2GB).""" + print(f"Downloading Piper model: {self.model_name}") + + voice_type = self.model_name.split("-")[-1] # medium/high + base = f"{self.MODEL_BASE_URL}/{self.model_name.replace(f'en_US-', '').replace(f'-{voice_type}', '')}/{voice_type}" + + subprocess.run([ + "wget", "-q", "--show-progress", + "-O", str(self.model_path), + f"{base}/{self.model_name}.onnx" + ], check=True) + + subprocess.run([ + "wget", "-q", "--show-progress", + "-O", str(self.config_path), + f"{base}/{self.model_name}.onnx.json" + ], check=True) + + print(f"Model downloaded to {model_dir}") + + def synthesize(self, text: str, output_path: str) -> str: + """Convert text to MP3.""" + chunks = self._chunk_text(text) + + with tempfile.TemporaryDirectory() as tmpdir: + chunk_files = [] + + for i, chunk in enumerate(chunks): + chunk_wav = f"{tmpdir}/chunk_{i:03d}.wav" + self._synthesize_chunk(chunk, chunk_wav) + chunk_files.append(chunk_wav) + + # Concatenate + concat_list = f"{tmpdir}/concat.txt" + with open(concat_list, 'w') as f: + for cf in chunk_files: + f.write(f"file '{cf}'\n") + + subprocess.run([ + "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", + "-f", "concat", "-safe", "0", "-i", concat_list, + "-c:a", "libmp3lame", "-q:a", "4", output_path + ], check=True) + + return output_path + + def _chunk_text(self, text: str, max_chars: int = 400) -> List[str]: + """Split at sentence boundaries.""" + text = text.replace('. ', '.|').replace('! ', '!|').replace('? ', '?|') + sentences = text.split('|') + + chunks = [] + current = "" + + for sent in sentences: + sent = sent.strip() + if not sent: + continue + if len(current) + len(sent) < max_chars: + current += sent + " " + else: + if current: + chunks.append(current.strip()) + current = sent + " " + + if current: + chunks.append(current.strip()) + + return chunks or [text[:max_chars]] + + def _synthesize_chunk(self, text: str, output_wav: str): + """Synthesize single chunk.""" + subprocess.run([ + "piper", "--quiet", + "--model", str(self.model_path), + "--config", str(self.config_path), + "--output_file", output_wav + ], input=text.encode(), check=True) + + +class ElevenLabsTTS: + """Cloud TTS using ElevenLabs API.""" + + API_BASE = "https://api.elevenlabs.io/v1" + DEFAULT_VOICE = "21m00Tcm4TlvDq8ikWAM" # Rachel + + def __init__(self, api_key: str = None, voice_id: str = None): + self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") + if not self.api_key: + raise ValueError("ELEVENLABS_API_KEY required") + self.voice_id = voice_id or self.DEFAULT_VOICE + + def synthesize(self, text: str, output_path: str) -> str: + """Convert text to speech via API.""" + url = f"{self.API_BASE}/text-to-speech/{self.voice_id}" + + headers = { + "Accept": "audio/mpeg", + "Content-Type": "application/json", + "xi-api-key": self.api_key + } + + data = { + "text": text[:5000], # ElevenLabs limit + "model_id": "eleven_monolingual_v1", + "voice_settings": { + "stability": 0.5, + "similarity_boost": 0.75 + } + } + + response = requests.post(url, json=data, headers=headers, timeout=120) + response.raise_for_status() + + with open(output_path, 'wb') as f: + f.write(response.content) + + return output_path + + +class HybridTTS: + """TTS with sovereign primary, cloud fallback.""" + + def __init__(self, prefer_cloud: bool = False): + self.primary = None + self.fallback = None + self.prefer_cloud = prefer_cloud + + # Try preferred engine + if prefer_cloud: + self._init_elevenlabs() + if not self.primary: + self._init_piper() + else: + self._init_piper() + if not self.primary: + self._init_elevenlabs() + + def _init_piper(self): + try: + self.primary = PiperTTS() + except Exception as e: + print(f"Piper init failed: {e}") + + def _init_elevenlabs(self): + try: + self.primary = ElevenLabsTTS() + except Exception as e: + print(f"ElevenLabs init failed: {e}") + + def synthesize(self, text: str, output_path: str) -> str: + """Synthesize with fallback.""" + if self.primary: + try: + return self.primary.synthesize(text, output_path) + except Exception as e: + print(f"Primary failed: {e}") + + raise RuntimeError("No TTS engine available") + + +def phase4_generate_audio(briefing_text: str, output_dir: str = "/tmp/deepdive", + prefer_cloud: bool = False) -> str: + """Phase 4: Generate audio from briefing text.""" + os.makedirs(output_dir, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = f"{output_dir}/deepdive_{timestamp}.mp3" + + tts = HybridTTS(prefer_cloud=prefer_cloud) + return tts.synthesize(briefing_text, output_path) + + +if __name__ == "__main__": + # Test + test_text = """ + Good morning. This is your Deep Dive daily briefing for April 5th, 2026. + Three papers from arXiv caught our attention today. + First, researchers at Stanford propose a new method for efficient fine-tuning + of large language models using gradient checkpointing. + Second, a team from DeepMind releases a comprehensive survey on multi-agent + reinforcement learning in open-ended environments. + Third, an interesting approach to speculative decoding that promises 3x speedup + for transformer inference without quality degradation. + That concludes today's briefing. Stay sovereign. + """ + + output = phase4_generate_audio(test_text) + print(f"Generated: {output}")