#!/usr/bin/env python3 """ TTS Engine for Deep Dive — Phase 4 Implementation Issue #830 — Sovereign NotebookLM Daily Briefing """ import os import subprocess import tempfile import requests from pathlib import Path from datetime import datetime from typing import Optional, List class PiperTTS: """Local TTS using Piper (sovereign, no API calls).""" DEFAULT_MODEL = "en_US-lessac-medium" MODEL_BASE_URL = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US" def __init__(self, model_name: str = None): self.model_name = model_name or self.DEFAULT_MODEL self.model_path = None self.config_path = None self._ensure_model() def _ensure_model(self): """Download model if not present.""" model_dir = Path.home() / ".local/share/piper" model_dir.mkdir(parents=True, exist_ok=True) self.model_path = model_dir / f"{self.model_name}.onnx" self.config_path = model_dir / f"{self.model_name}.onnx.json" if not self.model_path.exists(): self._download_model(model_dir) def _download_model(self, model_dir: Path): """Download voice model (~2GB).""" print(f"Downloading Piper model: {self.model_name}") voice_type = self.model_name.split("-")[-1] # medium/high base = f"{self.MODEL_BASE_URL}/{self.model_name.replace(f'en_US-', '').replace(f'-{voice_type}', '')}/{voice_type}" subprocess.run([ "wget", "-q", "--show-progress", "-O", str(self.model_path), f"{base}/{self.model_name}.onnx" ], check=True) subprocess.run([ "wget", "-q", "--show-progress", "-O", str(self.config_path), f"{base}/{self.model_name}.onnx.json" ], check=True) print(f"Model downloaded to {model_dir}") def synthesize(self, text: str, output_path: str) -> str: """Convert text to MP3.""" chunks = self._chunk_text(text) with tempfile.TemporaryDirectory() as tmpdir: chunk_files = [] for i, chunk in enumerate(chunks): chunk_wav = f"{tmpdir}/chunk_{i:03d}.wav" self._synthesize_chunk(chunk, chunk_wav) chunk_files.append(chunk_wav) # Concatenate concat_list = f"{tmpdir}/concat.txt" with open(concat_list, 'w') as f: for cf in chunk_files: f.write(f"file '{cf}'\n") subprocess.run([ "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", "-f", "concat", "-safe", "0", "-i", concat_list, "-c:a", "libmp3lame", "-q:a", "4", output_path ], check=True) return output_path def _chunk_text(self, text: str, max_chars: int = 400) -> List[str]: """Split at sentence boundaries.""" text = text.replace('. ', '.|').replace('! ', '!|').replace('? ', '?|') sentences = text.split('|') chunks = [] current = "" for sent in sentences: sent = sent.strip() if not sent: continue if len(current) + len(sent) < max_chars: current += sent + " " else: if current: chunks.append(current.strip()) current = sent + " " if current: chunks.append(current.strip()) return chunks or [text[:max_chars]] def _synthesize_chunk(self, text: str, output_wav: str): """Synthesize single chunk.""" subprocess.run([ "piper", "--quiet", "--model", str(self.model_path), "--config", str(self.config_path), "--output_file", output_wav ], input=text.encode(), check=True) class ElevenLabsTTS: """Cloud TTS using ElevenLabs API.""" API_BASE = "https://api.elevenlabs.io/v1" DEFAULT_VOICE = "21m00Tcm4TlvDq8ikWAM" # Rachel def __init__(self, api_key: str = None, voice_id: str = None): self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") if not self.api_key: raise ValueError("ELEVENLABS_API_KEY required") self.voice_id = voice_id or self.DEFAULT_VOICE def synthesize(self, text: str, output_path: str) -> str: """Convert text to speech via API.""" url = f"{self.API_BASE}/text-to-speech/{self.voice_id}" headers = { "Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": self.api_key } data = { "text": text[:5000], # ElevenLabs limit "model_id": "eleven_monolingual_v1", "voice_settings": { "stability": 0.5, "similarity_boost": 0.75 } } response = requests.post(url, json=data, headers=headers, timeout=120) response.raise_for_status() with open(output_path, 'wb') as f: f.write(response.content) return output_path class HybridTTS: """TTS with sovereign primary, cloud fallback.""" def __init__(self, prefer_cloud: bool = False): self.primary = None self.fallback = None self.prefer_cloud = prefer_cloud # Try preferred engine if prefer_cloud: self._init_elevenlabs() if not self.primary: self._init_piper() else: self._init_piper() if not self.primary: self._init_elevenlabs() def _init_piper(self): try: self.primary = PiperTTS() except Exception as e: print(f"Piper init failed: {e}") def _init_elevenlabs(self): try: self.primary = ElevenLabsTTS() except Exception as e: print(f"ElevenLabs init failed: {e}") def synthesize(self, text: str, output_path: str) -> str: """Synthesize with fallback.""" if self.primary: try: return self.primary.synthesize(text, output_path) except Exception as e: print(f"Primary failed: {e}") raise RuntimeError("No TTS engine available") def phase4_generate_audio(briefing_text: str, output_dir: str = "/tmp/deepdive", prefer_cloud: bool = False) -> str: """Phase 4: Generate audio from briefing text.""" os.makedirs(output_dir, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"{output_dir}/deepdive_{timestamp}.mp3" tts = HybridTTS(prefer_cloud=prefer_cloud) return tts.synthesize(briefing_text, output_path) if __name__ == "__main__": # Test test_text = """ Good morning. This is your Deep Dive daily briefing for April 5th, 2026. Three papers from arXiv caught our attention today. First, researchers at Stanford propose a new method for efficient fine-tuning of large language models using gradient checkpointing. Second, a team from DeepMind releases a comprehensive survey on multi-agent reinforcement learning in open-ended environments. Third, an interesting approach to speculative decoding that promises 3x speedup for transformer inference without quality degradation. That concludes today's briefing. Stay sovereign. """ output = phase4_generate_audio(test_text) print(f"Generated: {output}")