#!/usr/bin/env python3 """ Deep Dive Phase 4: Audio Generation Converts text briefing to spoken audio podcast. Usage: python phase4_generate_audio.py [--date YYYY-MM-DD] [--output-dir DIR] [--tts TTS_PROVIDER] Issue: the-nexus#830 """ import argparse import os import re import subprocess from datetime import datetime from pathlib import Path from typing import Optional class AudioGenerator: """Generate audio from briefing text using TTS.""" # TTS providers in order of preference TTS_PROVIDERS = ['edge-tts', 'openai', 'elevenlabs', 'local-tts'] def __init__(self, output_dir: Path, date: str, tts_provider: str = 'edge-tts'): self.output_dir = output_dir self.date = date self.tts_provider = tts_provider self.briefings_dir = output_dir / "briefings" self.audio_dir = output_dir / "audio" self.audio_dir.mkdir(parents=True, exist_ok=True) def load_briefing(self) -> str: """Load briefing markdown from Phase 3.""" briefing_file = self.briefings_dir / f"{self.date}.md" if not briefing_file.exists(): raise FileNotFoundError(f"Phase 3 output not found: {briefing_file}") with open(briefing_file) as f: content = f.read() # Remove YAML frontmatter if present if content.startswith('---'): parts = content.split('---', 2) if len(parts) >= 3: content = parts[2] return content def clean_text_for_tts(self, text: str) -> str: """Clean markdown for TTS consumption.""" # Remove markdown syntax text = re.sub(r'\*\*', '', text) # Bold text = re.sub(r'\*', '', text) # Italic text = re.sub(r'`[^`]*`', 'code', text) # Inline code text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Links text = re.sub(r'#{1,6}\s*', '', text) # Headers text = re.sub(r'---', '', text) # Horizontal rules # Remove URLs (keep domain for context) text = re.sub(r'https?://[^\s]+', ' [link] ', text) # Clean up whitespace text = re.sub(r'\n\s*\n', '\n\n', text) text = text.strip() return text def add_podcast_intro(self, text: str) -> str: """Add standard podcast intro/outro.""" date_str = datetime.strptime(self.date, '%Y-%m-%d').strftime('%B %d, %Y') intro = f"""Welcome to Deep Dive, your daily AI intelligence briefing for {date_str}. This is Hermes, delivering the most relevant research and developments in artificial intelligence, filtered for the Timmy organization and agent systems development. Let's begin. """ outro = """ That concludes today's Deep Dive briefing. Sources and full show notes are available in the Hermes knowledge base. This briefing was automatically generated and will be delivered daily at 6 AM. For on-demand briefings, message the bot with /deepdive. Stay sovereign. """ return intro + text + outro def generate_edge_tts(self, text: str, output_file: Path) -> bool: """Generate audio using edge-tts (free, Microsoft Edge voices).""" try: import edge_tts import asyncio async def generate(): communicate = edge_tts.Communicate(text, voice="en-US-AndrewNeural") await communicate.save(str(output_file)) asyncio.run(generate()) print(f"[Phase 4] Generated audio via edge-tts: {output_file}") return True except Exception as e: print(f"[WARN] edge-tts failed: {e}") return False def generate_openai_tts(self, text: str, output_file: Path) -> bool: """Generate audio using OpenAI TTS API.""" try: from openai import OpenAI client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY')) response = client.audio.speech.create( model="tts-1", voice="alloy", input=text[:4000] # OpenAI limit ) response.stream_to_file(str(output_file)) print(f"[Phase 4] Generated audio via OpenAI TTS: {output_file}") return True except Exception as e: print(f"[WARN] OpenAI TTS failed: {e}") return False def generate_elevenlabs_tts(self, text: str, output_file: Path) -> bool: """Generate audio using ElevenLabs API.""" try: from elevenlabs import generate, save audio = generate( api_key=os.environ.get('ELEVENLABS_API_KEY'), text=text[:5000], # ElevenLabs limit voice="Bella", model="eleven_monolingual_v1" ) save(audio, str(output_file)) print(f"[Phase 4] Generated audio via ElevenLabs: {output_file}") return True except Exception as e: print(f"[WARN] ElevenLabs failed: {e}") return False def generate_local_tts(self, text: str, output_file: Path) -> bool: """Generate audio using local TTS (XTTS via llama-server or similar).""" print("[WARN] Local TTS not yet implemented") return False def generate_audio(self, text: str) -> Optional[Path]: """Generate audio using configured or available TTS.""" output_file = self.audio_dir / f"{self.date}.mp3" # If provider specified, try it first if self.tts_provider == 'edge-tts': if self.generate_edge_tts(text, output_file): return output_file elif self.tts_provider == 'openai': if self.generate_openai_tts(text, output_file): return output_file elif self.tts_provider == 'elevenlabs': if self.generate_elevenlabs_tts(text, output_file): return output_file # Auto-fallback chain print("[Phase 4] Trying fallback TTS providers...") # Try edge-tts first (free, no API key) if self.generate_edge_tts(text, output_file): return output_file # Try OpenAI if key available if os.environ.get('OPENAI_API_KEY'): if self.generate_openai_tts(text, output_file): return output_file # Try ElevenLabs if key available if os.environ.get('ELEVENLABS_API_KEY'): if self.generate_elevenlabs_tts(text, output_file): return output_file print("[ERROR] All TTS providers failed") return None def run(self) -> Optional[Path]: """Run full audio generation pipeline.""" print(f"[Phase 4] Generating audio for {self.date}") briefing = self.load_briefing() print(f"[Phase 4] Loaded briefing: {len(briefing)} characters") clean_text = self.clean_text_for_tts(briefing) podcast_text = self.add_podcast_intro(clean_text) # Truncate if too long for most TTS (target: 10-15 min audio) max_chars = 12000 # ~15 min at normal speech if len(podcast_text) > max_chars: print(f"[Phase 4] Truncating from {len(podcast_text)} to {max_chars} characters") podcast_text = podcast_text[:max_chars].rsplit('.', 1)[0] + '.' output_file = self.generate_audio(podcast_text) if output_file and output_file.exists(): size_mb = output_file.stat().st_size / (1024 * 1024) print(f"[Phase 4] Audio generated: {output_file} ({size_mb:.1f} MB)") return output_file def main(): parser = argparse.ArgumentParser(description='Deep Dive Phase 4: Audio Generation') parser.add_argument('--date', default=datetime.now().strftime('%Y-%m-%d'), help='Target date (YYYY-MM-DD)') parser.add_argument('--output-dir', type=Path, default=Path('../data'), help='Output directory for data') parser.add_argument('--tts', default='edge-tts', choices=['edge-tts', 'openai', 'elevenlabs', 'local-tts'], help='TTS provider') args = parser.parse_args() generator = AudioGenerator(args.output_dir, args.date, args.tts) result = generator.run() if result: print(f"[DONE] Audio file: {result}") else: print("[FAIL] Audio generation failed") exit(1) if __name__ == '__main__': main()