Files
ezra-environment/the-nexus/deepdive/bin/phase4_generate_audio.py
Ezra 9f010ad044 [BURN] Deep Dive scaffold: 5-phase sovereign NotebookLM (#830)
Complete production-ready scaffold for automated daily AI intelligence briefings:

- Phase 1: Source aggregation (arXiv + lab blogs)
- Phase 2: Relevance ranking (keyword + source authority scoring)
- Phase 3: LLM synthesis (Hermes-context briefing generation)
- Phase 4: TTS audio (edge-tts/OpenAI/ElevenLabs)
- Phase 5: Telegram delivery (voice message)

Deliverables:
- docs/ARCHITECTURE.md (9000+ lines) - system design
- docs/OPERATIONS.md - runbook and troubleshooting
- 5 executable phase scripts (bin/)
- Full pipeline orchestrator (run_full_pipeline.py)
- requirements.txt, README.md

Addresses all 9 acceptance criteria from #830.
Ready for host selection, credential config, and cron activation.

Author: Ezra | Burn mode | 2026-04-05
2026-04-05 05:48:12 +00:00

229 lines
8.5 KiB
Python

#!/usr/bin/env python3
"""
Deep Dive Phase 4: Audio Generation
Converts text briefing to spoken audio podcast.
Usage:
python phase4_generate_audio.py [--date YYYY-MM-DD] [--output-dir DIR] [--tts TTS_PROVIDER]
Issue: the-nexus#830
"""
import argparse
import os
import re
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Optional
class AudioGenerator:
"""Generate audio from briefing text using TTS."""
# TTS providers in order of preference
TTS_PROVIDERS = ['edge-tts', 'openai', 'elevenlabs', 'local-tts']
def __init__(self, output_dir: Path, date: str, tts_provider: str = 'edge-tts'):
self.output_dir = output_dir
self.date = date
self.tts_provider = tts_provider
self.briefings_dir = output_dir / "briefings"
self.audio_dir = output_dir / "audio"
self.audio_dir.mkdir(parents=True, exist_ok=True)
def load_briefing(self) -> str:
"""Load briefing markdown from Phase 3."""
briefing_file = self.briefings_dir / f"{self.date}.md"
if not briefing_file.exists():
raise FileNotFoundError(f"Phase 3 output not found: {briefing_file}")
with open(briefing_file) as f:
content = f.read()
# Remove YAML frontmatter if present
if content.startswith('---'):
parts = content.split('---', 2)
if len(parts) >= 3:
content = parts[2]
return content
def clean_text_for_tts(self, text: str) -> str:
"""Clean markdown for TTS consumption."""
# Remove markdown syntax
text = re.sub(r'\*\*', '', text) # Bold
text = re.sub(r'\*', '', text) # Italic
text = re.sub(r'`[^`]*`', 'code', text) # Inline code
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Links
text = re.sub(r'#{1,6}\s*', '', text) # Headers
text = re.sub(r'---', '', text) # Horizontal rules
# Remove URLs (keep domain for context)
text = re.sub(r'https?://[^\s]+', ' [link] ', text)
# Clean up whitespace
text = re.sub(r'\n\s*\n', '\n\n', text)
text = text.strip()
return text
def add_podcast_intro(self, text: str) -> str:
"""Add standard podcast intro/outro."""
date_str = datetime.strptime(self.date, '%Y-%m-%d').strftime('%B %d, %Y')
intro = f"""Welcome to Deep Dive, your daily AI intelligence briefing for {date_str}. This is Hermes, delivering the most relevant research and developments in artificial intelligence, filtered for the Timmy organization and agent systems development. Let's begin.
"""
outro = """
That concludes today's Deep Dive briefing. Sources and full show notes are available in the Hermes knowledge base. This briefing was automatically generated and will be delivered daily at 6 AM. For on-demand briefings, message the bot with /deepdive. Stay sovereign.
"""
return intro + text + outro
def generate_edge_tts(self, text: str, output_file: Path) -> bool:
"""Generate audio using edge-tts (free, Microsoft Edge voices)."""
try:
import edge_tts
import asyncio
async def generate():
communicate = edge_tts.Communicate(text, voice="en-US-AndrewNeural")
await communicate.save(str(output_file))
asyncio.run(generate())
print(f"[Phase 4] Generated audio via edge-tts: {output_file}")
return True
except Exception as e:
print(f"[WARN] edge-tts failed: {e}")
return False
def generate_openai_tts(self, text: str, output_file: Path) -> bool:
"""Generate audio using OpenAI TTS API."""
try:
from openai import OpenAI
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input=text[:4000] # OpenAI limit
)
response.stream_to_file(str(output_file))
print(f"[Phase 4] Generated audio via OpenAI TTS: {output_file}")
return True
except Exception as e:
print(f"[WARN] OpenAI TTS failed: {e}")
return False
def generate_elevenlabs_tts(self, text: str, output_file: Path) -> bool:
"""Generate audio using ElevenLabs API."""
try:
from elevenlabs import generate, save
audio = generate(
api_key=os.environ.get('ELEVENLABS_API_KEY'),
text=text[:5000], # ElevenLabs limit
voice="Bella",
model="eleven_monolingual_v1"
)
save(audio, str(output_file))
print(f"[Phase 4] Generated audio via ElevenLabs: {output_file}")
return True
except Exception as e:
print(f"[WARN] ElevenLabs failed: {e}")
return False
def generate_local_tts(self, text: str, output_file: Path) -> bool:
"""Generate audio using local TTS (XTTS via llama-server or similar)."""
print("[WARN] Local TTS not yet implemented")
return False
def generate_audio(self, text: str) -> Optional[Path]:
"""Generate audio using configured or available TTS."""
output_file = self.audio_dir / f"{self.date}.mp3"
# If provider specified, try it first
if self.tts_provider == 'edge-tts':
if self.generate_edge_tts(text, output_file):
return output_file
elif self.tts_provider == 'openai':
if self.generate_openai_tts(text, output_file):
return output_file
elif self.tts_provider == 'elevenlabs':
if self.generate_elevenlabs_tts(text, output_file):
return output_file
# Auto-fallback chain
print("[Phase 4] Trying fallback TTS providers...")
# Try edge-tts first (free, no API key)
if self.generate_edge_tts(text, output_file):
return output_file
# Try OpenAI if key available
if os.environ.get('OPENAI_API_KEY'):
if self.generate_openai_tts(text, output_file):
return output_file
# Try ElevenLabs if key available
if os.environ.get('ELEVENLABS_API_KEY'):
if self.generate_elevenlabs_tts(text, output_file):
return output_file
print("[ERROR] All TTS providers failed")
return None
def run(self) -> Optional[Path]:
"""Run full audio generation pipeline."""
print(f"[Phase 4] Generating audio for {self.date}")
briefing = self.load_briefing()
print(f"[Phase 4] Loaded briefing: {len(briefing)} characters")
clean_text = self.clean_text_for_tts(briefing)
podcast_text = self.add_podcast_intro(clean_text)
# Truncate if too long for most TTS (target: 10-15 min audio)
max_chars = 12000 # ~15 min at normal speech
if len(podcast_text) > max_chars:
print(f"[Phase 4] Truncating from {len(podcast_text)} to {max_chars} characters")
podcast_text = podcast_text[:max_chars].rsplit('.', 1)[0] + '.'
output_file = self.generate_audio(podcast_text)
if output_file and output_file.exists():
size_mb = output_file.stat().st_size / (1024 * 1024)
print(f"[Phase 4] Audio generated: {output_file} ({size_mb:.1f} MB)")
return output_file
def main():
parser = argparse.ArgumentParser(description='Deep Dive Phase 4: Audio Generation')
parser.add_argument('--date', default=datetime.now().strftime('%Y-%m-%d'),
help='Target date (YYYY-MM-DD)')
parser.add_argument('--output-dir', type=Path, default=Path('../data'),
help='Output directory for data')
parser.add_argument('--tts', default='edge-tts',
choices=['edge-tts', 'openai', 'elevenlabs', 'local-tts'],
help='TTS provider')
args = parser.parse_args()
generator = AudioGenerator(args.output_dir, args.date, args.tts)
result = generator.run()
if result:
print(f"[DONE] Audio file: {result}")
else:
print("[FAIL] Audio generation failed")
exit(1)
if __name__ == '__main__':
main()