229 lines
8.5 KiB
Python
229 lines
8.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Deep Dive Phase 4: Audio Generation
|
||
|
|
Converts text briefing to spoken audio podcast.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python phase4_generate_audio.py [--date YYYY-MM-DD] [--output-dir DIR] [--tts TTS_PROVIDER]
|
||
|
|
|
||
|
|
Issue: the-nexus#830
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import subprocess
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
|
||
|
|
class AudioGenerator:
|
||
|
|
"""Generate audio from briefing text using TTS."""
|
||
|
|
|
||
|
|
# TTS providers in order of preference
|
||
|
|
TTS_PROVIDERS = ['edge-tts', 'openai', 'elevenlabs', 'local-tts']
|
||
|
|
|
||
|
|
def __init__(self, output_dir: Path, date: str, tts_provider: str = 'edge-tts'):
|
||
|
|
self.output_dir = output_dir
|
||
|
|
self.date = date
|
||
|
|
self.tts_provider = tts_provider
|
||
|
|
self.briefings_dir = output_dir / "briefings"
|
||
|
|
self.audio_dir = output_dir / "audio"
|
||
|
|
self.audio_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
def load_briefing(self) -> str:
|
||
|
|
"""Load briefing markdown from Phase 3."""
|
||
|
|
briefing_file = self.briefings_dir / f"{self.date}.md"
|
||
|
|
if not briefing_file.exists():
|
||
|
|
raise FileNotFoundError(f"Phase 3 output not found: {briefing_file}")
|
||
|
|
|
||
|
|
with open(briefing_file) as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# Remove YAML frontmatter if present
|
||
|
|
if content.startswith('---'):
|
||
|
|
parts = content.split('---', 2)
|
||
|
|
if len(parts) >= 3:
|
||
|
|
content = parts[2]
|
||
|
|
|
||
|
|
return content
|
||
|
|
|
||
|
|
def clean_text_for_tts(self, text: str) -> str:
|
||
|
|
"""Clean markdown for TTS consumption."""
|
||
|
|
# Remove markdown syntax
|
||
|
|
text = re.sub(r'\*\*', '', text) # Bold
|
||
|
|
text = re.sub(r'\*', '', text) # Italic
|
||
|
|
text = re.sub(r'`[^`]*`', 'code', text) # Inline code
|
||
|
|
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Links
|
||
|
|
text = re.sub(r'#{1,6}\s*', '', text) # Headers
|
||
|
|
text = re.sub(r'---', '', text) # Horizontal rules
|
||
|
|
|
||
|
|
# Remove URLs (keep domain for context)
|
||
|
|
text = re.sub(r'https?://[^\s]+', ' [link] ', text)
|
||
|
|
|
||
|
|
# Clean up whitespace
|
||
|
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||
|
|
text = text.strip()
|
||
|
|
|
||
|
|
return text
|
||
|
|
|
||
|
|
def add_podcast_intro(self, text: str) -> str:
|
||
|
|
"""Add standard podcast intro/outro."""
|
||
|
|
date_str = datetime.strptime(self.date, '%Y-%m-%d').strftime('%B %d, %Y')
|
||
|
|
|
||
|
|
intro = f"""Welcome to Deep Dive, your daily AI intelligence briefing for {date_str}. This is Hermes, delivering the most relevant research and developments in artificial intelligence, filtered for the Timmy organization and agent systems development. Let's begin.
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
outro = """
|
||
|
|
|
||
|
|
That concludes today's Deep Dive briefing. Sources and full show notes are available in the Hermes knowledge base. This briefing was automatically generated and will be delivered daily at 6 AM. For on-demand briefings, message the bot with /deepdive. Stay sovereign.
|
||
|
|
"""
|
||
|
|
|
||
|
|
return intro + text + outro
|
||
|
|
|
||
|
|
def generate_edge_tts(self, text: str, output_file: Path) -> bool:
|
||
|
|
"""Generate audio using edge-tts (free, Microsoft Edge voices)."""
|
||
|
|
try:
|
||
|
|
import edge_tts
|
||
|
|
import asyncio
|
||
|
|
|
||
|
|
async def generate():
|
||
|
|
communicate = edge_tts.Communicate(text, voice="en-US-AndrewNeural")
|
||
|
|
await communicate.save(str(output_file))
|
||
|
|
|
||
|
|
asyncio.run(generate())
|
||
|
|
print(f"[Phase 4] Generated audio via edge-tts: {output_file}")
|
||
|
|
return True
|
||
|
|
except Exception as e:
|
||
|
|
print(f"[WARN] edge-tts failed: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def generate_openai_tts(self, text: str, output_file: Path) -> bool:
|
||
|
|
"""Generate audio using OpenAI TTS API."""
|
||
|
|
try:
|
||
|
|
from openai import OpenAI
|
||
|
|
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
|
||
|
|
|
||
|
|
response = client.audio.speech.create(
|
||
|
|
model="tts-1",
|
||
|
|
voice="alloy",
|
||
|
|
input=text[:4000] # OpenAI limit
|
||
|
|
)
|
||
|
|
|
||
|
|
response.stream_to_file(str(output_file))
|
||
|
|
print(f"[Phase 4] Generated audio via OpenAI TTS: {output_file}")
|
||
|
|
return True
|
||
|
|
except Exception as e:
|
||
|
|
print(f"[WARN] OpenAI TTS failed: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def generate_elevenlabs_tts(self, text: str, output_file: Path) -> bool:
|
||
|
|
"""Generate audio using ElevenLabs API."""
|
||
|
|
try:
|
||
|
|
from elevenlabs import generate, save
|
||
|
|
|
||
|
|
audio = generate(
|
||
|
|
api_key=os.environ.get('ELEVENLABS_API_KEY'),
|
||
|
|
text=text[:5000], # ElevenLabs limit
|
||
|
|
voice="Bella",
|
||
|
|
model="eleven_monolingual_v1"
|
||
|
|
)
|
||
|
|
|
||
|
|
save(audio, str(output_file))
|
||
|
|
print(f"[Phase 4] Generated audio via ElevenLabs: {output_file}")
|
||
|
|
return True
|
||
|
|
except Exception as e:
|
||
|
|
print(f"[WARN] ElevenLabs failed: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def generate_local_tts(self, text: str, output_file: Path) -> bool:
|
||
|
|
"""Generate audio using local TTS (XTTS via llama-server or similar)."""
|
||
|
|
print("[WARN] Local TTS not yet implemented")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def generate_audio(self, text: str) -> Optional[Path]:
|
||
|
|
"""Generate audio using configured or available TTS."""
|
||
|
|
output_file = self.audio_dir / f"{self.date}.mp3"
|
||
|
|
|
||
|
|
# If provider specified, try it first
|
||
|
|
if self.tts_provider == 'edge-tts':
|
||
|
|
if self.generate_edge_tts(text, output_file):
|
||
|
|
return output_file
|
||
|
|
elif self.tts_provider == 'openai':
|
||
|
|
if self.generate_openai_tts(text, output_file):
|
||
|
|
return output_file
|
||
|
|
elif self.tts_provider == 'elevenlabs':
|
||
|
|
if self.generate_elevenlabs_tts(text, output_file):
|
||
|
|
return output_file
|
||
|
|
|
||
|
|
# Auto-fallback chain
|
||
|
|
print("[Phase 4] Trying fallback TTS providers...")
|
||
|
|
|
||
|
|
# Try edge-tts first (free, no API key)
|
||
|
|
if self.generate_edge_tts(text, output_file):
|
||
|
|
return output_file
|
||
|
|
|
||
|
|
# Try OpenAI if key available
|
||
|
|
if os.environ.get('OPENAI_API_KEY'):
|
||
|
|
if self.generate_openai_tts(text, output_file):
|
||
|
|
return output_file
|
||
|
|
|
||
|
|
# Try ElevenLabs if key available
|
||
|
|
if os.environ.get('ELEVENLABS_API_KEY'):
|
||
|
|
if self.generate_elevenlabs_tts(text, output_file):
|
||
|
|
return output_file
|
||
|
|
|
||
|
|
print("[ERROR] All TTS providers failed")
|
||
|
|
return None
|
||
|
|
|
||
|
|
def run(self) -> Optional[Path]:
|
||
|
|
"""Run full audio generation pipeline."""
|
||
|
|
print(f"[Phase 4] Generating audio for {self.date}")
|
||
|
|
|
||
|
|
briefing = self.load_briefing()
|
||
|
|
print(f"[Phase 4] Loaded briefing: {len(briefing)} characters")
|
||
|
|
|
||
|
|
clean_text = self.clean_text_for_tts(briefing)
|
||
|
|
podcast_text = self.add_podcast_intro(clean_text)
|
||
|
|
|
||
|
|
# Truncate if too long for most TTS (target: 10-15 min audio)
|
||
|
|
max_chars = 12000 # ~15 min at normal speech
|
||
|
|
if len(podcast_text) > max_chars:
|
||
|
|
print(f"[Phase 4] Truncating from {len(podcast_text)} to {max_chars} characters")
|
||
|
|
podcast_text = podcast_text[:max_chars].rsplit('.', 1)[0] + '.'
|
||
|
|
|
||
|
|
output_file = self.generate_audio(podcast_text)
|
||
|
|
|
||
|
|
if output_file and output_file.exists():
|
||
|
|
size_mb = output_file.stat().st_size / (1024 * 1024)
|
||
|
|
print(f"[Phase 4] Audio generated: {output_file} ({size_mb:.1f} MB)")
|
||
|
|
|
||
|
|
return output_file
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description='Deep Dive Phase 4: Audio Generation')
|
||
|
|
parser.add_argument('--date', default=datetime.now().strftime('%Y-%m-%d'),
|
||
|
|
help='Target date (YYYY-MM-DD)')
|
||
|
|
parser.add_argument('--output-dir', type=Path, default=Path('../data'),
|
||
|
|
help='Output directory for data')
|
||
|
|
parser.add_argument('--tts', default='edge-tts',
|
||
|
|
choices=['edge-tts', 'openai', 'elevenlabs', 'local-tts'],
|
||
|
|
help='TTS provider')
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
generator = AudioGenerator(args.output_dir, args.date, args.tts)
|
||
|
|
result = generator.run()
|
||
|
|
|
||
|
|
if result:
|
||
|
|
print(f"[DONE] Audio file: {result}")
|
||
|
|
else:
|
||
|
|
print("[FAIL] Audio generation failed")
|
||
|
|
exit(1)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|