[ezra] Add Phase 4 TTS pipeline with multi-adapter support #830
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
This commit is contained in:
235
bin/deepdive_tts.py
Normal file
235
bin/deepdive_tts.py
Normal file
@@ -0,0 +1,235 @@
|
||||
#!/usr/bin/env python3
|
||||
"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.
|
||||
|
||||
Issue: #830 (the-nexus)
|
||||
Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSConfig:
|
||||
provider: str # "piper", "elevenlabs", "openai"
|
||||
voice_id: str
|
||||
output_dir: Path
|
||||
# Provider-specific
|
||||
api_key: Optional[str] = None
|
||||
model: Optional[str] = None # e.g., "eleven_turbo_v2" or "tts-1"
|
||||
|
||||
|
||||
class PiperAdapter:
|
||||
"""Local TTS using Piper (offline, free, medium quality).
|
||||
|
||||
Requires: pip install piper-tts
|
||||
Model download: https://huggingface.co/rhasspy/piper-voices
|
||||
"""
|
||||
|
||||
def __init__(self, config: TTSConfig):
|
||||
self.config = config
|
||||
self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"
|
||||
|
||||
def synthesize(self, text: str, output_path: Path) -> Path:
|
||||
if not Path(self.model_path).exists():
|
||||
raise RuntimeError(f"Piper model not found: {self.model_path}. "
|
||||
f"Download from https://huggingface.co/rhasspy/piper-voices")
|
||||
|
||||
cmd = [
|
||||
"piper-tts",
|
||||
"--model", str(self.model_path),
|
||||
"--output_file", str(output_path.with_suffix(".wav"))
|
||||
]
|
||||
|
||||
subprocess.run(cmd, input=text.encode(), check=True)
|
||||
|
||||
# Convert to MP3 for smaller size
|
||||
mp3_path = output_path.with_suffix(".mp3")
|
||||
subprocess.run([
|
||||
"lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)
|
||||
], check=True, capture_output=True)
|
||||
|
||||
output_path.with_suffix(".wav").unlink()
|
||||
return mp3_path
|
||||
|
||||
|
||||
class ElevenLabsAdapter:
|
||||
"""Cloud TTS using ElevenLabs API (high quality, paid).
|
||||
|
||||
Requires: ELEVENLABS_API_KEY environment variable
|
||||
Voices: https://elevenlabs.io/voice-library
|
||||
"""
|
||||
|
||||
VOICE_MAP = {
|
||||
"matthew": "Mathew", # Professional narrator
|
||||
"josh": "Josh", # Young male
|
||||
"rachel": "Rachel", # Professional female
|
||||
"bella": "Bella", # Warm female
|
||||
"adam": "Adam", # Deep male
|
||||
}
|
||||
|
||||
def __init__(self, config: TTSConfig):
|
||||
self.config = config
|
||||
self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")
|
||||
if not self.api_key:
|
||||
raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")
|
||||
|
||||
def synthesize(self, text: str, output_path: Path) -> Path:
|
||||
voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)
|
||||
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
||||
|
||||
data = json.dumps({
|
||||
"text": text[:5000], # ElevenLabs limit
|
||||
"model_id": self.config.model or "eleven_turbo_v2",
|
||||
"voice_settings": {
|
||||
"stability": 0.5,
|
||||
"similarity_boost": 0.75
|
||||
}
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(url, data=data, method="POST")
|
||||
req.add_header("xi-api-key", self.api_key)
|
||||
req.add_header("Content-Type", "application/json")
|
||||
|
||||
mp3_path = output_path.with_suffix(".mp3")
|
||||
|
||||
with urllib.request.urlopen(req, timeout=120) as resp:
|
||||
mp3_path.write_bytes(resp.read())
|
||||
|
||||
return mp3_path
|
||||
|
||||
|
||||
class OpenAITTSAdapter:
|
||||
"""Cloud TTS using OpenAI API (good quality, usage-based pricing).
|
||||
|
||||
Requires: OPENAI_API_KEY environment variable
|
||||
"""
|
||||
|
||||
VOICE_MAP = {
|
||||
"alloy": "alloy",
|
||||
"echo": "echo",
|
||||
"fable": "fable",
|
||||
"onyx": "onyx",
|
||||
"nova": "nova",
|
||||
"shimmer": "shimmer",
|
||||
}
|
||||
|
||||
def __init__(self, config: TTSConfig):
|
||||
self.config = config
|
||||
self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not self.api_key:
|
||||
raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")
|
||||
|
||||
def synthesize(self, text: str, output_path: Path) -> Path:
|
||||
voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")
|
||||
|
||||
url = "https://api.openai.com/v1/audio/speech"
|
||||
|
||||
data = json.dumps({
|
||||
"model": self.config.model or "tts-1",
|
||||
"input": text[:4096], # OpenAI limit
|
||||
"voice": voice,
|
||||
"response_format": "mp3"
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(url, data=data, method="POST")
|
||||
req.add_header("Authorization", f"Bearer {self.api_key}")
|
||||
req.add_header("Content-Type", "application/json")
|
||||
|
||||
mp3_path = output_path.with_suffix(".mp3")
|
||||
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
mp3_path.write_bytes(resp.read())
|
||||
|
||||
return mp3_path
|
||||
|
||||
|
||||
ADAPTERS = {
|
||||
"piper": PiperAdapter,
|
||||
"elevenlabs": ElevenLabsAdapter,
|
||||
"openai": OpenAITTSAdapter,
|
||||
}
|
||||
|
||||
|
||||
def get_provider_config() -> TTSConfig:
|
||||
"""Load TTS configuration from environment."""
|
||||
provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
|
||||
voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew")
|
||||
|
||||
return TTSConfig(
|
||||
provider=provider,
|
||||
voice_id=voice,
|
||||
output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),
|
||||
api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs"
|
||||
else os.environ.get("OPENAI_API_KEY") if provider == "openai"
|
||||
else None
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")
|
||||
parser.add_argument("--text", help="Text to synthesize (or read from stdin)")
|
||||
parser.add_argument("--input-file", "-i", help="Text file to synthesize")
|
||||
parser.add_argument("--output", "-o", help="Output file path (without extension)")
|
||||
parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")
|
||||
parser.add_argument("--voice", help="Voice ID override")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load config
|
||||
config = get_provider_config()
|
||||
if args.provider:
|
||||
config.provider = args.provider
|
||||
if args.voice:
|
||||
config.voice_id = args.voice
|
||||
if args.output:
|
||||
config.output_dir = Path(args.output).parent
|
||||
output_name = Path(args.output).stem
|
||||
else:
|
||||
from datetime import datetime
|
||||
output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"
|
||||
|
||||
config.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_path = config.output_dir / output_name
|
||||
|
||||
# Get text
|
||||
if args.input_file:
|
||||
text = Path(args.input_file).read_text()
|
||||
elif args.text:
|
||||
text = args.text
|
||||
else:
|
||||
text = sys.stdin.read()
|
||||
|
||||
if not text.strip():
|
||||
print("Error: No text provided", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Synthesize
|
||||
print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")
|
||||
|
||||
adapter_class = ADAPTERS.get(config.provider)
|
||||
if not adapter_class:
|
||||
print(f"Error: Unknown provider {config.provider}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
adapter = adapter_class(config)
|
||||
result_path = adapter.synthesize(text, output_path)
|
||||
|
||||
print(f"[TTS] Audio saved: {result_path}")
|
||||
print(json.dumps({
|
||||
"provider": config.provider,
|
||||
"voice": config.voice_id,
|
||||
"output_path": str(result_path),
|
||||
"duration_estimate_min": len(text) // 150 # ~150 chars/min
|
||||
}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user