Files
the-nexus/bin/deepdive_tts.py

236 lines
7.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.
Issue: #830 (the-nexus)
Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.
"""
import argparse
import json
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import os
import urllib.request
@dataclass
class TTSConfig:
provider: str # "piper", "elevenlabs", "openai"
voice_id: str
output_dir: Path
# Provider-specific
api_key: Optional[str] = None
model: Optional[str] = None # e.g., "eleven_turbo_v2" or "tts-1"
class PiperAdapter:
"""Local TTS using Piper (offline, free, medium quality).
Requires: pip install piper-tts
Model download: https://huggingface.co/rhasspy/piper-voices
"""
def __init__(self, config: TTSConfig):
self.config = config
self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"
def synthesize(self, text: str, output_path: Path) -> Path:
if not Path(self.model_path).exists():
raise RuntimeError(f"Piper model not found: {self.model_path}. "
f"Download from https://huggingface.co/rhasspy/piper-voices")
cmd = [
"piper-tts",
"--model", str(self.model_path),
"--output_file", str(output_path.with_suffix(".wav"))
]
subprocess.run(cmd, input=text.encode(), check=True)
# Convert to MP3 for smaller size
mp3_path = output_path.with_suffix(".mp3")
subprocess.run([
"lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)
], check=True, capture_output=True)
output_path.with_suffix(".wav").unlink()
return mp3_path
class ElevenLabsAdapter:
"""Cloud TTS using ElevenLabs API (high quality, paid).
Requires: ELEVENLABS_API_KEY environment variable
Voices: https://elevenlabs.io/voice-library
"""
VOICE_MAP = {
"matthew": "Mathew", # Professional narrator
"josh": "Josh", # Young male
"rachel": "Rachel", # Professional female
"bella": "Bella", # Warm female
"adam": "Adam", # Deep male
}
def __init__(self, config: TTSConfig):
self.config = config
self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")
if not self.api_key:
raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")
def synthesize(self, text: str, output_path: Path) -> Path:
voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
data = json.dumps({
"text": text[:5000], # ElevenLabs limit
"model_id": self.config.model or "eleven_turbo_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75
}
}).encode()
req = urllib.request.Request(url, data=data, method="POST")
req.add_header("xi-api-key", self.api_key)
req.add_header("Content-Type", "application/json")
mp3_path = output_path.with_suffix(".mp3")
with urllib.request.urlopen(req, timeout=120) as resp:
mp3_path.write_bytes(resp.read())
return mp3_path
class OpenAITTSAdapter:
"""Cloud TTS using OpenAI API (good quality, usage-based pricing).
Requires: OPENAI_API_KEY environment variable
"""
VOICE_MAP = {
"alloy": "alloy",
"echo": "echo",
"fable": "fable",
"onyx": "onyx",
"nova": "nova",
"shimmer": "shimmer",
}
def __init__(self, config: TTSConfig):
self.config = config
self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")
if not self.api_key:
raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")
def synthesize(self, text: str, output_path: Path) -> Path:
voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")
url = "https://api.openai.com/v1/audio/speech"
data = json.dumps({
"model": self.config.model or "tts-1",
"input": text[:4096], # OpenAI limit
"voice": voice,
"response_format": "mp3"
}).encode()
req = urllib.request.Request(url, data=data, method="POST")
req.add_header("Authorization", f"Bearer {self.api_key}")
req.add_header("Content-Type", "application/json")
mp3_path = output_path.with_suffix(".mp3")
with urllib.request.urlopen(req, timeout=60) as resp:
mp3_path.write_bytes(resp.read())
return mp3_path
ADAPTERS = {
"piper": PiperAdapter,
"elevenlabs": ElevenLabsAdapter,
"openai": OpenAITTSAdapter,
}
def get_provider_config() -> TTSConfig:
"""Load TTS configuration from environment."""
provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew")
return TTSConfig(
provider=provider,
voice_id=voice,
output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),
api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs"
else os.environ.get("OPENAI_API_KEY") if provider == "openai"
else None
)
def main():
parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")
parser.add_argument("--text", help="Text to synthesize (or read from stdin)")
parser.add_argument("--input-file", "-i", help="Text file to synthesize")
parser.add_argument("--output", "-o", help="Output file path (without extension)")
parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")
parser.add_argument("--voice", help="Voice ID override")
args = parser.parse_args()
# Load config
config = get_provider_config()
if args.provider:
config.provider = args.provider
if args.voice:
config.voice_id = args.voice
if args.output:
config.output_dir = Path(args.output).parent
output_name = Path(args.output).stem
else:
from datetime import datetime
output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"
config.output_dir.mkdir(parents=True, exist_ok=True)
output_path = config.output_dir / output_name
# Get text
if args.input_file:
text = Path(args.input_file).read_text()
elif args.text:
text = args.text
else:
text = sys.stdin.read()
if not text.strip():
print("Error: No text provided", file=sys.stderr)
sys.exit(1)
# Synthesize
print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")
adapter_class = ADAPTERS.get(config.provider)
if not adapter_class:
print(f"Error: Unknown provider {config.provider}", file=sys.stderr)
sys.exit(1)
adapter = adapter_class(config)
result_path = adapter.synthesize(text, output_path)
print(f"[TTS] Audio saved: {result_path}")
print(json.dumps({
"provider": config.provider,
"voice": config.voice_id,
"output_path": str(result_path),
"duration_estimate_min": len(text) // 150 # ~150 chars/min
}))
if __name__ == "__main__":
main()