236 lines
7.5 KiB
Python
236 lines
7.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""deepdive_tts.py — Phase 4: Text-to-Speech pipeline for Deep Dive.
|
||
|
|
|
||
|
|
Issue: #830 (the-nexus)
|
||
|
|
Multi-adapter TTS supporting local (Piper) and cloud (ElevenLabs, OpenAI) providers.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional
|
||
|
|
import os
|
||
|
|
import urllib.request
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class TTSConfig:
|
||
|
|
provider: str # "piper", "elevenlabs", "openai"
|
||
|
|
voice_id: str
|
||
|
|
output_dir: Path
|
||
|
|
# Provider-specific
|
||
|
|
api_key: Optional[str] = None
|
||
|
|
model: Optional[str] = None # e.g., "eleven_turbo_v2" or "tts-1"
|
||
|
|
|
||
|
|
|
||
|
|
class PiperAdapter:
|
||
|
|
"""Local TTS using Piper (offline, free, medium quality).
|
||
|
|
|
||
|
|
Requires: pip install piper-tts
|
||
|
|
Model download: https://huggingface.co/rhasspy/piper-voices
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self, config: TTSConfig):
|
||
|
|
self.config = config
|
||
|
|
self.model_path = config.model or Path.home() / ".local/share/piper/en_US-lessac-medium.onnx"
|
||
|
|
|
||
|
|
def synthesize(self, text: str, output_path: Path) -> Path:
|
||
|
|
if not Path(self.model_path).exists():
|
||
|
|
raise RuntimeError(f"Piper model not found: {self.model_path}. "
|
||
|
|
f"Download from https://huggingface.co/rhasspy/piper-voices")
|
||
|
|
|
||
|
|
cmd = [
|
||
|
|
"piper-tts",
|
||
|
|
"--model", str(self.model_path),
|
||
|
|
"--output_file", str(output_path.with_suffix(".wav"))
|
||
|
|
]
|
||
|
|
|
||
|
|
subprocess.run(cmd, input=text.encode(), check=True)
|
||
|
|
|
||
|
|
# Convert to MP3 for smaller size
|
||
|
|
mp3_path = output_path.with_suffix(".mp3")
|
||
|
|
subprocess.run([
|
||
|
|
"lame", "-V2", str(output_path.with_suffix(".wav")), str(mp3_path)
|
||
|
|
], check=True, capture_output=True)
|
||
|
|
|
||
|
|
output_path.with_suffix(".wav").unlink()
|
||
|
|
return mp3_path
|
||
|
|
|
||
|
|
|
||
|
|
class ElevenLabsAdapter:
|
||
|
|
"""Cloud TTS using ElevenLabs API (high quality, paid).
|
||
|
|
|
||
|
|
Requires: ELEVENLABS_API_KEY environment variable
|
||
|
|
Voices: https://elevenlabs.io/voice-library
|
||
|
|
"""
|
||
|
|
|
||
|
|
VOICE_MAP = {
|
||
|
|
"matthew": "Mathew", # Professional narrator
|
||
|
|
"josh": "Josh", # Young male
|
||
|
|
"rachel": "Rachel", # Professional female
|
||
|
|
"bella": "Bella", # Warm female
|
||
|
|
"adam": "Adam", # Deep male
|
||
|
|
}
|
||
|
|
|
||
|
|
def __init__(self, config: TTSConfig):
|
||
|
|
self.config = config
|
||
|
|
self.api_key = config.api_key or os.environ.get("ELEVENLABS_API_KEY")
|
||
|
|
if not self.api_key:
|
||
|
|
raise RuntimeError("ElevenLabs API key required. Set ELEVENLABS_API_KEY env var.")
|
||
|
|
|
||
|
|
def synthesize(self, text: str, output_path: Path) -> Path:
|
||
|
|
voice_id = self.VOICE_MAP.get(self.config.voice_id, self.config.voice_id)
|
||
|
|
|
||
|
|
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
||
|
|
|
||
|
|
data = json.dumps({
|
||
|
|
"text": text[:5000], # ElevenLabs limit
|
||
|
|
"model_id": self.config.model or "eleven_turbo_v2",
|
||
|
|
"voice_settings": {
|
||
|
|
"stability": 0.5,
|
||
|
|
"similarity_boost": 0.75
|
||
|
|
}
|
||
|
|
}).encode()
|
||
|
|
|
||
|
|
req = urllib.request.Request(url, data=data, method="POST")
|
||
|
|
req.add_header("xi-api-key", self.api_key)
|
||
|
|
req.add_header("Content-Type", "application/json")
|
||
|
|
|
||
|
|
mp3_path = output_path.with_suffix(".mp3")
|
||
|
|
|
||
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
||
|
|
mp3_path.write_bytes(resp.read())
|
||
|
|
|
||
|
|
return mp3_path
|
||
|
|
|
||
|
|
|
||
|
|
class OpenAITTSAdapter:
|
||
|
|
"""Cloud TTS using OpenAI API (good quality, usage-based pricing).
|
||
|
|
|
||
|
|
Requires: OPENAI_API_KEY environment variable
|
||
|
|
"""
|
||
|
|
|
||
|
|
VOICE_MAP = {
|
||
|
|
"alloy": "alloy",
|
||
|
|
"echo": "echo",
|
||
|
|
"fable": "fable",
|
||
|
|
"onyx": "onyx",
|
||
|
|
"nova": "nova",
|
||
|
|
"shimmer": "shimmer",
|
||
|
|
}
|
||
|
|
|
||
|
|
def __init__(self, config: TTSConfig):
|
||
|
|
self.config = config
|
||
|
|
self.api_key = config.api_key or os.environ.get("OPENAI_API_KEY")
|
||
|
|
if not self.api_key:
|
||
|
|
raise RuntimeError("OpenAI API key required. Set OPENAI_API_KEY env var.")
|
||
|
|
|
||
|
|
def synthesize(self, text: str, output_path: Path) -> Path:
|
||
|
|
voice = self.VOICE_MAP.get(self.config.voice_id, "alloy")
|
||
|
|
|
||
|
|
url = "https://api.openai.com/v1/audio/speech"
|
||
|
|
|
||
|
|
data = json.dumps({
|
||
|
|
"model": self.config.model or "tts-1",
|
||
|
|
"input": text[:4096], # OpenAI limit
|
||
|
|
"voice": voice,
|
||
|
|
"response_format": "mp3"
|
||
|
|
}).encode()
|
||
|
|
|
||
|
|
req = urllib.request.Request(url, data=data, method="POST")
|
||
|
|
req.add_header("Authorization", f"Bearer {self.api_key}")
|
||
|
|
req.add_header("Content-Type", "application/json")
|
||
|
|
|
||
|
|
mp3_path = output_path.with_suffix(".mp3")
|
||
|
|
|
||
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
||
|
|
mp3_path.write_bytes(resp.read())
|
||
|
|
|
||
|
|
return mp3_path
|
||
|
|
|
||
|
|
|
||
|
|
ADAPTERS = {
|
||
|
|
"piper": PiperAdapter,
|
||
|
|
"elevenlabs": ElevenLabsAdapter,
|
||
|
|
"openai": OpenAITTSAdapter,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def get_provider_config() -> TTSConfig:
|
||
|
|
"""Load TTS configuration from environment."""
|
||
|
|
provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
|
||
|
|
voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew")
|
||
|
|
|
||
|
|
return TTSConfig(
|
||
|
|
provider=provider,
|
||
|
|
voice_id=voice,
|
||
|
|
output_dir=Path(os.environ.get("DEEPDIVE_OUTPUT_DIR", "/tmp/deepdive")),
|
||
|
|
api_key=os.environ.get("ELEVENLABS_API_KEY") if provider == "elevenlabs"
|
||
|
|
else os.environ.get("OPENAI_API_KEY") if provider == "openai"
|
||
|
|
else None
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Deep Dive TTS Pipeline")
|
||
|
|
parser.add_argument("--text", help="Text to synthesize (or read from stdin)")
|
||
|
|
parser.add_argument("--input-file", "-i", help="Text file to synthesize")
|
||
|
|
parser.add_argument("--output", "-o", help="Output file path (without extension)")
|
||
|
|
parser.add_argument("--provider", choices=list(ADAPTERS.keys()), help="TTS provider override")
|
||
|
|
parser.add_argument("--voice", help="Voice ID override")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Load config
|
||
|
|
config = get_provider_config()
|
||
|
|
if args.provider:
|
||
|
|
config.provider = args.provider
|
||
|
|
if args.voice:
|
||
|
|
config.voice_id = args.voice
|
||
|
|
if args.output:
|
||
|
|
config.output_dir = Path(args.output).parent
|
||
|
|
output_name = Path(args.output).stem
|
||
|
|
else:
|
||
|
|
from datetime import datetime
|
||
|
|
output_name = f"briefing_{datetime.now().strftime("%Y%m%d_%H%M")}"
|
||
|
|
|
||
|
|
config.output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
output_path = config.output_dir / output_name
|
||
|
|
|
||
|
|
# Get text
|
||
|
|
if args.input_file:
|
||
|
|
text = Path(args.input_file).read_text()
|
||
|
|
elif args.text:
|
||
|
|
text = args.text
|
||
|
|
else:
|
||
|
|
text = sys.stdin.read()
|
||
|
|
|
||
|
|
if not text.strip():
|
||
|
|
print("Error: No text provided", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# Synthesize
|
||
|
|
print(f"[TTS] Using provider: {config.provider}, voice: {config.voice_id}")
|
||
|
|
|
||
|
|
adapter_class = ADAPTERS.get(config.provider)
|
||
|
|
if not adapter_class:
|
||
|
|
print(f"Error: Unknown provider {config.provider}", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
adapter = adapter_class(config)
|
||
|
|
result_path = adapter.synthesize(text, output_path)
|
||
|
|
|
||
|
|
print(f"[TTS] Audio saved: {result_path}")
|
||
|
|
print(json.dumps({
|
||
|
|
"provider": config.provider,
|
||
|
|
"voice": config.voice_id,
|
||
|
|
"output_path": str(result_path),
|
||
|
|
"duration_estimate_min": len(text) // 150 # ~150 chars/min
|
||
|
|
}))
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|