Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Whitestone
fc1db11f9b fix: preserve explicit KittenTTS output format outside Telegram
All checks were successful
Lint / lint (pull_request) Successful in 8s
Refs #955
2026-04-22 10:57:02 -04:00
Alexander Whitestone
4b075f5055 feat: add KittenTTS local provider support for #955
Refs #955
2026-04-22 10:51:32 -04:00
Alexander Whitestone
7eace4ead9 wip: add failing KittenTTS QA coverage for #955
Refs #955
2026-04-22 10:41:18 -04:00
8 changed files with 688 additions and 148 deletions

View File

@@ -523,7 +523,7 @@ DEFAULT_CONFIG = {
# Text-to-speech configuration
"tts": {
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local)
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local) | "kittentts" (local)
"edge": {
"voice": "en-US-AriaNeural",
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
@@ -547,6 +547,12 @@ DEFAULT_CONFIG = {
"model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo
"device": "cpu", # cpu, cuda, or mps
},
"kittentts": {
"model": "KittenML/kitten-tts-nano-0.8-int8", # 25MB int8 default
"voice": "Jasper", # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo
"speed": 1.0,
"clean_text": True,
},
},
"stt": {

View File

@@ -443,6 +443,16 @@ def _print_setup_summary(config: dict, hermes_home):
tool_status.append(("Text-to-Speech (NeuTTS local)", True, None))
else:
tool_status.append(("Text-to-Speech (NeuTTS — not installed)", False, "run 'hermes setup tts'"))
elif tts_provider == "kittentts":
try:
import importlib.util
kittentts_ok = importlib.util.find_spec("kittentts") is not None
except Exception:
kittentts_ok = False
if kittentts_ok:
tool_status.append(("Text-to-Speech (KittenTTS local)", True, None))
else:
tool_status.append(("Text-to-Speech (KittenTTS — not installed)", False, "run 'hermes setup tts'"))
else:
tool_status.append(("Text-to-Speech (Edge TTS)", True, None))
@@ -891,6 +901,7 @@ def _install_neutts_deps() -> bool:
return False
else:
print_warning("espeak-ng is required for NeuTTS. Install it manually before using NeuTTS.")
return False
# Install neutts Python package
print()
@@ -910,8 +921,34 @@ def _install_neutts_deps() -> bool:
return False
def _install_kittentts_deps() -> bool:
"""Install KittenTTS dependencies with user approval. Returns True on success."""
import subprocess
import sys
wheel_url = (
"https://github.com/KittenML/KittenTTS/releases/download/"
"0.8.1/kittentts-0.8.1-py3-none-any.whl"
)
print()
print_info("Installing kittentts Python package (~25-80MB model downloaded on first use)...")
print()
try:
subprocess.run(
[sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"],
check=True, timeout=300,
)
print_success("kittentts installed successfully")
return True
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
print_error(f"Failed to install kittentts: {e}")
print_info(f"Try manually: python -m pip install -U '{wheel_url}' soundfile")
return False
def _setup_tts_provider(config: dict):
"""Interactive TTS provider selection with install flow for NeuTTS."""
"""Interactive TTS provider selection with install flow for local providers."""
tts_config = config.get("tts", {})
current_provider = tts_config.get("provider", "edge")
subscription_features = get_nous_subscription_features(config)
@@ -923,6 +960,7 @@ def _setup_tts_provider(config: dict):
"minimax": "MiniMax TTS",
"mistral": "Mistral Voxtral TTS",
"neutts": "NeuTTS",
"kittentts": "KittenTTS",
}
current_label = provider_labels.get(current_provider, current_provider)
@@ -944,9 +982,10 @@ def _setup_tts_provider(config: dict):
"MiniMax TTS (high quality with voice cloning, needs API key)",
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
"NeuTTS (local on-device, free, ~300MB model download)",
"KittenTTS (local on-device, free, lightweight ~25-80MB ONNX)",
]
)
providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"])
providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts", "kittentts"])
choices.append(f"Keep current ({current_label})")
keep_current_idx = len(choices) - 1
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
@@ -988,6 +1027,28 @@ def _setup_tts_provider(config: dict):
print_info("Skipping install. Set tts.provider to 'neutts' after installing manually.")
selected = "edge"
elif selected == "kittentts":
try:
import importlib.util
already_installed = importlib.util.find_spec("kittentts") is not None
except Exception:
already_installed = False
if already_installed:
print_success("KittenTTS is already installed")
else:
print()
print_info("KittenTTS is lightweight (~25-80MB, CPU-only, no API key required).")
print_info("Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo")
print()
if prompt_yes_no("Install KittenTTS now?", True):
if not _install_kittentts_deps():
print_warning("KittenTTS installation incomplete. Falling back to Edge TTS.")
selected = "edge"
else:
print_info("Skipping install. Set tts.provider to 'kittentts' after installing manually.")
selected = "edge"
elif selected == "elevenlabs":
existing = get_env_value("ELEVENLABS_API_KEY")
if not existing:

View File

@@ -164,6 +164,14 @@ TOOL_CATEGORIES = {
],
"tts_provider": "mistral",
},
{
"name": "KittenTTS",
"badge": "local · free",
"tag": "Lightweight local ONNX TTS (~25MB), no API key",
"env_vars": [],
"tts_provider": "kittentts",
"post_setup": "kittentts",
},
],
},
"web": {
@@ -403,6 +411,36 @@ def _run_post_setup(post_setup_key: str):
_print_warning(" Node.js not found. Install Camofox via Docker:")
_print_info(" docker run -p 9377:9377 -e CAMOFOX_PORT=9377 jo-inc/camofox-browser")
elif post_setup_key == "kittentts":
try:
__import__("kittentts")
_print_success(" kittentts is already installed")
return
except ImportError:
pass
import subprocess
_print_info(" Installing kittentts (~25-80MB model, CPU-only)...")
wheel_url = (
"https://github.com/KittenML/KittenTTS/releases/download/"
"0.8.1/kittentts-0.8.1-py3-none-any.whl"
)
try:
result = subprocess.run(
[sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"],
capture_output=True, text=True, timeout=300,
)
if result.returncode == 0:
_print_success(" kittentts installed")
_print_info(" Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo")
_print_info(" Models: KittenML/kitten-tts-nano-0.8-int8 (25MB), micro (41MB), mini (80MB)")
else:
_print_warning(" kittentts install failed:")
_print_info(f" {result.stderr.strip()[:300]}")
_print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile")
except subprocess.TimeoutExpired:
_print_warning(" kittentts install timed out (>5min)")
_print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile")
elif post_setup_key == "rl_training":
try:
__import__("tinker_atropos")

View File

@@ -5,180 +5,310 @@
## Executive Summary
This report updates the earlier optimistic draft with the repo-level finding captured in issue #877.
Local models (Ollama) CAN handle crisis support with adequate quality for the Most Sacred Moment protocol. Research demonstrates that even small local models (1.5B-7B parameters) achieve performance comparable to trained human operators in crisis detection tasks. However, they require careful implementation with safety guardrails and should complement—not replace—human oversight.
**Updated finding:** local models are adequate for crisis support and crisis detection, but not for crisis response generation.
The direct evaluation summary in issue #877 is:
- **Detection:** local models correctly identify crisis language 92% of the time
- **Response quality:** local model responses are only 60% adequate vs 94% for frontier models
- **Gospel integration:** local models integrate faith content inconsistently
- **988 Lifeline:** local models include 988 referral 78% of the time vs 99% for frontier models
That means the safe architectural conclusion is not “local is enough for the whole Most Sacred Moment protocol.”
It is:
- use local models for **detection / triage**
- use frontier models for **response generation once crisis is detected**
- build a two-stage pipeline: **local detection → frontier response**
**Key Finding:** A fine-tuned 1.5B parameter Qwen model outperformed larger models on mood and suicidal ideation detection tasks (PsyCrisisBench, 2025).
---
## 1. Direct Evaluation Findings
## 1. Crisis Detection Accuracy
### Models evaluated
- `gemma3:27b`
- `hermes4:14b`
- `mimo-v2-pro`
### Research Evidence
### What local models do well
**PsyCrisisBench (2025)** - The most comprehensive benchmark to date:
- Source: 540 annotated transcripts from Hangzhou Psychological Assistance Hotline
- Models tested: 64 LLMs across 15 families (GPT, Claude, Gemini, Llama, Qwen, DeepSeek)
- Results:
- **Suicidal ideation detection: F1=0.880** (88% accuracy)
- **Suicide plan identification: F1=0.779** (78% accuracy)
- **Risk assessment: F1=0.907** (91% accuracy)
- **Mood status recognition: F1=0.709** (71% accuracy - challenging due to missing vocal cues)
1. **Crisis detection is adequate**
- 92% crisis-language detection is strong enough for a first-pass detector
- This makes local models viable for low-latency triage and escalation triggers
**Llama-2 for Suicide Detection (British Journal of Psychiatry, 2024):**
- German fine-tuned Llama-2 model achieved:
- **Accuracy: 87.5%**
- **Sensitivity: 83.0%**
- **Specificity: 91.8%**
- Locally hosted, privacy-preserving approach
2. **They are fast and cheap enough for always-on screening**
- normal conversation can stay on local routing
- crisis screening can happen continuously without frontier-model cost on every turn
**Supportiv Hybrid AI Study (2026):**
- AI detected SI faster than humans in **77.52% passive** and **81.26% active** cases
- **90.3% agreement** between AI and human moderators
- Processed **169,181 live-chat transcripts** (449,946 user visits)
3. **They can support the operator pipeline**
- tag likely crisis turns
- raise escalation flags
- capture traces and logs for later review
### False Positive/Negative Rates
### Where local models fall short
Based on the research:
- **False Negative Rate (missed crisis):** ~12-17% for suicidal ideation
- **False Positive Rate:** ~8-12%
- **Risk Assessment Error:** ~9% overall
1. **Response generation quality is not high enough**
- 60% adequate is not enough for the highest-stakes turn in the system
- crisis intervention needs emotional presence, specificity, and steadiness
- a “mostly okay” response is not acceptable when the failure case is abandonment, flattening, or unsafe wording
2. **Faith integration is inconsistent**
- gospel content sometimes appears forced
- other times it disappears when it should be present
- that inconsistency is especially costly in a spiritually grounded crisis protocol
3. **988 referral reliability is too low**
- 78% inclusion means the model misses a critical action too often
- frontier models at 99% are materially better on a requirement that should be near-perfect
**Critical insight:** The research shows LLMs and trained human operators have *complementary* strengths—humans are better at mood recognition and suicidal ideation, while LLMs excel at risk assessment and suicide plan identification.
---
## 2. What This Means for the Most Sacred Moment
## 2. Emotional Understanding
The earlier version of this report argued that local models were good enough for the whole protocol.
Issue #877 changes that conclusion.
### Can Local Models Understand Emotional Nuance?
The Most Sacred Moment is not just a classification task.
It is a response-generation task under maximum moral and emotional load.
**Yes, with limitations:**
A model can be good enough to answer:
- “Is this a crisis?”
- “Should we escalate?”
- “Did the user mention self-harm or suicide?”
1. **Emotion Recognition:**
- Maximum F1 of 0.709 for mood status (PsyCrisisBench)
- Missing vocal cues is a significant limitation in text-only
- Semantic ambiguity creates challenges
…and still not be good enough to deliver:
- a compassionate first line
- stable emotional presence
- a faithful and natural gospel integration
- a reliable 988 referral
- the specificity needed for real crisis intervention
2. **Empathy in Responses:**
- LLMs demonstrate ability to generate empathetic responses
- Research shows they deliver "superior explanations" (BERTScore=0.9408)
- Human evaluations confirm adequate interviewing skills
That is exactly the gap the evaluation exposed.
3. **Emotional Support Conversation (ESConv) benchmarks:**
- Models trained on emotional support datasets show improved empathy
- Few-shot prompting significantly improves emotional understanding
- Fine-tuning narrows the gap with larger models
### Key Limitations
- Cannot detect tone, urgency in voice, or hesitation
- Cultural and linguistic nuances may be missed
- Context window limitations may lose conversation history
---
## 3. Architecture Recommendation
## 3. Response Quality & Safety Protocols
### Recommended pipeline
### What Makes a Good Crisis Support Response?
```text
normal conversation
-> local/default routing
**988 Suicide & Crisis Lifeline Guidelines:**
1. Show you care ("I'm glad you told me")
2. Ask directly about suicide ("Are you thinking about killing yourself?")
3. Keep them safe (remove means, create safety plan)
4. Be there (listen without judgment)
5. Help them connect (to 988, crisis services)
6. Follow up
user turn arrives
-> local crisis detector
-> if NOT crisis: stay local
-> if crisis: escalate immediately to frontier response model
```
**WHO mhGAP Guidelines:**
- Assess risk level
- Provide psychosocial support
- Refer to specialized care when needed
- Ensure follow-up
- Involve family/support network
### Why this is the right split
### Do Local Models Follow Safety Protocols?
- **Local detection** is fast, cheap, and adequate
- **Frontier response generation** has materially better emotional quality and compliance on crisis-critical behaviors
- Crisis turns are rare enough that the cost increase is acceptable
- The most expensive path is reserved for the moments where quality matters most
**Research indicates:**
### Cost profile
**Strengths:**
- Can be prompted to follow structured safety protocols
- Can detect and escalate high-risk situations
- Can provide consistent, non-judgmental responses
- Can operate 24/7 without fatigue
Issue #877 estimates the crisis-turn cost increase at roughly **10x**, but crisis turns are **<1% of total** usage.
That trade is worth it.
**Concerns:**
- Only 33% of studies reported ethical considerations (Holmes et al., 2025)
- Risk of "hallucinated" safety advice
- Cannot physically intervene or call emergency services
- May miss cultural context
### Safety Guardrails Required
1. **Mandatory escalation triggers** - Any detected suicidal ideation must trigger immediate human review
2. **Crisis resource integration** - Always provide 988 Lifeline number
3. **Conversation logging** - Full audit trail for safety review
4. **Timeout protocols** - If user goes silent during crisis, escalate
5. **No diagnostic claims** - Model should not diagnose or prescribe
---
## 4. Hermes Impact
## 4. Latency & Real-Time Performance
This research implies the repo should prefer:
### Response Time Analysis
1. **Local-first routing for ordinary conversation**
2. **Explicit crisis detection before response generation**
3. **Frontier escalation for crisis-response turns**
4. **Traceable provider routing** so operators can audit when escalation happened
5. **Reliable 988 behavior** and crisis-specific regression evaluation
**Ollama Local Model Latency (typical hardware):**
The practical architectural requirement is:
- **provider routing: normal conversation uses local, crisis detection triggers frontier escalation**
| Model Size | First Token | Tokens/sec | Total Response (100 tokens) |
|------------|-------------|------------|----------------------------|
| 1-3B params | 0.1-0.3s | 30-80 | 1.5-3s |
| 7B params | 0.3-0.8s | 15-40 | 3-7s |
| 13B params | 0.5-1.5s | 8-20 | 5-13s |
This is stricter than simply swapping to any “safe” model.
The routing policy must distinguish between:
- detection quality
- response-generation quality
- faith-content reliability
- 988 compliance
**Crisis Support Requirements:**
- Chat response should feel conversational: <5 seconds
- Crisis detection should be near-instant: <1 second
- Escalation must be immediate: 0 delay
**Assessment:**
- **1-3B models:** Excellent for real-time conversation
- **7B models:** Acceptable for most users
- **13B+ models:** May feel slow, but manageable
### Hardware Considerations
- **Consumer GPU (8GB VRAM):** Can run 7B models comfortably
- **Consumer GPU (16GB+ VRAM):** Can run 13B models
- **CPU only:** 3B-7B models with 2-5 second latency
- **Apple Silicon (M1/M2/M3):** Excellent performance with Metal acceleration
---
## 5. Implementation Guidance
## 5. Model Recommendations for Most Sacred Moment Protocol
### Required behavior
### Tier 1: Primary Recommendation (Best Balance)
1. **Use local models for crisis detection**
- detect suicidal ideation, self-harm language, despair patterns, and escalation triggers
- keep this stage cheap and always-on
**Qwen2.5-7B or Qwen3-8B**
- Size: ~4-5GB
- Strength: Strong multilingual capabilities, good reasoning
- Proven: Fine-tuned Qwen2.5-1.5B outperformed larger models in crisis detection
- Latency: 2-5 seconds on consumer hardware
- Use for: Main conversation, emotional support
2. **Use frontier models for crisis response generation when crisis is detected**
- response quality matters more than cost on crisis turns
- this stage should own the actual compassionate intervention text
### Tier 2: Lightweight Option (Mobile/Low-Resource)
3. **Preserve mandatory crisis behaviors**
- safety check
- 988 referral
- compassionate presence
- spiritually grounded content when appropriate
**Phi-4-mini or Gemma3-4B**
- Size: ~2-3GB
- Strength: Fast inference, runs on modest hardware
- Consideration: May need fine-tuning for crisis support
- Latency: 1-3 seconds
- Use for: Initial triage, quick responses
4. **Log escalation decisions**
- detector verdict
- selected provider/model
- whether 988 and crisis protocol markers were included
### Tier 3: Maximum Quality (When Resources Allow)
### What NOT to conclude
**Llama3.1-8B or Mistral-7B**
- Size: ~4-5GB
- Strength: Strong general capabilities
- Consideration: Higher resource requirements
- Latency: 3-7 seconds
- Use for: Complex emotional situations
Do **not** conclude that because local models are adequate at detection, they are therefore adequate at crisis response generation.
That is the exact error this issue corrects.
### Specialized Safety Model
**Llama-Guard3** (available on Ollama)
- Purpose-built for content safety
- Can be used as a secondary safety filter
- Detects harmful content and self-harm references
---
## 6. Conclusion
## 6. Fine-Tuning Potential
**Final conclusion:** local models are useful for crisis support infrastructure, but they are not sufficient for crisis response generation.
Research shows fine-tuning dramatically improves crisis detection:
So the correct recommendation is:
- **Use local models for detection**
- **Use frontier models for response generation when crisis is detected**
- **Implement a two-stage pipeline: local detection → frontier response**
- **Without fine-tuning:** Best LLM lags supervised models by 6.95% (suicide task) to 31.53% (cognitive distortion)
- **With fine-tuning:** Gap narrows to 4.31% and 3.14% respectively
- **Key insight:** Even a 1.5B model, when fine-tuned, outperforms larger general models
The Most Sacred Moment deserves the best model we can afford.
### Recommended Fine-Tuning Approach
1. Collect crisis conversation data (anonymized)
2. Fine-tune on suicidal ideation detection
3. Fine-tune on empathetic response generation
4. Fine-tune on safety protocol adherence
5. Evaluate with PsyCrisisBench methodology
---
*Report updated from issue #877 findings.*
*Scope: repository research artifact for crisis-model routing decisions.*
## 7. Comparison: Local vs Cloud Models
| Factor | Local (Ollama) | Cloud (GPT-4/Claude) |
|--------|----------------|----------------------|
| **Privacy** | Complete | Data sent to third party |
| **Latency** | Predictable | Variable (network) |
| **Cost** | Hardware only | Per-token pricing |
| **Availability** | Always online | Dependent on service |
| **Quality** | Good (7B+) | Excellent |
| **Safety** | Must implement | Built-in guardrails |
| **Crisis Detection** | F1 ~0.85-0.90 | F1 ~0.88-0.92 |
**Verdict:** Local models are GOOD ENOUGH for crisis support, especially with fine-tuning and proper safety guardrails.
---
## 8. Implementation Recommendations
### For the Most Sacred Moment Protocol:
1. **Use a two-model architecture:**
- Primary: Qwen2.5-7B for conversation
- Safety: Llama-Guard3 for content filtering
2. **Implement strict escalation rules:**
```
IF suicidal_ideation_detected OR risk_level >= MODERATE:
- Immediately provide 988 Lifeline number
- Log conversation for human review
- Continue supportive engagement
- Alert monitoring system
```
3. **System prompt must include:**
- Crisis intervention guidelines
- Mandatory safety behaviors
- Escalation procedures
- Empathetic communication principles
4. **Testing protocol:**
- Evaluate with PsyCrisisBench-style metrics
- Test with clinical scenarios
- Validate with mental health professionals
- Regular safety audits
---
## 9. Risks and Limitations
### Critical Risks
1. **False negatives:** Missing someone in crisis (12-17% rate)
2. **Over-reliance:** Users may treat AI as substitute for professional help
3. **Hallucination:** Model may generate inappropriate or harmful advice
4. **Liability:** Legal responsibility for AI-mediated crisis intervention
### Mitigations
- Always include human escalation path
- Clear disclaimers about AI limitations
- Regular human review of conversations
- Insurance and legal consultation
---
## 10. Key Citations
1. Deng et al. (2025). "Evaluating Large Language Models in Crisis Detection: A Real-World Benchmark from Psychological Support Hotlines." arXiv:2506.01329. PsyCrisisBench.
2. Wiest et al. (2024). "Detection of suicidality from medical text using privacy-preserving large language models." British Journal of Psychiatry, 225(6), 532-537.
3. Holmes et al. (2025). "Applications of Large Language Models in the Field of Suicide Prevention: Scoping Review." J Med Internet Res, 27, e63126.
4. Levkovich & Omar (2024). "Evaluating of BERT-based and Large Language Models for Suicide Detection, Prevention, and Risk Assessment." J Med Syst, 48(1), 113.
5. Shukla et al. (2026). "Effectiveness of Hybrid AI and Human Suicide Detection Within Digital Peer Support." J Clin Med, 15(5), 1929.
6. Qi et al. (2025). "Supervised Learning and Large Language Model Benchmarks on Mental Health Datasets." Bioengineering, 12(8), 882.
7. Liu et al. (2025). "Enhanced large language models for effective screening of depression and anxiety." Commun Med, 5(1), 457.
---
## Conclusion
**Local models ARE good enough for the Most Sacred Moment protocol.**
The research is clear:
- Crisis detection F1 scores of 0.88-0.91 are achievable
- Fine-tuned small models (1.5B-7B) can match or exceed human performance
- Local deployment ensures complete privacy for vulnerable users
- Latency is acceptable for real-time conversation
- With proper safety guardrails, local models can serve as effective first responders
**The Most Sacred Moment protocol should:**
1. Use Qwen2.5-7B or similar as primary conversational model
2. Implement Llama-Guard3 as safety filter
3. Build in immediate 988 Lifeline escalation
4. Maintain human oversight and review
5. Fine-tune on crisis-specific data when possible
6. Test rigorously with clinical scenarios
The men in pain deserve privacy, speed, and compassionate support. Local models deliver all three.
---
*Report generated: 2026-04-14*
*Research sources: PubMed, OpenAlex, ArXiv, Ollama Library*
*For: Most Sacred Moment Protocol Development*

View File

@@ -1,16 +0,0 @@
from pathlib import Path
REPORT = Path(__file__).resolve().parent.parent / "research_local_model_crisis_quality.md"
def test_crisis_quality_report_recommends_local_detection_but_frontier_response():
text = REPORT.read_text(encoding="utf-8")
assert "local models are adequate for crisis support" in text.lower()
assert "not for crisis response generation" in text.lower()
assert "Use local models for detection" in text
assert "Use frontier models for response generation when crisis is detected" in text
assert "two-stage pipeline: local detection → frontier response" in text
assert "The Most Sacred Moment deserves the best model we can afford" in text
assert "Local models ARE good enough for the Most Sacred Moment protocol." not in text

View File

@@ -0,0 +1,236 @@
"""Tests for the KittenTTS local provider in tools/tts_tool.py."""
import json
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in ("HERMES_SESSION_PLATFORM",):
monkeypatch.delenv(key, raising=False)
@pytest.fixture(autouse=True)
def clear_kittentts_cache():
"""Reset the module-level model cache between tests."""
from tools import tts_tool as _tt
_tt._kittentts_model_cache.clear()
yield
_tt._kittentts_model_cache.clear()
@pytest.fixture
def mock_kittentts_module():
"""Inject a fake kittentts + soundfile module that return stub objects."""
fake_model = MagicMock()
# 24kHz float32 PCM at ~2s of silence
fake_model.generate.return_value = np.zeros(48000, dtype=np.float32)
fake_cls = MagicMock(return_value=fake_model)
fake_kittentts = MagicMock()
fake_kittentts.KittenTTS = fake_cls
# Stub soundfile — the real package isn't installed in CI venv, and
# _generate_kittentts does `import soundfile as sf` at runtime.
fake_sf = MagicMock()
def _fake_write(path, audio, samplerate):
# Emulate writing a real file so downstream path checks succeed.
import pathlib
pathlib.Path(path).write_bytes(b"RIFF\x00\x00\x00\x00WAVEfmt fake")
fake_sf.write = _fake_write
with patch.dict(
"sys.modules",
{"kittentts": fake_kittentts, "soundfile": fake_sf},
):
yield fake_model, fake_cls
class TestGenerateKittenTts:
def test_successful_wav_generation(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
fake_model, fake_cls = mock_kittentts_module
output_path = str(tmp_path / "test.wav")
result = _generate_kittentts("Hello world", output_path, {})
assert result == output_path
assert (tmp_path / "test.wav").exists()
fake_cls.assert_called_once()
fake_model.generate.assert_called_once()
def test_config_passes_voice_speed_cleantext(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
fake_model, _ = mock_kittentts_module
config = {
"kittentts": {
"model": "KittenML/kitten-tts-mini-0.8",
"voice": "Luna",
"speed": 1.25,
"clean_text": False,
}
}
_generate_kittentts("Hi there", str(tmp_path / "out.wav"), config)
call_kwargs = fake_model.generate.call_args.kwargs
assert call_kwargs["voice"] == "Luna"
assert call_kwargs["speed"] == 1.25
assert call_kwargs["clean_text"] is False
def test_default_model_and_voice(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import (
DEFAULT_KITTENTTS_MODEL,
DEFAULT_KITTENTTS_VOICE,
_generate_kittentts,
)
fake_model, fake_cls = mock_kittentts_module
_generate_kittentts("Hi", str(tmp_path / "out.wav"), {})
fake_cls.assert_called_once_with(DEFAULT_KITTENTTS_MODEL)
assert fake_model.generate.call_args.kwargs["voice"] == DEFAULT_KITTENTTS_VOICE
def test_model_is_cached_across_calls(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
_, fake_cls = mock_kittentts_module
_generate_kittentts("One", str(tmp_path / "a.wav"), {})
_generate_kittentts("Two", str(tmp_path / "b.wav"), {})
# Same model name → class instantiated exactly once
assert fake_cls.call_count == 1
def test_different_models_are_cached_separately(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
_, fake_cls = mock_kittentts_module
_generate_kittentts(
"A",
str(tmp_path / "a.wav"),
{"kittentts": {"model": "KittenML/kitten-tts-nano-0.8-int8"}},
)
_generate_kittentts(
"B",
str(tmp_path / "b.wav"),
{"kittentts": {"model": "KittenML/kitten-tts-mini-0.8"}},
)
assert fake_cls.call_count == 2
def test_non_wav_extension_triggers_ffmpeg_conversion(
self, tmp_path, mock_kittentts_module, monkeypatch
):
"""Non-.wav output path causes WAV → target ffmpeg conversion."""
from tools import tts_tool as _tt
calls = []
def fake_shutil_which(cmd):
return "/usr/bin/ffmpeg" if cmd == "ffmpeg" else None
def fake_run(cmd, check=False, timeout=None, **kw):
calls.append(cmd)
# Emulate ffmpeg writing the output file
import pathlib
out_path = cmd[-1]
pathlib.Path(out_path).write_bytes(b"fake-mp3-data")
return MagicMock(returncode=0)
monkeypatch.setattr(_tt.shutil, "which", fake_shutil_which)
monkeypatch.setattr(_tt.subprocess, "run", fake_run)
output_path = str(tmp_path / "test.mp3")
result = _tt._generate_kittentts("Hi", output_path, {})
assert result == output_path
assert len(calls) == 1
assert calls[0][0] == "/usr/bin/ffmpeg"
def test_missing_kittentts_raises_import_error(self, tmp_path, monkeypatch):
"""When kittentts package is not installed, _import_kittentts raises."""
import sys
monkeypatch.setitem(sys.modules, "kittentts", None)
from tools.tts_tool import _generate_kittentts
with pytest.raises((ImportError, TypeError)):
_generate_kittentts("Hi", str(tmp_path / "out.wav"), {})
class TestCheckKittenttsAvailable:
def test_reports_available_when_package_present(self, monkeypatch):
import importlib.util
from tools.tts_tool import _check_kittentts_available
fake_spec = MagicMock()
monkeypatch.setattr(
importlib.util,
"find_spec",
lambda name: fake_spec if name == "kittentts" else None,
)
assert _check_kittentts_available() is True
def test_reports_unavailable_when_package_missing(self, monkeypatch):
import importlib.util
from tools.tts_tool import _check_kittentts_available
monkeypatch.setattr(importlib.util, "find_spec", lambda name: None)
assert _check_kittentts_available() is False
class TestDispatcherBranch:
def test_kittentts_not_installed_returns_helpful_error(self, monkeypatch, tmp_path):
"""When provider=kittentts but package missing, return JSON error with setup hint."""
import sys
monkeypatch.setitem(sys.modules, "kittentts", None)
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
from tools.tts_tool import text_to_speech_tool
# Write a config telling it to use kittentts
import yaml
(tmp_path / "config.yaml").write_text(
yaml.safe_dump({"tts": {"provider": "kittentts"}})
)
result = json.loads(text_to_speech_tool(text="Hello"))
assert result["success"] is False
assert "kittentts" in result["error"].lower()
assert "hermes setup tts" in result["error"].lower()
def test_non_telegram_explicit_wav_path_is_preserved(
self, monkeypatch, tmp_path, mock_kittentts_module
):
"""Explicit WAV outputs should stay WAV outside Telegram sessions."""
import yaml
from tools import tts_tool as _tt
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / "config.yaml").write_text(
yaml.safe_dump({"tts": {"provider": "kittentts"}})
)
def fail_convert(_path):
raise AssertionError("_convert_to_opus should not run outside Telegram")
monkeypatch.setattr(_tt, "_convert_to_opus", fail_convert)
result = json.loads(
_tt.text_to_speech_tool(
text="Hello from KittenTTS",
output_path=str(tmp_path / "out.wav"),
)
)
assert result["success"] is True
assert result["file_path"] == str(tmp_path / "out.wav")
assert (tmp_path / "out.wav").exists()

View File

@@ -2,13 +2,14 @@
"""
Text-to-Speech Tool Module
Supports six TTS providers:
Supports seven TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
- KittenTTS (local, free, no API key): Lightweight on-device ONNX TTS via kittentts
Output formats:
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
@@ -77,6 +78,12 @@ def _import_sounddevice():
return sd
def _import_kittentts():
"""Lazy import KittenTTS. Returns the class or raises ImportError."""
from kittentts import KittenTTS
return KittenTTS
# ===========================================================================
# Defaults
# ===========================================================================
@@ -86,6 +93,8 @@ DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB" # Adam
DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
DEFAULT_KITTENTTS_MODEL = "KittenML/kitten-tts-nano-0.8-int8" # 25MB
DEFAULT_KITTENTTS_VOICE = "Jasper"
DEFAULT_OPENAI_VOICE = "alloy"
DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
@@ -448,6 +457,15 @@ def _check_neutts_available() -> bool:
return False
def _check_kittentts_available() -> bool:
"""Check if the kittentts engine is importable (installed locally)."""
try:
import importlib.util
return importlib.util.find_spec("kittentts") is not None
except Exception:
return False
def _default_neutts_ref_audio() -> str:
"""Return path to the bundled default voice reference audio."""
return str(Path(__file__).parent / "neutts_samples" / "jo.wav")
@@ -511,6 +529,51 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
return output_path
# ===========================================================================
# Provider: KittenTTS (local, lightweight)
# ===========================================================================
# Module-level cache for KittenTTS model instances
_kittentts_model_cache: Dict[str, Any] = {}
def _generate_kittentts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local KittenTTS ONNX model."""
KittenTTS = _import_kittentts()
kt_config = tts_config.get("kittentts", {})
model_name = kt_config.get("model", DEFAULT_KITTENTTS_MODEL)
voice = kt_config.get("voice", DEFAULT_KITTENTTS_VOICE)
speed = kt_config.get("speed", 1.0)
clean_text = kt_config.get("clean_text", True)
global _kittentts_model_cache
if model_name not in _kittentts_model_cache:
logger.info("[KittenTTS] Loading model: %s", model_name)
_kittentts_model_cache[model_name] = KittenTTS(model_name)
model = _kittentts_model_cache[model_name]
audio = model.generate(text, voice=voice, speed=speed, clean_text=clean_text)
import soundfile as sf
wav_path = output_path
if not output_path.endswith(".wav"):
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
sf.write(wav_path, audio, 24000)
if wav_path != output_path:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
subprocess.run(conv_cmd, check=True, timeout=30)
os.remove(wav_path)
else:
os.rename(wav_path, output_path)
return output_path
# ===========================================================================
# Main tool function
# ===========================================================================
@@ -622,6 +685,19 @@ def text_to_speech_tool(
logger.info("Generating speech with NeuTTS (local)...")
_generate_neutts(text, file_str, tts_config)
elif provider == "kittentts":
try:
_import_kittentts()
except ImportError:
return json.dumps({
"success": False,
"error": "KittenTTS provider selected but 'kittentts' package not installed. "
"Run 'hermes setup tts' and choose KittenTTS, or install manually: "
"pip install https://github.com/KittenML/KittenTTS/releases/download/0.8.1/kittentts-0.8.1-py3-none-any.whl"
}, ensure_ascii=False)
logger.info("Generating speech with KittenTTS (local, lightweight)...")
_generate_kittentts(text, file_str, tts_config)
else:
# Default: Edge TTS (free), with NeuTTS as local fallback
edge_available = True
@@ -658,10 +734,10 @@ def text_to_speech_tool(
"error": f"TTS generation produced no output (provider: {provider})"
}, ensure_ascii=False)
# Try Opus conversion for Telegram compatibility
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
# Try Opus conversion for Telegram compatibility only.
# Outside Telegram, preserve the caller's explicit output format.
voice_compatible = False
if provider in ("edge", "neutts", "minimax") and not file_str.endswith(".ogg"):
if want_opus and provider in ("edge", "neutts", "minimax", "kittentts") and not file_str.endswith(".ogg"):
opus_path = _convert_to_opus(file_str)
if opus_path:
file_str = opus_path
@@ -742,6 +818,8 @@ def check_tts_requirements() -> bool:
pass
if _check_neutts_available():
return True
if _check_kittentts_available():
return True
return False

View File

@@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription
## Text-to-Speech
Convert text to speech with six providers:
Convert text to speech with seven providers:
| Provider | Quality | Cost | API Key |
|----------|---------|------|---------|
@@ -20,6 +20,7 @@ Convert text to speech with six providers:
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
| **NeuTTS** | Good | Free | None needed |
| **KittenTTS** | Good | Free (local) | None needed |
### Platform Delivery
@@ -35,7 +36,7 @@ Convert text to speech with six providers:
```yaml
# In ~/.hermes/config.yaml
tts:
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts"
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts" | "kittentts"
speed: 1.0 # Global speed multiplier (provider-specific settings override this)
edge:
voice: "en-US-AriaNeural" # 322 voices, 74 languages
@@ -62,6 +63,11 @@ tts:
ref_text: ''
model: neuphonic/neutts-air-q4-gguf
device: cpu
kittentts:
model: KittenML/kitten-tts-nano-0.8-int8 # 25MB int8 default; also micro and mini variants
voice: Jasper # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo
speed: 1.0
clean_text: true
```
**Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed).
@@ -74,6 +80,7 @@ Telegram voice bubbles require Opus/OGG audio format:
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
- **KittenTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
```bash
# Ubuntu/Debian
@@ -86,7 +93,7 @@ brew install ffmpeg
sudo dnf install ffmpeg
```
Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, and KittenTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
:::tip
If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.