Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Whitestone
fc1db11f9b fix: preserve explicit KittenTTS output format outside Telegram
All checks were successful
Lint / lint (pull_request) Successful in 8s
Refs #955
2026-04-22 10:57:02 -04:00
Alexander Whitestone
4b075f5055 feat: add KittenTTS local provider support for #955
Refs #955
2026-04-22 10:51:32 -04:00
Alexander Whitestone
7eace4ead9 wip: add failing KittenTTS QA coverage for #955
Refs #955
2026-04-22 10:41:18 -04:00
8 changed files with 513 additions and 522 deletions

View File

@@ -1,70 +1,43 @@
from __future__ import annotations
"""
A2A agent card generation for fleet discovery.
Agent Card — A2A-compliant agent discovery.
Part of #843: fix: implement A2A agent card for fleet discovery (#819)
Refs #801.
Closes #802.
Provides metadata about the agent's identity, capabilities, and installed skills
for discovery by other agents in the fleet.
"""
import argparse
import json
import logging
import os
import socket
import sys
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, Iterable, List, Mapping, Sequence
from urllib.parse import urlparse, urlunparse
from pathlib import Path
from typing import Any, Dict, List, Optional
from hermes_cli import __version__
from hermes_cli.config import load_config
from hermes_cli.config import load_config, get_hermes_home
from agent.skill_utils import (
get_all_skills_dirs,
get_disabled_skill_names,
iter_skill_index_files,
parse_frontmatter,
skill_matches_platform,
get_all_skills_dirs,
get_disabled_skill_names,
skill_matches_platform
)
logger = logging.getLogger(__name__)
DEFAULT_DESCRIPTION = "Sovereign AI agent — orchestration, code, research"
DEFAULT_INPUT_MODES = ["text/plain", "application/json"]
DEFAULT_OUTPUT_MODES = ["text/plain", "application/json"]
_REQUIRED_CAPABILITY_FLAGS = (
"streaming",
"pushNotifications",
"stateTransitionHistory",
)
@dataclass
class AgentSkill:
id: str
name: str
description: str = ""
tags: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
data: Dict[str, Any] = {"id": self.id, "name": self.name}
if self.description:
data["description"] = self.description
if self.tags:
data["tags"] = self.tags
return data
version: str = "1.0.0"
@dataclass
class AgentCapabilities:
streaming: bool = True
pushNotifications: bool = False
stateTransitionHistory: bool = True
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
tools: bool = True
vision: bool = False
reasoning: bool = False
@dataclass
class AgentCard:
@@ -74,81 +47,14 @@ class AgentCard:
version: str = __version__
capabilities: AgentCapabilities = field(default_factory=AgentCapabilities)
skills: List[AgentSkill] = field(default_factory=list)
defaultInputModes: List[str] = field(default_factory=lambda: list(DEFAULT_INPUT_MODES))
defaultOutputModes: List[str] = field(default_factory=lambda: list(DEFAULT_OUTPUT_MODES))
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
data: Dict[str, Any] = {
"name": self.name,
"description": self.description,
"url": self.url,
"version": self.version,
"capabilities": self.capabilities.to_dict(),
"skills": [skill.to_dict() for skill in self.skills],
"defaultInputModes": list(self.defaultInputModes),
"defaultOutputModes": list(self.defaultOutputModes),
}
if self.metadata:
data["metadata"] = dict(self.metadata)
return data
def to_json(self, indent: int = 2) -> str:
return json.dumps(self.to_dict(), indent=indent)
def _env_or_empty(key: str) -> str:
return os.environ.get(key, "").strip()
def _as_agent_config(config: Mapping[str, Any] | None) -> Dict[str, Any]:
if not isinstance(config, Mapping):
return {}
agent_cfg = config.get("agent")
return dict(agent_cfg) if isinstance(agent_cfg, Mapping) else {}
def _as_a2a_config(config: Mapping[str, Any] | None) -> Dict[str, Any]:
if not isinstance(config, Mapping):
return {}
a2a_cfg = config.get("a2a")
return dict(a2a_cfg) if isinstance(a2a_cfg, Mapping) else {}
def _normalize_string_list(value: Any) -> List[str]:
if value is None:
return []
if isinstance(value, str):
parts = value.split(",")
elif isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray, str)):
parts = list(value)
else:
parts = [value]
out: List[str] = []
seen = set()
for item in parts:
text = str(item).strip()
if not text or text in seen:
continue
seen.add(text)
out.append(text)
return out
def _normalize_skill_tags(frontmatter: Mapping[str, Any]) -> List[str]:
tags = _normalize_string_list(frontmatter.get("tags"))
category = str(frontmatter.get("category") or "").strip()
if category and category not in tags:
tags.append(category)
return tags
defaultInputModes: List[str] = field(default_factory=lambda: ["text/plain"])
defaultOutputModes: List[str] = field(default_factory=lambda: ["text/plain"])
def _load_skills() -> List[AgentSkill]:
"""Scan enabled skills and return A2A skill metadata."""
skills: List[AgentSkill] = []
"""Scan all enabled skills and return metadata."""
skills = []
disabled = get_disabled_skill_names()
seen_ids = set()
for skills_dir in get_all_skills_dirs():
if not skills_dir.is_dir():
continue
@@ -159,262 +65,71 @@ def _load_skills() -> List[AgentSkill]:
except Exception:
continue
skill_name = frontmatter.get("name") or skill_file.parent.name
if str(skill_name) in disabled:
continue
if not skill_matches_platform(frontmatter):
continue
skill_id = str(frontmatter.get("name") or skill_file.parent.name).strip().lower().replace(" ", "-")
if skill_id in disabled or skill_id in seen_ids:
continue
seen_ids.add(skill_id)
skills.append(AgentSkill(
id=str(skill_name),
name=str(frontmatter.get("name", skill_name)),
description=str(frontmatter.get("description", "")),
version=str(frontmatter.get("version", "1.0.0"))
))
return skills
display_name = str(frontmatter.get("title") or frontmatter.get("name") or skill_file.parent.name).strip()
description = str(frontmatter.get("description") or "").strip()
tags = _normalize_skill_tags(frontmatter)
skills.append(
AgentSkill(
id=skill_id,
name=display_name,
description=description,
tags=tags,
)
)
def build_agent_card() -> AgentCard:
"""Build the agent card from current configuration and environment."""
config = load_config()
# Identity
name = os.environ.get("HERMES_AGENT_NAME") or config.get("agent", {}).get("name") or "hermes"
description = os.environ.get("HERMES_AGENT_DESCRIPTION") or config.get("agent", {}).get("description") or "Sovereign AI agent"
# URL - try to determine from environment or config
port = os.environ.get("HERMES_WEB_PORT") or "9119"
host = os.environ.get("HERMES_WEB_HOST") or "localhost"
url = f"http://{host}:{port}"
# Capabilities
# In a real scenario, we'd check model metadata for vision/reasoning
capabilities = AgentCapabilities(
streaming=True,
tools=True,
vision=False, # Default to false unless we can confirm
reasoning=False
)
# Skills
skills = _load_skills()
return AgentCard(
name=name,
description=description,
url=url,
version=__version__,
capabilities=capabilities,
skills=skills
)
return sorted(skills, key=lambda skill: skill.id)
def _get_agent_name(config: Mapping[str, Any] | None, override: str | None = None) -> str:
if override:
return override
env_name = _env_or_empty("HERMES_AGENT_NAME") or _env_or_empty("AGENT_NAME")
if env_name:
return env_name
agent_cfg = _as_agent_config(config)
if agent_cfg.get("name"):
return str(agent_cfg["name"]).strip()
def get_agent_card_json() -> str:
"""Return the agent card as a JSON string."""
try:
hostname = socket.gethostname().split(".", 1)[0].strip()
if hostname:
return hostname
except Exception:
pass
return "hermes"
def _get_description(config: Mapping[str, Any] | None, override: str | None = None) -> str:
if override:
return override
env_description = _env_or_empty("HERMES_AGENT_DESCRIPTION") or _env_or_empty("AGENT_DESCRIPTION")
if env_description:
return env_description
agent_cfg = _as_agent_config(config)
if agent_cfg.get("description"):
return str(agent_cfg["description"]).strip()
return DEFAULT_DESCRIPTION
def _normalize_a2a_url(url: str) -> str:
raw = (url or "").strip()
if not raw:
return ""
parsed = urlparse(raw if "://" in raw else f"https://{raw}")
scheme = parsed.scheme or "https"
netloc = parsed.netloc or parsed.path
path = parsed.path if parsed.netloc else ""
normalized_path = path.rstrip("/") if path not in ("", "/") else ""
if not normalized_path.endswith("/a2a"):
normalized_path = f"{normalized_path}/a2a" if normalized_path else "/a2a"
return urlunparse((scheme, netloc, normalized_path, "", "", ""))
def _get_agent_url(config: Mapping[str, Any] | None, override: str | None = None) -> str:
if override:
return _normalize_a2a_url(override)
agent_cfg = _as_agent_config(config)
a2a_cfg = _as_a2a_config(config)
explicit = (
_env_or_empty("HERMES_A2A_PUBLIC_URL")
or str(a2a_cfg.get("public_url") or "").strip()
or str(agent_cfg.get("a2a_public_url") or "").strip()
)
if explicit:
return _normalize_a2a_url(explicit)
host = (
_env_or_empty("HERMES_A2A_HOST")
or str(a2a_cfg.get("host") or "").strip()
or _env_or_empty("HERMES_WEB_HOST")
or str(agent_cfg.get("host") or "").strip()
or "localhost"
)
port = (
_env_or_empty("HERMES_A2A_PORT")
or str(a2a_cfg.get("port") or "").strip()
or _env_or_empty("HERMES_WEB_PORT")
or str(agent_cfg.get("port") or "").strip()
or "9119"
)
scheme = (
_env_or_empty("HERMES_A2A_SCHEME")
or str(a2a_cfg.get("scheme") or "").strip()
or ("https" if (_env_or_empty("HERMES_MTLS_CERT") or str(port) == "9443") else "http")
)
return _normalize_a2a_url(f"{scheme}://{host}:{port}")
def _merge_skills(base_skills: Iterable[AgentSkill], extra_skills: Iterable[AgentSkill] | None = None) -> List[AgentSkill]:
merged: Dict[str, AgentSkill] = {}
for skill in list(base_skills) + list(extra_skills or []):
if skill.id not in merged:
merged[skill.id] = skill
return [merged[key] for key in sorted(merged)]
def build_agent_card(
*,
name: str | None = None,
description: str | None = None,
url: str | None = None,
extra_skills: Iterable[AgentSkill] | None = None,
metadata: Mapping[str, Any] | None = None,
) -> AgentCard:
"""Build an A2A-compliant agent card from config, env, and installed skills."""
try:
config = load_config()
except Exception as exc:
logger.debug("Falling back to empty config while building agent card: %s", exc)
config = {}
card = AgentCard(
name=_get_agent_name(config, override=name),
description=_get_description(config, override=description),
url=_get_agent_url(config, override=url),
skills=_merge_skills(_load_skills(), extra_skills),
metadata=dict(metadata or {}),
)
return card
def validate_agent_card(card: AgentCard | Dict[str, Any]) -> List[str]:
"""Return a list of schema-validation errors for an agent card."""
data = card.to_dict() if isinstance(card, AgentCard) else dict(card)
errors: List[str] = []
for field_name in ("name", "description", "url", "version"):
value = data.get(field_name)
if not isinstance(value, str) or not value.strip():
errors.append(f"{field_name} must be a non-empty string")
url_value = str(data.get("url") or "")
parsed = urlparse(url_value)
if not parsed.scheme or not parsed.netloc:
errors.append("url must be an absolute http/https URL")
elif parsed.scheme not in {"http", "https"}:
errors.append("url must use http or https")
elif not parsed.path.rstrip("/").endswith("/a2a"):
errors.append("url must point to the /a2a endpoint")
capabilities = data.get("capabilities")
if not isinstance(capabilities, Mapping):
errors.append("capabilities must be an object")
else:
for capability_name in _REQUIRED_CAPABILITY_FLAGS:
if not isinstance(capabilities.get(capability_name), bool):
errors.append(f"capabilities.{capability_name} must be a boolean")
for field_name, required_modes in (
("defaultInputModes", DEFAULT_INPUT_MODES),
("defaultOutputModes", DEFAULT_OUTPUT_MODES),
):
modes = data.get(field_name)
if not isinstance(modes, list) or not modes:
errors.append(f"{field_name} must be a non-empty list of MIME types")
continue
for mode in modes:
if not isinstance(mode, str) or "/" not in mode:
errors.append(f"{field_name} entries must be MIME types")
for required_mode in required_modes:
if required_mode not in modes:
errors.append(f"{field_name} must include {required_mode}")
skills = data.get("skills")
if not isinstance(skills, list):
errors.append("skills must be a list")
else:
for index, skill in enumerate(skills):
if not isinstance(skill, Mapping):
errors.append(f"skills[{index}] must be an object")
continue
if not str(skill.get("id") or "").strip():
errors.append(f"skills[{index}] missing id")
if not str(skill.get("name") or "").strip():
errors.append(f"skills[{index}] missing name")
tags = skill.get("tags", [])
if tags is None:
tags = []
if not isinstance(tags, list):
errors.append(f"skills[{index}].tags must be a list")
else:
for tag in tags:
if not isinstance(tag, str) or not tag.strip():
errors.append(f"skills[{index}].tags entries must be non-empty strings")
metadata = data.get("metadata")
if metadata is not None and not isinstance(metadata, Mapping):
errors.append("metadata must be an object when present")
return errors
def get_agent_card_json(
*,
name: str | None = None,
description: str | None = None,
url: str | None = None,
metadata: Mapping[str, Any] | None = None,
indent: int = 2,
) -> str:
"""Return the local agent card as JSON, falling back to an error card on failure."""
try:
card = build_agent_card(name=name, description=description, url=url, metadata=metadata)
errors = validate_agent_card(card)
if errors:
raise ValueError("; ".join(errors))
return card.to_json(indent=indent)
except Exception as exc:
logger.error("Failed to build agent card: %s", exc)
card = build_agent_card()
return json.dumps(asdict(card), indent=2)
except Exception as e:
logger.error(f"Failed to build agent card: {e}")
# Minimal fallback card
fallback = {
"name": name or _env_or_empty("HERMES_AGENT_NAME") or "hermes",
"description": "Sovereign AI agent (agent card fallback)",
"url": url or "http://localhost:9119/a2a",
"name": "hermes",
"description": "Sovereign AI agent (fallback)",
"version": __version__,
"capabilities": AgentCapabilities().to_dict(),
"skills": [],
"defaultInputModes": list(DEFAULT_INPUT_MODES),
"defaultOutputModes": list(DEFAULT_OUTPUT_MODES),
"error": str(exc),
"error": str(e)
}
return json.dumps(fallback, indent=indent)
return json.dumps(fallback, indent=2)
def main(argv: Sequence[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Generate an A2A-compliant Hermes agent card")
parser.add_argument("--name", help="Override the agent name")
parser.add_argument("--description", help="Override the agent description")
parser.add_argument("--url", help="Override the public A2A URL")
parser.add_argument("--validate", action="store_true", help="Validate before printing; exit 1 on schema errors")
args = parser.parse_args(list(argv) if argv is not None else None)
card = build_agent_card(name=args.name, description=args.description, url=args.url)
errors = validate_agent_card(card)
if args.validate and errors:
for error in errors:
print(error, file=sys.stderr)
return 1
print(card.to_json(indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())
def validate_agent_card(card_data: Dict[str, Any]) -> bool:
"""Check if the card data complies with the A2A schema."""
required = ["name", "description", "url", "version"]
return all(k in card_data for k in required)

View File

@@ -523,7 +523,7 @@ DEFAULT_CONFIG = {
# Text-to-speech configuration
"tts": {
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local)
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local) | "kittentts" (local)
"edge": {
"voice": "en-US-AriaNeural",
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
@@ -547,6 +547,12 @@ DEFAULT_CONFIG = {
"model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo
"device": "cpu", # cpu, cuda, or mps
},
"kittentts": {
"model": "KittenML/kitten-tts-nano-0.8-int8", # 25MB int8 default
"voice": "Jasper", # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo
"speed": 1.0,
"clean_text": True,
},
},
"stt": {

View File

@@ -443,6 +443,16 @@ def _print_setup_summary(config: dict, hermes_home):
tool_status.append(("Text-to-Speech (NeuTTS local)", True, None))
else:
tool_status.append(("Text-to-Speech (NeuTTS — not installed)", False, "run 'hermes setup tts'"))
elif tts_provider == "kittentts":
try:
import importlib.util
kittentts_ok = importlib.util.find_spec("kittentts") is not None
except Exception:
kittentts_ok = False
if kittentts_ok:
tool_status.append(("Text-to-Speech (KittenTTS local)", True, None))
else:
tool_status.append(("Text-to-Speech (KittenTTS — not installed)", False, "run 'hermes setup tts'"))
else:
tool_status.append(("Text-to-Speech (Edge TTS)", True, None))
@@ -891,6 +901,7 @@ def _install_neutts_deps() -> bool:
return False
else:
print_warning("espeak-ng is required for NeuTTS. Install it manually before using NeuTTS.")
return False
# Install neutts Python package
print()
@@ -910,8 +921,34 @@ def _install_neutts_deps() -> bool:
return False
def _install_kittentts_deps() -> bool:
"""Install KittenTTS dependencies with user approval. Returns True on success."""
import subprocess
import sys
wheel_url = (
"https://github.com/KittenML/KittenTTS/releases/download/"
"0.8.1/kittentts-0.8.1-py3-none-any.whl"
)
print()
print_info("Installing kittentts Python package (~25-80MB model downloaded on first use)...")
print()
try:
subprocess.run(
[sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"],
check=True, timeout=300,
)
print_success("kittentts installed successfully")
return True
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
print_error(f"Failed to install kittentts: {e}")
print_info(f"Try manually: python -m pip install -U '{wheel_url}' soundfile")
return False
def _setup_tts_provider(config: dict):
"""Interactive TTS provider selection with install flow for NeuTTS."""
"""Interactive TTS provider selection with install flow for local providers."""
tts_config = config.get("tts", {})
current_provider = tts_config.get("provider", "edge")
subscription_features = get_nous_subscription_features(config)
@@ -923,6 +960,7 @@ def _setup_tts_provider(config: dict):
"minimax": "MiniMax TTS",
"mistral": "Mistral Voxtral TTS",
"neutts": "NeuTTS",
"kittentts": "KittenTTS",
}
current_label = provider_labels.get(current_provider, current_provider)
@@ -944,9 +982,10 @@ def _setup_tts_provider(config: dict):
"MiniMax TTS (high quality with voice cloning, needs API key)",
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
"NeuTTS (local on-device, free, ~300MB model download)",
"KittenTTS (local on-device, free, lightweight ~25-80MB ONNX)",
]
)
providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"])
providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts", "kittentts"])
choices.append(f"Keep current ({current_label})")
keep_current_idx = len(choices) - 1
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
@@ -988,6 +1027,28 @@ def _setup_tts_provider(config: dict):
print_info("Skipping install. Set tts.provider to 'neutts' after installing manually.")
selected = "edge"
elif selected == "kittentts":
try:
import importlib.util
already_installed = importlib.util.find_spec("kittentts") is not None
except Exception:
already_installed = False
if already_installed:
print_success("KittenTTS is already installed")
else:
print()
print_info("KittenTTS is lightweight (~25-80MB, CPU-only, no API key required).")
print_info("Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo")
print()
if prompt_yes_no("Install KittenTTS now?", True):
if not _install_kittentts_deps():
print_warning("KittenTTS installation incomplete. Falling back to Edge TTS.")
selected = "edge"
else:
print_info("Skipping install. Set tts.provider to 'kittentts' after installing manually.")
selected = "edge"
elif selected == "elevenlabs":
existing = get_env_value("ELEVENLABS_API_KEY")
if not existing:

View File

@@ -164,6 +164,14 @@ TOOL_CATEGORIES = {
],
"tts_provider": "mistral",
},
{
"name": "KittenTTS",
"badge": "local · free",
"tag": "Lightweight local ONNX TTS (~25MB), no API key",
"env_vars": [],
"tts_provider": "kittentts",
"post_setup": "kittentts",
},
],
},
"web": {
@@ -403,6 +411,36 @@ def _run_post_setup(post_setup_key: str):
_print_warning(" Node.js not found. Install Camofox via Docker:")
_print_info(" docker run -p 9377:9377 -e CAMOFOX_PORT=9377 jo-inc/camofox-browser")
elif post_setup_key == "kittentts":
try:
__import__("kittentts")
_print_success(" kittentts is already installed")
return
except ImportError:
pass
import subprocess
_print_info(" Installing kittentts (~25-80MB model, CPU-only)...")
wheel_url = (
"https://github.com/KittenML/KittenTTS/releases/download/"
"0.8.1/kittentts-0.8.1-py3-none-any.whl"
)
try:
result = subprocess.run(
[sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"],
capture_output=True, text=True, timeout=300,
)
if result.returncode == 0:
_print_success(" kittentts installed")
_print_info(" Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo")
_print_info(" Models: KittenML/kitten-tts-nano-0.8-int8 (25MB), micro (41MB), mini (80MB)")
else:
_print_warning(" kittentts install failed:")
_print_info(f" {result.stderr.strip()[:300]}")
_print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile")
except subprocess.TimeoutExpired:
_print_warning(" kittentts install timed out (>5min)")
_print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile")
elif post_setup_key == "rl_training":
try:
__import__("tinker_atropos")

View File

@@ -1,150 +0,0 @@
from __future__ import annotations
import json
from pathlib import Path
import pytest
from agent import agent_card as mod
DEFAULT_DESCRIPTION = "Sovereign AI agent — orchestration, code, research"
def _set_base_context(monkeypatch, *, name: str = "Timmy", description: str = DEFAULT_DESCRIPTION, url: str = "https://timmy.local:9443/a2a", skills=None):
monkeypatch.setattr(mod, "load_config", lambda: {"agent": {"name": name, "description": description}})
monkeypatch.setattr(
mod,
"_load_skills",
lambda: list(
skills
if skills is not None
else [
mod.AgentSkill(
id="code",
name="Code Implementation",
description="Implement and patch code",
tags=["python", "gitea"],
)
]
),
)
monkeypatch.setenv("HERMES_A2A_PUBLIC_URL", url)
monkeypatch.delenv("HERMES_AGENT_NAME", raising=False)
monkeypatch.delenv("AGENT_NAME", raising=False)
monkeypatch.delenv("HERMES_AGENT_DESCRIPTION", raising=False)
monkeypatch.delenv("AGENT_DESCRIPTION", raising=False)
def test_build_agent_card_matches_issue_802_schema(monkeypatch):
_set_base_context(monkeypatch)
card = mod.build_agent_card()
payload = card.to_dict()
assert payload["name"] == "Timmy"
assert payload["description"] == DEFAULT_DESCRIPTION
assert payload["url"] == "https://timmy.local:9443/a2a"
assert payload["capabilities"] == {
"streaming": True,
"pushNotifications": False,
"stateTransitionHistory": True,
}
assert payload["defaultInputModes"] == ["text/plain", "application/json"]
assert payload["defaultOutputModes"] == ["text/plain", "application/json"]
assert payload["skills"][0]["tags"] == ["python", "gitea"]
assert mod.validate_agent_card(payload) == []
@pytest.mark.parametrize(
("name", "url"),
[
("Timmy", "https://timmy.local:9443/a2a"),
("Allegro", "https://allegro.local:9443/a2a"),
("Ezra", "https://ezra.local:9443/a2a"),
],
)
def test_build_agent_card_supports_fleet_members(monkeypatch, name, url):
_set_base_context(monkeypatch, name=name, url=url, skills=[])
payload = mod.build_agent_card().to_dict()
assert payload["name"] == name
assert payload["url"] == url
assert mod.validate_agent_card(payload) == []
def test_load_skills_collects_tags_and_category(monkeypatch, tmp_path):
skill_root = tmp_path / "skills"
skill_dir = skill_root / "code-implementation"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text(
"""---
name: Code Implementation
description: Implement and patch code
tags: [python, gitea]
category: discovery
---
# Code Implementation
""",
encoding="utf-8",
)
monkeypatch.setattr(mod, "get_all_skills_dirs", lambda: [skill_root])
monkeypatch.setattr(mod, "get_disabled_skill_names", lambda: set())
monkeypatch.setattr(mod, "skill_matches_platform", lambda _frontmatter: True)
skills = mod._load_skills()
assert len(skills) == 1
assert skills[0].id == "code-implementation"
assert skills[0].name == "Code Implementation"
assert skills[0].description == "Implement and patch code"
assert skills[0].tags == ["python", "gitea", "discovery"]
def test_validate_agent_card_reports_schema_errors():
errors = mod.validate_agent_card(
{
"name": "",
"description": "",
"url": "timmy.local",
"version": "",
"capabilities": {"streaming": True},
"skills": [{"id": "", "name": "", "tags": "python"}],
"defaultInputModes": ["text/plain"],
"defaultOutputModes": ["plain"],
"metadata": [],
}
)
assert any("name must be a non-empty string" in error for error in errors)
assert any("url must be an absolute http/https URL" in error for error in errors)
assert any("capabilities.pushNotifications" in error for error in errors)
assert any("skills[0] missing id" in error for error in errors)
assert any("skills[0].tags must be a list" in error for error in errors)
assert any("defaultInputModes must include application/json" in error for error in errors)
assert any("defaultOutputModes entries must be MIME types" in error for error in errors)
assert any("metadata must be an object" in error for error in errors)
def test_get_agent_card_json_emits_valid_json(monkeypatch):
_set_base_context(monkeypatch)
payload = json.loads(mod.get_agent_card_json())
assert payload["name"] == "Timmy"
assert mod.validate_agent_card(payload) == []
def test_main_validate_prints_card(monkeypatch, capsys):
_set_base_context(monkeypatch)
exit_code = mod.main(["--validate"])
captured = capsys.readouterr()
assert exit_code == 0
payload = json.loads(captured.out)
assert payload["url"] == "https://timmy.local:9443/a2a"
assert captured.err == ""

View File

@@ -0,0 +1,236 @@
"""Tests for the KittenTTS local provider in tools/tts_tool.py."""
import json
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in ("HERMES_SESSION_PLATFORM",):
monkeypatch.delenv(key, raising=False)
@pytest.fixture(autouse=True)
def clear_kittentts_cache():
"""Reset the module-level model cache between tests."""
from tools import tts_tool as _tt
_tt._kittentts_model_cache.clear()
yield
_tt._kittentts_model_cache.clear()
@pytest.fixture
def mock_kittentts_module():
"""Inject a fake kittentts + soundfile module that return stub objects."""
fake_model = MagicMock()
# 24kHz float32 PCM at ~2s of silence
fake_model.generate.return_value = np.zeros(48000, dtype=np.float32)
fake_cls = MagicMock(return_value=fake_model)
fake_kittentts = MagicMock()
fake_kittentts.KittenTTS = fake_cls
# Stub soundfile — the real package isn't installed in CI venv, and
# _generate_kittentts does `import soundfile as sf` at runtime.
fake_sf = MagicMock()
def _fake_write(path, audio, samplerate):
# Emulate writing a real file so downstream path checks succeed.
import pathlib
pathlib.Path(path).write_bytes(b"RIFF\x00\x00\x00\x00WAVEfmt fake")
fake_sf.write = _fake_write
with patch.dict(
"sys.modules",
{"kittentts": fake_kittentts, "soundfile": fake_sf},
):
yield fake_model, fake_cls
class TestGenerateKittenTts:
def test_successful_wav_generation(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
fake_model, fake_cls = mock_kittentts_module
output_path = str(tmp_path / "test.wav")
result = _generate_kittentts("Hello world", output_path, {})
assert result == output_path
assert (tmp_path / "test.wav").exists()
fake_cls.assert_called_once()
fake_model.generate.assert_called_once()
def test_config_passes_voice_speed_cleantext(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
fake_model, _ = mock_kittentts_module
config = {
"kittentts": {
"model": "KittenML/kitten-tts-mini-0.8",
"voice": "Luna",
"speed": 1.25,
"clean_text": False,
}
}
_generate_kittentts("Hi there", str(tmp_path / "out.wav"), config)
call_kwargs = fake_model.generate.call_args.kwargs
assert call_kwargs["voice"] == "Luna"
assert call_kwargs["speed"] == 1.25
assert call_kwargs["clean_text"] is False
def test_default_model_and_voice(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import (
DEFAULT_KITTENTTS_MODEL,
DEFAULT_KITTENTTS_VOICE,
_generate_kittentts,
)
fake_model, fake_cls = mock_kittentts_module
_generate_kittentts("Hi", str(tmp_path / "out.wav"), {})
fake_cls.assert_called_once_with(DEFAULT_KITTENTTS_MODEL)
assert fake_model.generate.call_args.kwargs["voice"] == DEFAULT_KITTENTTS_VOICE
def test_model_is_cached_across_calls(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
_, fake_cls = mock_kittentts_module
_generate_kittentts("One", str(tmp_path / "a.wav"), {})
_generate_kittentts("Two", str(tmp_path / "b.wav"), {})
# Same model name → class instantiated exactly once
assert fake_cls.call_count == 1
def test_different_models_are_cached_separately(self, tmp_path, mock_kittentts_module):
from tools.tts_tool import _generate_kittentts
_, fake_cls = mock_kittentts_module
_generate_kittentts(
"A",
str(tmp_path / "a.wav"),
{"kittentts": {"model": "KittenML/kitten-tts-nano-0.8-int8"}},
)
_generate_kittentts(
"B",
str(tmp_path / "b.wav"),
{"kittentts": {"model": "KittenML/kitten-tts-mini-0.8"}},
)
assert fake_cls.call_count == 2
def test_non_wav_extension_triggers_ffmpeg_conversion(
self, tmp_path, mock_kittentts_module, monkeypatch
):
"""Non-.wav output path causes WAV → target ffmpeg conversion."""
from tools import tts_tool as _tt
calls = []
def fake_shutil_which(cmd):
return "/usr/bin/ffmpeg" if cmd == "ffmpeg" else None
def fake_run(cmd, check=False, timeout=None, **kw):
calls.append(cmd)
# Emulate ffmpeg writing the output file
import pathlib
out_path = cmd[-1]
pathlib.Path(out_path).write_bytes(b"fake-mp3-data")
return MagicMock(returncode=0)
monkeypatch.setattr(_tt.shutil, "which", fake_shutil_which)
monkeypatch.setattr(_tt.subprocess, "run", fake_run)
output_path = str(tmp_path / "test.mp3")
result = _tt._generate_kittentts("Hi", output_path, {})
assert result == output_path
assert len(calls) == 1
assert calls[0][0] == "/usr/bin/ffmpeg"
def test_missing_kittentts_raises_import_error(self, tmp_path, monkeypatch):
"""When kittentts package is not installed, _import_kittentts raises."""
import sys
monkeypatch.setitem(sys.modules, "kittentts", None)
from tools.tts_tool import _generate_kittentts
with pytest.raises((ImportError, TypeError)):
_generate_kittentts("Hi", str(tmp_path / "out.wav"), {})
class TestCheckKittenttsAvailable:
def test_reports_available_when_package_present(self, monkeypatch):
import importlib.util
from tools.tts_tool import _check_kittentts_available
fake_spec = MagicMock()
monkeypatch.setattr(
importlib.util,
"find_spec",
lambda name: fake_spec if name == "kittentts" else None,
)
assert _check_kittentts_available() is True
def test_reports_unavailable_when_package_missing(self, monkeypatch):
import importlib.util
from tools.tts_tool import _check_kittentts_available
monkeypatch.setattr(importlib.util, "find_spec", lambda name: None)
assert _check_kittentts_available() is False
class TestDispatcherBranch:
def test_kittentts_not_installed_returns_helpful_error(self, monkeypatch, tmp_path):
"""When provider=kittentts but package missing, return JSON error with setup hint."""
import sys
monkeypatch.setitem(sys.modules, "kittentts", None)
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
from tools.tts_tool import text_to_speech_tool
# Write a config telling it to use kittentts
import yaml
(tmp_path / "config.yaml").write_text(
yaml.safe_dump({"tts": {"provider": "kittentts"}})
)
result = json.loads(text_to_speech_tool(text="Hello"))
assert result["success"] is False
assert "kittentts" in result["error"].lower()
assert "hermes setup tts" in result["error"].lower()
def test_non_telegram_explicit_wav_path_is_preserved(
self, monkeypatch, tmp_path, mock_kittentts_module
):
"""Explicit WAV outputs should stay WAV outside Telegram sessions."""
import yaml
from tools import tts_tool as _tt
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / "config.yaml").write_text(
yaml.safe_dump({"tts": {"provider": "kittentts"}})
)
def fail_convert(_path):
raise AssertionError("_convert_to_opus should not run outside Telegram")
monkeypatch.setattr(_tt, "_convert_to_opus", fail_convert)
result = json.loads(
_tt.text_to_speech_tool(
text="Hello from KittenTTS",
output_path=str(tmp_path / "out.wav"),
)
)
assert result["success"] is True
assert result["file_path"] == str(tmp_path / "out.wav")
assert (tmp_path / "out.wav").exists()

View File

@@ -2,13 +2,14 @@
"""
Text-to-Speech Tool Module
Supports six TTS providers:
Supports seven TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
- KittenTTS (local, free, no API key): Lightweight on-device ONNX TTS via kittentts
Output formats:
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
@@ -77,6 +78,12 @@ def _import_sounddevice():
return sd
def _import_kittentts():
"""Lazy import KittenTTS. Returns the class or raises ImportError."""
from kittentts import KittenTTS
return KittenTTS
# ===========================================================================
# Defaults
# ===========================================================================
@@ -86,6 +93,8 @@ DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB" # Adam
DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
DEFAULT_KITTENTTS_MODEL = "KittenML/kitten-tts-nano-0.8-int8" # 25MB
DEFAULT_KITTENTTS_VOICE = "Jasper"
DEFAULT_OPENAI_VOICE = "alloy"
DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
@@ -448,6 +457,15 @@ def _check_neutts_available() -> bool:
return False
def _check_kittentts_available() -> bool:
"""Check if the kittentts engine is importable (installed locally)."""
try:
import importlib.util
return importlib.util.find_spec("kittentts") is not None
except Exception:
return False
def _default_neutts_ref_audio() -> str:
"""Return path to the bundled default voice reference audio."""
return str(Path(__file__).parent / "neutts_samples" / "jo.wav")
@@ -511,6 +529,51 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
return output_path
# ===========================================================================
# Provider: KittenTTS (local, lightweight)
# ===========================================================================
# Module-level cache for KittenTTS model instances
_kittentts_model_cache: Dict[str, Any] = {}
def _generate_kittentts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local KittenTTS ONNX model."""
KittenTTS = _import_kittentts()
kt_config = tts_config.get("kittentts", {})
model_name = kt_config.get("model", DEFAULT_KITTENTTS_MODEL)
voice = kt_config.get("voice", DEFAULT_KITTENTTS_VOICE)
speed = kt_config.get("speed", 1.0)
clean_text = kt_config.get("clean_text", True)
global _kittentts_model_cache
if model_name not in _kittentts_model_cache:
logger.info("[KittenTTS] Loading model: %s", model_name)
_kittentts_model_cache[model_name] = KittenTTS(model_name)
model = _kittentts_model_cache[model_name]
audio = model.generate(text, voice=voice, speed=speed, clean_text=clean_text)
import soundfile as sf
wav_path = output_path
if not output_path.endswith(".wav"):
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
sf.write(wav_path, audio, 24000)
if wav_path != output_path:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
subprocess.run(conv_cmd, check=True, timeout=30)
os.remove(wav_path)
else:
os.rename(wav_path, output_path)
return output_path
# ===========================================================================
# Main tool function
# ===========================================================================
@@ -622,6 +685,19 @@ def text_to_speech_tool(
logger.info("Generating speech with NeuTTS (local)...")
_generate_neutts(text, file_str, tts_config)
elif provider == "kittentts":
try:
_import_kittentts()
except ImportError:
return json.dumps({
"success": False,
"error": "KittenTTS provider selected but 'kittentts' package not installed. "
"Run 'hermes setup tts' and choose KittenTTS, or install manually: "
"pip install https://github.com/KittenML/KittenTTS/releases/download/0.8.1/kittentts-0.8.1-py3-none-any.whl"
}, ensure_ascii=False)
logger.info("Generating speech with KittenTTS (local, lightweight)...")
_generate_kittentts(text, file_str, tts_config)
else:
# Default: Edge TTS (free), with NeuTTS as local fallback
edge_available = True
@@ -658,10 +734,10 @@ def text_to_speech_tool(
"error": f"TTS generation produced no output (provider: {provider})"
}, ensure_ascii=False)
# Try Opus conversion for Telegram compatibility
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
# Try Opus conversion for Telegram compatibility only.
# Outside Telegram, preserve the caller's explicit output format.
voice_compatible = False
if provider in ("edge", "neutts", "minimax") and not file_str.endswith(".ogg"):
if want_opus and provider in ("edge", "neutts", "minimax", "kittentts") and not file_str.endswith(".ogg"):
opus_path = _convert_to_opus(file_str)
if opus_path:
file_str = opus_path
@@ -742,6 +818,8 @@ def check_tts_requirements() -> bool:
pass
if _check_neutts_available():
return True
if _check_kittentts_available():
return True
return False

View File

@@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription
## Text-to-Speech
Convert text to speech with six providers:
Convert text to speech with seven providers:
| Provider | Quality | Cost | API Key |
|----------|---------|------|---------|
@@ -20,6 +20,7 @@ Convert text to speech with six providers:
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
| **NeuTTS** | Good | Free | None needed |
| **KittenTTS** | Good | Free (local) | None needed |
### Platform Delivery
@@ -35,7 +36,7 @@ Convert text to speech with six providers:
```yaml
# In ~/.hermes/config.yaml
tts:
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts"
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts" | "kittentts"
speed: 1.0 # Global speed multiplier (provider-specific settings override this)
edge:
voice: "en-US-AriaNeural" # 322 voices, 74 languages
@@ -62,6 +63,11 @@ tts:
ref_text: ''
model: neuphonic/neutts-air-q4-gguf
device: cpu
kittentts:
model: KittenML/kitten-tts-nano-0.8-int8 # 25MB int8 default; also micro and mini variants
voice: Jasper # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo
speed: 1.0
clean_text: true
```
**Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed).
@@ -74,6 +80,7 @@ Telegram voice bubbles require Opus/OGG audio format:
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
- **KittenTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
```bash
# Ubuntu/Debian
@@ -86,7 +93,7 @@ brew install ffmpeg
sudo dnf install ffmpeg
```
Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, and KittenTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
:::tip
If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.