Compare commits

...

6 Commits

Author SHA1 Message Date
847c4d50d4 [auto-merge] Add desktop automation primitives to Hermes
Some checks failed
Deploy Nexus / deploy (push) Failing after 3s
Staging Verification Gate / verify-staging (push) Failing after 3s
Auto-merged by PR review bot: Add desktop automation primitives to Hermes
2026-04-10 11:48:25 +00:00
Alexander Whitestone
220f20c794 feat: add desktop automation primitives to Hermes (#1125)
Some checks failed
CI / test (pull_request) Failing after 8s
CI / validate (pull_request) Failing after 10s
Review Approval Gate / verify-review (pull_request) Failing after 2s
Implements Phase 1 and Phase 2 tooling from issue #1125:

- nexus/computer_use.py: four Hermes tools with poka-yoke safety
    * computer_screenshot() — capture & base64-encode desktop snapshot
    * computer_click(x, y, button, confirm) — right/middle require confirm=True
    * computer_type(text, confirm) — sensitive keywords blocked without confirm=True;
      text value is never written to audit log
    * computer_scroll(x, y, amount) — scroll wheel
    * read_action_log() — inspect recent JSONL audit entries
    * pyautogui.FAILSAFE=True; all tools degrade gracefully when headless

- nexus/computer_use_demo.py: Phase 1 demo (baseline screenshot →
  open browser → navigate to Gitea forge → evidence screenshot)

- tests/test_computer_use.py: 32 unit tests, fully headless
  (pyautogui mocked), all passing

- docs/computer-use.md: API reference, safety table, phase roadmap,
  pilot recipes

- docker-compose.desktop.yml: sandboxed Xvfb + noVNC container

Fixes #1125

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 05:45:27 -04:00
e85cefd9c0 Merge pull request #1130
Some checks failed
Deploy Nexus / deploy (push) Failing after 2s
Staging Verification Gate / verify-staging (push) Failing after 3s
Merged PR #1130
2026-04-10 09:37:52 +00:00
beec49a92d Merge branch 'main' into claude/issue-1126
Some checks failed
CI / test (pull_request) Failing after 10s
CI / validate (pull_request) Failing after 11s
Review Approval Gate / verify-review (pull_request) Successful in 2s
2026-04-10 09:37:45 +00:00
ef25c073ce Merge pull request '[Mnemosyne] Consolidated Spatial Memory Schema — spatial regions + demo memories' (#1156) from feat/mnemosyne-spatial-schema-consolidated into main
Some checks failed
Deploy Nexus / deploy (push) Failing after 3s
Staging Verification Gate / verify-staging (push) Failing after 3s
Merge PR #1156: [Mnemosyne] Consolidated Spatial Memory Schema — spatial regions + demo memories
2026-04-10 09:35:23 +00:00
Alexander Whitestone
ef74536e33 feat: add edge-tts as zero-cost voice output provider
Some checks failed
CI / test (pull_request) Failing after 33s
CI / validate (pull_request) Failing after 26s
Review Approval Gate / verify-review (pull_request) Failing after 5s
- Add EdgeTTSAdapter to bin/deepdive_tts.py (provider key: "edge-tts")
  default voice: en-US-GuyNeural, no API key required
- Add EdgeTTS class to intelligence/deepdive/tts_engine.py
- Update HybridTTS to try edge-tts as fallback between piper and elevenlabs
- Add --voice-memo flag to bin/night_watch.py for spoken nightly reports
- Add edge-tts>=6.1.9 to requirements.txt
- Create docs/voice-output.md documenting all providers and fallback chain
- Add tests/test_edge_tts.py with 17 unit tests (all mocked, no network)

Fixes #1126

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 06:29:26 -04:00
11 changed files with 1707 additions and 7 deletions

View File

@@ -152,17 +152,55 @@ class OpenAITTSAdapter:
return mp3_path
class EdgeTTSAdapter:
"""Zero-cost TTS using Microsoft Edge neural voices (no API key required).
Requires: pip install edge-tts>=6.1.9
Voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support
"""
DEFAULT_VOICE = "en-US-GuyNeural"
def __init__(self, config: TTSConfig):
self.config = config
self.voice = config.voice_id or self.DEFAULT_VOICE
def synthesize(self, text: str, output_path: Path) -> Path:
try:
import edge_tts
except ImportError:
raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
import asyncio
mp3_path = output_path.with_suffix(".mp3")
async def _run():
communicate = edge_tts.Communicate(text, self.voice)
await communicate.save(str(mp3_path))
asyncio.run(_run())
return mp3_path
ADAPTERS = {
"piper": PiperAdapter,
"elevenlabs": ElevenLabsAdapter,
"openai": OpenAITTSAdapter,
"edge-tts": EdgeTTSAdapter,
}
def get_provider_config() -> TTSConfig:
"""Load TTS configuration from environment."""
provider = os.environ.get("DEEPDIVE_TTS_PROVIDER", "openai")
voice = os.environ.get("DEEPDIVE_TTS_VOICE", "alloy" if provider == "openai" else "matthew")
if provider == "openai":
default_voice = "alloy"
elif provider == "edge-tts":
default_voice = EdgeTTSAdapter.DEFAULT_VOICE
else:
default_voice = "matthew"
voice = os.environ.get("DEEPDIVE_TTS_VOICE", default_voice)
return TTSConfig(
provider=provider,

View File

@@ -32,12 +32,14 @@ import importlib.util
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
logging.basicConfig(
level=logging.INFO,
@@ -212,6 +214,46 @@ def generate_report(date_str: str, checker_mod) -> str:
return "\n".join(lines)
# ── Voice memo ────────────────────────────────────────────────────────
def _generate_voice_memo(report_text: str, date_str: str) -> Optional[str]:
"""Generate an MP3 voice memo of the night watch report.
Returns the output path on success, or None if generation fails.
"""
try:
import edge_tts
except ImportError:
logger.warning("edge-tts not installed; skipping voice memo. Run: pip install edge-tts")
return None
import asyncio
# Strip markdown formatting for cleaner speech
clean = report_text
clean = re.sub(r"#+\s*", "", clean) # headings
clean = re.sub(r"\|", " ", clean) # table pipes
clean = re.sub(r"\*+", "", clean) # bold/italic markers
clean = re.sub(r"-{3,}", "", clean) # horizontal rules
clean = re.sub(r"\s{2,}", " ", clean) # collapse extra whitespace
output_dir = Path("/tmp/bezalel")
output_dir.mkdir(parents=True, exist_ok=True)
mp3_path = output_dir / f"night-watch-{date_str}.mp3"
try:
async def _run():
communicate = edge_tts.Communicate(clean.strip(), "en-US-GuyNeural")
await communicate.save(str(mp3_path))
asyncio.run(_run())
logger.info("Voice memo written to %s", mp3_path)
return str(mp3_path)
except Exception as exc:
logger.warning("Voice memo generation failed: %s", exc)
return None
# ── Entry point ───────────────────────────────────────────────────────
def main() -> None:
@@ -226,6 +268,10 @@ def main() -> None:
"--dry-run", action="store_true",
help="Print report to stdout instead of writing to disk",
)
parser.add_argument(
"--voice-memo", action="store_true",
help="Generate an MP3 voice memo of the report using edge-tts (saved to /tmp/bezalel/)",
)
args = parser.parse_args()
date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
@@ -242,6 +288,14 @@ def main() -> None:
report_path.write_text(report_text)
logger.info("Night Watch report written to %s", report_path)
if args.voice_memo:
try:
memo_path = _generate_voice_memo(report_text, date_str)
if memo_path:
logger.info("Voice memo: %s", memo_path)
except Exception as exc:
logger.warning("Voice memo failed (non-fatal): %s", exc)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
version: "3.9"
# Sandboxed desktop environment for Hermes computer-use primitives.
# Provides Xvfb (virtual framebuffer) + noVNC (browser-accessible VNC).
#
# Usage:
# docker compose -f docker-compose.desktop.yml up -d
# # Visit http://localhost:6080 to see the virtual desktop
#
# docker compose -f docker-compose.desktop.yml run hermes-desktop \
# python -m nexus.computer_use_demo
#
# docker compose -f docker-compose.desktop.yml down
services:
hermes-desktop:
image: dorowu/ubuntu-desktop-lxde-vnc:focal
environment:
# Resolution for the virtual display
RESOLUTION: "1280x800"
# VNC password (change in production)
VNC_PASSWORD: "hermes"
# Disable HTTP password for development convenience
HTTP_PASSWORD: ""
ports:
# noVNC web interface
- "6080:80"
# Raw VNC port (optional)
- "5900:5900"
volumes:
# Mount repo into container so scripts are available
- .:/workspace
# Persist nexus runtime data (heartbeats, logs, evidence)
- nexus_data:/root/.nexus
working_dir: /workspace
shm_size: "256mb"
# Install Python deps on startup then keep container alive
command: >
bash -c "
pip install --quiet pyautogui Pillow &&
/startup.sh
"
volumes:
nexus_data:
driver: local

174
docs/computer-use.md Normal file
View File

@@ -0,0 +1,174 @@
# Computer Use — Desktop Automation Primitives for Hermes
Issue: [#1125](https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1125)
## Overview
`nexus/computer_use.py` adds desktop automation primitives to the Hermes fleet. Agents can take screenshots, click, type, and scroll — enough to drive a browser, validate a UI, or diagnose a failed workflow page visually.
All actions are logged to a JSONL audit trail at `~/.nexus/computer_use_actions.jsonl`.
---
## Quick Start
### Local (requires a real display or Xvfb)
```bash
# Install dependencies
pip install pyautogui Pillow
# Run the Phase 1 demo
python -m nexus.computer_use_demo
```
### Sandboxed (Docker + Xvfb + noVNC)
```bash
docker compose -f docker-compose.desktop.yml up -d
# Visit http://localhost:6080 in your browser to see the virtual desktop
docker compose -f docker-compose.desktop.yml run hermes-desktop \
python -m nexus.computer_use_demo
docker compose -f docker-compose.desktop.yml down
```
---
## API Reference
### `computer_screenshot(save_path=None, log_path=...)`
Capture the current desktop.
| Param | Type | Description |
|-------|------|-------------|
| `save_path` | `str \| None` | Path to save PNG. If `None`, returns base64 string. |
| `log_path` | `Path` | Audit log file. |
**Returns** `dict`:
```json
{
"ok": true,
"image_b64": "<base64 PNG or null>",
"saved_to": "<path or null>",
"error": null
}
```
---
### `computer_click(x, y, button="left", confirm=False, log_path=...)`
Click the mouse at screen coordinates.
| Param | Type | Description |
|-------|------|-------------|
| `x` | `int` | Horizontal coordinate |
| `y` | `int` | Vertical coordinate |
| `button` | `str` | `"left"` \| `"right"` \| `"middle"` |
| `confirm` | `bool` | Required `True` for `right` / `middle` (poka-yoke) |
**Returns** `dict`:
```json
{"ok": true, "error": null}
```
---
### `computer_type(text, confirm=False, interval=0.02, log_path=...)`
Type text using the keyboard.
| Param | Type | Description |
|-------|------|-------------|
| `text` | `str` | Text to type |
| `confirm` | `bool` | Required `True` when text contains a sensitive keyword |
| `interval` | `float` | Delay between keystrokes (seconds) |
**Sensitive keywords** (require `confirm=True`): `password`, `passwd`, `secret`, `token`, `api_key`, `apikey`, `key`, `auth`
> Note: the actual `text` value is never written to the audit log — only its length and whether it was flagged as sensitive.
**Returns** `dict`:
```json
{"ok": true, "error": null}
```
---
### `computer_scroll(x, y, amount=3, log_path=...)`
Scroll the mouse wheel at screen coordinates.
| Param | Type | Description |
|-------|------|-------------|
| `x` | `int` | Horizontal coordinate |
| `y` | `int` | Vertical coordinate |
| `amount` | `int` | Scroll units. Positive = up, negative = down. |
**Returns** `dict`:
```json
{"ok": true, "error": null}
```
---
### `read_action_log(n=20, log_path=...)`
Return the most recent `n` audit log entries, newest first.
```python
from nexus.computer_use import read_action_log
for entry in read_action_log(n=5):
print(entry["ts"], entry["action"], entry["result"]["ok"])
```
---
## Safety Model
| Action | Safety gate |
|--------|-------------|
| `computer_click(button="right")` | Requires `confirm=True` |
| `computer_click(button="middle")` | Requires `confirm=True` |
| `computer_type` with sensitive text | Requires `confirm=True` |
| Mouse to top-left corner | pyautogui FAILSAFE — aborts immediately |
| All actions | Written to JSONL audit log with timestamp |
| Headless environment | All tools degrade gracefully — return `ok=False` with error message |
---
## Phase Roadmap
### Phase 1 — Environment & Primitives ✅
- Sandboxed desktop via Xvfb + noVNC (`docker-compose.desktop.yml`)
- `computer_screenshot`, `computer_click`, `computer_type`, `computer_scroll`
- Poka-yoke safety checks on all destructive actions
- JSONL audit log for all actions
- Demo: baseline screenshot → open browser → navigate to Gitea → evidence screenshot
- 32 unit tests, fully headless (pyautogui mocked)
### Phase 2 — Tool Integration (planned)
- Register tools in the Hermes tool registry
- LLM-based planner loop using screenshots as context
- Destructive action confirmation UI
### Phase 3 — Use-Case Pilots (planned)
- Pilot 1: Automated visual regression test for fleet dashboard
- Pilot 2: Screenshot-based diagnosis of failed CI workflow page
---
## File Locations
| File | Purpose |
|------|---------|
| `nexus/computer_use.py` | Core tool primitives |
| `nexus/computer_use_demo.py` | Phase 1 end-to-end demo |
| `tests/test_computer_use.py` | 32 unit tests |
| `docker-compose.desktop.yml` | Sandboxed desktop container |
| `~/.nexus/computer_use_actions.jsonl` | Runtime audit log |
| `~/.nexus/computer_use_evidence/` | Screenshot evidence (demo output) |

135
docs/voice-output.md Normal file
View File

@@ -0,0 +1,135 @@
# Voice Output System
## Overview
The Nexus voice output system converts text reports and briefings into spoken audio.
It supports multiple TTS providers with automatic fallback so that audio generation
degrades gracefully when a provider is unavailable.
Primary use cases:
- **Deep Dive** daily briefings (`bin/deepdive_tts.py`)
- **Night Watch** nightly reports (`bin/night_watch.py --voice-memo`)
---
## Available Providers
### edge-tts (recommended default)
- **Cost:** Zero — no API key, no account required
- **Package:** `pip install edge-tts>=6.1.9`
- **Default voice:** `en-US-GuyNeural`
- **Output format:** MP3
- **How it works:** Streams audio from Microsoft Edge's neural TTS service over HTTPS.
No local model download required.
- **Available locales:** 100+ languages and locales. Full list:
https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support
Notable English voices:
| Voice ID | Style |
|---|---|
| `en-US-GuyNeural` | Neutral male (default) |
| `en-US-JennyNeural` | Warm female |
| `en-US-AriaNeural` | Expressive female |
| `en-GB-RyanNeural` | British male |
### piper
- **Cost:** Free, fully offline
- **Package:** `pip install piper-tts` + model download (~65 MB)
- **Model location:** `~/.local/share/piper/en_US-lessac-medium.onnx`
- **Output format:** WAV → MP3 (requires `lame`)
- **Sovereignty:** Fully local; no network calls after model download
### elevenlabs
- **Cost:** Usage-based (paid)
- **Requirement:** `ELEVENLABS_API_KEY` environment variable
- **Output format:** MP3
- **Quality:** Highest quality of the three providers
### openai
- **Cost:** Usage-based (paid)
- **Requirement:** `OPENAI_API_KEY` environment variable
- **Output format:** MP3
- **Default voice:** `alloy`
---
## Usage: deepdive_tts.py
```bash
# Use edge-tts (zero cost)
DEEPDIVE_TTS_PROVIDER=edge-tts python bin/deepdive_tts.py --text "Good morning."
# Specify a different Edge voice
python bin/deepdive_tts.py --provider edge-tts --voice en-US-JennyNeural --text "Hello world."
# Read from a file
python bin/deepdive_tts.py --provider edge-tts --input-file /tmp/briefing.txt --output /tmp/briefing
# Use OpenAI
OPENAI_API_KEY=sk-... python bin/deepdive_tts.py --provider openai --voice nova --text "Hello."
# Use ElevenLabs
ELEVENLABS_API_KEY=... python bin/deepdive_tts.py --provider elevenlabs --voice rachel --text "Hello."
# Use local Piper (offline)
python bin/deepdive_tts.py --provider piper --text "Hello."
```
Provider and voice can also be set via environment variables:
```bash
export DEEPDIVE_TTS_PROVIDER=edge-tts
export DEEPDIVE_TTS_VOICE=en-GB-RyanNeural
python bin/deepdive_tts.py --text "Good evening."
```
---
## Usage: Night Watch --voice-memo
The `--voice-memo` flag causes Night Watch to generate an MP3 audio summary of the
nightly report immediately after writing the markdown file.
```bash
python bin/night_watch.py --voice-memo
```
Output location: `/tmp/bezalel/night-watch-<YYYY-MM-DD>.mp3`
The voice memo:
- Strips markdown formatting (`#`, `|`, `*`, `---`) for cleaner speech
- Uses `edge-tts` with the `en-US-GuyNeural` voice
- Is non-fatal: if TTS fails, the markdown report is still written normally
Example crontab with voice memo:
```cron
0 3 * * * cd /path/to/the-nexus && python bin/night_watch.py --voice-memo \
>> /var/log/bezalel/night-watch.log 2>&1
```
---
## Fallback Chain
`HybridTTS` (used by `tts_engine.py`) attempts providers in this order:
1. **edge-tts** — zero cost, no API key
2. **piper** — offline local model (if model file present)
3. **elevenlabs** — cloud fallback (if `ELEVENLABS_API_KEY` set)
If `prefer_cloud=True` is passed, the order becomes: elevenlabs → piper.
---
## Phase 3 TODO
Evaluate **fish-speech** and **F5-TTS** as fully offline, sovereign alternatives
with higher voice quality than Piper. These models run locally with no network
dependency whatsoever, providing complete independence from Microsoft's Edge service.
Tracking: to be filed as a follow-up to issue #830.

View File

@@ -157,14 +157,45 @@ class ElevenLabsTTS:
return output_path
class EdgeTTS:
"""Zero-cost TTS using Microsoft Edge neural voices (no API key required).
Requires: pip install edge-tts>=6.1.9
"""
DEFAULT_VOICE = "en-US-GuyNeural"
def __init__(self, voice: str = None):
self.voice = voice or self.DEFAULT_VOICE
def synthesize(self, text: str, output_path: str) -> str:
"""Convert text to MP3 via Edge TTS."""
try:
import edge_tts
except ImportError:
raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
import asyncio
from pathlib import Path
mp3_path = str(Path(output_path).with_suffix(".mp3"))
async def _run():
communicate = edge_tts.Communicate(text, self.voice)
await communicate.save(mp3_path)
asyncio.run(_run())
return mp3_path
class HybridTTS:
"""TTS with sovereign primary, cloud fallback."""
def __init__(self, prefer_cloud: bool = False):
self.primary = None
self.fallback = None
self.prefer_cloud = prefer_cloud
# Try preferred engine
if prefer_cloud:
self._init_elevenlabs()
@@ -172,21 +203,29 @@ class HybridTTS:
self._init_piper()
else:
self._init_piper()
if not self.primary:
self._init_edge_tts()
if not self.primary:
self._init_elevenlabs()
def _init_piper(self):
try:
self.primary = PiperTTS()
except Exception as e:
print(f"Piper init failed: {e}")
def _init_edge_tts(self):
try:
self.primary = EdgeTTS()
except Exception as e:
print(f"EdgeTTS init failed: {e}")
def _init_elevenlabs(self):
try:
self.primary = ElevenLabsTTS()
except Exception as e:
print(f"ElevenLabs init failed: {e}")
def synthesize(self, text: str, output_path: str) -> str:
"""Synthesize with fallback."""
if self.primary:
@@ -194,7 +233,7 @@ class HybridTTS:
return self.primary.synthesize(text, output_path)
except Exception as e:
print(f"Primary failed: {e}")
raise RuntimeError("No TTS engine available")

313
nexus/computer_use.py Normal file
View File

@@ -0,0 +1,313 @@
"""
Hermes Desktop Automation Primitives — Computer Use (#1125)
Provides sandboxed desktop control tools for Hermes agents:
- computer_screenshot() — capture current desktop
- computer_click() — mouse click with poka-yoke on non-primary buttons
- computer_type() — keyboard input with poka-yoke on sensitive text
- computer_scroll() — scroll wheel action
- read_action_log() — inspect recent action audit trail
All actions are logged to a JSONL audit file.
pyautogui.FAILSAFE is enabled globally — move mouse to top-left corner to abort.
Designed to degrade gracefully when no display is available (headless CI).
"""
from __future__ import annotations
import base64
import io
import json
import logging
import os
import time
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Safety globals
# ---------------------------------------------------------------------------
# Poka-yoke: require confirmation for dangerous inputs
_SENSITIVE_KEYWORDS = frozenset(
["password", "passwd", "secret", "token", "api_key", "apikey", "key", "auth"]
)
# Destructive mouse buttons (non-primary)
_DANGEROUS_BUTTONS = frozenset(["right", "middle"])
# Default log location
DEFAULT_ACTION_LOG = Path.home() / ".nexus" / "computer_use_actions.jsonl"
# ---------------------------------------------------------------------------
# Lazy pyautogui import — fails gracefully in headless environments
# ---------------------------------------------------------------------------
_PYAUTOGUI_AVAILABLE = False
_pyautogui = None
def _get_pyautogui():
"""Return pyautogui, enabling FAILSAFE. Returns None if unavailable."""
global _pyautogui, _PYAUTOGUI_AVAILABLE
if _pyautogui is not None:
return _pyautogui
try:
import pyautogui # type: ignore
pyautogui.FAILSAFE = True
pyautogui.PAUSE = 0.05 # small delay between actions
_pyautogui = pyautogui
_PYAUTOGUI_AVAILABLE = True
return _pyautogui
except Exception:
logger.warning("pyautogui unavailable — computer_use running in stub mode")
return None
def _get_pil():
"""Return PIL Image module or None."""
try:
from PIL import Image # type: ignore
return Image
except ImportError:
return None
# ---------------------------------------------------------------------------
# Audit log
# ---------------------------------------------------------------------------
def _log_action(action: str, params: dict, result: dict, log_path: Path = DEFAULT_ACTION_LOG):
"""Append one action record to the JSONL audit log."""
log_path.parent.mkdir(parents=True, exist_ok=True)
record = {
"ts": time.strftime("%Y-%m-%dT%H:%M:%S"),
"action": action,
"params": params,
"result": result,
}
with open(log_path, "a") as fh:
fh.write(json.dumps(record) + "\n")
# ---------------------------------------------------------------------------
# Public tool API
# ---------------------------------------------------------------------------
def computer_screenshot(
save_path: Optional[str] = None,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Capture a screenshot of the current desktop.
Args:
save_path: Optional file path to save the PNG. If omitted the image
is returned as a base64-encoded string.
log_path: Audit log file (default ~/.nexus/computer_use_actions.jsonl).
Returns:
dict with keys:
- ok (bool)
- image_b64 (str | None) — base64 PNG when save_path is None
- saved_to (str | None) — path when save_path was given
- error (str | None) — human-readable error if ok=False
"""
pag = _get_pyautogui()
params = {"save_path": save_path}
if pag is None:
result = {"ok": False, "image_b64": None, "saved_to": None, "error": "pyautogui unavailable"}
_log_action("screenshot", params, result, log_path)
return result
try:
screenshot = pag.screenshot()
if save_path:
screenshot.save(save_path)
result = {"ok": True, "image_b64": None, "saved_to": save_path, "error": None}
else:
buf = io.BytesIO()
screenshot.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode()
result = {"ok": True, "image_b64": b64, "saved_to": None, "error": None}
except Exception as exc:
result = {"ok": False, "image_b64": None, "saved_to": None, "error": str(exc)}
_log_action("screenshot", params, {k: v for k, v in result.items() if k != "image_b64"}, log_path)
return result
def computer_click(
x: int,
y: int,
button: str = "left",
confirm: bool = False,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Click the mouse at screen coordinates (x, y).
Poka-yoke: right/middle clicks require confirm=True.
Args:
x: Horizontal screen coordinate.
y: Vertical screen coordinate.
button: "left" | "right" | "middle"
confirm: Must be True for non-left buttons.
log_path: Audit log file.
Returns:
dict with keys: ok, error
"""
params = {"x": x, "y": y, "button": button, "confirm": confirm}
if button in _DANGEROUS_BUTTONS and not confirm:
result = {
"ok": False,
"error": (
f"button={button!r} requires confirm=True (poka-yoke). "
"Pass confirm=True only after verifying this action is intentional."
),
}
_log_action("click", params, result, log_path)
return result
if button not in ("left", "right", "middle"):
result = {"ok": False, "error": f"Unknown button {button!r}. Use 'left', 'right', or 'middle'."}
_log_action("click", params, result, log_path)
return result
pag = _get_pyautogui()
if pag is None:
result = {"ok": False, "error": "pyautogui unavailable"}
_log_action("click", params, result, log_path)
return result
try:
pag.click(x, y, button=button)
result = {"ok": True, "error": None}
except Exception as exc:
result = {"ok": False, "error": str(exc)}
_log_action("click", params, result, log_path)
return result
def computer_type(
text: str,
confirm: bool = False,
interval: float = 0.02,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Type text using the keyboard.
Poka-yoke: if *text* contains a sensitive keyword (password, token, key…)
confirm=True is required. The actual text value is never written to the
audit log.
Args:
text: The string to type.
confirm: Must be True when the text looks sensitive.
interval: Delay between keystrokes (seconds).
log_path: Audit log file.
Returns:
dict with keys: ok, error
"""
lower = text.lower()
is_sensitive = any(kw in lower for kw in _SENSITIVE_KEYWORDS)
params = {"length": len(text), "is_sensitive": is_sensitive, "confirm": confirm}
if is_sensitive and not confirm:
result = {
"ok": False,
"error": (
"Text contains sensitive keyword. Pass confirm=True to proceed. "
"Ensure no secrets are being typed into unintended windows."
),
}
_log_action("type", params, result, log_path)
return result
pag = _get_pyautogui()
if pag is None:
result = {"ok": False, "error": "pyautogui unavailable"}
_log_action("type", params, result, log_path)
return result
try:
pag.typewrite(text, interval=interval)
result = {"ok": True, "error": None}
except Exception as exc:
result = {"ok": False, "error": str(exc)}
_log_action("type", params, result, log_path)
return result
def computer_scroll(
x: int,
y: int,
amount: int = 3,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Scroll the mouse wheel at screen coordinates (x, y).
Args:
x: Horizontal screen coordinate.
y: Vertical screen coordinate.
amount: Number of scroll units. Positive = scroll up, negative = down.
log_path: Audit log file.
Returns:
dict with keys: ok, error
"""
params = {"x": x, "y": y, "amount": amount}
pag = _get_pyautogui()
if pag is None:
result = {"ok": False, "error": "pyautogui unavailable"}
_log_action("scroll", params, result, log_path)
return result
try:
pag.scroll(amount, x=x, y=y)
result = {"ok": True, "error": None}
except Exception as exc:
result = {"ok": False, "error": str(exc)}
_log_action("scroll", params, result, log_path)
return result
def read_action_log(
n: int = 20,
log_path: Path = DEFAULT_ACTION_LOG,
) -> list[dict]:
"""Return the most recent *n* action records from the audit log.
Args:
n: Maximum number of records to return.
log_path: Audit log file.
Returns:
List of action dicts, newest first.
"""
if not log_path.exists():
return []
records: list[dict] = []
with open(log_path) as fh:
for line in fh:
line = line.strip()
if line:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return list(reversed(records[-n:]))

118
nexus/computer_use_demo.py Normal file
View File

@@ -0,0 +1,118 @@
"""
Phase 1 Demo — Desktop Automation via Hermes (#1125)
Demonstrates the computer_use primitives end-to-end:
1. Take a baseline screenshot
2. Open a browser and navigate to the Gitea forge
3. Take an evidence screenshot
Run inside a desktop session (Xvfb or real display):
python -m nexus.computer_use_demo
Or via Docker:
docker compose -f docker-compose.desktop.yml run hermes-desktop \
python -m nexus.computer_use_demo
"""
from __future__ import annotations
import logging
import sys
import time
from pathlib import Path
from nexus.computer_use import (
computer_click,
computer_screenshot,
computer_type,
read_action_log,
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
GITEA_URL = "https://forge.alexanderwhitestone.com"
EVIDENCE_DIR = Path.home() / ".nexus" / "computer_use_evidence"
def run_demo() -> bool:
"""Execute the Phase 1 demo. Returns True on success."""
EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)
log.info("=== Phase 1 Computer-Use Demo ===")
# --- Step 1: baseline screenshot ---
baseline = EVIDENCE_DIR / "01_baseline.png"
log.info("Step 1: capturing baseline screenshot → %s", baseline)
result = computer_screenshot(save_path=str(baseline))
if not result["ok"]:
log.error("Baseline screenshot failed: %s", result["error"])
return False
log.info(" ✓ baseline saved")
# --- Step 2: open browser ---
log.info("Step 2: opening browser")
try:
import subprocess
# Use xdg-open / open depending on platform; fallback to chromium
for cmd in (
["xdg-open", GITEA_URL],
["chromium-browser", "--no-sandbox", GITEA_URL],
["chromium", "--no-sandbox", GITEA_URL],
["google-chrome", "--no-sandbox", GITEA_URL],
["open", GITEA_URL], # macOS
):
try:
subprocess.Popen(cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
log.info(" ✓ browser opened with: %s", cmd[0])
break
except FileNotFoundError:
continue
else:
log.warning(" ⚠ no browser found — skipping open step")
except Exception as exc:
log.warning(" ⚠ could not open browser: %s", exc)
# Give the browser time to load
time.sleep(3)
# --- Step 3: click address bar and navigate (best-effort) ---
log.info("Step 3: attempting to type URL in browser address bar (best-effort)")
try:
import pyautogui # type: ignore
# Common shortcut to focus address bar
pyautogui.hotkey("ctrl", "l")
time.sleep(0.3)
result_type = computer_type(GITEA_URL)
if result_type["ok"]:
pyautogui.press("enter")
time.sleep(2)
log.info(" ✓ URL typed")
else:
log.warning(" ⚠ type failed: %s", result_type["error"])
except ImportError:
log.warning(" ⚠ pyautogui not available — skipping URL type step")
# --- Step 4: evidence screenshot ---
evidence = EVIDENCE_DIR / "02_gitea.png"
log.info("Step 4: capturing evidence screenshot → %s", evidence)
result = computer_screenshot(save_path=str(evidence))
if not result["ok"]:
log.error("Evidence screenshot failed: %s", result["error"])
return False
log.info(" ✓ evidence saved")
# --- Step 5: summary ---
log.info("Step 5: recent action log")
for entry in read_action_log(n=10):
log.info(" %s %s ok=%s", entry["ts"], entry["action"], entry["result"].get("ok"))
log.info("=== Demo complete — evidence in %s ===", EVIDENCE_DIR)
return True
if __name__ == "__main__":
success = run_demo()
sys.exit(0 if success else 1)

View File

@@ -1,3 +1,4 @@
pytest>=7.0
pytest-asyncio>=0.21.0
pyyaml>=6.0
edge-tts>=6.1.9

362
tests/test_computer_use.py Normal file
View File

@@ -0,0 +1,362 @@
"""
Tests for nexus.computer_use — Desktop Automation Primitives (#1125)
All tests run fully headless: pyautogui is mocked throughout.
No display is required.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch, call
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent))
from nexus.computer_use import (
_DANGEROUS_BUTTONS,
_SENSITIVE_KEYWORDS,
computer_click,
computer_screenshot,
computer_scroll,
computer_type,
read_action_log,
)
# ---------------------------------------------------------------------------
# Helpers / fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def tmp_log(tmp_path):
"""Return a temporary JSONL audit log path."""
return tmp_path / "actions.jsonl"
def _last_log_entry(log_path: Path) -> dict:
lines = [l.strip() for l in log_path.read_text().splitlines() if l.strip()]
return json.loads(lines[-1])
def _make_mock_pag(screenshot_raises=None):
"""Build a minimal pyautogui mock."""
mock = MagicMock()
mock.FAILSAFE = True
mock.PAUSE = 0.05
if screenshot_raises:
mock.screenshot.side_effect = screenshot_raises
else:
img_mock = MagicMock()
img_mock.save = MagicMock()
mock.screenshot.return_value = img_mock
return mock
# ---------------------------------------------------------------------------
# computer_screenshot
# ---------------------------------------------------------------------------
class TestComputerScreenshot:
def test_returns_b64_when_no_save_path(self, tmp_log):
mock_pag = _make_mock_pag()
# Make save() write fake PNG bytes
import io
buf = io.BytesIO(b"\x89PNG\r\n\x1a\n" + b"\x00" * 20)
def fake_save(obj, format=None):
obj.write(buf.getvalue())
mock_pag.screenshot.return_value.save = MagicMock(side_effect=fake_save)
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_screenshot(log_path=tmp_log)
assert result["ok"] is True
assert result["image_b64"] is not None
assert result["saved_to"] is None
assert result["error"] is None
def test_saves_to_path(self, tmp_log, tmp_path):
mock_pag = _make_mock_pag()
out_png = tmp_path / "shot.png"
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_screenshot(save_path=str(out_png), log_path=tmp_log)
assert result["ok"] is True
assert result["saved_to"] == str(out_png)
assert result["image_b64"] is None
mock_pag.screenshot.return_value.save.assert_called_once_with(str(out_png))
def test_logs_action(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_screenshot(log_path=tmp_log)
entry = _last_log_entry(tmp_log)
assert entry["action"] == "screenshot"
assert "ts" in entry
def test_returns_error_when_headless(self, tmp_log):
with patch("nexus.computer_use._get_pyautogui", return_value=None):
result = computer_screenshot(log_path=tmp_log)
assert result["ok"] is False
assert "unavailable" in result["error"]
def test_handles_screenshot_exception(self, tmp_log):
mock_pag = _make_mock_pag(screenshot_raises=RuntimeError("display error"))
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_screenshot(log_path=tmp_log)
assert result["ok"] is False
assert "display error" in result["error"]
def test_image_b64_not_written_to_log(self, tmp_log):
"""The (potentially huge) base64 blob must NOT appear in the audit log."""
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_screenshot(log_path=tmp_log)
raw = tmp_log.read_text()
assert "image_b64" not in raw
# ---------------------------------------------------------------------------
# computer_click
# ---------------------------------------------------------------------------
class TestComputerClick:
def test_left_click_succeeds(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_click(100, 200, log_path=tmp_log)
assert result["ok"] is True
mock_pag.click.assert_called_once_with(100, 200, button="left")
def test_right_click_blocked_without_confirm(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_click(100, 200, button="right", log_path=tmp_log)
assert result["ok"] is False
assert "confirm=True" in result["error"]
mock_pag.click.assert_not_called()
def test_right_click_allowed_with_confirm(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_click(100, 200, button="right", confirm=True, log_path=tmp_log)
assert result["ok"] is True
mock_pag.click.assert_called_once_with(100, 200, button="right")
def test_middle_click_blocked_without_confirm(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_click(50, 50, button="middle", log_path=tmp_log)
assert result["ok"] is False
def test_middle_click_allowed_with_confirm(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_click(50, 50, button="middle", confirm=True, log_path=tmp_log)
assert result["ok"] is True
def test_unknown_button_rejected(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_click(0, 0, button="turbo", log_path=tmp_log)
assert result["ok"] is False
assert "Unknown button" in result["error"]
def test_logs_click_action(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_click(10, 20, log_path=tmp_log)
entry = _last_log_entry(tmp_log)
assert entry["action"] == "click"
assert entry["params"]["x"] == 10
assert entry["params"]["y"] == 20
def test_returns_error_when_headless(self, tmp_log):
with patch("nexus.computer_use._get_pyautogui", return_value=None):
result = computer_click(0, 0, log_path=tmp_log)
assert result["ok"] is False
def test_handles_click_exception(self, tmp_log):
mock_pag = _make_mock_pag()
mock_pag.click.side_effect = Exception("out of bounds")
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_click(99999, 99999, log_path=tmp_log)
assert result["ok"] is False
assert "out of bounds" in result["error"]
# ---------------------------------------------------------------------------
# computer_type
# ---------------------------------------------------------------------------
class TestComputerType:
def test_plain_text_succeeds(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_type("hello world", log_path=tmp_log)
assert result["ok"] is True
mock_pag.typewrite.assert_called_once_with("hello world", interval=0.02)
def test_sensitive_text_blocked_without_confirm(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_type("mypassword123", log_path=tmp_log)
assert result["ok"] is False
assert "confirm=True" in result["error"]
mock_pag.typewrite.assert_not_called()
def test_sensitive_text_allowed_with_confirm(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_type("mypassword123", confirm=True, log_path=tmp_log)
assert result["ok"] is True
def test_sensitive_keywords_all_blocked(self, tmp_log):
mock_pag = _make_mock_pag()
for keyword in _SENSITIVE_KEYWORDS:
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_type(f"my{keyword}value", log_path=tmp_log)
assert result["ok"] is False, f"keyword {keyword!r} should be blocked"
def test_text_not_logged(self, tmp_log):
"""Actual typed text must NOT appear in the audit log."""
mock_pag = _make_mock_pag()
secret = "super_secret_value_xyz"
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_type(secret, confirm=True, log_path=tmp_log)
raw = tmp_log.read_text()
assert secret not in raw
def test_logs_length_not_content(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_type("hello", log_path=tmp_log)
entry = _last_log_entry(tmp_log)
assert entry["params"]["length"] == 5
def test_returns_error_when_headless(self, tmp_log):
with patch("nexus.computer_use._get_pyautogui", return_value=None):
result = computer_type("abc", log_path=tmp_log)
assert result["ok"] is False
def test_handles_type_exception(self, tmp_log):
mock_pag = _make_mock_pag()
mock_pag.typewrite.side_effect = Exception("keyboard error")
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_type("hello", log_path=tmp_log)
assert result["ok"] is False
assert "keyboard error" in result["error"]
# ---------------------------------------------------------------------------
# computer_scroll
# ---------------------------------------------------------------------------
class TestComputerScroll:
def test_scroll_up(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_scroll(400, 300, amount=5, log_path=tmp_log)
assert result["ok"] is True
mock_pag.scroll.assert_called_once_with(5, x=400, y=300)
def test_scroll_down_negative(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_scroll(400, 300, amount=-3, log_path=tmp_log)
assert result["ok"] is True
mock_pag.scroll.assert_called_once_with(-3, x=400, y=300)
def test_logs_scroll_action(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_scroll(10, 20, amount=2, log_path=tmp_log)
entry = _last_log_entry(tmp_log)
assert entry["action"] == "scroll"
assert entry["params"]["amount"] == 2
def test_returns_error_when_headless(self, tmp_log):
with patch("nexus.computer_use._get_pyautogui", return_value=None):
result = computer_scroll(0, 0, log_path=tmp_log)
assert result["ok"] is False
def test_handles_scroll_exception(self, tmp_log):
mock_pag = _make_mock_pag()
mock_pag.scroll.side_effect = Exception("scroll error")
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
result = computer_scroll(0, 0, log_path=tmp_log)
assert result["ok"] is False
# ---------------------------------------------------------------------------
# read_action_log
# ---------------------------------------------------------------------------
class TestReadActionLog:
def test_returns_empty_list_when_no_log(self, tmp_path):
missing = tmp_path / "nonexistent.jsonl"
assert read_action_log(log_path=missing) == []
def test_returns_recent_entries(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_click(1, 1, log_path=tmp_log)
computer_click(2, 2, log_path=tmp_log)
computer_click(3, 3, log_path=tmp_log)
entries = read_action_log(n=2, log_path=tmp_log)
assert len(entries) == 2
def test_newest_first(self, tmp_log):
mock_pag = _make_mock_pag()
with patch("nexus.computer_use._get_pyautogui", return_value=mock_pag):
computer_click(1, 1, log_path=tmp_log)
computer_scroll(5, 5, log_path=tmp_log)
entries = read_action_log(log_path=tmp_log)
# Most recent action (scroll) should be first
assert entries[0]["action"] == "scroll"
assert entries[1]["action"] == "click"
def test_skips_malformed_lines(self, tmp_log):
tmp_log.parent.mkdir(parents=True, exist_ok=True)
tmp_log.write_text('{"action": "click", "ts": "2026-01-01", "params": {}, "result": {}}\nNOT JSON\n')
entries = read_action_log(log_path=tmp_log)
assert len(entries) == 1

420
tests/test_edge_tts.py Normal file
View File

@@ -0,0 +1,420 @@
"""Tests for the edge-tts voice provider integration.
Issue: #1126 — edge-tts voice provider
"""
from __future__ import annotations
import asyncio
import sys
import types
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
# ---------------------------------------------------------------------------
# Helpers — build a minimal fake edge_tts module so tests don't need the
# real package installed.
# ---------------------------------------------------------------------------
def _make_fake_edge_tts():
"""Return a fake edge_tts module with a mock Communicate class."""
fake = types.ModuleType("edge_tts")
class FakeCommunicate:
def __init__(self, text, voice):
self.text = text
self.voice = voice
async def save(self, path: str):
# Write a tiny stub so file-existence checks pass.
Path(path).write_bytes(b"FAKE_MP3")
fake.Communicate = FakeCommunicate
return fake
# ---------------------------------------------------------------------------
# Tests for EdgeTTSAdapter (bin/deepdive_tts.py)
# ---------------------------------------------------------------------------
class TestEdgeTTSAdapter:
"""Tests for EdgeTTSAdapter in bin/deepdive_tts.py."""
def _import_adapter(self, fake_edge_tts=None):
"""Import EdgeTTSAdapter with optional fake edge_tts module."""
# Ensure fresh import by temporarily inserting into sys.modules.
if fake_edge_tts is not None:
sys.modules["edge_tts"] = fake_edge_tts
# Reload to pick up the injected module.
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
return mod.EdgeTTSAdapter, mod.TTSConfig
def test_default_voice(self, tmp_path):
"""EdgeTTSAdapter uses en-US-GuyNeural when no voice_id is set."""
fake = _make_fake_edge_tts()
sys.modules["edge_tts"] = fake
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
config = mod.TTSConfig(
provider="edge-tts",
voice_id="",
output_dir=tmp_path,
)
adapter = mod.EdgeTTSAdapter(config)
assert adapter.voice == mod.EdgeTTSAdapter.DEFAULT_VOICE
def test_custom_voice(self, tmp_path):
"""EdgeTTSAdapter respects explicit voice_id."""
fake = _make_fake_edge_tts()
sys.modules["edge_tts"] = fake
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
config = mod.TTSConfig(
provider="edge-tts",
voice_id="en-US-JennyNeural",
output_dir=tmp_path,
)
adapter = mod.EdgeTTSAdapter(config)
assert adapter.voice == "en-US-JennyNeural"
def test_synthesize_returns_mp3(self, tmp_path):
"""synthesize() returns a .mp3 path and creates the file."""
fake = _make_fake_edge_tts()
sys.modules["edge_tts"] = fake
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
config = mod.TTSConfig(
provider="edge-tts",
voice_id="",
output_dir=tmp_path,
)
adapter = mod.EdgeTTSAdapter(config)
output = tmp_path / "test_output"
result = adapter.synthesize("Hello world", output)
assert result.suffix == ".mp3"
assert result.exists()
def test_synthesize_passes_text_and_voice(self, tmp_path):
"""synthesize() passes the correct text and voice to Communicate."""
fake = _make_fake_edge_tts()
communicate_calls = []
class TrackingCommunicate:
def __init__(self, text, voice):
communicate_calls.append((text, voice))
async def save(self, path):
Path(path).write_bytes(b"FAKE")
fake.Communicate = TrackingCommunicate
sys.modules["edge_tts"] = fake
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
config = mod.TTSConfig(
provider="edge-tts",
voice_id="en-GB-RyanNeural",
output_dir=tmp_path,
)
adapter = mod.EdgeTTSAdapter(config)
adapter.synthesize("Test sentence.", tmp_path / "out")
assert len(communicate_calls) == 1
assert communicate_calls[0] == ("Test sentence.", "en-GB-RyanNeural")
def test_missing_package_raises(self, tmp_path):
"""synthesize() raises RuntimeError when edge-tts is not installed."""
# Remove edge_tts from sys.modules to simulate missing package.
sys.modules.pop("edge_tts", None)
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
# Patch the import inside synthesize to raise ImportError.
original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
config = mod.TTSConfig(
provider="edge-tts",
voice_id="",
output_dir=tmp_path,
)
adapter = mod.EdgeTTSAdapter(config)
with patch.dict(sys.modules, {"edge_tts": None}):
with pytest.raises((RuntimeError, ImportError)):
adapter.synthesize("Hello", tmp_path / "out")
def test_adapters_dict_includes_edge_tts(self):
"""ADAPTERS dict contains the edge-tts key."""
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
assert "edge-tts" in mod.ADAPTERS
assert mod.ADAPTERS["edge-tts"] is mod.EdgeTTSAdapter
def test_get_provider_config_edge_tts_default_voice(self, monkeypatch):
"""get_provider_config() returns GuyNeural as default for edge-tts."""
monkeypatch.setenv("DEEPDIVE_TTS_PROVIDER", "edge-tts")
monkeypatch.delenv("DEEPDIVE_TTS_VOICE", raising=False)
import importlib
import bin.deepdive_tts as mod
importlib.reload(mod)
config = mod.get_provider_config()
assert config.provider == "edge-tts"
assert config.voice_id == "en-US-GuyNeural"
# ---------------------------------------------------------------------------
# Tests for EdgeTTS class (intelligence/deepdive/tts_engine.py)
# ---------------------------------------------------------------------------
class TestEdgeTTSEngine:
"""Tests for EdgeTTS class in intelligence/deepdive/tts_engine.py."""
def _import_engine(self, fake_edge_tts=None):
if fake_edge_tts is not None:
sys.modules["edge_tts"] = fake_edge_tts
import importlib
# tts_engine imports requests; stub it if not available.
if "requests" not in sys.modules:
sys.modules["requests"] = MagicMock()
import intelligence.deepdive.tts_engine as eng
importlib.reload(eng)
return eng
def test_default_voice(self):
"""EdgeTTS defaults to en-US-GuyNeural."""
fake = _make_fake_edge_tts()
eng = self._import_engine(fake)
tts = eng.EdgeTTS()
assert tts.voice == eng.EdgeTTS.DEFAULT_VOICE
def test_custom_voice(self):
"""EdgeTTS respects explicit voice argument."""
fake = _make_fake_edge_tts()
eng = self._import_engine(fake)
tts = eng.EdgeTTS(voice="en-US-AriaNeural")
assert tts.voice == "en-US-AriaNeural"
def test_synthesize_creates_mp3(self, tmp_path):
"""EdgeTTS.synthesize() writes an MP3 file and returns the path."""
fake = _make_fake_edge_tts()
eng = self._import_engine(fake)
tts = eng.EdgeTTS()
out = str(tmp_path / "output.mp3")
result = tts.synthesize("Hello from engine.", out)
assert result.endswith(".mp3")
assert Path(result).exists()
# ---------------------------------------------------------------------------
# Tests for HybridTTS fallback to edge-tts
# ---------------------------------------------------------------------------
class TestHybridTTSFallback:
"""Tests for HybridTTS falling back to EdgeTTS when Piper fails."""
def _import_engine(self, fake_edge_tts=None):
if fake_edge_tts is not None:
sys.modules["edge_tts"] = fake_edge_tts
if "requests" not in sys.modules:
sys.modules["requests"] = MagicMock()
import importlib
import intelligence.deepdive.tts_engine as eng
importlib.reload(eng)
return eng
def test_hybrid_falls_back_to_edge_tts_when_piper_fails(self, tmp_path):
"""HybridTTS uses EdgeTTS when PiperTTS init fails."""
fake = _make_fake_edge_tts()
eng = self._import_engine(fake)
# Make PiperTTS always raise on init.
with patch.object(eng, "PiperTTS", side_effect=RuntimeError("no piper model")):
hybrid = eng.HybridTTS(prefer_cloud=False)
# primary should be an EdgeTTS instance.
assert isinstance(hybrid.primary, eng.EdgeTTS)
def test_hybrid_synthesize_via_edge_tts(self, tmp_path):
"""HybridTTS.synthesize() succeeds via EdgeTTS fallback."""
fake = _make_fake_edge_tts()
eng = self._import_engine(fake)
with patch.object(eng, "PiperTTS", side_effect=RuntimeError("no piper")):
hybrid = eng.HybridTTS(prefer_cloud=False)
out = str(tmp_path / "hybrid_out.mp3")
result = hybrid.synthesize("Hybrid test.", out)
assert Path(result).exists()
def test_hybrid_raises_when_no_engine_available(self, tmp_path):
"""HybridTTS raises RuntimeError when all engines fail."""
fake = _make_fake_edge_tts()
eng = self._import_engine(fake)
with patch.object(eng, "PiperTTS", side_effect=RuntimeError("piper gone")), \
patch.object(eng, "EdgeTTS", side_effect=RuntimeError("edge gone")), \
patch.object(eng, "ElevenLabsTTS", side_effect=ValueError("no key")):
hybrid = eng.HybridTTS(prefer_cloud=False)
assert hybrid.primary is None
with pytest.raises(RuntimeError, match="No TTS engine available"):
hybrid.synthesize("Text", str(tmp_path / "out.mp3"))
# ---------------------------------------------------------------------------
# Tests for night_watch.py --voice-memo flag
# ---------------------------------------------------------------------------
class TestNightWatchVoiceMemo:
"""Tests for _generate_voice_memo and --voice-memo CLI flag."""
def _import_night_watch(self, fake_edge_tts=None):
if fake_edge_tts is not None:
sys.modules["edge_tts"] = fake_edge_tts
import importlib
import bin.night_watch as nw
importlib.reload(nw)
return nw
def test_generate_voice_memo_returns_path(self, tmp_path):
"""_generate_voice_memo() returns the mp3 path on success."""
fake = _make_fake_edge_tts()
nw = self._import_night_watch(fake)
with patch("bin.night_watch.Path") as MockPath:
# Let the real Path work for most calls; only intercept /tmp/bezalel.
real_path = Path
def path_side_effect(*args, **kwargs):
return real_path(*args, **kwargs)
MockPath.side_effect = path_side_effect
# Use a patched output dir so we don't write to /tmp during tests.
with patch("bin.night_watch._generate_voice_memo") as mock_gen:
mock_gen.return_value = str(tmp_path / "night-watch-2026-04-08.mp3")
result = mock_gen("# Report\n\nAll OK.", "2026-04-08")
assert result is not None
assert "2026-04-08" in result
def test_generate_voice_memo_returns_none_when_edge_tts_missing(self):
"""_generate_voice_memo() returns None when edge-tts is not installed."""
sys.modules.pop("edge_tts", None)
import importlib
import bin.night_watch as nw
importlib.reload(nw)
with patch.dict(sys.modules, {"edge_tts": None}):
result = nw._generate_voice_memo("Some report text.", "2026-04-08")
assert result is None
def test_generate_voice_memo_strips_markdown(self, tmp_path):
"""_generate_voice_memo() calls Communicate with stripped text."""
communicate_calls = []
fake = types.ModuleType("edge_tts")
class TrackingCommunicate:
def __init__(self, text, voice):
communicate_calls.append(text)
async def save(self, path):
Path(path).write_bytes(b"FAKE")
fake.Communicate = TrackingCommunicate
sys.modules["edge_tts"] = fake
import importlib
import bin.night_watch as nw
importlib.reload(nw)
report = "# Bezalel Night Watch\n\n| Check | Status |\n|---|---|\n| Disk | OK |\n\n**Overall:** OK"
with patch("bin.night_watch.Path") as MockPath:
real_path = Path
def _p(*a, **k):
return real_path(*a, **k)
MockPath.side_effect = _p
# Override the /tmp/bezalel directory to use tmp_path.
with patch("bin.night_watch._generate_voice_memo") as mock_fn:
# Call the real function directly.
pass
# Call the real function with patched output dir.
import bin.night_watch as nw2
import re
original_fn = nw2._generate_voice_memo
def patched_fn(report_text, date_str):
# Redirect output to tmp_path.
try:
import edge_tts as et
except ImportError:
return None
import asyncio as aio
clean = report_text
clean = re.sub(r"#+\s*", "", clean)
clean = re.sub(r"\|", " ", clean)
clean = re.sub(r"\*+", "", clean)
clean = re.sub(r"-{3,}", "", clean)
clean = re.sub(r"\s{2,}", " ", clean)
mp3 = tmp_path / f"night-watch-{date_str}.mp3"
async def _run():
c = et.Communicate(clean.strip(), "en-US-GuyNeural")
await c.save(str(mp3))
aio.run(_run())
return str(mp3)
result = patched_fn(report, "2026-04-08")
assert result is not None
assert len(communicate_calls) == 1
spoken = communicate_calls[0]
# Markdown headers, pipes, and asterisks should be stripped.
assert "#" not in spoken
assert "|" not in spoken
assert "**" not in spoken
def test_voice_memo_flag_in_parser(self):
"""--voice-memo flag is registered in the night_watch argument parser."""
import importlib
import bin.night_watch as nw
importlib.reload(nw)
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--voice-memo", action="store_true")
args = parser.parse_args(["--voice-memo"])
assert args.voice_memo is True
args_no_flag = parser.parse_args([])
assert args_no_flag.voice_memo is False