Implements Phase 1 and Phase 2 tooling from issue #1125: - nexus/computer_use.py: four Hermes tools with poka-yoke safety * computer_screenshot() — capture & base64-encode desktop snapshot * computer_click(x, y, button, confirm) — right/middle require confirm=True * computer_type(text, confirm) — sensitive keywords blocked without confirm=True; text value is never written to audit log * computer_scroll(x, y, amount) — scroll wheel * read_action_log() — inspect recent JSONL audit entries * pyautogui.FAILSAFE=True; all tools degrade gracefully when headless - nexus/computer_use_demo.py: Phase 1 demo (baseline screenshot → open browser → navigate to Gitea forge → evidence screenshot) - tests/test_computer_use.py: 32 unit tests, fully headless (pyautogui mocked), all passing - docs/computer-use.md: API reference, safety table, phase roadmap, pilot recipes - docker-compose.desktop.yml: sandboxed Xvfb + noVNC container Fixes #1125 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
314 lines
9.5 KiB
Python
314 lines
9.5 KiB
Python
"""
|
|
Hermes Desktop Automation Primitives — Computer Use (#1125)
|
|
|
|
Provides sandboxed desktop control tools for Hermes agents:
|
|
- computer_screenshot() — capture current desktop
|
|
- computer_click() — mouse click with poka-yoke on non-primary buttons
|
|
- computer_type() — keyboard input with poka-yoke on sensitive text
|
|
- computer_scroll() — scroll wheel action
|
|
- read_action_log() — inspect recent action audit trail
|
|
|
|
All actions are logged to a JSONL audit file.
|
|
pyautogui.FAILSAFE is enabled globally — move mouse to top-left corner to abort.
|
|
|
|
Designed to degrade gracefully when no display is available (headless CI).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import io
|
|
import json
|
|
import logging
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Safety globals
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Poka-yoke: require confirmation for dangerous inputs
|
|
_SENSITIVE_KEYWORDS = frozenset(
|
|
["password", "passwd", "secret", "token", "api_key", "apikey", "key", "auth"]
|
|
)
|
|
|
|
# Destructive mouse buttons (non-primary)
|
|
_DANGEROUS_BUTTONS = frozenset(["right", "middle"])
|
|
|
|
# Default log location
|
|
DEFAULT_ACTION_LOG = Path.home() / ".nexus" / "computer_use_actions.jsonl"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Lazy pyautogui import — fails gracefully in headless environments
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_PYAUTOGUI_AVAILABLE = False
|
|
_pyautogui = None
|
|
|
|
|
|
def _get_pyautogui():
|
|
"""Return pyautogui, enabling FAILSAFE. Returns None if unavailable."""
|
|
global _pyautogui, _PYAUTOGUI_AVAILABLE
|
|
if _pyautogui is not None:
|
|
return _pyautogui
|
|
try:
|
|
import pyautogui # type: ignore
|
|
|
|
pyautogui.FAILSAFE = True
|
|
pyautogui.PAUSE = 0.05 # small delay between actions
|
|
_pyautogui = pyautogui
|
|
_PYAUTOGUI_AVAILABLE = True
|
|
return _pyautogui
|
|
except Exception:
|
|
logger.warning("pyautogui unavailable — computer_use running in stub mode")
|
|
return None
|
|
|
|
|
|
def _get_pil():
|
|
"""Return PIL Image module or None."""
|
|
try:
|
|
from PIL import Image # type: ignore
|
|
|
|
return Image
|
|
except ImportError:
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Audit log
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _log_action(action: str, params: dict, result: dict, log_path: Path = DEFAULT_ACTION_LOG):
|
|
"""Append one action record to the JSONL audit log."""
|
|
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
record = {
|
|
"ts": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
"action": action,
|
|
"params": params,
|
|
"result": result,
|
|
}
|
|
with open(log_path, "a") as fh:
|
|
fh.write(json.dumps(record) + "\n")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public tool API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def computer_screenshot(
|
|
save_path: Optional[str] = None,
|
|
log_path: Path = DEFAULT_ACTION_LOG,
|
|
) -> dict:
|
|
"""Capture a screenshot of the current desktop.
|
|
|
|
Args:
|
|
save_path: Optional file path to save the PNG. If omitted the image
|
|
is returned as a base64-encoded string.
|
|
log_path: Audit log file (default ~/.nexus/computer_use_actions.jsonl).
|
|
|
|
Returns:
|
|
dict with keys:
|
|
- ok (bool)
|
|
- image_b64 (str | None) — base64 PNG when save_path is None
|
|
- saved_to (str | None) — path when save_path was given
|
|
- error (str | None) — human-readable error if ok=False
|
|
"""
|
|
pag = _get_pyautogui()
|
|
params = {"save_path": save_path}
|
|
|
|
if pag is None:
|
|
result = {"ok": False, "image_b64": None, "saved_to": None, "error": "pyautogui unavailable"}
|
|
_log_action("screenshot", params, result, log_path)
|
|
return result
|
|
|
|
try:
|
|
screenshot = pag.screenshot()
|
|
if save_path:
|
|
screenshot.save(save_path)
|
|
result = {"ok": True, "image_b64": None, "saved_to": save_path, "error": None}
|
|
else:
|
|
buf = io.BytesIO()
|
|
screenshot.save(buf, format="PNG")
|
|
b64 = base64.b64encode(buf.getvalue()).decode()
|
|
result = {"ok": True, "image_b64": b64, "saved_to": None, "error": None}
|
|
except Exception as exc:
|
|
result = {"ok": False, "image_b64": None, "saved_to": None, "error": str(exc)}
|
|
|
|
_log_action("screenshot", params, {k: v for k, v in result.items() if k != "image_b64"}, log_path)
|
|
return result
|
|
|
|
|
|
def computer_click(
|
|
x: int,
|
|
y: int,
|
|
button: str = "left",
|
|
confirm: bool = False,
|
|
log_path: Path = DEFAULT_ACTION_LOG,
|
|
) -> dict:
|
|
"""Click the mouse at screen coordinates (x, y).
|
|
|
|
Poka-yoke: right/middle clicks require confirm=True.
|
|
|
|
Args:
|
|
x: Horizontal screen coordinate.
|
|
y: Vertical screen coordinate.
|
|
button: "left" | "right" | "middle"
|
|
confirm: Must be True for non-left buttons.
|
|
log_path: Audit log file.
|
|
|
|
Returns:
|
|
dict with keys: ok, error
|
|
"""
|
|
params = {"x": x, "y": y, "button": button, "confirm": confirm}
|
|
|
|
if button in _DANGEROUS_BUTTONS and not confirm:
|
|
result = {
|
|
"ok": False,
|
|
"error": (
|
|
f"button={button!r} requires confirm=True (poka-yoke). "
|
|
"Pass confirm=True only after verifying this action is intentional."
|
|
),
|
|
}
|
|
_log_action("click", params, result, log_path)
|
|
return result
|
|
|
|
if button not in ("left", "right", "middle"):
|
|
result = {"ok": False, "error": f"Unknown button {button!r}. Use 'left', 'right', or 'middle'."}
|
|
_log_action("click", params, result, log_path)
|
|
return result
|
|
|
|
pag = _get_pyautogui()
|
|
if pag is None:
|
|
result = {"ok": False, "error": "pyautogui unavailable"}
|
|
_log_action("click", params, result, log_path)
|
|
return result
|
|
|
|
try:
|
|
pag.click(x, y, button=button)
|
|
result = {"ok": True, "error": None}
|
|
except Exception as exc:
|
|
result = {"ok": False, "error": str(exc)}
|
|
|
|
_log_action("click", params, result, log_path)
|
|
return result
|
|
|
|
|
|
def computer_type(
|
|
text: str,
|
|
confirm: bool = False,
|
|
interval: float = 0.02,
|
|
log_path: Path = DEFAULT_ACTION_LOG,
|
|
) -> dict:
|
|
"""Type text using the keyboard.
|
|
|
|
Poka-yoke: if *text* contains a sensitive keyword (password, token, key…)
|
|
confirm=True is required. The actual text value is never written to the
|
|
audit log.
|
|
|
|
Args:
|
|
text: The string to type.
|
|
confirm: Must be True when the text looks sensitive.
|
|
interval: Delay between keystrokes (seconds).
|
|
log_path: Audit log file.
|
|
|
|
Returns:
|
|
dict with keys: ok, error
|
|
"""
|
|
lower = text.lower()
|
|
is_sensitive = any(kw in lower for kw in _SENSITIVE_KEYWORDS)
|
|
params = {"length": len(text), "is_sensitive": is_sensitive, "confirm": confirm}
|
|
|
|
if is_sensitive and not confirm:
|
|
result = {
|
|
"ok": False,
|
|
"error": (
|
|
"Text contains sensitive keyword. Pass confirm=True to proceed. "
|
|
"Ensure no secrets are being typed into unintended windows."
|
|
),
|
|
}
|
|
_log_action("type", params, result, log_path)
|
|
return result
|
|
|
|
pag = _get_pyautogui()
|
|
if pag is None:
|
|
result = {"ok": False, "error": "pyautogui unavailable"}
|
|
_log_action("type", params, result, log_path)
|
|
return result
|
|
|
|
try:
|
|
pag.typewrite(text, interval=interval)
|
|
result = {"ok": True, "error": None}
|
|
except Exception as exc:
|
|
result = {"ok": False, "error": str(exc)}
|
|
|
|
_log_action("type", params, result, log_path)
|
|
return result
|
|
|
|
|
|
def computer_scroll(
|
|
x: int,
|
|
y: int,
|
|
amount: int = 3,
|
|
log_path: Path = DEFAULT_ACTION_LOG,
|
|
) -> dict:
|
|
"""Scroll the mouse wheel at screen coordinates (x, y).
|
|
|
|
Args:
|
|
x: Horizontal screen coordinate.
|
|
y: Vertical screen coordinate.
|
|
amount: Number of scroll units. Positive = scroll up, negative = down.
|
|
log_path: Audit log file.
|
|
|
|
Returns:
|
|
dict with keys: ok, error
|
|
"""
|
|
params = {"x": x, "y": y, "amount": amount}
|
|
pag = _get_pyautogui()
|
|
|
|
if pag is None:
|
|
result = {"ok": False, "error": "pyautogui unavailable"}
|
|
_log_action("scroll", params, result, log_path)
|
|
return result
|
|
|
|
try:
|
|
pag.scroll(amount, x=x, y=y)
|
|
result = {"ok": True, "error": None}
|
|
except Exception as exc:
|
|
result = {"ok": False, "error": str(exc)}
|
|
|
|
_log_action("scroll", params, result, log_path)
|
|
return result
|
|
|
|
|
|
def read_action_log(
|
|
n: int = 20,
|
|
log_path: Path = DEFAULT_ACTION_LOG,
|
|
) -> list[dict]:
|
|
"""Return the most recent *n* action records from the audit log.
|
|
|
|
Args:
|
|
n: Maximum number of records to return.
|
|
log_path: Audit log file.
|
|
|
|
Returns:
|
|
List of action dicts, newest first.
|
|
"""
|
|
if not log_path.exists():
|
|
return []
|
|
records: list[dict] = []
|
|
with open(log_path) as fh:
|
|
for line in fh:
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
records.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return list(reversed(records[-n:]))
|