Files
the-nexus/nexus/computer_use.py
Alexander Whitestone 220f20c794
Some checks failed
CI / test (pull_request) Failing after 8s
CI / validate (pull_request) Failing after 10s
Review Approval Gate / verify-review (pull_request) Failing after 2s
feat: add desktop automation primitives to Hermes (#1125)
Implements Phase 1 and Phase 2 tooling from issue #1125:

- nexus/computer_use.py: four Hermes tools with poka-yoke safety
    * computer_screenshot() — capture & base64-encode desktop snapshot
    * computer_click(x, y, button, confirm) — right/middle require confirm=True
    * computer_type(text, confirm) — sensitive keywords blocked without confirm=True;
      text value is never written to audit log
    * computer_scroll(x, y, amount) — scroll wheel
    * read_action_log() — inspect recent JSONL audit entries
    * pyautogui.FAILSAFE=True; all tools degrade gracefully when headless

- nexus/computer_use_demo.py: Phase 1 demo (baseline screenshot →
  open browser → navigate to Gitea forge → evidence screenshot)

- tests/test_computer_use.py: 32 unit tests, fully headless
  (pyautogui mocked), all passing

- docs/computer-use.md: API reference, safety table, phase roadmap,
  pilot recipes

- docker-compose.desktop.yml: sandboxed Xvfb + noVNC container

Fixes #1125

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 05:45:27 -04:00

314 lines
9.5 KiB
Python

"""
Hermes Desktop Automation Primitives — Computer Use (#1125)
Provides sandboxed desktop control tools for Hermes agents:
- computer_screenshot() — capture current desktop
- computer_click() — mouse click with poka-yoke on non-primary buttons
- computer_type() — keyboard input with poka-yoke on sensitive text
- computer_scroll() — scroll wheel action
- read_action_log() — inspect recent action audit trail
All actions are logged to a JSONL audit file.
pyautogui.FAILSAFE is enabled globally — move mouse to top-left corner to abort.
Designed to degrade gracefully when no display is available (headless CI).
"""
from __future__ import annotations
import base64
import io
import json
import logging
import os
import time
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Safety globals
# ---------------------------------------------------------------------------
# Poka-yoke: require confirmation for dangerous inputs
_SENSITIVE_KEYWORDS = frozenset(
["password", "passwd", "secret", "token", "api_key", "apikey", "key", "auth"]
)
# Destructive mouse buttons (non-primary)
_DANGEROUS_BUTTONS = frozenset(["right", "middle"])
# Default log location
DEFAULT_ACTION_LOG = Path.home() / ".nexus" / "computer_use_actions.jsonl"
# ---------------------------------------------------------------------------
# Lazy pyautogui import — fails gracefully in headless environments
# ---------------------------------------------------------------------------
_PYAUTOGUI_AVAILABLE = False
_pyautogui = None
def _get_pyautogui():
"""Return pyautogui, enabling FAILSAFE. Returns None if unavailable."""
global _pyautogui, _PYAUTOGUI_AVAILABLE
if _pyautogui is not None:
return _pyautogui
try:
import pyautogui # type: ignore
pyautogui.FAILSAFE = True
pyautogui.PAUSE = 0.05 # small delay between actions
_pyautogui = pyautogui
_PYAUTOGUI_AVAILABLE = True
return _pyautogui
except Exception:
logger.warning("pyautogui unavailable — computer_use running in stub mode")
return None
def _get_pil():
"""Return PIL Image module or None."""
try:
from PIL import Image # type: ignore
return Image
except ImportError:
return None
# ---------------------------------------------------------------------------
# Audit log
# ---------------------------------------------------------------------------
def _log_action(action: str, params: dict, result: dict, log_path: Path = DEFAULT_ACTION_LOG):
"""Append one action record to the JSONL audit log."""
log_path.parent.mkdir(parents=True, exist_ok=True)
record = {
"ts": time.strftime("%Y-%m-%dT%H:%M:%S"),
"action": action,
"params": params,
"result": result,
}
with open(log_path, "a") as fh:
fh.write(json.dumps(record) + "\n")
# ---------------------------------------------------------------------------
# Public tool API
# ---------------------------------------------------------------------------
def computer_screenshot(
save_path: Optional[str] = None,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Capture a screenshot of the current desktop.
Args:
save_path: Optional file path to save the PNG. If omitted the image
is returned as a base64-encoded string.
log_path: Audit log file (default ~/.nexus/computer_use_actions.jsonl).
Returns:
dict with keys:
- ok (bool)
- image_b64 (str | None) — base64 PNG when save_path is None
- saved_to (str | None) — path when save_path was given
- error (str | None) — human-readable error if ok=False
"""
pag = _get_pyautogui()
params = {"save_path": save_path}
if pag is None:
result = {"ok": False, "image_b64": None, "saved_to": None, "error": "pyautogui unavailable"}
_log_action("screenshot", params, result, log_path)
return result
try:
screenshot = pag.screenshot()
if save_path:
screenshot.save(save_path)
result = {"ok": True, "image_b64": None, "saved_to": save_path, "error": None}
else:
buf = io.BytesIO()
screenshot.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode()
result = {"ok": True, "image_b64": b64, "saved_to": None, "error": None}
except Exception as exc:
result = {"ok": False, "image_b64": None, "saved_to": None, "error": str(exc)}
_log_action("screenshot", params, {k: v for k, v in result.items() if k != "image_b64"}, log_path)
return result
def computer_click(
x: int,
y: int,
button: str = "left",
confirm: bool = False,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Click the mouse at screen coordinates (x, y).
Poka-yoke: right/middle clicks require confirm=True.
Args:
x: Horizontal screen coordinate.
y: Vertical screen coordinate.
button: "left" | "right" | "middle"
confirm: Must be True for non-left buttons.
log_path: Audit log file.
Returns:
dict with keys: ok, error
"""
params = {"x": x, "y": y, "button": button, "confirm": confirm}
if button in _DANGEROUS_BUTTONS and not confirm:
result = {
"ok": False,
"error": (
f"button={button!r} requires confirm=True (poka-yoke). "
"Pass confirm=True only after verifying this action is intentional."
),
}
_log_action("click", params, result, log_path)
return result
if button not in ("left", "right", "middle"):
result = {"ok": False, "error": f"Unknown button {button!r}. Use 'left', 'right', or 'middle'."}
_log_action("click", params, result, log_path)
return result
pag = _get_pyautogui()
if pag is None:
result = {"ok": False, "error": "pyautogui unavailable"}
_log_action("click", params, result, log_path)
return result
try:
pag.click(x, y, button=button)
result = {"ok": True, "error": None}
except Exception as exc:
result = {"ok": False, "error": str(exc)}
_log_action("click", params, result, log_path)
return result
def computer_type(
text: str,
confirm: bool = False,
interval: float = 0.02,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Type text using the keyboard.
Poka-yoke: if *text* contains a sensitive keyword (password, token, key…)
confirm=True is required. The actual text value is never written to the
audit log.
Args:
text: The string to type.
confirm: Must be True when the text looks sensitive.
interval: Delay between keystrokes (seconds).
log_path: Audit log file.
Returns:
dict with keys: ok, error
"""
lower = text.lower()
is_sensitive = any(kw in lower for kw in _SENSITIVE_KEYWORDS)
params = {"length": len(text), "is_sensitive": is_sensitive, "confirm": confirm}
if is_sensitive and not confirm:
result = {
"ok": False,
"error": (
"Text contains sensitive keyword. Pass confirm=True to proceed. "
"Ensure no secrets are being typed into unintended windows."
),
}
_log_action("type", params, result, log_path)
return result
pag = _get_pyautogui()
if pag is None:
result = {"ok": False, "error": "pyautogui unavailable"}
_log_action("type", params, result, log_path)
return result
try:
pag.typewrite(text, interval=interval)
result = {"ok": True, "error": None}
except Exception as exc:
result = {"ok": False, "error": str(exc)}
_log_action("type", params, result, log_path)
return result
def computer_scroll(
x: int,
y: int,
amount: int = 3,
log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
"""Scroll the mouse wheel at screen coordinates (x, y).
Args:
x: Horizontal screen coordinate.
y: Vertical screen coordinate.
amount: Number of scroll units. Positive = scroll up, negative = down.
log_path: Audit log file.
Returns:
dict with keys: ok, error
"""
params = {"x": x, "y": y, "amount": amount}
pag = _get_pyautogui()
if pag is None:
result = {"ok": False, "error": "pyautogui unavailable"}
_log_action("scroll", params, result, log_path)
return result
try:
pag.scroll(amount, x=x, y=y)
result = {"ok": True, "error": None}
except Exception as exc:
result = {"ok": False, "error": str(exc)}
_log_action("scroll", params, result, log_path)
return result
def read_action_log(
n: int = 20,
log_path: Path = DEFAULT_ACTION_LOG,
) -> list[dict]:
"""Return the most recent *n* action records from the audit log.
Args:
n: Maximum number of records to return.
log_path: Audit log file.
Returns:
List of action dicts, newest first.
"""
if not log_path.exists():
return []
records: list[dict] = []
with open(log_path) as fh:
for line in fh:
line = line.strip()
if line:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return list(reversed(records[-n:]))