Files
the-nexus/nexus/computer_use.py
Alexander Whitestone a3a28aa4c2
Some checks failed
CI / test (pull_request) Failing after 20s
CI / validate (pull_request) Failing after 25s
Review Approval Gate / verify-review (pull_request) Failing after 5s
feat: add desktop automation primitives to Hermes (#1125)
Implements Phase 1 & 2 of the [COMPUTER_USE] epic:

- nexus/computer_use.py — four Hermes tools with safety guards and
  JSONL action logging:
    computer_screenshot(), computer_click(), computer_type(), computer_scroll()
  Poka-yoke: right/middle clicks require confirm=True; text containing
  password/token/key keywords is refused without confirm=True.
  pyautogui.FAILSAFE=True enabled globally (corner-abort).

- nexus/computer_use_demo.py — end-to-end Phase 1 demo: baseline
  screenshot → open browser → navigate to Gitea → evidence screenshot.

- tests/test_computer_use.py — 29 unit tests, fully headless (pyautogui
  mocked); all pass.

- docs/computer-use.md — full Phase 1–3 documentation including API
  reference, safety table, action-log format, and pilot recipes.

- docker-compose.desktop.yml — sandboxed Xvfb + noVNC container for
  safe headless desktop automation.

The existing mcp_servers/desktop_control_server.py is unchanged; it
remains available for external/MCP callers (Bannerlord harness etc).

Fixes #1125
2026-04-08 06:29:27 -04:00

370 lines
12 KiB
Python

"""
nexus/computer_use.py — Hermes Desktop Automation Primitives
Provides computer-use tools so Hermes agents can control a desktop:
computer_screenshot(output_path=None) -> dict
computer_click(x, y, *, confirm=False) -> dict
computer_type(text, *, confirm=False) -> dict
computer_scroll(x, y, amount) -> dict
Design principles:
- pyautogui.FAILSAFE = True (move mouse to screen corner to abort)
- Poka-yoke: destructive/sensitive actions require confirm=True
- Every action is logged to ~/.nexus/computer_use_log.jsonl
- Screenshot evidence is captured before & after click/type actions
- All public functions return a consistent result dict:
{"ok": bool, "tool": str, ...fields..., "screenshot": path_or_None}
Usage::
from nexus.computer_use import computer_screenshot, computer_click, computer_type, computer_scroll
result = computer_screenshot()
# result == {"ok": True, "tool": "computer_screenshot", "path": "/tmp/nexus_snap_1234.png"}
result = computer_click(960, 540)
# Clicks centre of screen (no confirm needed for bare click)
result = computer_type("hello", confirm=True) # confirm required for type
Refs: #1125
"""
from __future__ import annotations
import json
import logging
import os
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# pyautogui — optional; degrades gracefully in headless environments
# ---------------------------------------------------------------------------
try:
import pyautogui # type: ignore
pyautogui.FAILSAFE = True # move mouse to corner (0,0) to abort
pyautogui.PAUSE = 0.05 # small inter-action pause (seconds)
_PYAUTOGUI_OK = True
except ImportError:
log.warning("pyautogui not installed — desktop primitives will return errors")
pyautogui = None # type: ignore
_PYAUTOGUI_OK = False
except Exception as exc: # headless / no DISPLAY
log.warning("pyautogui unavailable (%s) — running in degraded mode", exc)
pyautogui = None # type: ignore
_PYAUTOGUI_OK = False
# ---------------------------------------------------------------------------
# Action log — JSONL, one record per tool invocation
# ---------------------------------------------------------------------------
_LOG_DIR = Path(os.environ.get("NEXUS_HOME", Path.home() / ".nexus"))
_ACTION_LOG: Optional[Path] = None
def _action_log_path() -> Path:
global _ACTION_LOG
if _ACTION_LOG is None:
_LOG_DIR.mkdir(parents=True, exist_ok=True)
_ACTION_LOG = _LOG_DIR / "computer_use_log.jsonl"
return _ACTION_LOG
def _write_log(record: dict[str, Any]) -> None:
"""Append one JSON record to the action log."""
record.setdefault("ts", datetime.now(timezone.utc).isoformat())
try:
with open(_action_log_path(), "a") as fh:
fh.write(json.dumps(record) + "\n")
except OSError as exc:
log.warning("Could not write computer_use log: %s", exc)
# ---------------------------------------------------------------------------
# Screenshot helper
# ---------------------------------------------------------------------------
def _snap(prefix: str = "nexus_snap") -> Optional[str]:
"""Take a screenshot and return the saved path, or None on failure."""
if not _PYAUTOGUI_OK or pyautogui is None:
return None
_LOG_DIR.mkdir(parents=True, exist_ok=True)
ts = int(time.time() * 1000)
path = str(_LOG_DIR / f"{prefix}_{ts}.png")
try:
img = pyautogui.screenshot()
img.save(path)
return path
except Exception as exc:
log.warning("Screenshot failed: %s", exc)
return None
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def computer_screenshot(output_path: Optional[str] = None) -> dict[str, Any]:
"""
Capture a screenshot of the current desktop.
Args:
output_path: Where to save the PNG. Auto-generates a timestamped
path under ~/.nexus/ if omitted.
Returns:
{"ok": True, "tool": "computer_screenshot", "path": "<saved path>"}
or {"ok": False, "tool": "computer_screenshot", "error": "<reason>"}
"""
tool = "computer_screenshot"
if not _PYAUTOGUI_OK or pyautogui is None:
result = {"ok": False, "tool": tool, "error": "pyautogui not available"}
_write_log(result)
return result
if output_path is None:
_LOG_DIR.mkdir(parents=True, exist_ok=True)
ts = int(time.time() * 1000)
output_path = str(_LOG_DIR / f"nexus_snap_{ts}.png")
try:
img = pyautogui.screenshot()
img.save(output_path)
result: dict[str, Any] = {"ok": True, "tool": tool, "path": output_path}
except Exception as exc:
result = {"ok": False, "tool": tool, "error": str(exc)}
_write_log(result)
return result
def computer_click(
x: int,
y: int,
*,
button: str = "left",
confirm: bool = False,
) -> dict[str, Any]:
"""
Click at screen coordinates (x, y).
Poka-yoke: double-clicks and right-clicks on sensitive zones are not
blocked here, but callers should pass confirm=True for any action whose
side-effects are hard to reverse. When confirm=False and a destructive
pattern is detected, the call is refused and an error is returned.
Args:
x, y: Screen coordinates.
button: "left" (default), "right", or "middle".
confirm: Set True to acknowledge that the action may have
irreversible effects.
Returns:
{"ok": bool, "tool": "computer_click", "x": x, "y": y,
"button": button, "screenshot": path_or_None}
"""
tool = "computer_click"
# Poka-yoke: right-clicks and middle-clicks without confirm are rejected
if button in ("right", "middle") and not confirm:
result: dict[str, Any] = {
"ok": False,
"tool": tool,
"x": x, "y": y,
"button": button,
"error": (
f"button='{button}' requires confirm=True "
"(pass confirm=True to acknowledge the action)"
),
}
_write_log(result)
return result
if not _PYAUTOGUI_OK or pyautogui is None:
result = {"ok": False, "tool": tool, "x": x, "y": y,
"button": button, "error": "pyautogui not available"}
_write_log(result)
return result
before = _snap("before_click")
try:
if button == "left":
pyautogui.click(x, y)
elif button == "right":
pyautogui.rightClick(x, y)
elif button == "middle":
pyautogui.middleClick(x, y)
else:
raise ValueError(f"Unknown button: {button!r}")
after = _snap("after_click")
result = {
"ok": True, "tool": tool,
"x": x, "y": y, "button": button,
"before_screenshot": before,
"screenshot": after,
}
except Exception as exc:
result = {
"ok": False, "tool": tool,
"x": x, "y": y, "button": button,
"error": str(exc),
"before_screenshot": before,
}
_write_log(result)
return result
# Patterns that indicate potentially sensitive text being typed.
_SENSITIVE_PATTERNS = ("password", "secret", "token", "key", "pass", "pwd")
def computer_type(text: str, *, confirm: bool = False) -> dict[str, Any]:
"""
Type a string of text using keyboard simulation.
Poka-yoke: if the text contains common sensitive keywords the call
is refused unless confirm=True is passed explicitly.
Args:
text: The string to type.
confirm: Required when text looks sensitive (contains
"password", "token", "key", etc.).
Returns:
{"ok": bool, "tool": "computer_type", "length": len(text),
"screenshot": path_or_None}
"""
tool = "computer_type"
lower = text.lower()
looks_sensitive = any(pat in lower for pat in _SENSITIVE_PATTERNS)
if looks_sensitive and not confirm:
result: dict[str, Any] = {
"ok": False,
"tool": tool,
"length": len(text),
"error": (
"Text appears to contain sensitive data "
"(password/token/key). Pass confirm=True to proceed."
),
}
_write_log({**result, "text_length": len(text)})
return result
if not _PYAUTOGUI_OK or pyautogui is None:
result = {"ok": False, "tool": tool, "length": len(text),
"error": "pyautogui not available"}
_write_log(result)
return result
before = _snap("before_type")
try:
# typewrite handles printable ASCII; for unicode use pyperclip+hotkey
printable = all(ord(c) < 128 for c in text)
if printable:
pyautogui.typewrite(text, interval=0.02)
else:
# Fallback: copy-paste via clipboard for unicode
try:
import pyperclip # type: ignore
pyperclip.copy(text)
pyautogui.hotkey("ctrl", "v")
except ImportError:
raise RuntimeError(
"Unicode text requires pyperclip: pip install pyperclip"
)
after = _snap("after_type")
result = {
"ok": True, "tool": tool,
"length": len(text),
"before_screenshot": before,
"screenshot": after,
}
except Exception as exc:
result = {
"ok": False, "tool": tool,
"length": len(text),
"error": str(exc),
"before_screenshot": before,
}
_write_log({**result})
return result
def computer_scroll(
x: int,
y: int,
amount: int,
) -> dict[str, Any]:
"""
Scroll the mouse wheel at position (x, y).
Args:
x, y: Coordinates to move the mouse before scrolling.
amount: Number of scroll clicks. Positive = scroll up / zoom in,
negative = scroll down / zoom out.
Returns:
{"ok": bool, "tool": "computer_scroll", "x": x, "y": y,
"amount": amount, "screenshot": path_or_None}
"""
tool = "computer_scroll"
if not _PYAUTOGUI_OK or pyautogui is None:
result: dict[str, Any] = {
"ok": False, "tool": tool,
"x": x, "y": y, "amount": amount,
"error": "pyautogui not available",
}
_write_log(result)
return result
try:
pyautogui.moveTo(x, y)
pyautogui.scroll(amount)
snap = _snap("after_scroll")
result = {
"ok": True, "tool": tool,
"x": x, "y": y, "amount": amount,
"screenshot": snap,
}
except Exception as exc:
result = {
"ok": False, "tool": tool,
"x": x, "y": y, "amount": amount,
"error": str(exc),
}
_write_log(result)
return result
# ---------------------------------------------------------------------------
# Convenience: read action log
# ---------------------------------------------------------------------------
def read_action_log(last_n: int = 20) -> list[dict[str, Any]]:
"""Return the last *last_n* records from the action log."""
path = _action_log_path()
if not path.exists():
return []
lines = path.read_text().splitlines()
records = []
for line in lines:
line = line.strip()
if line:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return records[-last_n:]