Implements Phase 1 & 2 of the [COMPUTER_USE] epic:
- nexus/computer_use.py — four Hermes tools with safety guards and
JSONL action logging:
computer_screenshot(), computer_click(), computer_type(), computer_scroll()
Poka-yoke: right/middle clicks require confirm=True; text containing
password/token/key keywords is refused without confirm=True.
pyautogui.FAILSAFE=True enabled globally (corner-abort).
- nexus/computer_use_demo.py — end-to-end Phase 1 demo: baseline
screenshot → open browser → navigate to Gitea → evidence screenshot.
- tests/test_computer_use.py — 29 unit tests, fully headless (pyautogui
mocked); all pass.
- docs/computer-use.md — full Phase 1–3 documentation including API
reference, safety table, action-log format, and pilot recipes.
- docker-compose.desktop.yml — sandboxed Xvfb + noVNC container for
safe headless desktop automation.
The existing mcp_servers/desktop_control_server.py is unchanged; it
remains available for external/MCP callers (Bannerlord harness etc).
Fixes #1125
370 lines
12 KiB
Python
370 lines
12 KiB
Python
"""
|
|
nexus/computer_use.py — Hermes Desktop Automation Primitives
|
|
|
|
Provides computer-use tools so Hermes agents can control a desktop:
|
|
|
|
computer_screenshot(output_path=None) -> dict
|
|
computer_click(x, y, *, confirm=False) -> dict
|
|
computer_type(text, *, confirm=False) -> dict
|
|
computer_scroll(x, y, amount) -> dict
|
|
|
|
Design principles:
|
|
- pyautogui.FAILSAFE = True (move mouse to screen corner to abort)
|
|
- Poka-yoke: destructive/sensitive actions require confirm=True
|
|
- Every action is logged to ~/.nexus/computer_use_log.jsonl
|
|
- Screenshot evidence is captured before & after click/type actions
|
|
- All public functions return a consistent result dict:
|
|
{"ok": bool, "tool": str, ...fields..., "screenshot": path_or_None}
|
|
|
|
Usage::
|
|
|
|
from nexus.computer_use import computer_screenshot, computer_click, computer_type, computer_scroll
|
|
|
|
result = computer_screenshot()
|
|
# result == {"ok": True, "tool": "computer_screenshot", "path": "/tmp/nexus_snap_1234.png"}
|
|
|
|
result = computer_click(960, 540)
|
|
# Clicks centre of screen (no confirm needed for bare click)
|
|
|
|
result = computer_type("hello", confirm=True) # confirm required for type
|
|
|
|
Refs: #1125
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import time
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# pyautogui — optional; degrades gracefully in headless environments
|
|
# ---------------------------------------------------------------------------
|
|
try:
|
|
import pyautogui # type: ignore
|
|
|
|
pyautogui.FAILSAFE = True # move mouse to corner (0,0) to abort
|
|
pyautogui.PAUSE = 0.05 # small inter-action pause (seconds)
|
|
_PYAUTOGUI_OK = True
|
|
except ImportError:
|
|
log.warning("pyautogui not installed — desktop primitives will return errors")
|
|
pyautogui = None # type: ignore
|
|
_PYAUTOGUI_OK = False
|
|
except Exception as exc: # headless / no DISPLAY
|
|
log.warning("pyautogui unavailable (%s) — running in degraded mode", exc)
|
|
pyautogui = None # type: ignore
|
|
_PYAUTOGUI_OK = False
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Action log — JSONL, one record per tool invocation
|
|
# ---------------------------------------------------------------------------
|
|
_LOG_DIR = Path(os.environ.get("NEXUS_HOME", Path.home() / ".nexus"))
|
|
_ACTION_LOG: Optional[Path] = None
|
|
|
|
|
|
def _action_log_path() -> Path:
|
|
global _ACTION_LOG
|
|
if _ACTION_LOG is None:
|
|
_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
_ACTION_LOG = _LOG_DIR / "computer_use_log.jsonl"
|
|
return _ACTION_LOG
|
|
|
|
|
|
def _write_log(record: dict[str, Any]) -> None:
|
|
"""Append one JSON record to the action log."""
|
|
record.setdefault("ts", datetime.now(timezone.utc).isoformat())
|
|
try:
|
|
with open(_action_log_path(), "a") as fh:
|
|
fh.write(json.dumps(record) + "\n")
|
|
except OSError as exc:
|
|
log.warning("Could not write computer_use log: %s", exc)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Screenshot helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _snap(prefix: str = "nexus_snap") -> Optional[str]:
|
|
"""Take a screenshot and return the saved path, or None on failure."""
|
|
if not _PYAUTOGUI_OK or pyautogui is None:
|
|
return None
|
|
_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
ts = int(time.time() * 1000)
|
|
path = str(_LOG_DIR / f"{prefix}_{ts}.png")
|
|
try:
|
|
img = pyautogui.screenshot()
|
|
img.save(path)
|
|
return path
|
|
except Exception as exc:
|
|
log.warning("Screenshot failed: %s", exc)
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def computer_screenshot(output_path: Optional[str] = None) -> dict[str, Any]:
|
|
"""
|
|
Capture a screenshot of the current desktop.
|
|
|
|
Args:
|
|
output_path: Where to save the PNG. Auto-generates a timestamped
|
|
path under ~/.nexus/ if omitted.
|
|
|
|
Returns:
|
|
{"ok": True, "tool": "computer_screenshot", "path": "<saved path>"}
|
|
or {"ok": False, "tool": "computer_screenshot", "error": "<reason>"}
|
|
"""
|
|
tool = "computer_screenshot"
|
|
if not _PYAUTOGUI_OK or pyautogui is None:
|
|
result = {"ok": False, "tool": tool, "error": "pyautogui not available"}
|
|
_write_log(result)
|
|
return result
|
|
|
|
if output_path is None:
|
|
_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
ts = int(time.time() * 1000)
|
|
output_path = str(_LOG_DIR / f"nexus_snap_{ts}.png")
|
|
|
|
try:
|
|
img = pyautogui.screenshot()
|
|
img.save(output_path)
|
|
result: dict[str, Any] = {"ok": True, "tool": tool, "path": output_path}
|
|
except Exception as exc:
|
|
result = {"ok": False, "tool": tool, "error": str(exc)}
|
|
|
|
_write_log(result)
|
|
return result
|
|
|
|
|
|
def computer_click(
|
|
x: int,
|
|
y: int,
|
|
*,
|
|
button: str = "left",
|
|
confirm: bool = False,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Click at screen coordinates (x, y).
|
|
|
|
Poka-yoke: double-clicks and right-clicks on sensitive zones are not
|
|
blocked here, but callers should pass confirm=True for any action whose
|
|
side-effects are hard to reverse. When confirm=False and a destructive
|
|
pattern is detected, the call is refused and an error is returned.
|
|
|
|
Args:
|
|
x, y: Screen coordinates.
|
|
button: "left" (default), "right", or "middle".
|
|
confirm: Set True to acknowledge that the action may have
|
|
irreversible effects.
|
|
|
|
Returns:
|
|
{"ok": bool, "tool": "computer_click", "x": x, "y": y,
|
|
"button": button, "screenshot": path_or_None}
|
|
"""
|
|
tool = "computer_click"
|
|
|
|
# Poka-yoke: right-clicks and middle-clicks without confirm are rejected
|
|
if button in ("right", "middle") and not confirm:
|
|
result: dict[str, Any] = {
|
|
"ok": False,
|
|
"tool": tool,
|
|
"x": x, "y": y,
|
|
"button": button,
|
|
"error": (
|
|
f"button='{button}' requires confirm=True "
|
|
"(pass confirm=True to acknowledge the action)"
|
|
),
|
|
}
|
|
_write_log(result)
|
|
return result
|
|
|
|
if not _PYAUTOGUI_OK or pyautogui is None:
|
|
result = {"ok": False, "tool": tool, "x": x, "y": y,
|
|
"button": button, "error": "pyautogui not available"}
|
|
_write_log(result)
|
|
return result
|
|
|
|
before = _snap("before_click")
|
|
try:
|
|
if button == "left":
|
|
pyautogui.click(x, y)
|
|
elif button == "right":
|
|
pyautogui.rightClick(x, y)
|
|
elif button == "middle":
|
|
pyautogui.middleClick(x, y)
|
|
else:
|
|
raise ValueError(f"Unknown button: {button!r}")
|
|
after = _snap("after_click")
|
|
result = {
|
|
"ok": True, "tool": tool,
|
|
"x": x, "y": y, "button": button,
|
|
"before_screenshot": before,
|
|
"screenshot": after,
|
|
}
|
|
except Exception as exc:
|
|
result = {
|
|
"ok": False, "tool": tool,
|
|
"x": x, "y": y, "button": button,
|
|
"error": str(exc),
|
|
"before_screenshot": before,
|
|
}
|
|
|
|
_write_log(result)
|
|
return result
|
|
|
|
|
|
# Patterns that indicate potentially sensitive text being typed.
|
|
_SENSITIVE_PATTERNS = ("password", "secret", "token", "key", "pass", "pwd")
|
|
|
|
|
|
def computer_type(text: str, *, confirm: bool = False) -> dict[str, Any]:
|
|
"""
|
|
Type a string of text using keyboard simulation.
|
|
|
|
Poka-yoke: if the text contains common sensitive keywords the call
|
|
is refused unless confirm=True is passed explicitly.
|
|
|
|
Args:
|
|
text: The string to type.
|
|
confirm: Required when text looks sensitive (contains
|
|
"password", "token", "key", etc.).
|
|
|
|
Returns:
|
|
{"ok": bool, "tool": "computer_type", "length": len(text),
|
|
"screenshot": path_or_None}
|
|
"""
|
|
tool = "computer_type"
|
|
|
|
lower = text.lower()
|
|
looks_sensitive = any(pat in lower for pat in _SENSITIVE_PATTERNS)
|
|
if looks_sensitive and not confirm:
|
|
result: dict[str, Any] = {
|
|
"ok": False,
|
|
"tool": tool,
|
|
"length": len(text),
|
|
"error": (
|
|
"Text appears to contain sensitive data "
|
|
"(password/token/key). Pass confirm=True to proceed."
|
|
),
|
|
}
|
|
_write_log({**result, "text_length": len(text)})
|
|
return result
|
|
|
|
if not _PYAUTOGUI_OK or pyautogui is None:
|
|
result = {"ok": False, "tool": tool, "length": len(text),
|
|
"error": "pyautogui not available"}
|
|
_write_log(result)
|
|
return result
|
|
|
|
before = _snap("before_type")
|
|
try:
|
|
# typewrite handles printable ASCII; for unicode use pyperclip+hotkey
|
|
printable = all(ord(c) < 128 for c in text)
|
|
if printable:
|
|
pyautogui.typewrite(text, interval=0.02)
|
|
else:
|
|
# Fallback: copy-paste via clipboard for unicode
|
|
try:
|
|
import pyperclip # type: ignore
|
|
pyperclip.copy(text)
|
|
pyautogui.hotkey("ctrl", "v")
|
|
except ImportError:
|
|
raise RuntimeError(
|
|
"Unicode text requires pyperclip: pip install pyperclip"
|
|
)
|
|
after = _snap("after_type")
|
|
result = {
|
|
"ok": True, "tool": tool,
|
|
"length": len(text),
|
|
"before_screenshot": before,
|
|
"screenshot": after,
|
|
}
|
|
except Exception as exc:
|
|
result = {
|
|
"ok": False, "tool": tool,
|
|
"length": len(text),
|
|
"error": str(exc),
|
|
"before_screenshot": before,
|
|
}
|
|
|
|
_write_log({**result})
|
|
return result
|
|
|
|
|
|
def computer_scroll(
|
|
x: int,
|
|
y: int,
|
|
amount: int,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Scroll the mouse wheel at position (x, y).
|
|
|
|
Args:
|
|
x, y: Coordinates to move the mouse before scrolling.
|
|
amount: Number of scroll clicks. Positive = scroll up / zoom in,
|
|
negative = scroll down / zoom out.
|
|
|
|
Returns:
|
|
{"ok": bool, "tool": "computer_scroll", "x": x, "y": y,
|
|
"amount": amount, "screenshot": path_or_None}
|
|
"""
|
|
tool = "computer_scroll"
|
|
|
|
if not _PYAUTOGUI_OK or pyautogui is None:
|
|
result: dict[str, Any] = {
|
|
"ok": False, "tool": tool,
|
|
"x": x, "y": y, "amount": amount,
|
|
"error": "pyautogui not available",
|
|
}
|
|
_write_log(result)
|
|
return result
|
|
|
|
try:
|
|
pyautogui.moveTo(x, y)
|
|
pyautogui.scroll(amount)
|
|
snap = _snap("after_scroll")
|
|
result = {
|
|
"ok": True, "tool": tool,
|
|
"x": x, "y": y, "amount": amount,
|
|
"screenshot": snap,
|
|
}
|
|
except Exception as exc:
|
|
result = {
|
|
"ok": False, "tool": tool,
|
|
"x": x, "y": y, "amount": amount,
|
|
"error": str(exc),
|
|
}
|
|
|
|
_write_log(result)
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Convenience: read action log
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def read_action_log(last_n: int = 20) -> list[dict[str, Any]]:
|
|
"""Return the last *last_n* records from the action log."""
|
|
path = _action_log_path()
|
|
if not path.exists():
|
|
return []
|
|
lines = path.read_text().splitlines()
|
|
records = []
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
records.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return records[-last_n:]
|