""" Hermes Desktop Automation Primitives — Computer Use (#1125) Provides sandboxed desktop control tools for Hermes agents: - computer_screenshot() — capture current desktop - computer_click() — mouse click with poka-yoke on non-primary buttons - computer_type() — keyboard input with poka-yoke on sensitive text - computer_scroll() — scroll wheel action - read_action_log() — inspect recent action audit trail All actions are logged to a JSONL audit file. pyautogui.FAILSAFE is enabled globally — move mouse to top-left corner to abort. Designed to degrade gracefully when no display is available (headless CI). """ from __future__ import annotations import base64 import io import json import logging import os import time from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Safety globals # --------------------------------------------------------------------------- # Poka-yoke: require confirmation for dangerous inputs _SENSITIVE_KEYWORDS = frozenset( ["password", "passwd", "secret", "token", "api_key", "apikey", "key", "auth"] ) # Destructive mouse buttons (non-primary) _DANGEROUS_BUTTONS = frozenset(["right", "middle"]) # Default log location DEFAULT_ACTION_LOG = Path.home() / ".nexus" / "computer_use_actions.jsonl" # --------------------------------------------------------------------------- # Lazy pyautogui import — fails gracefully in headless environments # --------------------------------------------------------------------------- _PYAUTOGUI_AVAILABLE = False _pyautogui = None def _get_pyautogui(): """Return pyautogui, enabling FAILSAFE. Returns None if unavailable.""" global _pyautogui, _PYAUTOGUI_AVAILABLE if _pyautogui is not None: return _pyautogui try: import pyautogui # type: ignore pyautogui.FAILSAFE = True pyautogui.PAUSE = 0.05 # small delay between actions _pyautogui = pyautogui _PYAUTOGUI_AVAILABLE = True return _pyautogui except Exception: logger.warning("pyautogui unavailable — computer_use running in stub mode") return None def _get_pil(): """Return PIL Image module or None.""" try: from PIL import Image # type: ignore return Image except ImportError: return None # --------------------------------------------------------------------------- # Audit log # --------------------------------------------------------------------------- def _log_action(action: str, params: dict, result: dict, log_path: Path = DEFAULT_ACTION_LOG): """Append one action record to the JSONL audit log.""" log_path.parent.mkdir(parents=True, exist_ok=True) record = { "ts": time.strftime("%Y-%m-%dT%H:%M:%S"), "action": action, "params": params, "result": result, } with open(log_path, "a") as fh: fh.write(json.dumps(record) + "\n") # --------------------------------------------------------------------------- # Public tool API # --------------------------------------------------------------------------- def computer_screenshot( save_path: Optional[str] = None, log_path: Path = DEFAULT_ACTION_LOG, ) -> dict: """Capture a screenshot of the current desktop. Args: save_path: Optional file path to save the PNG. If omitted the image is returned as a base64-encoded string. log_path: Audit log file (default ~/.nexus/computer_use_actions.jsonl). Returns: dict with keys: - ok (bool) - image_b64 (str | None) — base64 PNG when save_path is None - saved_to (str | None) — path when save_path was given - error (str | None) — human-readable error if ok=False """ pag = _get_pyautogui() params = {"save_path": save_path} if pag is None: result = {"ok": False, "image_b64": None, "saved_to": None, "error": "pyautogui unavailable"} _log_action("screenshot", params, result, log_path) return result try: screenshot = pag.screenshot() if save_path: screenshot.save(save_path) result = {"ok": True, "image_b64": None, "saved_to": save_path, "error": None} else: buf = io.BytesIO() screenshot.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() result = {"ok": True, "image_b64": b64, "saved_to": None, "error": None} except Exception as exc: result = {"ok": False, "image_b64": None, "saved_to": None, "error": str(exc)} _log_action("screenshot", params, {k: v for k, v in result.items() if k != "image_b64"}, log_path) return result def computer_click( x: int, y: int, button: str = "left", confirm: bool = False, log_path: Path = DEFAULT_ACTION_LOG, ) -> dict: """Click the mouse at screen coordinates (x, y). Poka-yoke: right/middle clicks require confirm=True. Args: x: Horizontal screen coordinate. y: Vertical screen coordinate. button: "left" | "right" | "middle" confirm: Must be True for non-left buttons. log_path: Audit log file. Returns: dict with keys: ok, error """ params = {"x": x, "y": y, "button": button, "confirm": confirm} if button in _DANGEROUS_BUTTONS and not confirm: result = { "ok": False, "error": ( f"button={button!r} requires confirm=True (poka-yoke). " "Pass confirm=True only after verifying this action is intentional." ), } _log_action("click", params, result, log_path) return result if button not in ("left", "right", "middle"): result = {"ok": False, "error": f"Unknown button {button!r}. Use 'left', 'right', or 'middle'."} _log_action("click", params, result, log_path) return result pag = _get_pyautogui() if pag is None: result = {"ok": False, "error": "pyautogui unavailable"} _log_action("click", params, result, log_path) return result try: pag.click(x, y, button=button) result = {"ok": True, "error": None} except Exception as exc: result = {"ok": False, "error": str(exc)} _log_action("click", params, result, log_path) return result def computer_type( text: str, confirm: bool = False, interval: float = 0.02, log_path: Path = DEFAULT_ACTION_LOG, ) -> dict: """Type text using the keyboard. Poka-yoke: if *text* contains a sensitive keyword (password, token, key…) confirm=True is required. The actual text value is never written to the audit log. Args: text: The string to type. confirm: Must be True when the text looks sensitive. interval: Delay between keystrokes (seconds). log_path: Audit log file. Returns: dict with keys: ok, error """ lower = text.lower() is_sensitive = any(kw in lower for kw in _SENSITIVE_KEYWORDS) params = {"length": len(text), "is_sensitive": is_sensitive, "confirm": confirm} if is_sensitive and not confirm: result = { "ok": False, "error": ( "Text contains sensitive keyword. Pass confirm=True to proceed. " "Ensure no secrets are being typed into unintended windows." ), } _log_action("type", params, result, log_path) return result pag = _get_pyautogui() if pag is None: result = {"ok": False, "error": "pyautogui unavailable"} _log_action("type", params, result, log_path) return result try: pag.typewrite(text, interval=interval) result = {"ok": True, "error": None} except Exception as exc: result = {"ok": False, "error": str(exc)} _log_action("type", params, result, log_path) return result def computer_scroll( x: int, y: int, amount: int = 3, log_path: Path = DEFAULT_ACTION_LOG, ) -> dict: """Scroll the mouse wheel at screen coordinates (x, y). Args: x: Horizontal screen coordinate. y: Vertical screen coordinate. amount: Number of scroll units. Positive = scroll up, negative = down. log_path: Audit log file. Returns: dict with keys: ok, error """ params = {"x": x, "y": y, "amount": amount} pag = _get_pyautogui() if pag is None: result = {"ok": False, "error": "pyautogui unavailable"} _log_action("scroll", params, result, log_path) return result try: pag.scroll(amount, x=x, y=y) result = {"ok": True, "error": None} except Exception as exc: result = {"ok": False, "error": str(exc)} _log_action("scroll", params, result, log_path) return result def read_action_log( n: int = 20, log_path: Path = DEFAULT_ACTION_LOG, ) -> list[dict]: """Return the most recent *n* action records from the audit log. Args: n: Maximum number of records to return. log_path: Audit log file. Returns: List of action dicts, newest first. """ if not log_path.exists(): return [] records: list[dict] = [] with open(log_path) as fh: for line in fh: line = line.strip() if line: try: records.append(json.loads(line)) except json.JSONDecodeError: pass return list(reversed(records[-n:]))