the-nexus/nexus/computer_use.py

"""
Hermes Desktop Automation Primitives — Computer Use (#1125)

Provides sandboxed desktop control tools for Hermes agents:
  - computer_screenshot()   — capture current desktop
  - computer_click()        — mouse click with poka-yoke on non-primary buttons
  - computer_type()         — keyboard input with poka-yoke on sensitive text
  - computer_scroll()       — scroll wheel action
  - read_action_log()       — inspect recent action audit trail

All actions are logged to a JSONL audit file.
pyautogui.FAILSAFE is enabled globally — move mouse to top-left corner to abort.

Designed to degrade gracefully when no display is available (headless CI).
"""

from __future__ import annotations

import base64
import io
import json
import logging
import os
import time
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Safety globals
# ---------------------------------------------------------------------------

# Poka-yoke: require confirmation for dangerous inputs
_SENSITIVE_KEYWORDS = frozenset(
    ["password", "passwd", "secret", "token", "api_key", "apikey", "key", "auth"]
)

# Destructive mouse buttons (non-primary)
_DANGEROUS_BUTTONS = frozenset(["right", "middle"])

# Default log location
DEFAULT_ACTION_LOG = Path.home() / ".nexus" / "computer_use_actions.jsonl"

# ---------------------------------------------------------------------------
# Lazy pyautogui import — fails gracefully in headless environments
# ---------------------------------------------------------------------------

_PYAUTOGUI_AVAILABLE = False
_pyautogui = None


def _get_pyautogui():
    """Return pyautogui, enabling FAILSAFE. Returns None if unavailable."""
    global _pyautogui, _PYAUTOGUI_AVAILABLE
    if _pyautogui is not None:
        return _pyautogui
    try:
        import pyautogui  # type: ignore

        pyautogui.FAILSAFE = True
        pyautogui.PAUSE = 0.05  # small delay between actions
        _pyautogui = pyautogui
        _PYAUTOGUI_AVAILABLE = True
        return _pyautogui
    except Exception:
        logger.warning("pyautogui unavailable — computer_use running in stub mode")
        return None


def _get_pil():
    """Return PIL Image module or None."""
    try:
        from PIL import Image  # type: ignore

        return Image
    except ImportError:
        return None


# ---------------------------------------------------------------------------
# Audit log
# ---------------------------------------------------------------------------


def _log_action(action: str, params: dict, result: dict, log_path: Path = DEFAULT_ACTION_LOG):
    """Append one action record to the JSONL audit log."""
    log_path.parent.mkdir(parents=True, exist_ok=True)
    record = {
        "ts": time.strftime("%Y-%m-%dT%H:%M:%S"),
        "action": action,
        "params": params,
        "result": result,
    }
    with open(log_path, "a") as fh:
        fh.write(json.dumps(record) + "\n")


# ---------------------------------------------------------------------------
# Public tool API
# ---------------------------------------------------------------------------


def computer_screenshot(
    save_path: Optional[str] = None,
    log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
    """Capture a screenshot of the current desktop.

    Args:
        save_path: Optional file path to save the PNG. If omitted the image
                   is returned as a base64-encoded string.
        log_path:  Audit log file (default ~/.nexus/computer_use_actions.jsonl).

    Returns:
        dict with keys:
          - ok (bool)
          - image_b64 (str | None) — base64 PNG when save_path is None
          - saved_to (str | None)  — path when save_path was given
          - error (str | None)     — human-readable error if ok=False
    """
    pag = _get_pyautogui()
    params = {"save_path": save_path}

    if pag is None:
        result = {"ok": False, "image_b64": None, "saved_to": None, "error": "pyautogui unavailable"}
        _log_action("screenshot", params, result, log_path)
        return result

    try:
        screenshot = pag.screenshot()
        if save_path:
            screenshot.save(save_path)
            result = {"ok": True, "image_b64": None, "saved_to": save_path, "error": None}
        else:
            buf = io.BytesIO()
            screenshot.save(buf, format="PNG")
            b64 = base64.b64encode(buf.getvalue()).decode()
            result = {"ok": True, "image_b64": b64, "saved_to": None, "error": None}
    except Exception as exc:
        result = {"ok": False, "image_b64": None, "saved_to": None, "error": str(exc)}

    _log_action("screenshot", params, {k: v for k, v in result.items() if k != "image_b64"}, log_path)
    return result


def computer_click(
    x: int,
    y: int,
    button: str = "left",
    confirm: bool = False,
    log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
    """Click the mouse at screen coordinates (x, y).

    Poka-yoke: right/middle clicks require confirm=True.

    Args:
        x:       Horizontal screen coordinate.
        y:       Vertical screen coordinate.
        button:  "left" | "right" | "middle"
        confirm: Must be True for non-left buttons.
        log_path: Audit log file.

    Returns:
        dict with keys: ok, error
    """
    params = {"x": x, "y": y, "button": button, "confirm": confirm}

    if button in _DANGEROUS_BUTTONS and not confirm:
        result = {
            "ok": False,
            "error": (
                f"button={button!r} requires confirm=True (poka-yoke). "
                "Pass confirm=True only after verifying this action is intentional."
            ),
        }
        _log_action("click", params, result, log_path)
        return result

    if button not in ("left", "right", "middle"):
        result = {"ok": False, "error": f"Unknown button {button!r}. Use 'left', 'right', or 'middle'."}
        _log_action("click", params, result, log_path)
        return result

    pag = _get_pyautogui()
    if pag is None:
        result = {"ok": False, "error": "pyautogui unavailable"}
        _log_action("click", params, result, log_path)
        return result

    try:
        pag.click(x, y, button=button)
        result = {"ok": True, "error": None}
    except Exception as exc:
        result = {"ok": False, "error": str(exc)}

    _log_action("click", params, result, log_path)
    return result


def computer_type(
    text: str,
    confirm: bool = False,
    interval: float = 0.02,
    log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
    """Type text using the keyboard.

    Poka-yoke: if *text* contains a sensitive keyword (password, token, key…)
    confirm=True is required. The actual text value is never written to the
    audit log.

    Args:
        text:     The string to type.
        confirm:  Must be True when the text looks sensitive.
        interval: Delay between keystrokes (seconds).
        log_path: Audit log file.

    Returns:
        dict with keys: ok, error
    """
    lower = text.lower()
    is_sensitive = any(kw in lower for kw in _SENSITIVE_KEYWORDS)
    params = {"length": len(text), "is_sensitive": is_sensitive, "confirm": confirm}

    if is_sensitive and not confirm:
        result = {
            "ok": False,
            "error": (
                "Text contains sensitive keyword. Pass confirm=True to proceed. "
                "Ensure no secrets are being typed into unintended windows."
            ),
        }
        _log_action("type", params, result, log_path)
        return result

    pag = _get_pyautogui()
    if pag is None:
        result = {"ok": False, "error": "pyautogui unavailable"}
        _log_action("type", params, result, log_path)
        return result

    try:
        pag.typewrite(text, interval=interval)
        result = {"ok": True, "error": None}
    except Exception as exc:
        result = {"ok": False, "error": str(exc)}

    _log_action("type", params, result, log_path)
    return result


def computer_scroll(
    x: int,
    y: int,
    amount: int = 3,
    log_path: Path = DEFAULT_ACTION_LOG,
) -> dict:
    """Scroll the mouse wheel at screen coordinates (x, y).

    Args:
        x:       Horizontal screen coordinate.
        y:       Vertical screen coordinate.
        amount:  Number of scroll units. Positive = scroll up, negative = down.
        log_path: Audit log file.

    Returns:
        dict with keys: ok, error
    """
    params = {"x": x, "y": y, "amount": amount}
    pag = _get_pyautogui()

    if pag is None:
        result = {"ok": False, "error": "pyautogui unavailable"}
        _log_action("scroll", params, result, log_path)
        return result

    try:
        pag.scroll(amount, x=x, y=y)
        result = {"ok": True, "error": None}
    except Exception as exc:
        result = {"ok": False, "error": str(exc)}

    _log_action("scroll", params, result, log_path)
    return result


def read_action_log(
    n: int = 20,
    log_path: Path = DEFAULT_ACTION_LOG,
) -> list[dict]:
    """Return the most recent *n* action records from the audit log.

    Args:
        n:        Maximum number of records to return.
        log_path: Audit log file.

    Returns:
        List of action dicts, newest first.
    """
    if not log_path.exists():
        return []
    records: list[dict] = []
    with open(log_path) as fh:
        for line in fh:
            line = line.strip()
            if line:
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError:
                    pass
    return list(reversed(records[-n:]))