the-nexus/nexus/computer_use.py

"""
nexus/computer_use.py — Hermes Desktop Automation Primitives

Provides computer-use tools so Hermes agents can control a desktop:

    computer_screenshot(output_path=None) -> dict
    computer_click(x, y, *, confirm=False) -> dict
    computer_type(text, *, confirm=False) -> dict
    computer_scroll(x, y, amount) -> dict

Design principles:
- pyautogui.FAILSAFE = True (move mouse to screen corner to abort)
- Poka-yoke: destructive/sensitive actions require confirm=True
- Every action is logged to ~/.nexus/computer_use_log.jsonl
- Screenshot evidence is captured before & after click/type actions
- All public functions return a consistent result dict:
    {"ok": bool, "tool": str, ...fields..., "screenshot": path_or_None}

Usage::

    from nexus.computer_use import computer_screenshot, computer_click, computer_type, computer_scroll

    result = computer_screenshot()
    # result == {"ok": True, "tool": "computer_screenshot", "path": "/tmp/nexus_snap_1234.png"}

    result = computer_click(960, 540)
    # Clicks centre of screen (no confirm needed for bare click)

    result = computer_type("hello", confirm=True)  # confirm required for type

Refs: #1125
"""

from __future__ import annotations

import json
import logging
import os
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# pyautogui — optional; degrades gracefully in headless environments
# ---------------------------------------------------------------------------
try:
    import pyautogui  # type: ignore

    pyautogui.FAILSAFE = True   # move mouse to corner (0,0) to abort
    pyautogui.PAUSE = 0.05      # small inter-action pause (seconds)
    _PYAUTOGUI_OK = True
except ImportError:
    log.warning("pyautogui not installed — desktop primitives will return errors")
    pyautogui = None  # type: ignore
    _PYAUTOGUI_OK = False
except Exception as exc:          # headless / no DISPLAY
    log.warning("pyautogui unavailable (%s) — running in degraded mode", exc)
    pyautogui = None  # type: ignore
    _PYAUTOGUI_OK = False

# ---------------------------------------------------------------------------
# Action log — JSONL, one record per tool invocation
# ---------------------------------------------------------------------------
_LOG_DIR = Path(os.environ.get("NEXUS_HOME", Path.home() / ".nexus"))
_ACTION_LOG: Optional[Path] = None


def _action_log_path() -> Path:
    global _ACTION_LOG
    if _ACTION_LOG is None:
        _LOG_DIR.mkdir(parents=True, exist_ok=True)
        _ACTION_LOG = _LOG_DIR / "computer_use_log.jsonl"
    return _ACTION_LOG


def _write_log(record: dict[str, Any]) -> None:
    """Append one JSON record to the action log."""
    record.setdefault("ts", datetime.now(timezone.utc).isoformat())
    try:
        with open(_action_log_path(), "a") as fh:
            fh.write(json.dumps(record) + "\n")
    except OSError as exc:
        log.warning("Could not write computer_use log: %s", exc)


# ---------------------------------------------------------------------------
# Screenshot helper
# ---------------------------------------------------------------------------

def _snap(prefix: str = "nexus_snap") -> Optional[str]:
    """Take a screenshot and return the saved path, or None on failure."""
    if not _PYAUTOGUI_OK or pyautogui is None:
        return None
    _LOG_DIR.mkdir(parents=True, exist_ok=True)
    ts = int(time.time() * 1000)
    path = str(_LOG_DIR / f"{prefix}_{ts}.png")
    try:
        img = pyautogui.screenshot()
        img.save(path)
        return path
    except Exception as exc:
        log.warning("Screenshot failed: %s", exc)
        return None


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def computer_screenshot(output_path: Optional[str] = None) -> dict[str, Any]:
    """
    Capture a screenshot of the current desktop.

    Args:
        output_path: Where to save the PNG.  Auto-generates a timestamped
                     path under ~/.nexus/ if omitted.

    Returns:
        {"ok": True, "tool": "computer_screenshot", "path": "<saved path>"}
        or {"ok": False, "tool": "computer_screenshot", "error": "<reason>"}
    """
    tool = "computer_screenshot"
    if not _PYAUTOGUI_OK or pyautogui is None:
        result = {"ok": False, "tool": tool, "error": "pyautogui not available"}
        _write_log(result)
        return result

    if output_path is None:
        _LOG_DIR.mkdir(parents=True, exist_ok=True)
        ts = int(time.time() * 1000)
        output_path = str(_LOG_DIR / f"nexus_snap_{ts}.png")

    try:
        img = pyautogui.screenshot()
        img.save(output_path)
        result: dict[str, Any] = {"ok": True, "tool": tool, "path": output_path}
    except Exception as exc:
        result = {"ok": False, "tool": tool, "error": str(exc)}

    _write_log(result)
    return result


def computer_click(
    x: int,
    y: int,
    *,
    button: str = "left",
    confirm: bool = False,
) -> dict[str, Any]:
    """
    Click at screen coordinates (x, y).

    Poka-yoke: double-clicks and right-clicks on sensitive zones are not
    blocked here, but callers should pass confirm=True for any action whose
    side-effects are hard to reverse.  When confirm=False and a destructive
    pattern is detected, the call is refused and an error is returned.

    Args:
        x, y:   Screen coordinates.
        button: "left" (default), "right", or "middle".
        confirm: Set True to acknowledge that the action may have
                 irreversible effects.

    Returns:
        {"ok": bool, "tool": "computer_click", "x": x, "y": y,
         "button": button, "screenshot": path_or_None}
    """
    tool = "computer_click"

    # Poka-yoke: right-clicks and middle-clicks without confirm are rejected
    if button in ("right", "middle") and not confirm:
        result: dict[str, Any] = {
            "ok": False,
            "tool": tool,
            "x": x, "y": y,
            "button": button,
            "error": (
                f"button='{button}' requires confirm=True "
                "(pass confirm=True to acknowledge the action)"
            ),
        }
        _write_log(result)
        return result

    if not _PYAUTOGUI_OK or pyautogui is None:
        result = {"ok": False, "tool": tool, "x": x, "y": y,
                  "button": button, "error": "pyautogui not available"}
        _write_log(result)
        return result

    before = _snap("before_click")
    try:
        if button == "left":
            pyautogui.click(x, y)
        elif button == "right":
            pyautogui.rightClick(x, y)
        elif button == "middle":
            pyautogui.middleClick(x, y)
        else:
            raise ValueError(f"Unknown button: {button!r}")
        after = _snap("after_click")
        result = {
            "ok": True, "tool": tool,
            "x": x, "y": y, "button": button,
            "before_screenshot": before,
            "screenshot": after,
        }
    except Exception as exc:
        result = {
            "ok": False, "tool": tool,
            "x": x, "y": y, "button": button,
            "error": str(exc),
            "before_screenshot": before,
        }

    _write_log(result)
    return result


# Patterns that indicate potentially sensitive text being typed.
_SENSITIVE_PATTERNS = ("password", "secret", "token", "key", "pass", "pwd")


def computer_type(text: str, *, confirm: bool = False) -> dict[str, Any]:
    """
    Type a string of text using keyboard simulation.

    Poka-yoke: if the text contains common sensitive keywords the call
    is refused unless confirm=True is passed explicitly.

    Args:
        text:    The string to type.
        confirm: Required when text looks sensitive (contains
                 "password", "token", "key", etc.).

    Returns:
        {"ok": bool, "tool": "computer_type", "length": len(text),
         "screenshot": path_or_None}
    """
    tool = "computer_type"

    lower = text.lower()
    looks_sensitive = any(pat in lower for pat in _SENSITIVE_PATTERNS)
    if looks_sensitive and not confirm:
        result: dict[str, Any] = {
            "ok": False,
            "tool": tool,
            "length": len(text),
            "error": (
                "Text appears to contain sensitive data "
                "(password/token/key). Pass confirm=True to proceed."
            ),
        }
        _write_log({**result, "text_length": len(text)})
        return result

    if not _PYAUTOGUI_OK or pyautogui is None:
        result = {"ok": False, "tool": tool, "length": len(text),
                  "error": "pyautogui not available"}
        _write_log(result)
        return result

    before = _snap("before_type")
    try:
        # typewrite handles printable ASCII; for unicode use pyperclip+hotkey
        printable = all(ord(c) < 128 for c in text)
        if printable:
            pyautogui.typewrite(text, interval=0.02)
        else:
            # Fallback: copy-paste via clipboard for unicode
            try:
                import pyperclip  # type: ignore
                pyperclip.copy(text)
                pyautogui.hotkey("ctrl", "v")
            except ImportError:
                raise RuntimeError(
                    "Unicode text requires pyperclip: pip install pyperclip"
                )
        after = _snap("after_type")
        result = {
            "ok": True, "tool": tool,
            "length": len(text),
            "before_screenshot": before,
            "screenshot": after,
        }
    except Exception as exc:
        result = {
            "ok": False, "tool": tool,
            "length": len(text),
            "error": str(exc),
            "before_screenshot": before,
        }

    _write_log({**result})
    return result


def computer_scroll(
    x: int,
    y: int,
    amount: int,
) -> dict[str, Any]:
    """
    Scroll the mouse wheel at position (x, y).

    Args:
        x, y:   Coordinates to move the mouse before scrolling.
        amount: Number of scroll clicks.  Positive = scroll up / zoom in,
                negative = scroll down / zoom out.

    Returns:
        {"ok": bool, "tool": "computer_scroll", "x": x, "y": y,
         "amount": amount, "screenshot": path_or_None}
    """
    tool = "computer_scroll"

    if not _PYAUTOGUI_OK or pyautogui is None:
        result: dict[str, Any] = {
            "ok": False, "tool": tool,
            "x": x, "y": y, "amount": amount,
            "error": "pyautogui not available",
        }
        _write_log(result)
        return result

    try:
        pyautogui.moveTo(x, y)
        pyautogui.scroll(amount)
        snap = _snap("after_scroll")
        result = {
            "ok": True, "tool": tool,
            "x": x, "y": y, "amount": amount,
            "screenshot": snap,
        }
    except Exception as exc:
        result = {
            "ok": False, "tool": tool,
            "x": x, "y": y, "amount": amount,
            "error": str(exc),
        }

    _write_log(result)
    return result


# ---------------------------------------------------------------------------
# Convenience: read action log
# ---------------------------------------------------------------------------

def read_action_log(last_n: int = 20) -> list[dict[str, Any]]:
    """Return the last *last_n* records from the action log."""
    path = _action_log_path()
    if not path.exists():
        return []
    lines = path.read_text().splitlines()
    records = []
    for line in lines:
        line = line.strip()
        if line:
            try:
                records.append(json.loads(line))
            except json.JSONDecodeError:
                pass
    return records[-last_n:]