diff --git a/docker-compose.desktop.yml b/docker-compose.desktop.yml new file mode 100644 index 0000000..d39fe52 --- /dev/null +++ b/docker-compose.desktop.yml @@ -0,0 +1,51 @@ +--- +# docker-compose.desktop.yml — Sandboxed desktop environment for Hermes computer-use +# +# Provides a virtual desktop (Xvfb + noVNC) so agents can run computer_use +# primitives safely inside a container. +# +# Usage: +# docker-compose -f docker-compose.desktop.yml up +# # Open noVNC at http://localhost:6080 +# # Run demo: docker exec -it nexus-desktop python nexus/computer_use_demo.py +# +# Refs: #1125 + +version: "3.8" + +services: + desktop: + image: python:3.11-slim + container_name: nexus-desktop + working_dir: /workspace + volumes: + - .:/workspace:ro # mount repo read-only + - nexus_home:/root/.nexus # persistent screenshot/log store + ports: + - "6080:6080" # noVNC web viewer + - "5900:5900" # VNC (optional, for native VNC clients) + environment: + - DISPLAY=:99 + - GITEA_URL=${GITEA_URL:-https://forge.alexanderwhitestone.com} + command: > + bash -c " + apt-get update -qq && + apt-get install -y -qq + xvfb x11vnc novnc websockify + chromium chromium-driver + python3-tk python3-dev scrot && + pip install -q pyautogui pillow && + Xvfb :99 -screen 0 1280x800x24 & + x11vnc -display :99 -forever -nopw -quiet & + websockify --web /usr/share/novnc 6080 localhost:5900 & + echo 'Desktop ready — noVNC at http://localhost:6080' && + tail -f /dev/null + " + healthcheck: + test: ["CMD", "pgrep", "Xvfb"] + interval: 5s + timeout: 3s + retries: 5 + +volumes: + nexus_home: diff --git a/docs/computer-use.md b/docs/computer-use.md new file mode 100644 index 0000000..b1c35d2 --- /dev/null +++ b/docs/computer-use.md @@ -0,0 +1,310 @@ +# Computer Use — Desktop Automation Primitives for Hermes + +**Issue:** #1125 +**Status:** Phase 1 complete, Phase 2 in progress +**Owner:** Bezalel +**Epic:** #1120 + +--- + +## Overview + +This document describes how Hermes agents can control a desktop environment +(screenshot, click, type, scroll) for automation and testing. The capability +unlocks: + +- Visual regression testing of fleet dashboards +- Automated Gitea workflow verification +- Screenshot-based incident diagnosis +- Driving GUI-only tools from agent code + +--- + +## Architecture + +``` +┌──────────────────────────────────────────────────────┐ +│ Hermes Agent │ +│ │ +│ computer_screenshot() computer_click(x, y) │ +│ computer_type(text) computer_scroll(x, y, n) │ +│ │ │ +│ nexus/computer_use.py │ +│ (safety guards · action log) │ +└────────────────────────┬─────────────────────────────┘ + │ + ┌──────────┴───────────┐ + │ pyautogui │ + │ (FAILSAFE enabled) │ + └──────────┬───────────┘ + │ + ┌──────────┴───────────┐ + │ Desktop environment │ + │ (Xvfb · noVNC · │ + │ bare metal) │ + └──────────────────────┘ +``` + +The MCP server layer (`mcp_servers/desktop_control_server.py`) is still +available for external callers (e.g. the Bannerlord harness). The +`nexus/computer_use.py` module calls pyautogui directly so that safety +guards, logging, and screenshot evidence are applied consistently for every +Hermes agent invocation. + +--- + +## Phase 1 — Environment & Primitives + +### Sandboxed Desktop Setup + +**Option A — Xvfb (lightweight, Linux/macOS)** + +```bash +# Install +sudo apt-get install xvfb # Linux +brew install xvfb # macOS (via XQuartz) + +# Start a virtual display on :99 +Xvfb :99 -screen 0 1280x800x24 & +export DISPLAY=:99 + +# Run the demo +python nexus/computer_use_demo.py +``` + +**Option B — Docker with noVNC** + +```bash +docker-compose -f docker-compose.desktop.yml up +# Open http://localhost:6080 to view the virtual desktop +``` + +See `docker-compose.desktop.yml` in the repo root. + +### Running the Demo + +The `nexus/computer_use_demo.py` script exercises the full Phase 1 loop: + +``` +[1/4] Capturing baseline screenshot +[2/4] Opening browser → https://forge.alexanderwhitestone.com +[3/4] Waiting 3s for page to load +[4/4] Capturing evidence screenshot +``` + +```bash +# Default target (Gitea forge) +python nexus/computer_use_demo.py + +# Custom URL +GITEA_URL=http://localhost:3000 python nexus/computer_use_demo.py +``` + +--- + +## Phase 2 — Tool Integration + +### API Reference + +All four tools live in `nexus/computer_use.py` and follow the same contract: + +```python +result = tool(...) +# result is always: +# {"ok": bool, "tool": str, ...fields..., "screenshot": path_or_None} +``` + +#### `computer_screenshot(output_path=None)` + +Take a screenshot of the current desktop. + +| Parameter | Type | Default | Description | +|---------------|-----------------|----------------------|-----------------------------------| +| `output_path` | `str` or `None` | auto timestamped PNG | Where to save the captured image. | + +```python +from nexus.computer_use import computer_screenshot + +result = computer_screenshot() +# {"ok": True, "tool": "computer_screenshot", "path": "~/.nexus/nexus_snap_1712345678.png"} +``` + +#### `computer_click(x, y, *, button="left", confirm=False)` + +Click at screen coordinates. + +| Parameter | Type | Default | Description | +|-----------|--------|----------|------------------------------------------| +| `x`, `y` | `int` | required | Screen pixel coordinates. | +| `button` | `str` | `"left"` | `"left"`, `"right"`, or `"middle"`. | +| `confirm` | `bool` | `False` | Required for `right`/`middle` clicks. | + +```python +from nexus.computer_use import computer_click + +# Simple left click (no confirm needed) +result = computer_click(960, 540) + +# Right-click requires explicit confirmation +result = computer_click(960, 540, button="right", confirm=True) +``` + +**Screenshot evidence:** before/after snapshots are captured and logged. + +#### `computer_type(text, *, confirm=False)` + +Type a string using keyboard simulation. + +| Parameter | Type | Default | Description | +|-----------|--------|----------|----------------------------------------------------| +| `text` | `str` | required | Text to type. | +| `confirm` | `bool` | `False` | Required when text contains `password`/`token`/`key`. | + +```python +from nexus.computer_use import computer_type + +# Safe text — no confirm needed +computer_type("https://forge.alexanderwhitestone.com") + +# Sensitive text — confirm required +computer_type("hunter2", confirm=True) +``` + +#### `computer_scroll(x, y, amount)` + +Scroll the mouse wheel at the given position. + +| Parameter | Type | Description | +|-----------|-------|-------------------------------------------------| +| `x`, `y` | `int` | Move mouse here before scrolling. | +| `amount` | `int` | Positive = scroll up; negative = scroll down. | + +```python +from nexus.computer_use import computer_scroll + +computer_scroll(640, 400, -5) # scroll down 5 clicks +computer_scroll(640, 400, 3) # scroll up 3 clicks +``` + +### Safety (Poka-Yoke) + +| Situation | Behavior | +|------------------------------------|-------------------------------------------------| +| `right`/`middle` click w/o confirm | Refused; returns `ok=False` with explanation | +| Text with `password`/`token`/`key` | Refused unless `confirm=True` | +| `FAILSAFE = True` | Move mouse to screen corner (0, 0) to abort | +| pyautogui unavailable | All tools return `ok=False` gracefully | + +### Action Log + +Every call is appended to `~/.nexus/computer_use_log.jsonl` (one JSON record per line): + +```json +{"ok": true, "tool": "computer_click", "x": 960, "y": 540, "button": "left", + "before_screenshot": "/home/user/.nexus/before_click_1712345.png", + "screenshot": "/home/user/.nexus/after_click_1712345.png", + "ts": "2026-04-08T10:30:00+00:00"} +``` + +Read recent entries from Python: + +```python +from nexus.computer_use import read_action_log +for record in read_action_log(last_n=10): + print(record) +``` + +--- + +## Phase 3 — Use-Case Pilots + +### Pilot 1: Visual Regression Test (Fleet Dashboard) + +Open the fleet health dashboard, take a screenshot, compare pixel-level +hashes against a golden baseline: + +```python +from nexus.computer_use import computer_screenshot, computer_click +import hashlib + +def screenshot_hash(path: str) -> str: + return hashlib.md5(open(path, "rb").read()).hexdigest() + +# Navigate to the dashboard +computer_click(960, 40) # address bar +computer_type("http://localhost:7771/health\n") + +import time; time.sleep(2) + +result = computer_screenshot() +current_hash = screenshot_hash(result["path"]) + +GOLDEN_HASH = "abc123..." # established on first run +assert current_hash == GOLDEN_HASH, "Visual regression detected!" +``` + +### Pilot 2: Screenshot-Based CI Diagnosis + +When a CI workflow fails, agents can screenshot the Gitea workflow page and +use the image to triage: + +```python +from nexus.computer_use import computer_screenshot + +def diagnose_failed_workflow(run_url: str) -> str: + """ + Navigate to *run_url*, screenshot it, return the screenshot path + for downstream LLM-based analysis. + """ + computer_click(960, 40) # address bar + computer_type(run_url + "\n") + + import time; time.sleep(3) + + result = computer_screenshot() + return result["path"] # hand off to vision model or OCR +``` + +--- + +## MCP Server (External Callers) + +The lower-level MCP server (`mcp_servers/desktop_control_server.py`) exposes +the same capabilities over JSON-RPC stdio for callers outside Python: + +```bash +# List available tools +echo '{"jsonrpc":"2.0","id":1,"method":"tools/list","params":{}}' \ + | python mcp_servers/desktop_control_server.py + +# Take a screenshot +echo '{"jsonrpc":"2.0","id":2,"method":"tools/call", + "params":{"name":"take_screenshot","arguments":{"path":"/tmp/snap.png"}}}' \ + | python mcp_servers/desktop_control_server.py +``` + +--- + +## Docker / Sandboxed Environment + +`docker-compose.desktop.yml` provides a safe container with: + +- Xvfb virtual display (1280×800) +- noVNC for browser-based viewing +- Python + pyautogui pre-installed + +```bash +docker-compose -f docker-compose.desktop.yml up +# noVNC → http://localhost:6080 +# Run demo inside container: +docker exec -it nexus-desktop python nexus/computer_use_demo.py +``` + +--- + +## Development Notes + +- `NEXUS_HOME` env var overrides the log/snapshot directory (default `~/.nexus`) +- `GITEA_URL` env var overrides the target in the demo script +- `BROWSER_OPEN_WAIT` controls how long the demo waits after opening the browser +- Tests in `tests/test_computer_use.py` run headless — pyautogui is fully mocked diff --git a/nexus/computer_use.py b/nexus/computer_use.py new file mode 100644 index 0000000..6ffc391 --- /dev/null +++ b/nexus/computer_use.py @@ -0,0 +1,369 @@ +""" +nexus/computer_use.py — Hermes Desktop Automation Primitives + +Provides computer-use tools so Hermes agents can control a desktop: + + computer_screenshot(output_path=None) -> dict + computer_click(x, y, *, confirm=False) -> dict + computer_type(text, *, confirm=False) -> dict + computer_scroll(x, y, amount) -> dict + +Design principles: +- pyautogui.FAILSAFE = True (move mouse to screen corner to abort) +- Poka-yoke: destructive/sensitive actions require confirm=True +- Every action is logged to ~/.nexus/computer_use_log.jsonl +- Screenshot evidence is captured before & after click/type actions +- All public functions return a consistent result dict: + {"ok": bool, "tool": str, ...fields..., "screenshot": path_or_None} + +Usage:: + + from nexus.computer_use import computer_screenshot, computer_click, computer_type, computer_scroll + + result = computer_screenshot() + # result == {"ok": True, "tool": "computer_screenshot", "path": "/tmp/nexus_snap_1234.png"} + + result = computer_click(960, 540) + # Clicks centre of screen (no confirm needed for bare click) + + result = computer_type("hello", confirm=True) # confirm required for type + +Refs: #1125 +""" + +from __future__ import annotations + +import json +import logging +import os +import time +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# pyautogui — optional; degrades gracefully in headless environments +# --------------------------------------------------------------------------- +try: + import pyautogui # type: ignore + + pyautogui.FAILSAFE = True # move mouse to corner (0,0) to abort + pyautogui.PAUSE = 0.05 # small inter-action pause (seconds) + _PYAUTOGUI_OK = True +except ImportError: + log.warning("pyautogui not installed — desktop primitives will return errors") + pyautogui = None # type: ignore + _PYAUTOGUI_OK = False +except Exception as exc: # headless / no DISPLAY + log.warning("pyautogui unavailable (%s) — running in degraded mode", exc) + pyautogui = None # type: ignore + _PYAUTOGUI_OK = False + +# --------------------------------------------------------------------------- +# Action log — JSONL, one record per tool invocation +# --------------------------------------------------------------------------- +_LOG_DIR = Path(os.environ.get("NEXUS_HOME", Path.home() / ".nexus")) +_ACTION_LOG: Optional[Path] = None + + +def _action_log_path() -> Path: + global _ACTION_LOG + if _ACTION_LOG is None: + _LOG_DIR.mkdir(parents=True, exist_ok=True) + _ACTION_LOG = _LOG_DIR / "computer_use_log.jsonl" + return _ACTION_LOG + + +def _write_log(record: dict[str, Any]) -> None: + """Append one JSON record to the action log.""" + record.setdefault("ts", datetime.now(timezone.utc).isoformat()) + try: + with open(_action_log_path(), "a") as fh: + fh.write(json.dumps(record) + "\n") + except OSError as exc: + log.warning("Could not write computer_use log: %s", exc) + + +# --------------------------------------------------------------------------- +# Screenshot helper +# --------------------------------------------------------------------------- + +def _snap(prefix: str = "nexus_snap") -> Optional[str]: + """Take a screenshot and return the saved path, or None on failure.""" + if not _PYAUTOGUI_OK or pyautogui is None: + return None + _LOG_DIR.mkdir(parents=True, exist_ok=True) + ts = int(time.time() * 1000) + path = str(_LOG_DIR / f"{prefix}_{ts}.png") + try: + img = pyautogui.screenshot() + img.save(path) + return path + except Exception as exc: + log.warning("Screenshot failed: %s", exc) + return None + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def computer_screenshot(output_path: Optional[str] = None) -> dict[str, Any]: + """ + Capture a screenshot of the current desktop. + + Args: + output_path: Where to save the PNG. Auto-generates a timestamped + path under ~/.nexus/ if omitted. + + Returns: + {"ok": True, "tool": "computer_screenshot", "path": ""} + or {"ok": False, "tool": "computer_screenshot", "error": ""} + """ + tool = "computer_screenshot" + if not _PYAUTOGUI_OK or pyautogui is None: + result = {"ok": False, "tool": tool, "error": "pyautogui not available"} + _write_log(result) + return result + + if output_path is None: + _LOG_DIR.mkdir(parents=True, exist_ok=True) + ts = int(time.time() * 1000) + output_path = str(_LOG_DIR / f"nexus_snap_{ts}.png") + + try: + img = pyautogui.screenshot() + img.save(output_path) + result: dict[str, Any] = {"ok": True, "tool": tool, "path": output_path} + except Exception as exc: + result = {"ok": False, "tool": tool, "error": str(exc)} + + _write_log(result) + return result + + +def computer_click( + x: int, + y: int, + *, + button: str = "left", + confirm: bool = False, +) -> dict[str, Any]: + """ + Click at screen coordinates (x, y). + + Poka-yoke: double-clicks and right-clicks on sensitive zones are not + blocked here, but callers should pass confirm=True for any action whose + side-effects are hard to reverse. When confirm=False and a destructive + pattern is detected, the call is refused and an error is returned. + + Args: + x, y: Screen coordinates. + button: "left" (default), "right", or "middle". + confirm: Set True to acknowledge that the action may have + irreversible effects. + + Returns: + {"ok": bool, "tool": "computer_click", "x": x, "y": y, + "button": button, "screenshot": path_or_None} + """ + tool = "computer_click" + + # Poka-yoke: right-clicks and middle-clicks without confirm are rejected + if button in ("right", "middle") and not confirm: + result: dict[str, Any] = { + "ok": False, + "tool": tool, + "x": x, "y": y, + "button": button, + "error": ( + f"button='{button}' requires confirm=True " + "(pass confirm=True to acknowledge the action)" + ), + } + _write_log(result) + return result + + if not _PYAUTOGUI_OK or pyautogui is None: + result = {"ok": False, "tool": tool, "x": x, "y": y, + "button": button, "error": "pyautogui not available"} + _write_log(result) + return result + + before = _snap("before_click") + try: + if button == "left": + pyautogui.click(x, y) + elif button == "right": + pyautogui.rightClick(x, y) + elif button == "middle": + pyautogui.middleClick(x, y) + else: + raise ValueError(f"Unknown button: {button!r}") + after = _snap("after_click") + result = { + "ok": True, "tool": tool, + "x": x, "y": y, "button": button, + "before_screenshot": before, + "screenshot": after, + } + except Exception as exc: + result = { + "ok": False, "tool": tool, + "x": x, "y": y, "button": button, + "error": str(exc), + "before_screenshot": before, + } + + _write_log(result) + return result + + +# Patterns that indicate potentially sensitive text being typed. +_SENSITIVE_PATTERNS = ("password", "secret", "token", "key", "pass", "pwd") + + +def computer_type(text: str, *, confirm: bool = False) -> dict[str, Any]: + """ + Type a string of text using keyboard simulation. + + Poka-yoke: if the text contains common sensitive keywords the call + is refused unless confirm=True is passed explicitly. + + Args: + text: The string to type. + confirm: Required when text looks sensitive (contains + "password", "token", "key", etc.). + + Returns: + {"ok": bool, "tool": "computer_type", "length": len(text), + "screenshot": path_or_None} + """ + tool = "computer_type" + + lower = text.lower() + looks_sensitive = any(pat in lower for pat in _SENSITIVE_PATTERNS) + if looks_sensitive and not confirm: + result: dict[str, Any] = { + "ok": False, + "tool": tool, + "length": len(text), + "error": ( + "Text appears to contain sensitive data " + "(password/token/key). Pass confirm=True to proceed." + ), + } + _write_log({**result, "text_length": len(text)}) + return result + + if not _PYAUTOGUI_OK or pyautogui is None: + result = {"ok": False, "tool": tool, "length": len(text), + "error": "pyautogui not available"} + _write_log(result) + return result + + before = _snap("before_type") + try: + # typewrite handles printable ASCII; for unicode use pyperclip+hotkey + printable = all(ord(c) < 128 for c in text) + if printable: + pyautogui.typewrite(text, interval=0.02) + else: + # Fallback: copy-paste via clipboard for unicode + try: + import pyperclip # type: ignore + pyperclip.copy(text) + pyautogui.hotkey("ctrl", "v") + except ImportError: + raise RuntimeError( + "Unicode text requires pyperclip: pip install pyperclip" + ) + after = _snap("after_type") + result = { + "ok": True, "tool": tool, + "length": len(text), + "before_screenshot": before, + "screenshot": after, + } + except Exception as exc: + result = { + "ok": False, "tool": tool, + "length": len(text), + "error": str(exc), + "before_screenshot": before, + } + + _write_log({**result}) + return result + + +def computer_scroll( + x: int, + y: int, + amount: int, +) -> dict[str, Any]: + """ + Scroll the mouse wheel at position (x, y). + + Args: + x, y: Coordinates to move the mouse before scrolling. + amount: Number of scroll clicks. Positive = scroll up / zoom in, + negative = scroll down / zoom out. + + Returns: + {"ok": bool, "tool": "computer_scroll", "x": x, "y": y, + "amount": amount, "screenshot": path_or_None} + """ + tool = "computer_scroll" + + if not _PYAUTOGUI_OK or pyautogui is None: + result: dict[str, Any] = { + "ok": False, "tool": tool, + "x": x, "y": y, "amount": amount, + "error": "pyautogui not available", + } + _write_log(result) + return result + + try: + pyautogui.moveTo(x, y) + pyautogui.scroll(amount) + snap = _snap("after_scroll") + result = { + "ok": True, "tool": tool, + "x": x, "y": y, "amount": amount, + "screenshot": snap, + } + except Exception as exc: + result = { + "ok": False, "tool": tool, + "x": x, "y": y, "amount": amount, + "error": str(exc), + } + + _write_log(result) + return result + + +# --------------------------------------------------------------------------- +# Convenience: read action log +# --------------------------------------------------------------------------- + +def read_action_log(last_n: int = 20) -> list[dict[str, Any]]: + """Return the last *last_n* records from the action log.""" + path = _action_log_path() + if not path.exists(): + return [] + lines = path.read_text().splitlines() + records = [] + for line in lines: + line = line.strip() + if line: + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + return records[-last_n:] diff --git a/nexus/computer_use_demo.py b/nexus/computer_use_demo.py new file mode 100644 index 0000000..9c6f312 --- /dev/null +++ b/nexus/computer_use_demo.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +nexus/computer_use_demo.py — Phase 1 end-to-end demo + +Demonstrates the computer-use primitives by: + 1. Taking a baseline screenshot + 2. Opening a browser and navigating to the Gitea instance + 3. Waiting for the page to load + 4. Taking a final screenshot as evidence + +Usage:: + + # With a live display (or Xvfb): + python nexus/computer_use_demo.py + + # Override the Gitea URL: + GITEA_URL=https://my-forge.example.com python nexus/computer_use_demo.py + + # Headless via Xvfb (one-liner): + Xvfb :99 -screen 0 1280x800x24 & + DISPLAY=:99 python nexus/computer_use_demo.py + +Refs: #1125 +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import time +from pathlib import Path + +# Add repo root to path +_HERE = Path(__file__).resolve().parent.parent +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from nexus.computer_use import ( + computer_screenshot, + computer_type, + read_action_log, +) + +GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com") +BROWSER_OPEN_WAIT = float(os.environ.get("BROWSER_OPEN_WAIT", "3.0")) + + +def _open_browser(url: str) -> bool: + """Open *url* in the default system browser. Returns True on success.""" + import platform + system = platform.system() + try: + if system == "Darwin": + subprocess.Popen(["open", url]) + elif system == "Linux": + subprocess.Popen(["xdg-open", url]) + elif system == "Windows": + subprocess.Popen(["start", url], shell=True) + else: + # Fallback to Python webbrowser module + import webbrowser + webbrowser.open(url) + return True + except Exception as exc: + print(f"[demo] Failed to open browser: {exc}", file=sys.stderr) + return False + + +def run_demo() -> int: + print("=" * 60) + print("Hermes Computer-Use Demo — Phase 1") + print(f" Target URL: {GITEA_URL}") + print("=" * 60) + + # Step 1: Baseline screenshot + print("\n[1/4] Capturing baseline screenshot...") + baseline = computer_screenshot() + if baseline["ok"]: + print(f" Saved → {baseline['path']}") + else: + print(f" WARNING: {baseline['error']}") + + # Step 2: Open browser + print(f"\n[2/4] Opening browser at {GITEA_URL} ...") + ok = _open_browser(GITEA_URL) + if not ok: + print(" ERROR: Could not open browser. " + "Ensure a display is available (or use Xvfb).") + return 1 + + # Step 3: Wait for page load + print(f"\n[3/4] Waiting {BROWSER_OPEN_WAIT}s for page to load...") + time.sleep(BROWSER_OPEN_WAIT) + + # Step 4: Evidence screenshot + print("\n[4/4] Capturing evidence screenshot...") + evidence = computer_screenshot() + if evidence["ok"]: + print(f" Saved → {evidence['path']}") + else: + print(f" WARNING: {evidence['error']}") + + # Summary + print("\n--- Action log (last 5 entries) ---") + for rec in read_action_log(5): + ts = rec.get("ts", "?") + tool = rec.get("tool", "?") + ok_flag = rec.get("ok", "?") + extra = rec.get("path") or rec.get("error") or "" + print(f" [{ts[:19]}] {tool:25s} ok={ok_flag} {extra}") + + print("\nDemo complete.") + return 0 + + +if __name__ == "__main__": + sys.exit(run_demo()) diff --git a/tests/test_computer_use.py b/tests/test_computer_use.py new file mode 100644 index 0000000..58cb99c --- /dev/null +++ b/tests/test_computer_use.py @@ -0,0 +1,280 @@ +""" +tests/test_computer_use.py — Unit tests for nexus.computer_use + +All tests run without a real display by patching pyautogui. +""" + +from __future__ import annotations + +import importlib +import json +import sys +import types +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# Helpers: stub pyautogui so tests run headless +# --------------------------------------------------------------------------- + +def _make_pyautogui_stub() -> MagicMock: + """Return a minimal pyautogui mock with the attributes we use.""" + stub = MagicMock() + stub.FAILSAFE = True + stub.PAUSE = 0.05 + # screenshot() → PIL-like object with .save() + img = MagicMock() + img.save = MagicMock() + stub.screenshot.return_value = img + stub.size.return_value = (1920, 1080) + stub.position.return_value = (100, 200) + return stub + + +def _reload_module(pyautogui_stub=None): + """ + Reload nexus.computer_use with an optional pyautogui stub. + Returns the freshly imported module. + """ + # Remove cached module so we get a clean import + for key in list(sys.modules.keys()): + if "nexus.computer_use" in key or key == "nexus.computer_use": + del sys.modules[key] + + if pyautogui_stub is not None: + sys.modules["pyautogui"] = pyautogui_stub + else: + sys.modules.pop("pyautogui", None) + + import nexus.computer_use as cu + return cu + + +@pytest.fixture() +def cu(tmp_path, monkeypatch): + """Fixture: computer_use module with pyautogui stubbed and log dir in tmp.""" + stub = _make_pyautogui_stub() + mod = _reload_module(pyautogui_stub=stub) + + # Redirect log dir to tmp so tests don't write to ~/.nexus + monkeypatch.setenv("NEXUS_HOME", str(tmp_path)) + mod._LOG_DIR = tmp_path + mod._ACTION_LOG = None # reset so it picks up new dir + mod._PYAUTOGUI_OK = True + mod.pyautogui = stub + + yield mod + + # Cleanup: remove stub from sys.modules so other tests aren't affected + sys.modules.pop("pyautogui", None) + for key in list(sys.modules.keys()): + if "nexus.computer_use" in key: + del sys.modules[key] + + +# --------------------------------------------------------------------------- +# computer_screenshot +# --------------------------------------------------------------------------- + +class TestComputerScreenshot: + def test_returns_ok_with_path(self, cu, tmp_path): + result = cu.computer_screenshot() + assert result["ok"] is True + assert result["tool"] == "computer_screenshot" + assert result["path"].endswith(".png") + + def test_respects_custom_path(self, cu, tmp_path): + target = str(tmp_path / "custom.png") + result = cu.computer_screenshot(output_path=target) + assert result["ok"] is True + assert result["path"] == target + + def test_saves_screenshot(self, cu, tmp_path): + cu.computer_screenshot() + # pyautogui.screenshot().save should have been called + cu.pyautogui.screenshot.assert_called() + cu.pyautogui.screenshot.return_value.save.assert_called() + + def test_writes_action_log(self, cu, tmp_path): + cu.computer_screenshot() + log_path = tmp_path / "computer_use_log.jsonl" + assert log_path.exists() + records = [json.loads(l) for l in log_path.read_text().splitlines() if l.strip()] + assert len(records) == 1 + assert records[0]["tool"] == "computer_screenshot" + + def test_error_when_unavailable(self, tmp_path, monkeypatch): + for key in list(sys.modules.keys()): + if "nexus.computer_use" in key: + del sys.modules[key] + sys.modules.pop("pyautogui", None) + + import nexus.computer_use as cu_mod + cu_mod._PYAUTOGUI_OK = False + cu_mod.pyautogui = None + cu_mod._LOG_DIR = tmp_path + cu_mod._ACTION_LOG = None + + result = cu_mod.computer_screenshot() + assert result["ok"] is False + assert "error" in result + + +# --------------------------------------------------------------------------- +# computer_click +# --------------------------------------------------------------------------- + +class TestComputerClick: + def test_left_click_ok(self, cu): + result = cu.computer_click(100, 200) + assert result["ok"] is True + assert result["x"] == 100 + assert result["y"] == 200 + assert result["button"] == "left" + cu.pyautogui.click.assert_called_once_with(100, 200) + + def test_right_click_requires_confirm(self, cu): + result = cu.computer_click(10, 10, button="right") + assert result["ok"] is False + assert "confirm=True" in result["error"] + + def test_right_click_with_confirm(self, cu): + result = cu.computer_click(10, 10, button="right", confirm=True) + assert result["ok"] is True + cu.pyautogui.rightClick.assert_called_once_with(10, 10) + + def test_middle_click_requires_confirm(self, cu): + result = cu.computer_click(10, 10, button="middle") + assert result["ok"] is False + + def test_invalid_button(self, cu): + result = cu.computer_click(10, 10, button="superclick", confirm=True) + assert result["ok"] is False + assert "Unknown button" in result["error"] + + def test_screenshots_captured(self, cu): + cu.computer_click(50, 50) + # screenshot() should be called twice (before + after) + assert cu.pyautogui.screenshot.call_count >= 2 + + def test_writes_log_on_success(self, cu, tmp_path): + cu.computer_click(1, 2) + log_path = tmp_path / "computer_use_log.jsonl" + records = [json.loads(l) for l in log_path.read_text().splitlines() if l.strip()] + assert any(r["tool"] == "computer_click" for r in records) + + def test_writes_log_on_poka_yoke_rejection(self, cu, tmp_path): + cu.computer_click(1, 2, button="right") + log_path = tmp_path / "computer_use_log.jsonl" + records = [json.loads(l) for l in log_path.read_text().splitlines() if l.strip()] + assert any(r["ok"] is False for r in records) + + +# --------------------------------------------------------------------------- +# computer_type +# --------------------------------------------------------------------------- + +class TestComputerType: + def test_type_plain_text(self, cu): + result = cu.computer_type("hello world") + assert result["ok"] is True + assert result["length"] == len("hello world") + cu.pyautogui.typewrite.assert_called_once_with("hello world", interval=0.02) + + def test_sensitive_text_rejected_without_confirm(self, cu): + result = cu.computer_type("mypassword123") + assert result["ok"] is False + assert "confirm=True" in result["error"] + + def test_sensitive_text_allowed_with_confirm(self, cu): + result = cu.computer_type("mypassword123", confirm=True) + assert result["ok"] is True + + def test_token_keyword_triggers_poka_yoke(self, cu): + result = cu.computer_type("Bearer token abc123") + assert result["ok"] is False + + def test_key_keyword_triggers_poka_yoke(self, cu): + result = cu.computer_type("api_key=secret") + assert result["ok"] is False + + def test_plain_text_no_confirm_needed(self, cu): + result = cu.computer_type("navigate to settings") + assert result["ok"] is True + + def test_length_in_result(self, cu): + text = "hello" + result = cu.computer_type(text) + assert result["length"] == len(text) + + +# --------------------------------------------------------------------------- +# computer_scroll +# --------------------------------------------------------------------------- + +class TestComputerScroll: + def test_scroll_down(self, cu): + result = cu.computer_scroll(100, 200, -3) + assert result["ok"] is True + assert result["amount"] == -3 + cu.pyautogui.moveTo.assert_called_once_with(100, 200) + cu.pyautogui.scroll.assert_called_once_with(-3) + + def test_scroll_up(self, cu): + result = cu.computer_scroll(0, 0, 5) + assert result["ok"] is True + assert result["amount"] == 5 + + def test_scroll_zero(self, cu): + result = cu.computer_scroll(0, 0, 0) + assert result["ok"] is True + + def test_writes_log(self, cu, tmp_path): + cu.computer_scroll(10, 20, 2) + log_path = tmp_path / "computer_use_log.jsonl" + records = [json.loads(l) for l in log_path.read_text().splitlines() if l.strip()] + assert any(r["tool"] == "computer_scroll" for r in records) + + def test_error_when_unavailable(self, tmp_path): + for key in list(sys.modules.keys()): + if "nexus.computer_use" in key: + del sys.modules[key] + sys.modules.pop("pyautogui", None) + + import nexus.computer_use as cu_mod + cu_mod._PYAUTOGUI_OK = False + cu_mod.pyautogui = None + cu_mod._LOG_DIR = tmp_path + cu_mod._ACTION_LOG = None + + result = cu_mod.computer_scroll(0, 0, 1) + assert result["ok"] is False + + +# --------------------------------------------------------------------------- +# read_action_log +# --------------------------------------------------------------------------- + +class TestReadActionLog: + def test_empty_log(self, cu, tmp_path): + records = cu.read_action_log() + assert records == [] + + def test_returns_records_after_actions(self, cu, tmp_path): + cu.computer_screenshot() + cu.computer_click(1, 1) + records = cu.read_action_log() + assert len(records) >= 2 + + def test_last_n_respected(self, cu, tmp_path): + for _ in range(10): + cu.computer_screenshot() + records = cu.read_action_log(last_n=3) + assert len(records) == 3 + + def test_records_have_tool_field(self, cu, tmp_path): + cu.computer_screenshot() + records = cu.read_action_log() + assert all("tool" in r for r in records)