WIP: Gemini Code progress on #1008

Automated salvage commit — agent session ended (exit 124). Work in progress, may need continuation.
2026-03-23 14:48:35 -04:00
parent cd1bc2bf6b
commit 037494c19b
9 changed files with 380 additions and 8 deletions
--- a/src/infrastructure/perception/init.py
+++ b/src/infrastructure/perception/init.py
--- a/src/infrastructure/perception/processor.py
+++ b/src/infrastructure/perception/processor.py
@@ -0,0 +1,117 @@
+"""
+Metal-accelerated image processing and UI state classification for perception.
+
+This module provides functions to preprocess raw image frames (resizing,
+grayscale, contrast enhancement) using Metal shaders and to classify
+UI states using Core ML models, explicitly pinned to the GPU/Neural Engine.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+# For now, we'll use Pillow as a placeholder for image operations.
+# In a real Metal integration, these would be replaced by calls to Metal shaders.
+try:
+    from PIL import Image
+    from PIL import ImageOps
+except ImportError:
+    Image = None
+    ImageOps = None
+    logging.warning("Pillow not installed. Image processing will be a no-op.")
+
+
+logger = logging.getLogger(__name__)
+
+
+class MetalPerceptionProcessor:
+    """
+    Handles Metal-accelerated frame preprocessing and Core ML UI state classification.
+    """
+
+    def __init__(self):
+        logger.info("Initializing MetalPerceptionProcessor.")
+        # REAL IMPLEMENTATION: Initialize Metal devices and command queue.
+        # This would involve using a library like PyObjC to interact with CoreGraphics,
+        # CoreImage, and MetalKit to set up the GPU for image processing.
+        # For this mock, we simulate readiness if Pillow is installed.
+        self.metal_ready = False
+        if Image is None:
+            logger.warning("MetalPerceptionProcessor will operate in no-op mode due to missing Pillow.")
+        else:
+            self.metal_ready = True # Simulate readiness if Pillow is available
+
+        # REAL IMPLEMENTATION: Load Core ML model and configure it to use the GPU/Neural Engine.
+        # This would involve using MLModel.load() and setting MLCpuOnly to false or
+        # specifying MLComputeUnits.all for optimal performance on Apple Silicon.
+        # For this mock, we just have a placeholder.
+        self.core_ml_model = None
+        logger.info("MetalPerceptionProcessor initialized.")
+
+    def preprocess_frame(self, raw_image_data: bytes, target_size: tuple[int, int] = (256, 256)) -> bytes:
+        """
+        Preprocesses a raw image frame using (simulated) Metal shaders.
+
+        Args:
+            raw_image_data: Raw bytes of the image (e.g., PNG, JPEG).
+            target_size: The desired (width, height) for resizing.
+
+        Returns:
+            Processed image data as bytes.
+        """
+        if not self.metal_ready:
+            logger.warning("Pillow not available. Skipping frame preprocessing.")
+            return raw_image_data
+
+        logger.debug("Preprocessing frame with (simulated) Metal shaders.")
+        try:
+            # Simulate Metal operations using Pillow
+            image = Image.open(io.BytesIO(raw_image_data))
+
+            # 1. Resize
+            image = image.resize(target_size, Image.Resampling.LANCZOS)
+
+            # 2. Grayscale
+            image = ImageOps.grayscale(image)
+
+            # 3. Contrast enhancement (simple example)
+            # This is a very basic contrast adjustment. Metal shaders would allow for more sophisticated algorithms.
+            image = ImageOps.autocontrast(image)
+
+            output_buffer = io.BytesIO()
+            image.save(output_buffer, format="PNG")  # Or appropriate format
+            return output_buffer.getvalue()
+
+        except Exception as e:
+            logger.error("Simulated Metal frame preprocessing failed: %s", e)
+            return raw_image_data # Return original data on failure
+
+    def classify_ui_state(self, preprocessed_image_data: bytes) -> dict[str, Any]:
+        """
+        Classifies the UI state using a (simulated) Core ML model.
+
+        Args:
+            preprocessed_image_data: Image data after preprocessing.
+
+        Returns:
+            A dictionary containing classification results (e.g., {"state": "dashboard", "confidence": 0.9}).
+        """
+        if not self.metal_ready:
+            logger.warning("Pillow not available. Skipping UI state classification.")
+            return {"state": "unknown", "confidence": 0.0}
+
+        logger.debug("Classifying UI state with (simulated) Core ML.")
+        # TODO: Implement actual Core ML model inference here.
+        # This would involve converting preprocessed_image_data to a CVPixelBuffer
+        # and feeding it to the Core ML model, ensuring it runs on GPU/Neural Engine.
+        # For now, return a mock result.
+        mock_results = {
+            "state": "dashboard_overview",
+            "confidence": 0.85,
+            "detected_elements": ["chart_widget", "notification_bell"],
+        }
+        return mock_results
+
+# Module-level singleton for easy access
+metal_perception_processor = MetalPerceptionProcessor()
--- a/src/infrastructure/world/adapters/desktop_mock.py
+++ b/src/infrastructure/world/adapters/desktop_mock.py
@@ -0,0 +1,102 @@
+"""Mock desktop world adapter — returns canned perception with a mock image.
+
+Useful for testing the heartbeat loop and Metal perception integration
+without needing a live desktop capture or a running game server.
+"""
+
+from __future__ import annotations
+
+import base64
+import logging
+from dataclasses import dataclass
+from datetime import UTC, datetime
+
+from infrastructure.world.interface import WorldInterface
+from infrastructure.world.types import (
+    ActionResult,
+    ActionStatus,
+    CommandInput,
+    PerceptionOutput,
+)
+
+logger = logging.getLogger(__name__)
+
+# A very simple 1x1 black PNG image, base64 encoded.
+# In a real scenario, this would be a more representative screenshot.
+_MOCK_PNG_IMAGE_DATA = (
+    b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
+)
+
+
+class DesktopMockWorldAdapter(WorldInterface):
+    """In-memory mock adapter simulating a desktop environment.
+
+    * ``observe()`` returns configurable canned perception, including a mock image in raw.
+    * ``act()`` logs the command and returns success.
+    * ``speak()`` logs the message.
+    """
+
+    def __init__(
+        self,
+        *,
+        location: str = "Desktop",
+        entities: list[str] | None = None,
+        events: list[str] | None = None,
+        mock_image_data: bytes = _MOCK_PNG_IMAGE_DATA,
+    ) -> None:
+        self._location = location
+        self._entities = entities or ["Browser", "Terminal"]
+        self._events = events or ["Window focused"]
+        self._connected = False
+        self._mock_image_data = mock_image_data
+        self.action_log: list[CommandInput] = []
+        self.speech_log: list[dict] = []
+
+    # -- lifecycle ---------------------------------------------------------
+
+    def connect(self) -> None:
+        self._connected = True
+        logger.info("DesktopMockWorldAdapter connected")
+
+    def disconnect(self) -> None:
+        self._connected = False
+        logger.info("DesktopMockWorldAdapter disconnected")
+
+    @property
+    def is_connected(self) -> bool:
+        return self._connected
+
+    # -- core contract -----------------------------------------------------
+
+    def observe(self) -> PerceptionOutput:
+        logger.debug("DesktopMockWorldAdapter.observe()")
+        return PerceptionOutput(
+            timestamp=datetime.now(UTC),
+            location=self._location,
+            entities=list(self._entities),
+            events=list(self._events),
+            raw={
+                "adapter": "desktop_mock",
+                "image": base64.b64decode(self._mock_image_data),
+                "image_format": "png",
+            },
+        )
+
+    def act(self, command: CommandInput) -> ActionResult:
+        logger.debug("DesktopMockWorldAdapter.act(%s)", command.action)
+        self.action_log.append(command)
+        return ActionResult(
+            status=ActionStatus.SUCCESS,
+            message=f"Mock desktop action executed: {command.action}",
+            data={"adapter": "desktop_mock"},
+        )
+
+    def speak(self, message: str, target: str | None = None) -> None:
+        logger.debug("DesktopMockWorldAdapter.speak(%r, target=%r)", message, target)
+        self.speech_log.append(
+            {
+                "message": message,
+                "target": target,
+                "timestamp": datetime.now(UTC).isoformat(),
+            }
+        )
--- a/src/infrastructure/world/benchmark/metrics.py
+++ b/src/infrastructure/world/benchmark/metrics.py
@@ -38,6 +38,7 @@ class ScenarioResult:
    wall_time_ms: int = 0
    llm_calls: int = 0
    metabolic_cost: float = 0.0
+    perception_latency_ms: int = 0 # New: Time taken for image preprocessing and UI classification.
    error: str | None = None
    tags: list[str] = field(default_factory=list)

@@ -120,11 +121,12 @@ class BenchmarkMetrics:
        lines.append("")
        for r in self.results:
            status = "PASS" if r.success else "FAIL"
+            perception_info = f", {r.perception_latency_ms} ms perception" if r.perception_latency_ms > 0 else ""
            lines.append(
                f"  [{status}] {r.scenario_name} — "
                f"{r.cycles_used}/{r.max_cycles} cycles, "
                f"{r.wall_time_ms} ms, "
-                f"{r.llm_calls} LLM calls"
+                f"{r.llm_calls} LLM calls{perception_info}"
            )
            if r.error:
                lines.append(f"         Error: {r.error}")
--- a/src/infrastructure/world/benchmark/runner.py
+++ b/src/infrastructure/world/benchmark/runner.py
@@ -13,6 +13,7 @@ import time
 from datetime import UTC, datetime

 from infrastructure.world.adapters.mock import MockWorldAdapter
+from infrastructure.world.adapters.desktop_mock import DesktopMockWorldAdapter # New import
 from infrastructure.world.benchmark.metrics import BenchmarkMetrics, ScenarioResult
 from infrastructure.world.benchmark.scenarios import BenchmarkScenario
 from infrastructure.world.interface import WorldInterface
@@ -96,9 +97,15 @@ class BenchmarkRunner:
        start = time.monotonic()
        try:
            for cycle in range(1, scenario.max_cycles + 1):
+                cycle_start_time = time.monotonic() # Record start time for the cycle
                record = await hb.run_once()
+                cycle_end_time = time.monotonic() # Record end time for the cycle
                result.cycles_used = cycle

+                # If this is a perception benchmark, record the perception latency
+                if "perception" in scenario.tags:
+                    result.perception_latency_ms = int((cycle_end_time - cycle_start_time) * 1000)
+
                # Track LLM calls (each cycle has 3 phases that may call LLM)
                result.llm_calls += 3

@@ -136,7 +143,13 @@ class BenchmarkRunner:

    @staticmethod
    def _default_adapter(scenario: BenchmarkScenario) -> WorldInterface:
-        """Build a MockWorldAdapter from a scenario's starting state."""
+        """Build a MockWorldAdapter (or DesktopMockWorldAdapter) from a scenario's starting state."""
+        if "perception" in scenario.tags:
+            return DesktopMockWorldAdapter(
+                location=scenario.start_location,
+                entities=list(scenario.entities),
+                events=list(scenario.events),
+            )
        return MockWorldAdapter(
            location=scenario.start_location,
            entities=list(scenario.entities),
--- a/src/infrastructure/world/benchmark/scenarios.py
+++ b/src/infrastructure/world/benchmark/scenarios.py
@@ -140,6 +140,20 @@ BUILTIN_SCENARIOS: list[BenchmarkScenario] = [
        max_cycles=10,
        tags=["observation", "passive"],
    ),
+    BenchmarkScenario(
+        name="Perception Latency Test",
+        description=(
+            "Measures the latency of the Metal-accelerated perception pipeline "
+            "including frame preprocessing and UI state classification."
+        ),
+        start_location="Desktop",
+        goal_location="",
+        entities=["Browser", "Terminal", "VS Code"],
+        events=["desktop_active", "ui_changed"],
+        max_cycles=1, # One cycle to measure a single perception pass
+        goal_predicate=lambda actions, current_location: True, # Always succeed, we just measure time
+        tags=["perception", "performance", "metal_accelerated"],
+    ),
 ]


--- a/src/infrastructure/world/registry.py
+++ b/src/infrastructure/world/registry.py
@@ -13,6 +13,8 @@ import logging
 from typing import Any

 from infrastructure.world.interface import WorldInterface
+from infrastructure.world.adapters.mock import MockWorldAdapter
+from infrastructure.world.adapters.desktop_mock import DesktopMockWorldAdapter

 logger = logging.getLogger(__name__)

@@ -22,6 +24,8 @@ class AdapterRegistry:

    def __init__(self) -> None:
        self._adapters: dict[str, type[WorldInterface]] = {}
+        self.register("mock", MockWorldAdapter)
+        self.register("desktop_mock", DesktopMockWorldAdapter)

    def register(self, name: str, cls: type[WorldInterface]) -> None:
        """Register an adapter class under *name*.
--- a/src/loop/heartbeat.py
+++ b/src/loop/heartbeat.py
@@ -198,16 +198,35 @@ class Heartbeat:
            "events": perception.events,
        }

+        from infrastructure.perception.processor import metal_perception_processor
+
        # 2. Feed observation into the three-phase loop
-        obs_content = (
-            f"Location: {perception.location}\n"
-            f"Entities: {', '.join(perception.entities)}\n"
-            f"Events: {', '.join(perception.events)}"
-        )
+        extra_metadata = {"perception": record.observation}
+        obs_content_parts = [
+            f"Location: {perception.location}",
+            f"Entities: {', '.join(perception.entities)}",
+            f"Events: {', '.join(perception.events)}",
+        ]
+
+        # Process raw image data if available
+        if perception.raw and "image" in perception.raw:
+            raw_image_data = perception.raw["image"]
+            processed_image_data = metal_perception_processor.preprocess_frame(raw_image_data)
+            ui_classification = metal_perception_processor.classify_ui_state(processed_image_data)
+            extra_metadata["preprocessed_frame"] = processed_image_data # Or a reference/hash
+            extra_metadata["ui_classification"] = ui_classification
+            obs_content_parts.append(f"UI State: {ui_classification.get('state', 'unknown')}")
+            obs_content_parts.append(f"Detected Elements: {', '.join(ui_classification.get('detected_elements', []))}")
+            logger.info(
+                "Perception: preprocessed_frame_size=%d ui_state=%s",
+                len(processed_image_data), ui_classification.get('state', 'unknown')
+            )
+        
+        obs_content = "\n".join(obs_content_parts)
        payload = ContextPayload(
            source="world",
            content=obs_content,
-            metadata={"perception": record.observation},
+            metadata=extra_metadata,
        )

        gathered = gather(payload)
--- a/tests/infrastructure/world/test_perception.py
+++ b/tests/infrastructure/world/test_perception.py
@@ -0,0 +1,101 @@
+"""Tests for the Metal-accelerated perception pipeline and mock adapter.
+"""
+
+from __future__ import annotations
+
+import base64
+import io
+from unittest.mock import MagicMock
+
+import pytest
+from PIL import Image
+
+from infrastructure.perception.processor import MetalPerceptionProcessor, metal_perception_processor
+from infrastructure.world.adapters.desktop_mock import DesktopMockWorldAdapter, _MOCK_PNG_IMAGE_DATA
+from infrastructure.world.types import PerceptionOutput
+from loop.heartbeat import Heartbeat
+
+
+@pytest.fixture(autouse=True)
+def mock_pillow_imports(monkeypatch):
+    """Ensure Pillow is available for tests, mock if not really installed."""
+    if Image is None:
+        mock_image = MagicMock()
+        mock_image.Resampling.LANCZOS = 1 # Mimic constant
+        monkeypatch.setattr("infrastructure.perception.processor.Image", mock_image)
+        monkeypatch.setattr("infrastructure.perception.processor.ImageOps", MagicMock())
+        monkeypatch.setattr("infrastructure.perception.processor.io", io)
+
+
+@pytest.fixture
+def desktop_mock_adapter():
+    return DesktopMockWorldAdapter()
+
+
+@pytest.fixture
+def perception_processor():
+    return MetalPerceptionProcessor()
+
+
+class TestDesktopMockWorldAdapter:
+    def test_observe_returns_mock_image_data(self, desktop_mock_adapter):
+        perception = desktop_mock_adapter.observe()
+        assert isinstance(perception, PerceptionOutput)
+        assert "image" in perception.raw
+        assert perception.raw["image"] == base64.b64decode(_MOCK_PNG_IMAGE_DATA)
+        assert perception.raw["image_format"] == "png"
+        assert perception.location == "Desktop"
+        assert "Browser" in perception.entities
+
+
+class TestMetalPerceptionProcessor:
+    def test_init_sets_metal_ready_if_pillow_available(self, perception_processor):
+        if Image is not None:
+            assert perception_processor.metal_ready is True
+        else:
+            assert perception_processor.metal_ready is False
+
+    def test_preprocess_frame_simulates_processing(self, perception_processor, desktop_mock_adapter):
+        raw_image_data = desktop_mock_adapter.observe().raw["image"]
+
+        if perception_processor.metal_ready:
+            processed_image_data = perception_processor.preprocess_frame(raw_image_data)
+            assert processed_image_data != raw_image_data # Should be transformed
+            # Attempt to open as an image to ensure it's valid PNG after processing
+            processed_image = Image.open(io.BytesIO(processed_image_data))
+            assert processed_image.format == "PNG"
+            assert processed_image.size == (256, 256) # Default target size
+            assert processed_image.mode == "L" # Grayscale
+        else:
+            # If Pillow is not available, it should return the original data
+            processed_image_data = perception_processor.preprocess_frame(raw_image_data)
+            assert processed_image_data == raw_image_data
+
+    def test_classify_ui_state_returns_mock_result(self, perception_processor, desktop_mock_adapter):
+        raw_image_data = desktop_mock_adapter.observe().raw["image"]
+        preprocessed_data = perception_processor.preprocess_frame(raw_image_data)
+
+        if perception_processor.metal_ready:
+            classification = perception_processor.classify_ui_state(preprocessed_data)
+            assert "state" in classification
+            assert "confidence" in classification
+            assert classification["state"] == "dashboard_overview"
+            assert classification["confidence"] == 0.85
+        else:
+            classification = perception_processor.classify_ui_state(preprocessed_data)
+            assert classification == {"state": "unknown", "confidence": 0.0}
+
+
+@pytest.mark.asyncio
+class TestHeartbeatPerceptionIntegration:
+    async def test_embodied_cycle_processes_perception_raw_image(self, desktop_mock_adapter):
+        heartbeat = Heartbeat(world=desktop_mock_adapter, interval=0.1)
+        record = await heartbeat.run_once()
+
+        assert record.observation["location"] == "Desktop"
+        assert "preprocessed_frame" in record.reasoning_summary # Check that processed frame is reflected in summary
+        assert "UI State: dashboard_overview" in record.reasoning_summary
+
+        # Verify metadata contains processed data and classification
+        # This requires inspecting the payload passed to gather, which is internal to Heartbeat for now
+        # For now, we rely on the summary containing the info.