2 changed files with 398 additions and 0 deletions
--- a/scripts/visual_state_verifier.py
+++ b/scripts/visual_state_verifier.py
@@ -0,0 +1,285 @@
 """
 Visual State Verification Module for Game Agents
 =================================================
 Provides screenshot-based environmental state verification for game agents
 (Morrowind, Minecraft, or any game with a screenshot API). Uses multimodal
 analysis to confirm agent expectations match actual game state.
 Usage:
    from scripts.visual_state_verifier import VisualStateVerifier
    verifier = VisualStateVerifier()
    result = verifier.verify_state(
        screenshot_path="/tmp/game_screenshot.png",
        expected_state={"location": "Balmora", "health_above": 50, "has_weapon": True},
        context="Player should be in Balmora with a weapon equipped"
    )
    print(result.verified)  # True/False
    print(result.details)   # Human-readable analysis
 """
 import json
 import os
 import subprocess
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
 from typing import Optional
 class VerificationStatus(Enum):
    """Status of a visual state verification."""
    VERIFIED = "verified"
    FAILED = "failed"
    UNCERTAIN = "uncertain"
    ERROR = "error"
@dataclass
 class VerificationResult:
    """Result of a visual state verification."""
    status: VerificationStatus
    verified: bool
    confidence: float  # 0.0 - 1.0
    details: str
    expected: dict
    observed: dict = field(default_factory=dict)
    mismatches: list = field(default_factory=list)
    screenshot_path: Optional[str] = None
 class VisualStateVerifier:
    """
    Verifies game state by analyzing screenshots against expected conditions.
    Supports any game that can produce screenshots. Designed for integration
    with MCP screenshot tools and vision analysis capabilities.
    """
    def __init__(self, vision_backend: str = "builtin"):
        """
        Args:
            vision_backend: "builtin" for MCP vision, "ollama" for local model
        """
        self.vision_backend = vision_backend
    def verify_state(
        self,
        screenshot_path: str,
        expected_state: dict,
        context: str = "",
        game: str = "generic"
    ) -> VerificationResult:
        """
        Verify a game screenshot matches expected state conditions.
        Args:
            screenshot_path: Path to the screenshot file
            expected_state: Dict of expected conditions, e.g.:
                {
                    "location": "Balmora",
                    "health_above": 50,
                    "has_weapon": True,
                    "time_of_day": "day",
                    "nearby_npcs": ["Caius Cosades"]
                }
            context: Additional context for the vision model
            game: Game name for context ("morrowind", "minecraft", "generic")
        Returns:
            VerificationResult with status, confidence, and details
        """
        if not Path(screenshot_path).exists():
            return VerificationResult(
                status=VerificationStatus.ERROR,
                verified=False,
                confidence=0.0,
                details=f"Screenshot not found: {screenshot_path}",
                expected=expected_state,
                screenshot_path=screenshot_path
            )
        # Build verification prompt
        prompt = self._build_prompt(expected_state, context, game)
        # Analyze screenshot
        analysis = self._analyze_screenshot(screenshot_path, prompt)
        # Parse results
        return self._parse_analysis(analysis, expected_state, screenshot_path)
    def _build_prompt(self, expected: dict, context: str, game: str) -> str:
        """Build a structured verification prompt for the vision model."""
        conditions = []
        for key, value in expected.items():
            if isinstance(value, bool):
                conditions.append(f"- {key}: {'yes' if value else 'no'}")
            elif isinstance(value, (int, float)):
                conditions.append(f"- {key}: {value} or better")
            elif isinstance(value, list):
                conditions.append(f"- {key}: should include {', '.join(str(v) for v in value)}")
            else:
                conditions.append(f"- {key}: {value}")
        prompt = f"""Analyze this {game} game screenshot and verify the following conditions:
 {chr(10).join(conditions)}
 Context: {context if context else 'No additional context provided.'}
 For each condition, state VERIFIED, FAILED, or UNCERTAIN with a brief reason.
 End with a JSON block:
 ```json
 {{
  "verified": true/false,
  "confidence": 0.0-1.0,
  "details": "brief summary",
  "mismatches": ["list of failed conditions"]
 }}
 ```
 """
        return prompt
    def _analyze_screenshot(self, path: str, prompt: str) -> str:
        """
        Send screenshot to vision backend for analysis.
        In a live agent context, this would call the MCP vision tool.
        For standalone use, it returns the prompt for manual invocation.
        """
        # Return structured prompt for the calling agent to process
        return json.dumps({
            "prompt": prompt,
            "screenshot_path": str(path),
            "instruction": "Use vision_analyze tool with this prompt and screenshot_path"
        })
    def _parse_analysis(
        self, analysis: str, expected: dict, screenshot_path: str
    ) -> VerificationResult:
        """Parse vision analysis into a VerificationResult."""
        try:
            data = json.loads(analysis)
            if "instruction" in data:
                # Not yet analyzed - return pending
                return VerificationResult(
                    status=VerificationStatus.UNCERTAIN,
                    verified=False,
                    confidence=0.0,
                    details=f"Pending analysis. Run: vision_analyze("{data['screenshot_path']}", "{data['prompt'][:100]}...")",
                    expected=expected,
                    screenshot_path=screenshot_path
                )
        except json.JSONDecodeError:
            pass
        # Parse text analysis for JSON block
        import re
        json_match = re.search(r"```json\s*({.*?})\s*```", analysis, re.DOTALL)
        if json_match:
            try:
                result = json.loads(json_match.group(1))
                status = VerificationStatus.VERIFIED if result.get("verified") else VerificationStatus.FAILED
                return VerificationResult(
                    status=status,
                    verified=result.get("verified", False),
                    confidence=result.get("confidence", 0.0),
                    details=result.get("details", ""),
                    expected=expected,
                    mismatches=result.get("mismatches", []),
                    screenshot_path=screenshot_path
                )
            except json.JSONDecodeError:
                pass
        # Fallback: return as uncertain
        return VerificationResult(
            status=VerificationStatus.UNCERTAIN,
            verified=False,
            confidence=0.3,
            details=analysis[:500],
            expected=expected,
            screenshot_path=screenshot_path
        )
    @staticmethod
    def morrowind_state(
        location: Optional[str] = None,
        health_min: Optional[int] = None,
        has_weapon: Optional[bool] = None,
        is_indoors: Optional[bool] = None,
        time_of_day: Optional[str] = None,
        nearby_npcs: Optional[list] = None,
        **extra
    ) -> dict:
        """Build expected state dict for Morrowind."""
        state = {}
        if location:
            state["location"] = location
        if health_min is not None:
            state["health_above"] = health_min
        if has_weapon is not None:
            state["has_weapon"] = has_weapon
        if is_indoors is not None:
            state["indoors"] = is_indoors
        if time_of_day:
            state["time_of_day"] = time_of_day
        if nearby_npcs:
            state["nearby_npcs"] = nearby_npcs
        state.update(extra)
        return state
 # --- Example Verification Flows ---
 EXAMPLE_MORROWIND_VERIFICATION = """
 # Verify player is in Balmora with a weapon
 verifier = VisualStateVerifier()
 result = verifier.verify_state(
    screenshot_path="/tmp/morrowind_screenshot.png",
    expected_state=VisualStateVerifier.morrowind_state(
        location="Balmora",
        health_min=50,
        has_weapon=True
    ),
    context="After completing the first Caius Cosades quest",
    game="morrowind"
 )
 if result.verified:
    print(f"State confirmed: {result.details}")
 else:
    print(f"State mismatch: {result.mismatches}")
 """
 EXAMPLE_BATCH_VERIFICATION = """
 # Verify multiple game states in sequence
 states = [
    {"screenshot": "screen1.png", "expected": {"location": "Seyda Neen"}, "context": "After character creation"},
    {"screenshot": "screen2.png", "expected": {"location": "Balmora", "has_weapon": True}, "context": "After buying weapon"},
    {"screenshot": "screen3.png", "expected": {"health_above": 80}, "context": "After resting"},
 ]
 verifier = VisualStateVerifier()
 for state in states:
    result = verifier.verify_state(**state, game="morrowind")
    print(f"{state['context']}: {'PASS' if result.verified else 'FAIL'} (confidence: {result.confidence:.0%})")
 """
 if __name__ == "__main__":
    # Demo: build and display a verification prompt
    verifier = VisualStateVerifier()
    expected = verifier.morrowind_state(
        location="Balmora",
        health_min=50,
        has_weapon=True,
        nearby_npcs=["Caius Cosades"]
    )
    result = verifier.verify_state(
        screenshot_path="/tmp/demo_screenshot.png",
        expected_state=expected,
        context="Player should have completed the first quest",
        game="morrowind"
    )
    print(result.details)
--- a/tests/test_visual_state_verifier.py
+++ b/tests/test_visual_state_verifier.py
@@ -0,0 +1,113 @@
 """Tests for visual state verification module."""
 import json
 import tempfile
 from pathlib import Path
 import pytest
 # Add parent to path for import
 import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.visual_state_verifier import (
    VisualStateVerifier,
    VerificationResult,
    VerificationStatus,
 )
 class TestVisualStateVerifier:
    """Test the visual state verifier."""
    def test_missing_screenshot_returns_error(self):
        verifier = VisualStateVerifier()
        result = verifier.verify_state(
            screenshot_path="/nonexistent/screenshot.png",
            expected_state={"location": "Balmora"},
            game="morrowind"
        )
        assert result.status == VerificationStatus.ERROR
        assert not result.verified
        assert "not found" in result.details.lower()
    def test_morrowind_state_builder(self):
        state = VisualStateVerifier.morrowind_state(
            location="Balmora",
            health_min=50,
            has_weapon=True,
            nearby_npcs=["Caius Cosades"]
        )
        assert state["location"] == "Balmora"
        assert state["health_above"] == 50
        assert state["has_weapon"] is True
        assert state["nearby_npcs"] == ["Caius Cosades"]
    def test_morrowind_state_minimal(self):
        state = VisualStateVerifier.morrowind_state(location="Vivec")
        assert state == {"location": "Vivec"}
    def test_morrowind_state_with_extras(self):
        state = VisualStateVerifier.morrowind_state(
            location="Balmora",
            quest_complete=True,
            gold_min=1000
        )
        assert state["quest_complete"] is True
        assert state["gold_min"] == 1000
    def test_prompt_includes_conditions(self):
        verifier = VisualStateVerifier()
        expected = {"location": "Balmora", "health_above": 50}
        prompt = verifier._build_prompt(expected, "Test context", "morrowind")
        assert "Balmora" in prompt
        assert "50" in prompt
        assert "Test context" in prompt
        assert "morrowind" in prompt
    def test_parse_analysis_returns_pending_for_raw(self):
        verifier = VisualStateVerifier()
        raw_analysis = json.dumps({
            "prompt": "test",
            "screenshot_path": "/tmp/test.png",
            "instruction": "Use vision_analyze"
        })
        result = verifier._parse_analysis(raw_analysis, {}, "/tmp/test.png")
        assert result.status == VerificationStatus.UNCERTAIN
        assert not result.verified
    def test_parse_analysis_extracts_json(self):
        verifier = VisualStateVerifier()
        analysis = """
        The player appears to be in Balmora.
        Health looks good.
        ```json
        {
            "verified": true,
            "confidence": 0.85,
            "details": "Player is in Balmora with weapon equipped",
            "mismatches": []
        }
        ```
        """
        result = verifier._parse_analysis(analysis, {"location": "Balmora"}, "/tmp/test.png")
        assert result.status == VerificationStatus.VERIFIED
        assert result.verified
        assert result.confidence == 0.85
        assert result.mismatches == []
    def test_parse_analysis_handles_failures(self):
        verifier = VisualStateVerifier()
        analysis = """
        ```json
        {
            "verified": false,
            "confidence": 0.9,
            "details": "Player is not in Balmora",
            "mismatches": ["location"]
        }
        ```
        """
        result = verifier._parse_analysis(analysis, {"location": "Balmora"}, "/tmp/test.png")
        assert result.status == VerificationStatus.FAILED
        assert not result.verified
        assert "location" in result.mismatches