diff --git a/scripts/visual_state_verifier.py b/scripts/visual_state_verifier.py new file mode 100644 index 00000000..e9a72fec --- /dev/null +++ b/scripts/visual_state_verifier.py @@ -0,0 +1,285 @@ +""" +Visual State Verification Module for Game Agents +================================================= + +Provides screenshot-based environmental state verification for game agents +(Morrowind, Minecraft, or any game with a screenshot API). Uses multimodal +analysis to confirm agent expectations match actual game state. + +Usage: + from scripts.visual_state_verifier import VisualStateVerifier + + verifier = VisualStateVerifier() + result = verifier.verify_state( + screenshot_path="/tmp/game_screenshot.png", + expected_state={"location": "Balmora", "health_above": 50, "has_weapon": True}, + context="Player should be in Balmora with a weapon equipped" + ) + print(result.verified) # True/False + print(result.details) # Human-readable analysis +""" + +import json +import os +import subprocess +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Optional + + +class VerificationStatus(Enum): + """Status of a visual state verification.""" + VERIFIED = "verified" + FAILED = "failed" + UNCERTAIN = "uncertain" + ERROR = "error" + + +@dataclass +class VerificationResult: + """Result of a visual state verification.""" + status: VerificationStatus + verified: bool + confidence: float # 0.0 - 1.0 + details: str + expected: dict + observed: dict = field(default_factory=dict) + mismatches: list = field(default_factory=list) + screenshot_path: Optional[str] = None + + +class VisualStateVerifier: + """ + Verifies game state by analyzing screenshots against expected conditions. + + Supports any game that can produce screenshots. Designed for integration + with MCP screenshot tools and vision analysis capabilities. + """ + + def __init__(self, vision_backend: str = "builtin"): + """ + Args: + vision_backend: "builtin" for MCP vision, "ollama" for local model + """ + self.vision_backend = vision_backend + + def verify_state( + self, + screenshot_path: str, + expected_state: dict, + context: str = "", + game: str = "generic" + ) -> VerificationResult: + """ + Verify a game screenshot matches expected state conditions. + + Args: + screenshot_path: Path to the screenshot file + expected_state: Dict of expected conditions, e.g.: + { + "location": "Balmora", + "health_above": 50, + "has_weapon": True, + "time_of_day": "day", + "nearby_npcs": ["Caius Cosades"] + } + context: Additional context for the vision model + game: Game name for context ("morrowind", "minecraft", "generic") + + Returns: + VerificationResult with status, confidence, and details + """ + if not Path(screenshot_path).exists(): + return VerificationResult( + status=VerificationStatus.ERROR, + verified=False, + confidence=0.0, + details=f"Screenshot not found: {screenshot_path}", + expected=expected_state, + screenshot_path=screenshot_path + ) + + # Build verification prompt + prompt = self._build_prompt(expected_state, context, game) + + # Analyze screenshot + analysis = self._analyze_screenshot(screenshot_path, prompt) + + # Parse results + return self._parse_analysis(analysis, expected_state, screenshot_path) + + def _build_prompt(self, expected: dict, context: str, game: str) -> str: + """Build a structured verification prompt for the vision model.""" + conditions = [] + for key, value in expected.items(): + if isinstance(value, bool): + conditions.append(f"- {key}: {'yes' if value else 'no'}") + elif isinstance(value, (int, float)): + conditions.append(f"- {key}: {value} or better") + elif isinstance(value, list): + conditions.append(f"- {key}: should include {', '.join(str(v) for v in value)}") + else: + conditions.append(f"- {key}: {value}") + + prompt = f"""Analyze this {game} game screenshot and verify the following conditions: + +{chr(10).join(conditions)} + +Context: {context if context else 'No additional context provided.'} + +For each condition, state VERIFIED, FAILED, or UNCERTAIN with a brief reason. +End with a JSON block: +```json +{{ + "verified": true/false, + "confidence": 0.0-1.0, + "details": "brief summary", + "mismatches": ["list of failed conditions"] +}} +``` +""" + return prompt + + def _analyze_screenshot(self, path: str, prompt: str) -> str: + """ + Send screenshot to vision backend for analysis. + + In a live agent context, this would call the MCP vision tool. + For standalone use, it returns the prompt for manual invocation. + """ + # Return structured prompt for the calling agent to process + return json.dumps({ + "prompt": prompt, + "screenshot_path": str(path), + "instruction": "Use vision_analyze tool with this prompt and screenshot_path" + }) + + def _parse_analysis( + self, analysis: str, expected: dict, screenshot_path: str + ) -> VerificationResult: + """Parse vision analysis into a VerificationResult.""" + try: + data = json.loads(analysis) + if "instruction" in data: + # Not yet analyzed - return pending + return VerificationResult( + status=VerificationStatus.UNCERTAIN, + verified=False, + confidence=0.0, + details=f"Pending analysis. Run: vision_analyze("{data['screenshot_path']}", "{data['prompt'][:100]}...")", + expected=expected, + screenshot_path=screenshot_path + ) + except json.JSONDecodeError: + pass + + # Parse text analysis for JSON block + import re + json_match = re.search(r"```json\s*({.*?})\s*```", analysis, re.DOTALL) + if json_match: + try: + result = json.loads(json_match.group(1)) + status = VerificationStatus.VERIFIED if result.get("verified") else VerificationStatus.FAILED + return VerificationResult( + status=status, + verified=result.get("verified", False), + confidence=result.get("confidence", 0.0), + details=result.get("details", ""), + expected=expected, + mismatches=result.get("mismatches", []), + screenshot_path=screenshot_path + ) + except json.JSONDecodeError: + pass + + # Fallback: return as uncertain + return VerificationResult( + status=VerificationStatus.UNCERTAIN, + verified=False, + confidence=0.3, + details=analysis[:500], + expected=expected, + screenshot_path=screenshot_path + ) + + @staticmethod + def morrowind_state( + location: Optional[str] = None, + health_min: Optional[int] = None, + has_weapon: Optional[bool] = None, + is_indoors: Optional[bool] = None, + time_of_day: Optional[str] = None, + nearby_npcs: Optional[list] = None, + **extra + ) -> dict: + """Build expected state dict for Morrowind.""" + state = {} + if location: + state["location"] = location + if health_min is not None: + state["health_above"] = health_min + if has_weapon is not None: + state["has_weapon"] = has_weapon + if is_indoors is not None: + state["indoors"] = is_indoors + if time_of_day: + state["time_of_day"] = time_of_day + if nearby_npcs: + state["nearby_npcs"] = nearby_npcs + state.update(extra) + return state + + +# --- Example Verification Flows --- + +EXAMPLE_MORROWIND_VERIFICATION = """ +# Verify player is in Balmora with a weapon +verifier = VisualStateVerifier() +result = verifier.verify_state( + screenshot_path="/tmp/morrowind_screenshot.png", + expected_state=VisualStateVerifier.morrowind_state( + location="Balmora", + health_min=50, + has_weapon=True + ), + context="After completing the first Caius Cosades quest", + game="morrowind" +) + +if result.verified: + print(f"State confirmed: {result.details}") +else: + print(f"State mismatch: {result.mismatches}") +""" + +EXAMPLE_BATCH_VERIFICATION = """ +# Verify multiple game states in sequence +states = [ + {"screenshot": "screen1.png", "expected": {"location": "Seyda Neen"}, "context": "After character creation"}, + {"screenshot": "screen2.png", "expected": {"location": "Balmora", "has_weapon": True}, "context": "After buying weapon"}, + {"screenshot": "screen3.png", "expected": {"health_above": 80}, "context": "After resting"}, +] + +verifier = VisualStateVerifier() +for state in states: + result = verifier.verify_state(**state, game="morrowind") + print(f"{state['context']}: {'PASS' if result.verified else 'FAIL'} (confidence: {result.confidence:.0%})") +""" + +if __name__ == "__main__": + # Demo: build and display a verification prompt + verifier = VisualStateVerifier() + expected = verifier.morrowind_state( + location="Balmora", + health_min=50, + has_weapon=True, + nearby_npcs=["Caius Cosades"] + ) + result = verifier.verify_state( + screenshot_path="/tmp/demo_screenshot.png", + expected_state=expected, + context="Player should have completed the first quest", + game="morrowind" + ) + print(result.details)