Multimodal screenshot-based state verification: - Generic verifier for any game with screenshots - Morrowind-specific state builder - Structured prompt generation for vision models - JSON result parsing with confidence scoring - Batch verification examples Relates to #1482
286 lines
9.6 KiB
Python
286 lines
9.6 KiB
Python
"""
|
|
Visual State Verification Module for Game Agents
|
|
=================================================
|
|
|
|
Provides screenshot-based environmental state verification for game agents
|
|
(Morrowind, Minecraft, or any game with a screenshot API). Uses multimodal
|
|
analysis to confirm agent expectations match actual game state.
|
|
|
|
Usage:
|
|
from scripts.visual_state_verifier import VisualStateVerifier
|
|
|
|
verifier = VisualStateVerifier()
|
|
result = verifier.verify_state(
|
|
screenshot_path="/tmp/game_screenshot.png",
|
|
expected_state={"location": "Balmora", "health_above": 50, "has_weapon": True},
|
|
context="Player should be in Balmora with a weapon equipped"
|
|
)
|
|
print(result.verified) # True/False
|
|
print(result.details) # Human-readable analysis
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import subprocess
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
class VerificationStatus(Enum):
|
|
"""Status of a visual state verification."""
|
|
VERIFIED = "verified"
|
|
FAILED = "failed"
|
|
UNCERTAIN = "uncertain"
|
|
ERROR = "error"
|
|
|
|
|
|
@dataclass
|
|
class VerificationResult:
|
|
"""Result of a visual state verification."""
|
|
status: VerificationStatus
|
|
verified: bool
|
|
confidence: float # 0.0 - 1.0
|
|
details: str
|
|
expected: dict
|
|
observed: dict = field(default_factory=dict)
|
|
mismatches: list = field(default_factory=list)
|
|
screenshot_path: Optional[str] = None
|
|
|
|
|
|
class VisualStateVerifier:
|
|
"""
|
|
Verifies game state by analyzing screenshots against expected conditions.
|
|
|
|
Supports any game that can produce screenshots. Designed for integration
|
|
with MCP screenshot tools and vision analysis capabilities.
|
|
"""
|
|
|
|
def __init__(self, vision_backend: str = "builtin"):
|
|
"""
|
|
Args:
|
|
vision_backend: "builtin" for MCP vision, "ollama" for local model
|
|
"""
|
|
self.vision_backend = vision_backend
|
|
|
|
def verify_state(
|
|
self,
|
|
screenshot_path: str,
|
|
expected_state: dict,
|
|
context: str = "",
|
|
game: str = "generic"
|
|
) -> VerificationResult:
|
|
"""
|
|
Verify a game screenshot matches expected state conditions.
|
|
|
|
Args:
|
|
screenshot_path: Path to the screenshot file
|
|
expected_state: Dict of expected conditions, e.g.:
|
|
{
|
|
"location": "Balmora",
|
|
"health_above": 50,
|
|
"has_weapon": True,
|
|
"time_of_day": "day",
|
|
"nearby_npcs": ["Caius Cosades"]
|
|
}
|
|
context: Additional context for the vision model
|
|
game: Game name for context ("morrowind", "minecraft", "generic")
|
|
|
|
Returns:
|
|
VerificationResult with status, confidence, and details
|
|
"""
|
|
if not Path(screenshot_path).exists():
|
|
return VerificationResult(
|
|
status=VerificationStatus.ERROR,
|
|
verified=False,
|
|
confidence=0.0,
|
|
details=f"Screenshot not found: {screenshot_path}",
|
|
expected=expected_state,
|
|
screenshot_path=screenshot_path
|
|
)
|
|
|
|
# Build verification prompt
|
|
prompt = self._build_prompt(expected_state, context, game)
|
|
|
|
# Analyze screenshot
|
|
analysis = self._analyze_screenshot(screenshot_path, prompt)
|
|
|
|
# Parse results
|
|
return self._parse_analysis(analysis, expected_state, screenshot_path)
|
|
|
|
def _build_prompt(self, expected: dict, context: str, game: str) -> str:
|
|
"""Build a structured verification prompt for the vision model."""
|
|
conditions = []
|
|
for key, value in expected.items():
|
|
if isinstance(value, bool):
|
|
conditions.append(f"- {key}: {'yes' if value else 'no'}")
|
|
elif isinstance(value, (int, float)):
|
|
conditions.append(f"- {key}: {value} or better")
|
|
elif isinstance(value, list):
|
|
conditions.append(f"- {key}: should include {', '.join(str(v) for v in value)}")
|
|
else:
|
|
conditions.append(f"- {key}: {value}")
|
|
|
|
prompt = f"""Analyze this {game} game screenshot and verify the following conditions:
|
|
|
|
{chr(10).join(conditions)}
|
|
|
|
Context: {context if context else 'No additional context provided.'}
|
|
|
|
For each condition, state VERIFIED, FAILED, or UNCERTAIN with a brief reason.
|
|
End with a JSON block:
|
|
```json
|
|
{{
|
|
"verified": true/false,
|
|
"confidence": 0.0-1.0,
|
|
"details": "brief summary",
|
|
"mismatches": ["list of failed conditions"]
|
|
}}
|
|
```
|
|
"""
|
|
return prompt
|
|
|
|
def _analyze_screenshot(self, path: str, prompt: str) -> str:
|
|
"""
|
|
Send screenshot to vision backend for analysis.
|
|
|
|
In a live agent context, this would call the MCP vision tool.
|
|
For standalone use, it returns the prompt for manual invocation.
|
|
"""
|
|
# Return structured prompt for the calling agent to process
|
|
return json.dumps({
|
|
"prompt": prompt,
|
|
"screenshot_path": str(path),
|
|
"instruction": "Use vision_analyze tool with this prompt and screenshot_path"
|
|
})
|
|
|
|
def _parse_analysis(
|
|
self, analysis: str, expected: dict, screenshot_path: str
|
|
) -> VerificationResult:
|
|
"""Parse vision analysis into a VerificationResult."""
|
|
try:
|
|
data = json.loads(analysis)
|
|
if "instruction" in data:
|
|
# Not yet analyzed - return pending
|
|
return VerificationResult(
|
|
status=VerificationStatus.UNCERTAIN,
|
|
verified=False,
|
|
confidence=0.0,
|
|
details=f"Pending analysis. Run: vision_analyze("{data['screenshot_path']}", "{data['prompt'][:100]}...")",
|
|
expected=expected,
|
|
screenshot_path=screenshot_path
|
|
)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Parse text analysis for JSON block
|
|
import re
|
|
json_match = re.search(r"```json\s*({.*?})\s*```", analysis, re.DOTALL)
|
|
if json_match:
|
|
try:
|
|
result = json.loads(json_match.group(1))
|
|
status = VerificationStatus.VERIFIED if result.get("verified") else VerificationStatus.FAILED
|
|
return VerificationResult(
|
|
status=status,
|
|
verified=result.get("verified", False),
|
|
confidence=result.get("confidence", 0.0),
|
|
details=result.get("details", ""),
|
|
expected=expected,
|
|
mismatches=result.get("mismatches", []),
|
|
screenshot_path=screenshot_path
|
|
)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Fallback: return as uncertain
|
|
return VerificationResult(
|
|
status=VerificationStatus.UNCERTAIN,
|
|
verified=False,
|
|
confidence=0.3,
|
|
details=analysis[:500],
|
|
expected=expected,
|
|
screenshot_path=screenshot_path
|
|
)
|
|
|
|
@staticmethod
|
|
def morrowind_state(
|
|
location: Optional[str] = None,
|
|
health_min: Optional[int] = None,
|
|
has_weapon: Optional[bool] = None,
|
|
is_indoors: Optional[bool] = None,
|
|
time_of_day: Optional[str] = None,
|
|
nearby_npcs: Optional[list] = None,
|
|
**extra
|
|
) -> dict:
|
|
"""Build expected state dict for Morrowind."""
|
|
state = {}
|
|
if location:
|
|
state["location"] = location
|
|
if health_min is not None:
|
|
state["health_above"] = health_min
|
|
if has_weapon is not None:
|
|
state["has_weapon"] = has_weapon
|
|
if is_indoors is not None:
|
|
state["indoors"] = is_indoors
|
|
if time_of_day:
|
|
state["time_of_day"] = time_of_day
|
|
if nearby_npcs:
|
|
state["nearby_npcs"] = nearby_npcs
|
|
state.update(extra)
|
|
return state
|
|
|
|
|
|
# --- Example Verification Flows ---
|
|
|
|
EXAMPLE_MORROWIND_VERIFICATION = """
|
|
# Verify player is in Balmora with a weapon
|
|
verifier = VisualStateVerifier()
|
|
result = verifier.verify_state(
|
|
screenshot_path="/tmp/morrowind_screenshot.png",
|
|
expected_state=VisualStateVerifier.morrowind_state(
|
|
location="Balmora",
|
|
health_min=50,
|
|
has_weapon=True
|
|
),
|
|
context="After completing the first Caius Cosades quest",
|
|
game="morrowind"
|
|
)
|
|
|
|
if result.verified:
|
|
print(f"State confirmed: {result.details}")
|
|
else:
|
|
print(f"State mismatch: {result.mismatches}")
|
|
"""
|
|
|
|
EXAMPLE_BATCH_VERIFICATION = """
|
|
# Verify multiple game states in sequence
|
|
states = [
|
|
{"screenshot": "screen1.png", "expected": {"location": "Seyda Neen"}, "context": "After character creation"},
|
|
{"screenshot": "screen2.png", "expected": {"location": "Balmora", "has_weapon": True}, "context": "After buying weapon"},
|
|
{"screenshot": "screen3.png", "expected": {"health_above": 80}, "context": "After resting"},
|
|
]
|
|
|
|
verifier = VisualStateVerifier()
|
|
for state in states:
|
|
result = verifier.verify_state(**state, game="morrowind")
|
|
print(f"{state['context']}: {'PASS' if result.verified else 'FAIL'} (confidence: {result.confidence:.0%})")
|
|
"""
|
|
|
|
if __name__ == "__main__":
|
|
# Demo: build and display a verification prompt
|
|
verifier = VisualStateVerifier()
|
|
expected = verifier.morrowind_state(
|
|
location="Balmora",
|
|
health_min=50,
|
|
has_weapon=True,
|
|
nearby_npcs=["Caius Cosades"]
|
|
)
|
|
result = verifier.verify_state(
|
|
screenshot_path="/tmp/demo_screenshot.png",
|
|
expected_state=expected,
|
|
context="Player should have completed the first quest",
|
|
game="morrowind"
|
|
)
|
|
print(result.details)
|