Files
Timmy-time-dashboard/scripts/visual_state_verifier.py
Alexander Whitestone 0fcd53cf3e feat: add visual state verification module for game agents
Multimodal screenshot-based state verification:
- Generic verifier for any game with screenshots
- Morrowind-specific state builder
- Structured prompt generation for vision models
- JSON result parsing with confidence scoring
- Batch verification examples

Relates to #1482
2026-04-09 14:52:08 +00:00

286 lines
9.6 KiB
Python

"""
Visual State Verification Module for Game Agents
=================================================
Provides screenshot-based environmental state verification for game agents
(Morrowind, Minecraft, or any game with a screenshot API). Uses multimodal
analysis to confirm agent expectations match actual game state.
Usage:
from scripts.visual_state_verifier import VisualStateVerifier
verifier = VisualStateVerifier()
result = verifier.verify_state(
screenshot_path="/tmp/game_screenshot.png",
expected_state={"location": "Balmora", "health_above": 50, "has_weapon": True},
context="Player should be in Balmora with a weapon equipped"
)
print(result.verified) # True/False
print(result.details) # Human-readable analysis
"""
import json
import os
import subprocess
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Optional
class VerificationStatus(Enum):
"""Status of a visual state verification."""
VERIFIED = "verified"
FAILED = "failed"
UNCERTAIN = "uncertain"
ERROR = "error"
@dataclass
class VerificationResult:
"""Result of a visual state verification."""
status: VerificationStatus
verified: bool
confidence: float # 0.0 - 1.0
details: str
expected: dict
observed: dict = field(default_factory=dict)
mismatches: list = field(default_factory=list)
screenshot_path: Optional[str] = None
class VisualStateVerifier:
"""
Verifies game state by analyzing screenshots against expected conditions.
Supports any game that can produce screenshots. Designed for integration
with MCP screenshot tools and vision analysis capabilities.
"""
def __init__(self, vision_backend: str = "builtin"):
"""
Args:
vision_backend: "builtin" for MCP vision, "ollama" for local model
"""
self.vision_backend = vision_backend
def verify_state(
self,
screenshot_path: str,
expected_state: dict,
context: str = "",
game: str = "generic"
) -> VerificationResult:
"""
Verify a game screenshot matches expected state conditions.
Args:
screenshot_path: Path to the screenshot file
expected_state: Dict of expected conditions, e.g.:
{
"location": "Balmora",
"health_above": 50,
"has_weapon": True,
"time_of_day": "day",
"nearby_npcs": ["Caius Cosades"]
}
context: Additional context for the vision model
game: Game name for context ("morrowind", "minecraft", "generic")
Returns:
VerificationResult with status, confidence, and details
"""
if not Path(screenshot_path).exists():
return VerificationResult(
status=VerificationStatus.ERROR,
verified=False,
confidence=0.0,
details=f"Screenshot not found: {screenshot_path}",
expected=expected_state,
screenshot_path=screenshot_path
)
# Build verification prompt
prompt = self._build_prompt(expected_state, context, game)
# Analyze screenshot
analysis = self._analyze_screenshot(screenshot_path, prompt)
# Parse results
return self._parse_analysis(analysis, expected_state, screenshot_path)
def _build_prompt(self, expected: dict, context: str, game: str) -> str:
"""Build a structured verification prompt for the vision model."""
conditions = []
for key, value in expected.items():
if isinstance(value, bool):
conditions.append(f"- {key}: {'yes' if value else 'no'}")
elif isinstance(value, (int, float)):
conditions.append(f"- {key}: {value} or better")
elif isinstance(value, list):
conditions.append(f"- {key}: should include {', '.join(str(v) for v in value)}")
else:
conditions.append(f"- {key}: {value}")
prompt = f"""Analyze this {game} game screenshot and verify the following conditions:
{chr(10).join(conditions)}
Context: {context if context else 'No additional context provided.'}
For each condition, state VERIFIED, FAILED, or UNCERTAIN with a brief reason.
End with a JSON block:
```json
{{
"verified": true/false,
"confidence": 0.0-1.0,
"details": "brief summary",
"mismatches": ["list of failed conditions"]
}}
```
"""
return prompt
def _analyze_screenshot(self, path: str, prompt: str) -> str:
"""
Send screenshot to vision backend for analysis.
In a live agent context, this would call the MCP vision tool.
For standalone use, it returns the prompt for manual invocation.
"""
# Return structured prompt for the calling agent to process
return json.dumps({
"prompt": prompt,
"screenshot_path": str(path),
"instruction": "Use vision_analyze tool with this prompt and screenshot_path"
})
def _parse_analysis(
self, analysis: str, expected: dict, screenshot_path: str
) -> VerificationResult:
"""Parse vision analysis into a VerificationResult."""
try:
data = json.loads(analysis)
if "instruction" in data:
# Not yet analyzed - return pending
return VerificationResult(
status=VerificationStatus.UNCERTAIN,
verified=False,
confidence=0.0,
details=f"Pending analysis. Run: vision_analyze("{data['screenshot_path']}", "{data['prompt'][:100]}...")",
expected=expected,
screenshot_path=screenshot_path
)
except json.JSONDecodeError:
pass
# Parse text analysis for JSON block
import re
json_match = re.search(r"```json\s*({.*?})\s*```", analysis, re.DOTALL)
if json_match:
try:
result = json.loads(json_match.group(1))
status = VerificationStatus.VERIFIED if result.get("verified") else VerificationStatus.FAILED
return VerificationResult(
status=status,
verified=result.get("verified", False),
confidence=result.get("confidence", 0.0),
details=result.get("details", ""),
expected=expected,
mismatches=result.get("mismatches", []),
screenshot_path=screenshot_path
)
except json.JSONDecodeError:
pass
# Fallback: return as uncertain
return VerificationResult(
status=VerificationStatus.UNCERTAIN,
verified=False,
confidence=0.3,
details=analysis[:500],
expected=expected,
screenshot_path=screenshot_path
)
@staticmethod
def morrowind_state(
location: Optional[str] = None,
health_min: Optional[int] = None,
has_weapon: Optional[bool] = None,
is_indoors: Optional[bool] = None,
time_of_day: Optional[str] = None,
nearby_npcs: Optional[list] = None,
**extra
) -> dict:
"""Build expected state dict for Morrowind."""
state = {}
if location:
state["location"] = location
if health_min is not None:
state["health_above"] = health_min
if has_weapon is not None:
state["has_weapon"] = has_weapon
if is_indoors is not None:
state["indoors"] = is_indoors
if time_of_day:
state["time_of_day"] = time_of_day
if nearby_npcs:
state["nearby_npcs"] = nearby_npcs
state.update(extra)
return state
# --- Example Verification Flows ---
EXAMPLE_MORROWIND_VERIFICATION = """
# Verify player is in Balmora with a weapon
verifier = VisualStateVerifier()
result = verifier.verify_state(
screenshot_path="/tmp/morrowind_screenshot.png",
expected_state=VisualStateVerifier.morrowind_state(
location="Balmora",
health_min=50,
has_weapon=True
),
context="After completing the first Caius Cosades quest",
game="morrowind"
)
if result.verified:
print(f"State confirmed: {result.details}")
else:
print(f"State mismatch: {result.mismatches}")
"""
EXAMPLE_BATCH_VERIFICATION = """
# Verify multiple game states in sequence
states = [
{"screenshot": "screen1.png", "expected": {"location": "Seyda Neen"}, "context": "After character creation"},
{"screenshot": "screen2.png", "expected": {"location": "Balmora", "has_weapon": True}, "context": "After buying weapon"},
{"screenshot": "screen3.png", "expected": {"health_above": 80}, "context": "After resting"},
]
verifier = VisualStateVerifier()
for state in states:
result = verifier.verify_state(**state, game="morrowind")
print(f"{state['context']}: {'PASS' if result.verified else 'FAIL'} (confidence: {result.confidence:.0%})")
"""
if __name__ == "__main__":
# Demo: build and display a verification prompt
verifier = VisualStateVerifier()
expected = verifier.morrowind_state(
location="Balmora",
health_min=50,
has_weapon=True,
nearby_npcs=["Caius Cosades"]
)
result = verifier.verify_state(
screenshot_path="/tmp/demo_screenshot.png",
expected_state=expected,
context="Player should have completed the first quest",
game="morrowind"
)
print(result.details)